Exemplo n.º 1
0
def split_indices(all_indices: List[int],
                  num_folds: int,
                  scaffold: bool = False,
                  data: MoleculeDataset = None,
                  shuffle: bool = True) -> List[List[int]]:
    num_data = len(all_indices)
    if scaffold:
        scaffold_to_indices = scaffold_to_smiles(data.mols(), use_indices=True)
        index_sets = sorted(list(scaffold_to_indices.values()),
                            key=lambda index_set: len(index_set),
                            reverse=True)
        fold_indices = [[] for _ in range(num_folds)]
        for s in index_sets:
            length_array = [len(fi) for fi in fold_indices]
            min_index = length_array.index(min(length_array))
            fold_indices[min_index] += s
        if shuffle:
            random.shuffle(fold_indices)
    else:  # random
        if shuffle:
            random.shuffle(all_indices)
        fold_indices = []
        for i in range(num_folds):
            begin, end = int(i * num_data / num_folds), int(
                (i + 1) * num_data / num_folds)
            fold_indices.append(np.array(all_indices[begin:end]))
    return fold_indices
Exemplo n.º 2
0
def split_indices(all_indices, num_folds, data, shuffle=True):
    num_data = len(all_indices)
    scaffold_to_indices = scaffold_to_smiles(data.mols(flatten=True),
                                             use_indices=True)
    print(len(scaffold_to_indices))
    print(scaffold_to_indices)
    index_sets = sorted(list(scaffold_to_indices.values()),
                        key=lambda index_set: len(index_set),
                        reverse=True)
    fold_indices = [[] for _ in range(num_folds)]
    for s in index_sets:
        length_array = [len(fi) for fi in fold_indices]
        min_index = length_array.index(min(length_array))
        fold_indices[min_index] += s
    if shuffle:
        random.shuffle(fold_indices)

    return fold_indices
Exemplo n.º 3
0
def scaffold_similarity(smiles_1: List[str], smiles_2: List[str]):
    """
    Determines the similarity between the scaffolds of two lists of smiles strings.

    :param smiles_1: A list of smiles strings.
    :param smiles_2: A list of smiles strings.
    """
    # Get scaffolds
    scaffold_to_smiles_1 = scaffold_to_smiles(smiles_1)
    scaffold_to_smiles_2 = scaffold_to_smiles(smiles_2)

    scaffolds_1, smiles_sets_1 = zip(*scaffold_to_smiles_1.items())
    scaffolds_2, smiles_sets_2 = zip(*scaffold_to_smiles_2.items())

    smiles_to_scaffold = {
        smiles: scaffold
        for scaffold, smiles_set in scaffold_to_smiles_1.items()
        for smiles in smiles_set
    }
    smiles_to_scaffold.update({
        smiles: scaffold
        for scaffold, smiles_set in scaffold_to_smiles_2.items()
        for smiles in smiles_set
    })

    # Determine similarity
    scaffolds_1, scaffolds_2 = set(scaffolds_1), set(scaffolds_2)
    smiles_1, smiles_2 = set(smiles_1), set(smiles_2)

    all_scaffolds = scaffolds_1 | scaffolds_2
    all_smiles = smiles_1 | smiles_2

    scaffolds_intersection = scaffolds_1 & scaffolds_2
    # smiles_intersection is smiles with a scaffold that appears in both datasets
    smiles_intersection = {
        smiles
        for smiles in all_smiles
        if smiles_to_scaffold[smiles] in scaffolds_intersection
    }

    smiles_in_1_with_scaffold_in_2 = {
        smiles
        for smiles in smiles_1 if smiles_to_scaffold[smiles] in scaffolds_2
    }
    smiles_in_2_with_scaffold_in_1 = {
        smiles
        for smiles in smiles_2 if smiles_to_scaffold[smiles] in scaffolds_1
    }

    sizes_1 = np.array([len(smiles_set) for smiles_set in smiles_sets_1])
    sizes_2 = np.array([len(smiles_set) for smiles_set in smiles_sets_2])

    # Print results
    print()
    print(f'Number of molecules = {len(all_smiles):,}')
    print(f'Number of scaffolds = {len(all_scaffolds):,}')
    print()
    print(
        f'Number of scaffolds in both datasets = {len(scaffolds_intersection):,}'
    )
    print(
        f'Scaffold intersection over union = {len(scaffolds_intersection) / len(all_scaffolds):.4f}'
    )
    print()
    print(
        f'Number of molecules with scaffold in both datasets = {len(smiles_intersection):,}'
    )
    print(
        f'Molecule intersection over union = {len(smiles_intersection) / len(all_smiles):.4f}'
    )
    print()
    print(f'Number of molecules in dataset 1 = {np.sum(sizes_1):,}')
    print(f'Number of scaffolds in dataset 1 = {len(scaffolds_1):,}')
    print()
    print(f'Number of molecules in dataset 2 = {np.sum(sizes_2):,}')
    print(f'Number of scaffolds in dataset 2 = {len(scaffolds_2):,}')
    print()
    print(
        f'Percent of scaffolds in dataset 1 which are also in dataset 2 = {100 * len(scaffolds_intersection) / len(scaffolds_1):.2f}%'
    )
    print(
        f'Percent of scaffolds in dataset 2 which are also in dataset 1 = {100 * len(scaffolds_intersection) / len(scaffolds_2):.2f}%'
    )
    print()
    print(
        f'Number of molecules in dataset 1 with scaffolds in dataset 2 = {len(smiles_in_1_with_scaffold_in_2):,}'
    )
    print(
        f'Percent of molecules in dataset 1 with scaffolds in dataset 2 = {100 * len(smiles_in_1_with_scaffold_in_2) / len(smiles_1):.2f}%'
    )
    print()
    print(
        f'Number of molecules in dataset 2 with scaffolds in dataset 1 = {len(smiles_in_2_with_scaffold_in_1):,}'
    )
    print(
        f'Percent of molecules in dataset 2 with scaffolds in dataset 1 = {100 * len(smiles_in_2_with_scaffold_in_1) / len(smiles_2):.2f}%'
    )
    print()
    print(
        f'Average number of molecules per scaffold in dataset 1 = {np.mean(sizes_1):.4f} +/- {np.std(sizes_1):.4f}'
    )
    print('Percentiles for molecules per scaffold in dataset 1')
    print(' | '.join([
        f'{i}% = {int(np.percentile(sizes_1, i)):,}'
        for i in range(0, 101, 10)
    ]))
    print()
    print(
        f'Average number of molecules per scaffold in dataset 2 = {np.mean(sizes_2):.4f} +/- {np.std(sizes_2):.4f}'
    )
    print('Percentiles for molecules per scaffold in dataset 2')
    print(' | '.join([
        f'{i}% = {int(np.percentile(sizes_2, i)):,}'
        for i in range(0, 101, 10)
    ]))
def scaffold_split_num_pos(data_path: str, max_scaffold_size_in_test: int,
                           num_pos_in_test: int, percent_neg_in_test: float,
                           save_dir: str):
    # Load data
    data = pd.read_csv(data_path)
    mols = [Chem.MolFromSmiles(smiles) for smiles in data['smiles']]

    # Determine scaffolds
    scaffold_to_indices: Dict[str,
                              Set[int]] = scaffold_to_smiles(mols,
                                                             use_indices=True)
    scaffold_to_indices = {
        scaffold: sorted(indices)
        for scaffold, indices in scaffold_to_indices.items()
    }

    # Split scaffolds into those will all positive, all negative, or mixed activity
    pos_scaffolds, mix_scaffolds, neg_scaffolds = [], [], []
    for scaffold, indices in scaffold_to_indices.items():
        activities = {data.iloc[index]['activity'] for index in indices}

        if activities == {1}:
            pos_scaffolds.append(scaffold)
        elif activities == {0}:
            neg_scaffolds.append(scaffold)
        elif activities == {0, 1}:
            mix_scaffolds.append(scaffold)
        else:
            raise ValueError(
                f'Found activities "{activities}" but should only be 0 or 1')

    # Reproducibility
    random.seed(0)
    pos_scaffolds, mix_scaffolds, neg_scaffolds = sorted(
        pos_scaffolds), sorted(mix_scaffolds), sorted(neg_scaffolds)

    # Get small scaffolds
    small_pos_scaffolds = [
        scaffold for scaffold in pos_scaffolds
        if len(scaffold_to_indices[scaffold]) < max_scaffold_size_in_test
    ]
    small_mix_scaffolds = [
        scaffold for scaffold in mix_scaffolds
        if len(scaffold_to_indices[scaffold]) < max_scaffold_size_in_test
    ]
    small_neg_scaffolds = [
        scaffold for scaffold in neg_scaffolds
        if len(scaffold_to_indices[scaffold]) < max_scaffold_size_in_test
    ]

    # Put all big scaffolds in train
    train_scaffolds = sorted(
        set.union(
            set(pos_scaffolds) - set(small_pos_scaffolds),
            set(mix_scaffolds) - set(small_mix_scaffolds),
            set(neg_scaffolds) - set(small_neg_scaffolds)))
    test_scaffolds = []

    # Mixed scaffolds (half in train, half in test)
    random.shuffle(small_mix_scaffolds)
    half = len(small_mix_scaffolds) // 2
    train_scaffolds += small_mix_scaffolds[:half]
    test_scaffolds += small_mix_scaffolds[half:]

    # Positive scaffolds (put in test until hit num_pos_in_test, rest in train)
    random.shuffle(small_pos_scaffolds)
    test_indices = sum(
        (scaffold_to_indices[scaffold] for scaffold in test_scaffolds), [])
    num_pos = sum(data.iloc[test_indices]['activity'])

    for scaffold in small_pos_scaffolds:
        scaffold_size = len(scaffold_to_indices[scaffold])
        if num_pos < num_pos_in_test and scaffold_size <= (num_pos_in_test -
                                                           num_pos):
            test_scaffolds.append(scaffold)
            num_pos += scaffold_size
        else:
            train_scaffolds.append(scaffold)

    # Negative scaffolds (put in test until hit percent_neg_in_test, rest in train)
    random.shuffle(small_neg_scaffolds)
    test_indices = sum(
        (scaffold_to_indices[scaffold] for scaffold in test_scaffolds), [])
    num_neg_in_test = int(percent_neg_in_test * sum(data['activity'] == 0))
    num_neg = sum(data.iloc[test_indices]['activity'])

    for scaffold in small_neg_scaffolds:
        scaffold_size = len(scaffold_to_indices[scaffold])
        if num_neg < num_neg_in_test and scaffold_size <= (num_neg_in_test -
                                                           num_neg):
            test_scaffolds.append(scaffold)
            num_neg += scaffold_size
        else:
            train_scaffolds.append(scaffold)

    # Get indices
    train_indices = sum(
        (scaffold_to_indices[scaffold] for scaffold in train_scaffolds), [])
    test_indices = sum(
        (scaffold_to_indices[scaffold] for scaffold in test_scaffolds), [])

    # Checks
    train_scaffolds_set, test_scaffolds_set = set(train_scaffolds), set(
        test_scaffolds)
    assert len(train_scaffolds_set & test_scaffolds_set) == 0
    assert set.union(train_scaffolds_set,
                     test_scaffolds_set) == set(scaffold_to_indices.keys())

    train_indices_set, test_indices_set = set(train_indices), set(test_indices)
    assert len(train_indices_set & test_indices_set) == 0
    assert set.union(train_indices_set,
                     test_indices_set) == set(range(len(data)))

    # Shuffle test
    random.shuffle(test_indices)

    # Split data
    train, test = data.iloc[train_indices], data.iloc[test_indices]

    # Print statistics
    print('train')
    print(train['activity'].value_counts())
    print('test')
    print(test['activity'].value_counts())

    # Shuffle test

    # Save scaffolds
    makedirs(save_dir)
    train.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
    test.to_csv(os.path.join(save_dir, 'test.csv'), index=False)