Пример #1
0
def find_similar_mols(test_smiles: List[str],
                      train_smiles: List[str],
                      distance_measure: str,
                      model: MoleculeModel = None,
                      num_neighbors: int = None,
                      batch_size: int = 50) -> List[OrderedDict]:
    """
    For each test molecule, finds the N most similar training molecules according to some distance measure.

    :param test_smiles: A list of test SMILES strings.
    :param train_smiles: A list of train SMILES strings.
    :param model: A trained MoleculeModel (only needed for distance_measure == 'embedding').
    :param distance_measure: The distance measure to use to determine nearest neighbors.
    :param num_neighbors: The number of nearest training molecules to find for each test molecule.
    :param batch_size: Batch size.
    :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles,
    and other relevant distance info.
    """
    test_data, train_data = get_data_from_smiles(test_smiles), get_data_from_smiles(train_smiles)
    train_smiles_set = set(train_smiles)

    print(f'Computing {distance_measure} vectors')
    if distance_measure == 'embedding':
        assert model is not None
        test_vecs = np.array(compute_molecule_vectors(model=model, data=test_data, batch_size=batch_size))
        train_vecs = np.array(compute_molecule_vectors(model=model, data=train_data, batch_size=batch_size))
        metric = 'cosine'
    elif distance_measure == 'morgan':
        test_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(test_smiles, total=len(test_smiles))])
        train_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(train_smiles, total=len(train_smiles))])
        metric = 'jaccard'
    else:
        raise ValueError(f'Distance measure "{distance_measure}" not supported.')

    print('Computing distances')
    distances = cdist(test_vecs, train_vecs, metric=metric)

    print('Finding neighbors')
    neighbors = []
    for test_index, test_smile in enumerate(test_smiles):
        # Find the num_neighbors molecules in the training set which are most similar to the test molecule
        nearest_train_indices = np.argsort(distances[test_index])[:num_neighbors]

        # Build dictionary with distance info
        neighbor = OrderedDict()
        neighbor['test_smiles'] = test_smile
        neighbor['test_in_train'] = test_smile in train_smiles_set

        for i, train_index in enumerate(nearest_train_indices):
            neighbor[f'train_{i + 1}_smiles'] = train_smiles[train_index]
            neighbor[f'train_{i + 1}_{distance_measure}_{metric}_distance'] = distances[test_index][train_index]

        neighbors.append(neighbor)

    return neighbors
Пример #2
0
def featurize_file(input_df, output_path, pretrained_model):
    smiles_list = input_df[input_df.columns[0]].tolist()
    print(len(smiles_list))
    data = get_data_from_smiles(smiles=[[smiles] for smiles in smiles_list])
    print("Starting molecule vector computation...")
    descriptors = compute_molecule_vectors(model=pretrained_model,
                                           data=data,
                                           batch_size=64)
    print("Computation finished, saving result...")
    smiles_descriptors_dict = {
        'smiles': smiles_list,
        'descriptors': descriptors
    }
    output_df = pd.DataFrame(smiles_descriptors_dict)
    output_df.to_csv(output_path,
                     mode='a+',
                     header=not os.path.exists(output_path),
                     encoding="ascii",
                     index=False)