def dist_smiles_from_ecfp( ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect], ecfp2: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect] ) -> List[float]: """Calculate tanimoto distance distribution between two lists of ecpf features Parameters ---------- ecfp1: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect], A list of ECPF finger prints ecfp2: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect] A list of ECPF finger prints Returns ------- List[float] A list of tanimoto distances between 0 and 1 """ if len(ecfp1) == 0 or len(ecfp2) == 0: pass #pdb.set_trace() return cd.calc_summary(dist_metrics.tanimoto(ecfp1, ecfp2), calc_type='nearest', num_nearest=1, within_dset=False)
def calc_dist_feat_array(feat_type, dist_met, feat1, feat2=None, calc_type='nearest', num_nearest=1, **metric_kwargs): """Returns a vector or array of distances, either between all compounds in a single dataset or between two datasets, given the feature matrices for the dataset(s). Args: feat_type (str): How the data was featurized. Current options are 'ECFP' or 'descriptors'. dist_met (str): What distance metric to use. Current options include tanimoto, cosine, cityblock, euclidean, or any other metric supported by scipy.spatial.distance.pdist(). feat1: feature matrix as a numpy array feat2: Optional, second feature matrix calc_type (str): Type of summarization to perform on rows of distance matrix. See function calc_summary for options. num_nearest (int): Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest. metric_kwargs: Additional arguments to be passed to functions that calculate metrics. Returns: dists: vector or array of distances """ within_dset = False if feat_type in ['ECFP', 'ecfp']: if dist_met == 'tanimoto': if feat2 is not None: if feat2.shape[0] == 1: # Vector of distances return calc_summary(dist_metrics.tanimoto_single(feat2, feat1)[0], calc_type, num_nearest) return calc_summary(dist_metrics.tanimoto(feat1, feat2), calc_type, num_nearest) else: return calc_summary(dist_metrics.tanimoto(feat1), calc_type, num_nearest, within_dset=True) else: if feat2 is not None: return calc_summary(cdist(feat1, feat2, dist_met), calc_type, num_nearest) return calc_summary(pdist(feat1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True) elif feat_type == 'descriptors': if feat2 is not None: return calc_summary(cdist(feat1, feat2, dist_met), calc_type, num_nearest) return calc_summary(pdist(feat1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True)
def diversity_plots(dset_key, datastore=True, bucket='gsk_ml', title_prefix=None, ecfp_radius=4, out_dir=None, id_col='compound_id', smiles_col='rdkit_smiles', max_for_mcs=300): """ Plot visualizations of diversity for an arbitrary table of compounds. At minimum, the file should contain columns for a compound ID and a SMILES string. """ # Load table of compound names, IDs and SMILES strings if datastore: cmpd_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket) else: cmpd_df = pd.read_csv(dset_key, index_col=False) file_prefix = os.path.splitext(os.path.basename(dset_key))[0] if title_prefix is None: title_prefix = file_prefix.replace('_', ' ') compound_ids = cmpd_df[id_col].values smiles_strs = cmpd_df[smiles_col].values ncmpds = len(smiles_strs) print(ncmpds) # Strip salts, canonicalize SMILES strings and create RDKit Mol objects print("Canonicalizing molecules...") base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs] for i, mol in enumerate(base_mols): if mol is None: print('Unable to get base molecule for compound %d = %s' % (i, compound_ids[i])) base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols] print("Done") # Generate ECFP fingerprints print("Computing fingerprints...") fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None] print("Done") if ncmpds <= max_for_mcs: # Get MCS distance matrix and draw a heatmap print("Computing MCS distance matrix...") mcs_dist = dm.mcs(base_mols) print("Done") cmpd1 = [] cmpd2 = [] dist = [] ind1 = [] ind2 = [] for i in range(ncmpds-1): for j in range(i+1, ncmpds): cmpd1.append(compound_ids[i]) cmpd2.append(compound_ids[j]) dist.append(mcs_dist[i,j]) ind1.append(i) ind2.append(j) dist_df = pd.DataFrame({'compound_1' : cmpd1, 'compound_2' : cmpd2, 'dist' : dist, 'i' : ind1, 'j' : ind2}) dist_df = dist_df.sort_values(by='dist') print(dist_df.head(10)) if out_dir is not None: dist_df.to_csv('%s/%s_mcs_dist_table.csv' % (out_dir, file_prefix), index=False) for k in range(10): mol_i = base_mols[dist_df.i.values[k]] mol_j = base_mols[dist_df.j.values[k]] img_file_i = '%s/%d_%s.png' % (out_dir, k, compound_ids[dist_df.i.values[k]]) img_file_j = '%s/%d_%s.png' % (out_dir, k, compound_ids[dist_df.j.values[k]]) Draw.MolToFile(mol_i, img_file_i, size=(500,500), fitImage=False) Draw.MolToFile(mol_j, img_file_j, size=(500,500), fitImage=False) mcs_linkage = linkage(mcs_dist, method='complete') mcs_df = pd.DataFrame(mcs_dist, columns=compound_ids, index=compound_ids) if out_dir is not None: pdf_path = '%s/%s_mcs_clustermap.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(mcs_df, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12,12), cmap='plasma') if out_dir is not None: pdf.savefig(g.fig) pdf.close() # Draw a UMAP projection based on MCS distance mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17) reps = mapper.fit_transform(mcs_dist) rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y']) rep_df['compound_id'] = compound_ids if out_dir is not None: pdf_path = '%s/%s_mcs_umap_proj.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) fig, ax = plt.subplots(figsize=(12,12)) sns.scatterplot(x='x', y='y', data=rep_df, ax=ax) ax.set_title("%s, 2D projection based on MCS distance" % title_prefix) if out_dir is not None: pdf.savefig(fig) pdf.close() rep_df.to_csv('%s/%s_mcs_umap_proj.csv' % (out_dir, file_prefix), index=False) # Get Tanimoto distance matrix print("Computing Tanimoto distance matrix...") tani_dist = dm.tanimoto(fps) print("Done") # Draw a UMAP projection based on Tanimoto distance mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17) reps = mapper.fit_transform(tani_dist) rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y']) rep_df['compound_id'] = compound_ids if out_dir is not None: pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) fig, ax = plt.subplots(figsize=(12,12)) sns.scatterplot(x='x', y='y', data=rep_df, ax=ax) ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix) if out_dir is not None: pdf.savefig(fig) pdf.close() # Draw a cluster heatmap based on Tanimoto distance tani_linkage = linkage(tani_dist, method='complete') tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids) if out_dir is not None: pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(tani_df, row_linkage=tani_linkage, col_linkage=tani_linkage, figsize=(12,12), cmap='plasma') if out_dir is not None: pdf.savefig(g.fig) pdf.close()
def obach_diversity_plots(ecfp_radius=6): """ Plot visualizations of diversity for the compounds in the Obach, Lombardo et al PK dataset """ # TODO: Put this dataset in the datastore where everybody else can see it cmpd_file = '/usr/local/data/diversity_plots/obach/LombardoSupplemental_Data_rdsmiles.csv' out_dir = '/usr/local/data/diversity_plots/obach' os.makedirs(out_dir, exist_ok=True) file_prefix = 'obach' title_prefix = 'Obach PK compound set' id_col = 'Name' smiles_col='rdkit_smiles' # Load table of compound names, IDs and SMILES strings cmpd_df = pd.read_csv(cmpd_file, index_col=False) compound_ids = cmpd_df[id_col].values smiles_strs = cmpd_df[smiles_col].values ncmpds = len(smiles_strs) # Strip salts, canonicalize SMILES strings and create RDKit Mol objects print("Canonicalizing molecules...") base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs] for i, mol in enumerate(base_mols): if mol is None: print('Unable to get base molecule for compound %d = %s' % (i, compound_ids[i])) base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols] print("Done") # Generate ECFP fingerprints print("Computing fingerprints...") fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None] print("Done") # Get Tanimoto distance matrix print("Computing Tanimoto distance matrix...") tani_dist = dm.tanimoto(fps) print("Done") # Draw a UMAP projection based on Tanimoto distance mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17) reps = mapper.fit_transform(tani_dist) rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y']) rep_df['compound_id'] = compound_ids if out_dir is not None: pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) fig, ax = plt.subplots(figsize=(12,12)) sns.scatterplot(x='x', y='y', data=rep_df, ax=ax) ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix) main_rep_df = rep_df[(rep_df.x > -20) & (rep_df.y > -20)] fig, ax = plt.subplots(figsize=(12,12)) sns.scatterplot(x='x', y='y', data=main_rep_df, ax=ax) ax.set_title("%s, main portion, 2D projection based on Tanimoto distance" % title_prefix) if out_dir is not None: pdf.savefig(fig) pdf.close() # Draw a cluster heatmap based on Tanimoto distance tani_linkage = linkage(tani_dist, method='complete') tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids) if out_dir is not None: pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(tani_df, row_linkage=tani_linkage, col_linkage=tani_linkage, figsize=(12,12), cmap='plasma') if out_dir is not None: pdf.savefig(g.fig) pdf.close()
def diversity_plots(dset_key, datastore=True, bucket='public', title_prefix=None, ecfp_radius=4, umap_file=None, out_dir=None, id_col='compound_id', smiles_col='rdkit_smiles', is_base_smiles=False, response_col=None, max_for_mcs=300): """ Plot visualizations of diversity for an arbitrary table of compounds. At minimum, the file should contain columns for a compound ID and a SMILES string. Produces a clustered heatmap display of Tanimoto distances between compounds along with a 2D UMAP projection plot based on ECFP fingerprints, with points colored according to the response variable. Args: dset_key (str): Datastore key or filepath for dataset. datastore (bool): Whether to load dataset from datastore or from filesystem. bucket (str): Name of datastore bucket containing dataset. title_prefix (str): Prefix for plot titles. ecfp_radius (int): Radius for ECFP fingerprint calculation. umap_file (str, optional): Path to file to write UMAP coordinates to. out_dir (str, optional): Output directory for plots and tables. If provided, plots will be output as PDF files rather than in the current notebook, and some additional CSV files will be generated. id_col (str): Column in dataset containing compound IDs. smiles_col (str): Column in dataset containing SMILES strings. is_base_smiles (bool): True if SMILES strings do not need to be salt-stripped and standardized. response_col (str): Column in dataset containing response values. max_for_mcs (int): Maximum dataset size for plots based on MCS distance. If the number of compounds is less than this value, an additional cluster heatmap and UMAP projection plot will be produced based on maximum common substructure distance. """ # Load table of compound names, IDs and SMILES strings if datastore: cmpd_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket) else: cmpd_df = pd.read_csv(dset_key, index_col=False) cmpd_df = cmpd_df.drop_duplicates(subset=smiles_col) file_prefix = os.path.splitext(os.path.basename(dset_key))[0] if title_prefix is None: title_prefix = file_prefix.replace('_', ' ') compound_ids = cmpd_df[id_col].values smiles_strs = cmpd_df[smiles_col].values ncmpds = len(smiles_strs) # Strip salts, canonicalize SMILES strings and create RDKit Mol objects if is_base_smiles: base_mols = np.array([Chem.MolFromSmiles(s) for s in smiles_strs]) else: print("Canonicalizing %d molecules..." % ncmpds) base_mols = np.array([ struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs ]) for i, mol in enumerate(base_mols): if mol is None: print('Unable to get base molecule for compound %d = %s' % (i, compound_ids[i])) print("Done") has_good_smiles = np.array([mol is not None for mol in base_mols]) base_mols = base_mols[has_good_smiles] cmpd_df = cmpd_df[has_good_smiles] ncmpds = cmpd_df.shape[0] compound_ids = cmpd_df[id_col].values responses = None if response_col is not None: responses = cmpd_df[response_col].values uniq_responses = set(responses) if uniq_responses == set([0, 1]): response_type = 'binary' colorpal = {0: 'forestgreen', 1: 'red'} elif len(uniq_responses) <= 10: response_type = 'categorical' colorpal = sns.color_palette('husl', n_colors=len(uniq_responses)) else: response_type = 'continuous' colorpal = sns.blend_palette(['red', 'green', 'blue'], 12, as_cmap=True) # Generate ECFP fingerprints print("Computing fingerprints...") fps = [ AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None ] print("Done") if ncmpds <= max_for_mcs: # Get MCS distance matrix and draw a heatmap print("Computing MCS distance matrix...") mcs_dist = dm.mcs(base_mols) print("Done") cmpd1 = [] cmpd2 = [] dist = [] ind1 = [] ind2 = [] for i in range(ncmpds - 1): for j in range(i + 1, ncmpds): cmpd1.append(compound_ids[i]) cmpd2.append(compound_ids[j]) dist.append(mcs_dist[i, j]) ind1.append(i) ind2.append(j) dist_df = pd.DataFrame({ 'compound_1': cmpd1, 'compound_2': cmpd2, 'dist': dist, 'i': ind1, 'j': ind2 }) dist_df = dist_df.sort_values(by='dist') print(dist_df.head(10)) if out_dir is not None: dist_df.to_csv('%s/%s_mcs_dist_table.csv' % (out_dir, file_prefix), index=False) for k in range(10): mol_i = base_mols[dist_df.i.values[k]] mol_j = base_mols[dist_df.j.values[k]] img_file_i = '%s/%d_%s.png' % ( out_dir, k, compound_ids[dist_df.i.values[k]]) img_file_j = '%s/%d_%s.png' % ( out_dir, k, compound_ids[dist_df.j.values[k]]) Draw.MolToFile(mol_i, img_file_i, size=(500, 500), fitImage=False) Draw.MolToFile(mol_j, img_file_j, size=(500, 500), fitImage=False) mcs_linkage = linkage(mcs_dist, method='complete') mcs_df = pd.DataFrame(mcs_dist, columns=compound_ids, index=compound_ids) if out_dir is not None: pdf_path = '%s/%s_mcs_clustermap.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(mcs_df, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12, 12), cmap='plasma') if out_dir is not None: pdf.savefig(g.fig) pdf.close() # Draw a UMAP projection based on MCS distance mapper = umap.UMAP(n_neighbors=20, min_dist=0.1, n_components=2, metric='precomputed', random_state=17) reps = mapper.fit_transform(mcs_dist) rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y']) rep_df['compound_id'] = compound_ids if out_dir is not None: pdf_path = '%s/%s_mcs_umap_proj.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) fig, ax = plt.subplots(figsize=(12, 12)) if responses is None: sns.scatterplot(x='x', y='y', data=rep_df, ax=ax) else: rep_df['response'] = responses sns.scatterplot(x='x', y='y', hue='response', palette=colorpal, data=rep_df, ax=ax) ax.set_title("%s, 2D projection based on MCS distance" % title_prefix) if out_dir is not None: pdf.savefig(fig) pdf.close() rep_df.to_csv('%s/%s_mcs_umap_proj.csv' % (out_dir, file_prefix), index=False) # Get Tanimoto distance matrix print("Computing Tanimoto distance matrix...") tani_dist = dm.tanimoto(fps) print("Done") # Draw a UMAP projection based on Tanimoto distance mapper = umap.UMAP(n_neighbors=20, min_dist=0.1, n_components=2, metric='precomputed', random_state=17) reps = mapper.fit_transform(tani_dist) rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y']) rep_df['compound_id'] = compound_ids if responses is not None: rep_df['response'] = responses if umap_file is not None: rep_df.to_csv(umap_file, index=False) print("Wrote UMAP mapping to %s" % umap_file) if out_dir is not None: pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) fig, ax = plt.subplots(figsize=(12, 12)) if responses is None: sns.scatterplot(x='x', y='y', data=rep_df, ax=ax) else: sns.scatterplot(x='x', y='y', hue='response', palette=colorpal, data=rep_df, ax=ax) ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix) if out_dir is not None: pdf.savefig(fig) pdf.close() # Draw a cluster heatmap based on Tanimoto distance tani_linkage = linkage(tani_dist, method='complete') tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids) if out_dir is not None: pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix) pdf = PdfPages(pdf_path) g = sns.clustermap(tani_df, row_linkage=tani_linkage, col_linkage=tani_linkage, figsize=(12, 12), cmap='plasma') if out_dir is not None: pdf.savefig(g.fig) pdf.close()
def calc_dist_smiles(feat_type, dist_met, smiles_arr1, smiles_arr2=None, calc_type='nearest', num_nearest=1, **metric_kwargs): """ Returns an array of distances between compounds given as SMILES strings, either between all pairs of compounds in a single dataset or between two datasets. Args: feat_type (str): How the data is to be featurized, if dist_met is not 'mcs'. The only option supported currently is 'ECFP'. dist_met (str): What distance metric to use. Current options include 'tanimoto' and 'mcs'. smiles_arr1 (list): First list of SMILES strings. smiles_arr2 (list): Optional, second list of SMILES strings. Can have only 1 member if wanting compound to matrix comparison. calc_type (str): Type of summarization to perform on rows of distance matrix. See function calc_summary for options. num_nearest (int): Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest. metric_kwargs: Additional arguments to be passed to functions that calculate metrics. Returns: dists: vector or array of distances Todo: Fix the function _get_descriptors(), which is broken, and re-enable the 'descriptors' option for feat_type. Will need to add a parameter to indicate what kind of descriptors should be computed. Allow other metrics for ECFP features, as in calc_dist_diskdataset(). """ within_dset = False if feat_type in ['ECFP','ecfp'] and dist_met=='tanimoto': mols1 = [Chem.MolFromSmiles(s) for s in smiles_arr1] fprints1 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols1] if smiles_arr2 is not None: if len(smiles_arr2) == 1: cpd_mol = Chem.MolFromSmiles(smiles_arr2[0]) cpd_fprint = AllChem.GetMorganFingerprintAsBitVect(cpd_mol, 2, 1024) # Vector of distances return calc_summary(dist_metrics.tanimoto_single(cpd_fprint, fprints1)[0], calc_type, num_nearest, within_dset) else: mols2 = [Chem.MolFromSmiles(s) for s in smiles_arr2] fprints2 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols2] else: fprints2 = None within_dset = True return calc_summary(dist_metrics.tanimoto(fprints1, fprints2), calc_type, num_nearest, within_dset) elif dist_met == 'mcs': mols1 = [Chem.MolFromSmiles(s) for s in smiles_arr1] n_atms = [mol.GetNumAtoms() for mol in mols1] if smiles_arr2 is not None: if len(smiles_arr2) == 1: cpd_mol = Chem.MolFromSmiles(smiles_arr2[0]) # Vector of distances return calc_summary(dist_metrics.mcs_single( cpd_mol, mols1, n_atms)[0], calc_type, num_nearest, within_dset) else: mols2 = [Chem.MolFromSmiles(s) for s in smiles_arr2] else: mols2 = None return calc_summary(dist_metrics.mcs(mols1, mols2), calc_type, num_nearest, within_dset=True) elif feat_type in ['descriptors', 'moe']: raise ValueError("Descriptor features are not currently supported by calc_dist_smiles().") feats1 = _get_descriptors(smiles_arr1) if feats1 is not None: if smiles_arr2 is not None: feats2 = _get_descriptors(smiles_arr2) if feats2 is None: return return calc_summary(cdist(feats1, feats2, dist_met), calc_type, num_nearest, within_dset) else: return calc_summary(pdist(feats1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True)
def calc_dist_smiles(feat_type, dist_met, smiles_arr1, smiles_arr2=None, calc_type='nearest', num_nearest=1, **metric_kwargs): """Returns a vector of distances, either between all compounds in a single matrix or between two matrices. Args: feature_type: How the data should be featurized. Current options include ECFP, MOE, or Dragon7. dist_met: What distance metric to use. Current options include tanimoto, cosine, cityblock, euclidean, or any other metric supported by scipy.spatial.distance.pdist().. smiles_arr1 (list): List of SMILES strings. smiles_arr2 (list): Optional, second list of SMILES strings. Can have only 1 member if wanting compound to matrix comparison. calc_type: Type of calculation to use to process distance matrix. Options: nearest, farthest, average, or all. num_nearest: Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest. Returns: dists: vector or array of distances """ within_dset = False if feat_type in ['ECFP', 'ecfp'] and dist_met == 'tanimoto': # TODO: Handle other metrics for fingerprints, as in calc_dist_diskdataset() mols1 = [Chem.MolFromSmiles(s) for s in smiles_arr1] fprints1 = [ AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols1 ] if smiles_arr2 is not None: if len(smiles_arr2) == 1: cpd_mol = Chem.MolFromSmiles(smiles_arr2[0]) cpd_fprint = AllChem.GetMorganFingerprintAsBitVect( cpd_mol, 2, 1024) # Vector of distances return calc_summary( dist_metrics.tanimoto_single(cpd_fprint, fprints1)[0], calc_type, num_nearest, within_dset) else: mols2 = [Chem.MolFromSmiles(s) for s in smiles_arr2] fprints2 = [ AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols2 ] else: fprints2 = None within_dset = True return calc_summary(dist_metrics.tanimoto(fprints1, fprints2), calc_type, num_nearest, within_dset) elif dist_met == 'mcs': mols1 = [Chem.MolFromSmiles(s) for s in smiles_arr1] n_atms = [mol.GetNumAtoms() for mol in mols1] if smiles_arr2 is not None: if len(smiles_arr2) == 1: cpd_mol = Chem.MolFromSmiles(smiles_arr2[0]) # Vector of distances return calc_summary( dist_metrics.mcs_single(cpd_mol, mols1, n_atms)[0], calc_type, num_nearest, within_dset) else: mols2 = [Chem.MolFromSmiles(s) for s in smiles_arr2] else: mols2 = None return calc_summary(dist_metrics.mcs(mols1, mols2), calc_type, num_nearest, within_dset=True) elif feat_type in ['descriptors', 'moe']: feats1 = get_descriptors(smiles_arr1) if feats1 is not None: if smiles_arr2 is not None: feats2 = get_descriptors(smiles_arr2) if feats2 is None: return return calc_summary(cdist(feats1, feats2, dist_met), calc_type, num_nearest, within_dset) else: return calc_summary(pdist(feats1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True)
def calc_dist_feat_array(feat_type, dist_met, feat1, feat2=None, calc_type='nearest', num_nearest=1, **metric_kwargs): """Returns a vector or array of distances, either between all compounds in a single dataset or between two datasets, given the feature matrices for the dataset(s). Args: feature_type: How the data should be featurized. Current options include ECFP, MOE, or Dragon7. dist_met: What distance metric to use. Current options include tanimoto, cosine, cityblock, euclidean, or any other metric supported by scipy.spatial.distance.pdist(). feat1: feature matrix as a numpy array feat2: Optional, second feature matrix calc_type: Type of calculation to use to process distance matrix. Options: nearest, farthest, average, or all. num_nearest: Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest. Returns: dists: vector or array of distances """ within_dset = False if feat_type in ['ECFP', 'ecfp']: if dist_met == 'tanimoto': if feat2 is not None: if feat2.shape[0] == 1: # Vector of distances return calc_summary( dist_metrics.tanimoto_single(feat2, feat1)[0], calc_type, num_nearest) return calc_summary(dist_metrics.tanimoto(feat1, feat2), calc_type, num_nearest) else: return calc_summary(dist_metrics.tanimoto(feat1), calc_type, num_nearest, within_dset=True) else: if feat2 is not None: return calc_summary(cdist(feat1, feat2, dist_met), calc_type, num_nearest) return calc_summary(pdist(feat1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True) elif feat_type == 'descriptors': if feat2 is not None: return calc_summary(cdist(feat1, feat2, dist_met), calc_type, num_nearest) return calc_summary(pdist(feat1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True)