def calc_distance_matrix(mols): """ Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum distance possible being 1.0. :param mols: A list of molecules. It must be possible to iterate through this list multiple times :return: A NxN 2D array of distance scores, with N being the number of molecules in the input """ # TODO - do we need to calculate both sides of the matrix? Tanimoto is supposed to be a symmetric distance measure, # but the matrix that is generated does not seem to be symmetric. mol_fm_tuples = [] for mol in mols: features = sucos.getRawFeatures(mol) mol_fm_tuples.append((mol, features)) matrix = [] for tuple1 in mol_fm_tuples: tmp = [] for tuple2 in mol_fm_tuples: if tuple1[0] == tuple2[0]: tmp.append(0.0) else: #utils.log("Calculating SuCOS between", mol1, mol2) sucos_score, fm_score, tani_score = sucos.get_SucosScore( tuple1[0], tuple2[0], tani=True, ref_features=tuple1[1], query_features=tuple2[1]) tmp.append(1.0 - sucos_score) matrix.append(tmp) return matrix
def process(inputfilename, clusterfilenames, outputfilename, mode): all_clusters = {} for filename in clusterfilenames: cluster = [] cluster_file = utils.open_file_for_reading(filename) suppl = Chem.ForwardSDMolSupplier(cluster_file) i = 0 for mol in suppl: i += 1 if not mol: utils.log("WARNING: failed to generate molecule", i, "in cluster", filename) continue try: features = sucos.getRawFeatures(mol) cluster.append((mol, features)) except: utils.log("WARNING: failed to generate features for molecule", i, "in cluster", filename) cluster_file.close() all_clusters[filename] = cluster input_file = utils.open_file_for_reading(inputfilename) suppl = Chem.ForwardSDMolSupplier(input_file) output_file = utils.open_file_for_writing(outputfilename) writer = Chem.SDWriter(output_file) comparisons = 0 mol_num = 0 for mol in suppl: mol_num += 1 if not mol: utils.log("WARNING: failed to generate molecule", mol_num, "in input") continue try: query_features = sucos.getRawFeatures(mol) except: utils.log("WARNING: failed to generate features for molecule", mol_num, "in input") continue scores = [0, 0, 0] for clusterfilename in all_clusters: cluster = all_clusters[clusterfilename] index = 0 for entry in cluster: hit = entry[0] ref_features = entry[1] index += 1 comparisons += 1 sucos_score, fm_score, vol_score = sucos.get_SucosScore( hit, mol, tani=False, ref_features=ref_features, query_features=query_features) if mode == 'max': if sucos_score > scores[0]: scores[0] = sucos_score scores[1] = fm_score scores[2] = vol_score cluster_name = clusterfilename cluster_index = index elif mode == 'cum': scores[0] += sucos_score scores[1] += fm_score scores[2] += vol_score else: raise ValueError("Invalid mode: " + mode) if scores[0] > 0: if mode == 'max': cluster_file_name_only = cluster_name.split(os.sep)[-1] #utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index) mol.SetDoubleProp("Max_SuCOS_Score", scores[0]) mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores[1]) mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores[2]) mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only) mol.SetIntProp("Max_SuCOS_Index", cluster_index) else: #utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2]) mol.SetDoubleProp("Cum_SuCOS_Score", scores[0]) mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores[1]) mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores[2]) writer.write(mol) else: utils.log("Molecule", mol_num, "did not overlay. Omitting from results") input_file.close() writer.flush() writer.close() output_file.close() utils.log("Completed", comparisons, "comparisons")
def process(inputfilename, clusterfilenames, outputfilename, filter_value, filter_field): all_clusters = {} for filename in clusterfilenames: cluster = [] cluster_file = utils.open_file_for_reading(filename) suppl = Chem.ForwardSDMolSupplier(cluster_file) i = 0 for mol in suppl: i += 1 if not mol: utils.log("WARNING: failed to generate molecule", i, "in cluster", filename) continue try: features = sucos.getRawFeatures(mol) cluster.append((mol, features)) except: utils.log("WARNING: failed to generate features for molecule", i, "in cluster", filename) cluster_file.close() all_clusters[filename] = cluster input_file = utils.open_file_for_reading(inputfilename) suppl = Chem.ForwardSDMolSupplier(input_file) output_file = utils.open_file_for_writing(outputfilename) writer = Chem.SDWriter(output_file) comparisons = 0 mol_num = 0 for mol in suppl: mol_num += 1 if not mol: utils.log("WARNING: failed to generate molecule", mol_num, "in input") continue try: query_features = sucos.getRawFeatures(mol) except: utils.log("WARNING: failed to generate features for molecule", mol_num, "in input") continue scores_max = [0, 0, 0] scores_cum = [0, 0, 0] cluster_name = None for clusterfilename in all_clusters: cluster = all_clusters[clusterfilename] index = 0 for entry in cluster: hit = entry[0] ref_features = entry[1] index += 1 comparisons += 1 sucos_score, fm_score, vol_score = sucos.get_SucosScore( hit, mol, tani=False, ref_features=ref_features, query_features=query_features) if sucos_score > scores_max[0]: scores_max[0] = sucos_score scores_max[1] = fm_score scores_max[2] = vol_score cluster_name = clusterfilename cluster_index = index scores_cum[0] += sucos_score scores_cum[1] += fm_score scores_cum[2] += vol_score # utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index) mol.SetDoubleProp("Max_SuCOS_Score", scores_max[0] if scores_max[0] > 0 else 0) mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores_max[1] if scores_max[1] > 0 else 0) mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores_max[2] if scores_max[2] > 0 else 0) if cluster_name: cluster_file_name_only = cluster_name.split(os.sep)[-1] mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only) mol.SetIntProp("Max_SuCOS_Index", cluster_index) # utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2]) mol.SetDoubleProp("Cum_SuCOS_Score", scores_cum[0] if scores_cum[0] > 0 else 0) mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores_cum[1] if scores_cum[1] > 0 else 0) mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores_cum[2] if scores_cum[2] > 0 else 0) if filter_value and filter_field: if mol.HasProp(filter_field): val = mol.GetDoubleProp(filter_field) if val > filter_value: writer.write(mol) else: writer.write(mol) input_file.close() writer.flush() writer.close() output_file.close() utils.log("Completed", comparisons, "comparisons")