def main(): parser = argparse.ArgumentParser( description='Clustering with SuCOS and RDKit') parser.add_argument( '-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') parser.add_argument( '-o', '--output', default="cluster", help="Base name for output files in SDF format. " + "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created" ) parser.add_argument( '--gzip', action='store_true', help= 'Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz' ) parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Clustering threshold') args = parser.parse_args() utils.log("SuCOS Cluster Args: ", args) input_file = utils.open_file_for_reading(args.input) suppl = Chem.ForwardSDMolSupplier(input_file) mols = list(suppl) matrix = calc_distance_matrix(mols) clusters = cluster(matrix, threshold=args.threshold) write_clusters_to_sdfs(mols, clusters, args.output, gzip=args.gzip)
def process( refmol_filename, inputs_filename, outputs_filename, refmol_index=None, refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All, ): ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") ref_features = getRawFeatures(ref_mol) input_file = utils.open_file_for_reading(inputs_filename) suppl = Chem.ForwardSDMolSupplier(input_file) output_file = utils.open_file_for_writing(outputs_filename) writer = Chem.SDWriter(output_file) count = 0 total = 0 errors = 0 for mol in suppl: count += 1 if mol is None: continue # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") try: sucos_score, fm_score, val3 = get_SucosScore( ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode, ) mol.SetDoubleProp("SuCOS_Score", sucos_score) mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) if tani: mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) else: mol.SetDoubleProp("SuCOS_Protrude_Score", val3) utils.log("Scores:", sucos_score, fm_score, val3) writer.write(mol) total += 1 except ValueError as e: errors += 1 utils.log("Molecule", count, "failed to score:", e.message) input_file.close() writer.flush() writer.close() output_file.close() utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors")
def process(inputfilename, clusterfilenames, outputfilename, mode): all_clusters = {} for filename in clusterfilenames: cluster = [] cluster_file = utils.open_file_for_reading(filename) suppl = Chem.ForwardSDMolSupplier(cluster_file) i = 0 for mol in suppl: i += 1 if not mol: utils.log("WARNING: failed to generate molecule", i, "in cluster", filename) continue try: features = sucos.getRawFeatures(mol) cluster.append((mol, features)) except: utils.log("WARNING: failed to generate features for molecule", i, "in cluster", filename) cluster_file.close() all_clusters[filename] = cluster input_file = utils.open_file_for_reading(inputfilename) suppl = Chem.ForwardSDMolSupplier(input_file) output_file = utils.open_file_for_writing(outputfilename) writer = Chem.SDWriter(output_file) comparisons = 0 mol_num = 0 for mol in suppl: mol_num += 1 if not mol: utils.log("WARNING: failed to generate molecule", mol_num, "in input") continue try: query_features = sucos.getRawFeatures(mol) except: utils.log("WARNING: failed to generate features for molecule", mol_num, "in input") continue scores = [0, 0, 0] for clusterfilename in all_clusters: cluster = all_clusters[clusterfilename] index = 0 for entry in cluster: hit = entry[0] ref_features = entry[1] index += 1 comparisons += 1 sucos_score, fm_score, vol_score = sucos.get_SucosScore( hit, mol, tani=False, ref_features=ref_features, query_features=query_features) if mode == 'max': if sucos_score > scores[0]: scores[0] = sucos_score scores[1] = fm_score scores[2] = vol_score cluster_name = clusterfilename cluster_index = index elif mode == 'cum': scores[0] += sucos_score scores[1] += fm_score scores[2] += vol_score else: raise ValueError("Invalid mode: " + mode) if scores[0] > 0: if mode == 'max': cluster_file_name_only = cluster_name.split(os.sep)[-1] #utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index) mol.SetDoubleProp("Max_SuCOS_Score", scores[0]) mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores[1]) mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores[2]) mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only) mol.SetIntProp("Max_SuCOS_Index", cluster_index) else: #utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2]) mol.SetDoubleProp("Cum_SuCOS_Score", scores[0]) mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores[1]) mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores[2]) writer.write(mol) else: utils.log("Molecule", mol_num, "did not overlay. Omitting from results") input_file.close() writer.flush() writer.close() output_file.close() utils.log("Completed", comparisons, "comparisons")
def process(inputfilename, clusterfilenames, outputfilename, filter_value, filter_field): all_clusters = {} for filename in clusterfilenames: cluster = [] cluster_file = utils.open_file_for_reading(filename) suppl = Chem.ForwardSDMolSupplier(cluster_file) i = 0 for mol in suppl: i += 1 if not mol: utils.log("WARNING: failed to generate molecule", i, "in cluster", filename) continue try: features = sucos.getRawFeatures(mol) cluster.append((mol, features)) except: utils.log("WARNING: failed to generate features for molecule", i, "in cluster", filename) cluster_file.close() all_clusters[filename] = cluster input_file = utils.open_file_for_reading(inputfilename) suppl = Chem.ForwardSDMolSupplier(input_file) output_file = utils.open_file_for_writing(outputfilename) writer = Chem.SDWriter(output_file) comparisons = 0 mol_num = 0 for mol in suppl: mol_num += 1 if not mol: utils.log("WARNING: failed to generate molecule", mol_num, "in input") continue try: query_features = sucos.getRawFeatures(mol) except: utils.log("WARNING: failed to generate features for molecule", mol_num, "in input") continue scores_max = [0, 0, 0] scores_cum = [0, 0, 0] cluster_name = None for clusterfilename in all_clusters: cluster = all_clusters[clusterfilename] index = 0 for entry in cluster: hit = entry[0] ref_features = entry[1] index += 1 comparisons += 1 sucos_score, fm_score, vol_score = sucos.get_SucosScore( hit, mol, tani=False, ref_features=ref_features, query_features=query_features) if sucos_score > scores_max[0]: scores_max[0] = sucos_score scores_max[1] = fm_score scores_max[2] = vol_score cluster_name = clusterfilename cluster_index = index scores_cum[0] += sucos_score scores_cum[1] += fm_score scores_cum[2] += vol_score # utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index) mol.SetDoubleProp("Max_SuCOS_Score", scores_max[0] if scores_max[0] > 0 else 0) mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores_max[1] if scores_max[1] > 0 else 0) mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores_max[2] if scores_max[2] > 0 else 0) if cluster_name: cluster_file_name_only = cluster_name.split(os.sep)[-1] mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only) mol.SetIntProp("Max_SuCOS_Index", cluster_index) # utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2]) mol.SetDoubleProp("Cum_SuCOS_Score", scores_cum[0] if scores_cum[0] > 0 else 0) mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores_cum[1] if scores_cum[1] > 0 else 0) mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores_cum[2] if scores_cum[2] > 0 else 0) if filter_value and filter_field: if mol.HasProp(filter_field): val = mol.GetDoubleProp(filter_field) if val > filter_value: writer.write(mol) else: writer.write(mol) input_file.close() writer.flush() writer.close() output_file.close() utils.log("Completed", comparisons, "comparisons")