def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser( description='Calculate plane of best fit for molecules') utils.add_default_io_args(parser) args = parser.parse_args() utils.log("PBFEV args: ", args) input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'PBFEV', args.outformat) i = 0 count = 0 errors = 0 out_results = [] for mol in suppl: i += 1 AllChem.EmbedMolecule(mol) if mol is None: continue out_vector = PBFev(mol) if out_vector is None: continue rd = PBFRD(mol) mol.SetDoubleProp("distance", rd) for j, angle in enumerate(out_vector): mol.SetDoubleProp("angle" + "_" + str(j), angle) out_results.append(mol) count = write_out(out_results, count, writer, args.outformat) utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) + " outputs") writer.flush() writer.close() input.close() output.close()
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Sdf2Json') parser.add_argument('-i', '--input', help="Input SD file, if not defined the STDIN is used") parser.add_argument('-o', '--output', help="Base name for output json file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.") parser.add_argument('--exclude', help="Optional list of fields (comma separated) to exclude from the output.") args = parser.parse_args() utils.log("Screen Args: ", args) if args.input: if args.input.lower().endswith(".sdf"): base = args.input[:-4] elif args.input.lower().endswith(".sdf.gz"): base = args.input[:-7] else: base = "json" utils.log("Base:", base) input,output,suppl,writer,output_base = utils.default_open_input_output(args.input, "sdf", args.output, base, "json") if args.exclude: excludes = args.exclude.split(",") utils.log("Excluding", excludes) else: excludes = None i=0 count = 0 for mol in suppl: i +=1 if mol is None: continue if excludes: for exclude in excludes: if mol.HasProp(exclude): mol.ClearProp(exclude) writer.write(mol) count += 1 utils.log("Converted", count, " molecules") writer.flush() writer.close() input.close() output.close() utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitSdf2Json':count}) return count
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser( description='RDKit molecule standardiser / enumerator') utils.add_default_io_args(parser) parser.add_argument('-et', '--enumerate_tauts', action='store_true', help='Enumerate all tautomers') parser.add_argument('-es', '--enumerate_stereo', action='store_true', help='Enumerate all stereoisomers') parser.add_argument( '-st', '--standardize', action='store_true', help='Standardize molecules. Cannot be true if enumerate is on.') parser.add_argument('-stm', '--standardize_method', default="molvs", choices=STANDARD_MOL_METHODS.keys(), help="Chose the method to standardize.") args = parser.parse_args() if args.standardize and args.enumerate_tauts: raise ValueError("Cannot Enumerate Tautomers and Standardise") if args.standardize and args.enumerate_stereo: raise ValueError("Cannot Enumerate Stereo and Standardise") if args.standardize: getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method] input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'sanify', args.outformat) i = 0 count = 0 errors = 0 for mol in suppl: i += 1 if mol is None: continue if args.standardize: # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs oldUUID = mol.GetProp("uuid") inputCanSmiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) try: std = getStandardMolecule(mol) outputCanSmiles = Chem.MolToSmiles(std, isomericSmiles=True, canonical=True) if oldUUID: std.SetProp("uuid", oldUUID) #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles) if inputCanSmiles == outputCanSmiles: std.SetProp("Standardised", "False") else: std.SetProp("Standardised", "True") except: errors += 1 utils.log("Error standardizing", sys.exc_info()[0]) std = mol std.SetProp("Standardised", "Error") count = write_out([std], count, writer) else: # we want a new UUID generating as we are generating new molecules parentUuid = mol.GetProp("uuid") results = [] results.append(mol) if args.enumerate_tauts: utils.log("Enumerating tautomers") results = enumerateTautomers(mol) if args.enumerate_stereo: utils.log("Enumerating steroisomers") mols = results results = [] for m in mols: enumerated = enumerateStereoIsomers(m) results.extend(enumerated) for m in results: m.ClearProp("uuid") m.SetIntProp("SourceMolNum", i) if parentUuid: m.SetProp("SourceMolUUID", parentUuid) count = write_out(results, count, writer) utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) + " outputs") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, '__ErrorCount__': errors, 'RDKitSanify': count }) return count
def main(): parser = argparse.ArgumentParser(description='Open3DAlign with RDKit') parser.add_argument('query', help='query molfile') parser.add_argument( '--qmolidx', help="Query molecule index in SD file if not the first", type=int, default=1) parser.add_argument( '-t', '--threshold', type=float, help='score cuttoff relative to alignment of query to itself') parser.add_argument( '-n', '--num', default=0, type=int, help= 'number of conformers to generate, if None then input structures are assumed to already be 3D' ) parser.add_argument('-a', '--attempts', default=0, type=int, help='number of attempts to generate conformers') parser.add_argument('-r', '--rmsd', type=float, default=1.0, help='prune RMSD threshold for excluding conformers') parser.add_argument( '-e', '--emin', type=int, default=0, help= 'energy minimisation iterations for generated confomers (default of 0 means none)' ) utils.add_default_io_args(parser) args = parser.parse_args() utils.log("o3dAlign Args: ", args) qmol = utils.read_single_molecule(args.query, index=args.qmolidx) qmol = Chem.RemoveHs(qmol) qmol2 = Chem.Mol(qmol) source = "conformers.py" datasetMetaProps = { "source": source, "description": "Open3DAlign using RDKit " + rdBase.rdkitVersion } clsMappings = {"O3DAScore": "java.lang.Float"} fieldMetaProps = [{ "fieldName": "O3DAScore", "values": { "source": source, "description": "Open3DAlign alignment score" } }] if args.num > 0: # we generate the conformers so will add energy info clsMappings["EnergyDelta"] = "java.lang.Float" clsMappings["EnergyAbs"] = "java.lang.Float" fieldMetaProps.append({ "fieldName": "EnergyDelta", "values": { "source": source, "description": "Energy difference to lowest energy conformer" } }) fieldMetaProps.append({ "fieldName": "EnergyAbs", "values": { "source": source, "description": "Absolute energy" } }) input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'o3dAlign', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) pyO3A = rdMolAlign.GetO3A(qmol2, qmol) perfect_align = pyO3A.Align() perfect_score = pyO3A.Score() utils.log('Perfect score:', perfect_align, perfect_score, Chem.MolToSmiles(qmol, isomericSmiles=True), qmol.GetNumAtoms()) i = 0 count = 0 total = 0 for mol in suppl: if mol is None: continue if args.num > 0: mol.RemoveAllConformers() conformerProps, minEnergy = conformers.process_mol_conformers( mol, i, args.num, args.attempts, args.rmsd, None, None, 0) mol = Chem.RemoveHs(mol) count += doO3Dalign(i, mol, qmol, args.threshold, perfect_score, writer, conformerProps=conformerProps, minEnergy=minEnergy) else: mol = Chem.RemoveHs(mol) count += doO3Dalign(i, mol, qmol, args.threshold, perfect_score, writer) i += 1 total += mol.GetNumConformers() input.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitO3DAlign': total })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit conformers') parser.add_argument('-n', '--num', type=int, default=1, help='number of conformers to generate') parser.add_argument('-a', '--attempts', type=int, default=0, help='number of attempts') parser.add_argument('-r', '--rmsd', type=float, default=1.0, help='prune RMSD threshold') parser.add_argument( '-c', '--cluster', type=str.lower, choices=['rmsd', 'tdf'], help='Cluster method (RMSD or TFD). If None then no clustering') parser.add_argument( '-t', '--threshold', type=float, help='cluster threshold (default of 2.0 for RMSD and 0.3 for TFD)') parser.add_argument( '-e', '--emin', type=int, default=0, help='energy minimisation iterations (default of 0 means none)') utils.add_default_io_args(parser) parser.add_argument( '--smiles', help= 'input structure as smiles (incompatible with using files or stdin for input)' ) args = parser.parse_args() if not args.threshold: if args.cluster == 'tfd': args.threshold = 0.3 else: args.threshold = 2.0 utils.log("Conformers Args: ", args) source = "conformers.py" datasetMetaProps = { "source": source, "description": "Conformer generation using RDKit " + rdBase.rdkitVersion } clsMappings = { "RMSToCentroid": "java.lang.Float", "EnergyDelta": "java.lang.Float", "EnergyAbs": "java.lang.Float", "ConformerNum": "java.lang.Integer", "ClusterCentroid": "java.lang.Integer", "ClusterNum": "java.lang.Integer", "StructureNum": "java.lang.Integer" } fieldMetaProps = [{ "fieldName": "RMSToCentroid", "values": { "source": source, "description": "RMS distance to the cluster centroid" } }, { "fieldName": "EnergyDelta", "values": { "source": source, "description": "Energy difference to lowest energy structure" } }, { "fieldName": "EnergyAbs", "values": { "source": source, "description": "Absolute energy" } }, { "fieldName": "ConformerNum", "values": { "source": source, "description": "Conformer number" } }, { "fieldName": "ClusterCentroid", "values": { "source": source, "description": "Conformer number of the cluster centroid" } }, { "fieldName": "ClusterNum", "values": { "source": source, "description": "Cluster number" } }, { "fieldName": "StructureNum", "values": { "source": source, "description": "Structure number this conformer was generated from" } }] if args.smiles: mol = Chem.MolFromSmiles(args.smiles) suppl = [mol] input = None output, writer, output_base = utils.default_open_output( args.output, 'conformers', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) else: input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'conformers', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. #if args.meta: # t = open(output_base + '_types.txt', 'w') # t.write(field_StructureNum + '=integer\n') # t.write(field_StructureNum + '=integer\n') # t.write(field_ConformerNum + '=integer\n') # t.write(field_EnergyAbs + '=double\n') # t.write(field_EnergyDelta + '=double\n') # if args.emin > 0: # t.write(field_MinimizationConverged + '=boolean\n') # if args.cluster: # t.write(field_RMSToCentroid + '=double\n') # t.write(field_ClusterNum + '=integer\n') # t.write(field_ClusterCentroid + '=integer\n') # t.flush() # t.close() i = 0 count = 0 for mol in suppl: if mol is None: continue m = Chem.AddHs(mol) conformerPropsDict, minEnergy = process_mol_conformers( m, i, args.num, args.attempts, args.rmsd, args.cluster, args.threshold, args.emin) m = Chem.RemoveHs(m) write_conformers(m, i, conformerPropsDict, minEnergy, writer) count = count + m.GetNumConformers() i += 1 if input: input.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitConformer': count })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster') parser.add_argument( '-t', '--threshold', type=float, default=0.7, help='similarity clustering threshold (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-n', '--num', type=int, help='maximum number to pick for diverse subset selection') parser.add_argument( '-e', '--exclude', type=float, default=0.9, help= 'threshold for excluding structures in diverse subset selection (1.0 means identical)' ) parser.add_argument( '--fragment-method', choices=['hac', 'mw'], default='hac', help= 'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)' ) parser.add_argument( '--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule') parser.add_argument( '-f', '--field', help='field to use to optimise diverse subset selection') group = parser.add_mutually_exclusive_group() group.add_argument( '--min', action='store_true', help='pick lowest value specified by the --field option') group.add_argument( '--max', action='store_true', help='pick highest value specified by the --field option') utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("Cluster Args: ", args) descriptor = descriptors[args.descriptor] if descriptor is None: raise ValueError('Invalid descriptor name ' + args.descriptor) if args.field and not args.num: raise ValueError( '--num argument must be specified for diverse subset selection') if args.field and not (args.min or args.max): raise ValueError( '--min or --max argument must be specified for diverse subset selection' ) # handle metadata source = "cluster_butina.py" datasetMetaProps = { "source": source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion } clsMappings = {"Cluster": "java.lang.Integer"} fieldMetaProps = [{ "fieldName": "Cluster", "values": { "source": source, "description": "Cluster number" } }] input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'cluster_butina', args.outformat, thinOutput=args.thin, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) ### generate fingerprints #mols = [x for x in suppl if x is not None] #fps = [descriptor(x) for x in mols] mols = [] fps = [] errs = mol_utils.fragmentAndFingerprint( suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) input.close() ### do clustering utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold) clusters, dists, matrix = ClusterFps(fps, args.metric, 1.0 - args.threshold) utils.log("Found", len(clusters), "clusters") ### generate diverse subset if specified if args.num: utils.log("Generating diverse subset") # diverse subset selection is specified finalClusters = SelectDiverseSubset(mols, clusters, dists, args.num, args.field, args.max, args.exclude, args.quiet) else: finalClusters = clusters utils.log("Found", len(finalClusters), "clusters") lookup = ClustersToMap(finalClusters) if not args.quiet: utils.log("Final Clusters:", finalClusters) ### write the results i = 0 result_count = 0 for mol in mols: if lookup.has_key(i): if args.thin: utils.clear_mol_props(mol, ["uuid"]) cluster = lookup[i] mol.SetIntProp(field_Cluster, cluster) writer.write(mol) result_count += 1 i += 1 writer.flush() writer.close() output.close() if args.meta: status_str = str(result_count) + ' results from ' + str( len(finalClusters)) + ' clusters' utils.write_metrics( output_base, { '__StatusMessage__': status_str, '__InputCount__': i, '__OutputCount__': result_count, 'RDKitCluster': i })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help='query structure as smiles (incompatible with -qmolfile arg)') group.add_argument( '--qmolfile', help= 'query structure as filename in molfile format (incompatible with -qsmiles arg)' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] if args.qsmiles: query_rdkitmol = Chem.MolFromSmiles(args.qsmiles) elif args.qmolfile: query_rdkitmol = Chem.MolFromMolFile(args.qmolfile) else: raise ValueError('No query structure specified') query_fp = descriptor(query_rdkitmol) input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'screen', args.outformat, thinOutput=args.thin) i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = filter.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue target_fp = descriptor(mol) sim = metric(query_fp, target_fp) if sim >= args.simmin and sim <= args.simmax: count += 1 if not args.quiet: utils.log(i, sim) mol.SetDoubleProp(field_Similarity, sim) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': i }) return count
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help= 'filename of query structures as smiles (incompatible with --sdf and --qjson args)' ) group.add_argument( '--qsdf', help= 'filename of query structures as sdfile (incompatible with --smiles and --qjson args)' ) group.add_argument( '--qjson', help= 'filename of query structures as MoleculeObject JSON (incompatible with --qsmiles and --qsdf args)' ) parser.add_argument('--qsmilesTitleLine', action='store_true', help='the smiles file has a title line') parser.add_argument('--qsmilesDelimiter', default='\t', help='delimiter for smiles file (default is tab)') parser.add_argument( '--qsmilesColumn', type=int, default=0, help='column in smiles file with the smiles (default is first column)') parser.add_argument( '--qsmilesNameColumn', type=int, default=1, help='column in smiles file with ID (default is second column)') parser.add_argument( '--qprop', help= 'property name in query molecules to report. If not defined (or property is not present) ' + 'then name property is not written. JSON format uses the UUID as default' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] propName = args.qprop if args.qsmiles: queryMolsupplier = utils.default_open_input_smiles( args.qsmiles, delimiter=args.qsmilesDelimiter, smilesColumn=args.qsmilesColumn, nameColumn=args.qsmilesNameColumn, titleLine=args.qsmilesTitleLine) queryInput = None elif args.qsdf: queryInput, queryMolsupplier = utils.default_open_input_sdf(args.qsdf) elif args.qjson: queryInput, queryMolsupplier = utils.default_open_input_json( args.qjson, lazy=False) if not propName: propName = "uuid" else: raise ValueError('No query structure specified') queryFps = {} utils.log("Preparing query fingerprints") count = 0 for q in queryMolsupplier: count += 1 if q: queryFps[q] = descriptor(q) else: utils.log("WARNING: Failed to parse Molecule", count) if queryInput: queryInput.close() input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'screen_multi', args.outformat) # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. #if args.meta: # t = open(output_base + '_types.txt', 'w') # t.write(field_Similarity + '=integer\n') # t.flush() # t.close() i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = filter.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue targetFp = descriptor(mol) idx = 0 hits = 0 bestScore = 0 bestName = None for queryMol in queryFps: idx += 1 sim = metric(queryFps[queryMol], targetFp) if propName: name = str(queryMol.GetProp(propName)) else: name = None if sim >= args.simmin and sim <= args.simmax: hits += 1 if not args.quiet: utils.log(i, idx, sim) if sim > bestScore: bestScore = sim bestIdx = idx if name: bestName = name if name: mol.SetDoubleProp(field_Similarity + "_" + name, sim) else: mol.SetDoubleProp( field_Similarity + "_" + str(idx) + "_Score", sim) if hits > 0: count += 1 mol.SetDoubleProp(field_Similarity + "_BestScore", bestScore) if bestName: mol.SetProp(field_Similarity + "_BestName", bestName) else: mol.SetIntProp(field_Similarity + "_BestIndex", bestIdx) mol.SetIntProp(field_Similarity + "_Count", hits) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': count }) return count
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster') parser.add_argument('-t', '--threshold', type=float, default=0.0, help='similarity threshold (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='morgan2', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-n', '--num', type=int, help='maximum number to pick for diverse subset selection') parser.add_argument('-s', '--seed-molecules', help='optional file containing any seed molecules that have already been picked') parser.add_argument('--fragment-method', choices=['hac', 'mw'], default='hac', help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)') parser.add_argument('--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule') utils.add_default_io_args(parser) args = parser.parse_args() utils.log("MaxMinPicker Args: ", args) descriptor = descriptors[args.descriptor] if descriptor is None: raise ValueError('No descriptor specified') if not args.num and not args.threshold: raise ValueError('--num or --threshold arguments must be specified, or both') # handle metadata source = "max_min_picker.py" datasetMetaProps = {"source":source, "description": "MaxMinPicker using RDKit " + rdBase.rdkitVersion} ### generate fingerprints fps = [] mols = [] errors = 0 # first the initial seeds, if specified firstPicks = [] num_seeds = 0 if args.seed_molecules: seedsInput,seedsSuppl = utils.default_open_input(args.seed_molecules, None) start = time.time() errors += mol_utils.fragmentAndFingerprint(seedsSuppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) end = time.time() seedsInput.close() num_seeds = len(fps) utils.log("Read", len(fps), "fingerprints for seeds in", end-start, "secs,", errors, "errors") firstPicks = list(range(num_seeds)) # now the molecules to pick from input,output,suppl,writer,output_base = utils.default_open_input_output(args.input, args.informat, args.output, 'cluster_butina', args.outformat, datasetMetaProps=datasetMetaProps) # reset the mols list as we don't need the seeds, only the candidates mols = [] start = time.time() errs = mol_utils.fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) end = time.time() errors += errs input.close() num_fps = len(fps) num_candidates = num_fps - num_seeds utils.log("Read", num_candidates, "fingerprints for candidates in", end-start, "secs,", errs, "errors") if not args.num: num_to_pick = num_candidates elif args.num > num_candidates: num_to_pick = num_candidates utils.log("WARNING: --num argument (", args.num, ") is larger than the total number of candidates (", num_candidates, ") - resetting to", num_candidates) else: num_to_pick = args.num ### do picking utils.log("MaxMinPicking with descriptor", args.descriptor, "and threshold", args.threshold, ",", num_seeds, "seeds,", num_candidates, "candidates", num_fps, "total") start = time.time() picks, thresh = performPick(fps, num_to_pick + num_seeds, args.threshold, firstPicks) end = time.time() num_picks = len(picks) utils.log("Found", num_picks, "molecules in", end-start, "secs, final threshold", thresh) utils.log("Picks:", list(picks[num_seeds:])) del fps # we want to return the results in the order they were in the input so first we record the order in the pick list indices = {} i = 0 for idx in picks[num_seeds:]: indices[idx] = i i += 1 # now do the sort sorted_picks = sorted(picks[num_seeds:]) # now write out the mols in the correct order recording the value in the pick list as the PickIndex property i = 0 for idx in sorted_picks: mol = mols[idx - num_seeds] # mols array only contains the candidates mol.SetIntProp("PickIndex", indices[idx] + 1) writer.write(mol) i += 1 utils.log("Output", i, "molecules") writer.flush() writer.close() output.close() if args.meta: metrics = {} status_str = "{} compounds picked. Final threshold was {}.".format(i, thresh) if errors > 0: metrics['__ErrorCount__'] = errors status_str = status_str + " {} errors.".format(errors) metrics['__StatusMessage__'] = status_str metrics['__InputCount__'] = num_fps metrics['__OutputCount__'] = i metrics['RDKitMaxMinPicker'] = num_picks utils.write_metrics(output_base, metrics)
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit rxn smarts filter') utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-m', '--multi', action='store_true', help='Output one file for each reaction') parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("Screen Args: ", args) if not args.output and args.multi: raise ValueError("Must specify output location when writing individual result files") ### Define the filter chooser - lots of logic possible # SMARTS patterns are defined in poised_filter.py. Currently this is hardcoded. # Should make this configurable so that this can be specified by the user at some stage. poised_filter = True if poised_filter == True: from poised_filter import Filter filter_to_use = Filter() rxn_names = filter_to_use.get_rxn_names() utils.log("Using", len(rxn_names), "reaction filters") # handle metadata source = "rxn_smarts_filter.py" datasetMetaProps = {"source":source, "description": "Reaction SMARTS filter"} clsMappings = {} fieldMetaProps = [] for name in rxn_names: # this is the Java class type for an array of MoleculeObjects clsMappings[name] = "[Lorg.squonk.types.MoleculeObject;" fieldMetaProps.append({"fieldName":name, "values": {"source":source, "description":"Sythons from " + name + " reaction"}}) input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'rxn_smarts_filter', args.outformat, thinOutput=args.thin, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) i = 0 count = 0 if args.multi: dir_base = os.path.dirname(args.output) writer_dict = filter_to_use.get_writers(dir_base) else: writer_dict = None dir_base = None for mol in suppl: i += 1 if mol is None: continue # Return a dict/class here - indicating which filters passed filter_pass = filter_to_use.pass_filter(mol) utils.log("Found", str(len(filter_pass)), "matches") if filter_pass: props = {} count += 1 for reaction in filter_pass: molObjList = [] # Write the reaction name as a newline separated list of the synthons to the mol object # this is used in SDF output mol.SetProp(reaction, "\n".join(filter_pass[reaction])) # now write to the props is a way that can be used for the JSON output for smiles in filter_pass[reaction]: # generate a dict that generates MoleculeObject JSON mo = utils.generate_molecule_object_dict(smiles, "smiles", None) molObjList.append(mo) props[reaction] = molObjList if args.multi: writer_dict[reaction].write(mol) writer_dict[reaction].flush() # write the output. # In JSON format the props will override values set on the mol # In SDF format the props are ignored so the values in the mol are used writer.write(mol, props) writer.flush() utils.log("Matched", count, "molecules from a total of", i) if dir_base: utils.log("Individual SD files found in: " + dir_base) writer.flush() writer.close() if input: input.close() if output: output.close() # close the individual writers if writer_dict: for key in writer_dict: writer_dict[key].close() if args.meta: utils.write_metrics(output_base, {'__InputCount__': i, '__OutputCount__': count, 'RxnSmartsFilter': count})