def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser( description='Calculate plane of best fit for molecules') parameter_utils.add_default_io_args(parser) args = parser.parse_args() utils.log("PBFEV args: ", args) input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output( args.input, args.informat, args.output, 'PBFEV', args.outformat) i = 0 count = 0 errors = 0 out_results = [] for mol in suppl: i += 1 AllChem.EmbedMolecule(mol) if mol is None: continue out_vector = PBFev(mol) if out_vector is None: continue rd = PBFRD(mol) mol.SetDoubleProp("distance", rd) for j, angle in enumerate(out_vector): mol.SetDoubleProp("angle" + "_" + str(j), angle) out_results.append(mol) count = write_out(out_results, count, writer, args.outformat) utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) + " outputs") writer.flush() writer.close() input.close() output.close()
def main(): parser = argparse.ArgumentParser(description='RDKit constrained conformer generator') parameter_utils.add_default_io_args(parser) parser.add_argument('-n', '--num', type=int, default=10, help='number of conformers to generate') parser.add_argument('-r', '--refmol', help="Reference molecule file") parser.add_argument('--refmolidx', help="Reference molecule index in file", type=int, default=1) parser.add_argument('-c', '--core_smi', help='Core substructure. If not specified - guessed using MCS', default='') args = parser.parse_args() # Get the reference molecule ref_mol_input, ref_mol_suppl = rdkit_utils.default_open_input(args.refmol, args.refmol) counter = 0 # Get the specified reference molecule. Default is the first for mol in ref_mol_suppl: counter+=1 if counter == args.refmolidx: ref_mol = mol break ref_mol_input.close() if counter < args.refmolidx: raise ValueError("Invalid refmolidx. " + str(args.refmolidx) + " was specified but only " + str(counter) + " molecules were present in refmol.") # handle metadata source = "constrained_conf_gen.py" datasetMetaProps = {"source":source, "description": "Constrained conformer generation using RDKit " + rdBase.rdkitVersion} clsMappings = {"EmbedRMS": "java.lang.Float"} fieldMetaProps = [{"fieldName":"EmbedRMS", "values": {"source":source, "description":"Embedding RMS value"}}] # Get the molecules input, suppl = rdkit_utils.default_open_input(args.input, args.informat) output, WRITER, output_base = rdkit_utils.\ default_open_output(args.output, "const_conf_gen", args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) inputs = 0 totalCount = 0 totalErrors = 0 for mol in suppl: inputs += 1 if mol: count, errors = generate_conformers(inputs, mol, args.num, ref_mol, WRITER, args.core_smi) totalCount += count totalErrors += errors input.close() WRITER.close() if totalErrors > 0: utils.log("WARNING:", totalErrors, "conformers failed to generate") # write metrics if args.meta: metrics = {'__InputCount__':inputs, '__OutputCount__':totalCount, 'RDKitConstrainedConformer':totalCount} if totalErrors > 0: metrics['__ErrorCount__'] = totalErrors utils.write_metrics(output_base, metrics)
def main(): # Example usage # python -m pipelines.xchem.featurestein_score -i ../../data/mpro/poses.sdf.gz -f mpro-fstein.p -o fstein global fmaps parser = argparse.ArgumentParser(description='FeatureStein scoring with RDKit') parameter_utils.add_default_io_args(parser) parser.add_argument('-f', '--feat-map', help='Feature Map pickle to score with') parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('--metrics', action='store_true', help='Write metrics') args = parser.parse_args() utils.log("FeatureStein Args: ", args) source = "featurestein_score.py" datasetMetaProps = {"source":source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion} clsMappings = {} fieldMetaProps = [] clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float" clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float" fieldMetaProps.append({"fieldName":field_FeatureSteinQualityScore, "values": {"source":source, "description":"FeatureStein quality score"}, "fieldName":field_FeatureSteinQuantityScore, "values": {"source":source, "description":"FeatureStein quantity score"}}) pkl_file = open(args.feat_map, 'rb') fmaps = pickle.load(pkl_file) utils.log('FeatureMap has', fmaps.GetNumFeatures(), "features") inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat) output, writer, output_base = rdkit_utils.default_open_output(args.output, 'featurestein', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps, compress=not args.no_gzip) # this does the processing total, success, errors = process(inputs_supplr, writer) inputs_file.close() writer.flush() writer.close() output.close() if args.metrics: utils.write_metrics(output_base, {'__InputCount__':total, '__OutputCount__':success, '__ErrorCount__':errors, 'RDKitFeatureMap':success})
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit cluster 3D') parameter_utils.add_default_io_args(parser) args = parser.parse_args() utils.log("Cluster_3d Args: ", args) source = "cluster_3d.py" datasetMetaProps = { "source": source, "description": "Cluster 3D using RDKit " + rdBase.rdkitVersion } clsMappings = { # "RMSToCentroid": "java.lang.Float", # "EnergyDelta": "java.lang.Float", # "EnergyAbs": "java.lang.Float", # "ConformerNum": "java.lang.Integer", # "ClusterCentroid": "java.lang.Integer", # "ClusterNum": "java.lang.Integer", # "StructureNum": "java.lang.Integer" } fieldMetaProps = [ # {"fieldName":"RMSToCentroid", "values": {"source":source, "description":"RMS distance to the cluster centroid"}}, # {"fieldName":"EnergyDelta", "values": {"source":source, "description":"Energy difference to lowest energy structure"}}, # {"fieldName":"EnergyAbs", "values": {"source":source, "description":"Absolute energy"}}, # {"fieldName":"ConformerNum", "values": {"source":source, "description":"Conformer number"}}, # {"fieldName":"ClusterCentroid", "values": {"source":source, "description":"Conformer number of the cluster centroid"}}, # {"fieldName":"ClusterNum", "values": {"source":source, "description":"Cluster number"}}, # {"fieldName":"StructureNum", "values": {"source":source, "description":"Structure number this conformer was generated from"}} ] input, output, suppl, writer, output_base = rdkit_utils. \ default_open_input_output(args.input, args.informat, args.output, 'conformers', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) basemol = combine_conformers(suppl) if input: input.close() writer.flush() writer.close() output.close()
def main(): # Example usage: # python -m pipelines.xchem.xcos -f ../../data/mpro/hits-17.sdf.gz -i ../../data/mpro/poses.sdf.gz -o xcos parser = argparse.ArgumentParser(description='XCos scoring with RDKit') parameter_utils.add_default_io_args(parser) parser.add_argument('-f', '--fragments', required=True, help='Fragments to compare') parser.add_argument('-ff', '--fragments-format', help='Fragments format') parser.add_argument('-t', '--score-threshold', type=float, default=0.4, help='Minimum shape overlay and feature map score required for scoring a bit to a fragment') parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('--metrics', action='store_true', help='Write metrics') args = parser.parse_args() utils.log("XCos Args: ", args) source = "xcos.py" datasetMetaProps = {"source":source, "description": "XCos scoring using RDKit " + rdBase.rdkitVersion} clsMappings = {} fieldMetaProps = [] clsMappings[field_XCosRefMols] = "java.lang.String" clsMappings[field_XCosNumHits] = "java.lang.Integer" clsMappings[field_XCosScore1] = "java.lang.Float" fieldMetaProps.append({"fieldName":field_XCosRefMols, "values": {"source":source, "description":"XCos reference fragments"}}) fieldMetaProps.append({"fieldName":field_XCosNumHits, "values": {"source":source, "description":"XCos number of hits"}}) fieldMetaProps.append({"fieldName":field_XCosScore1, "values": {"source":source, "description":"XCos score 1"}}) frags_input,frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format) inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat) output, writer, output_base = rdkit_utils.default_open_output(args.output, 'xcos', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps, compress=not args.no_gzip) # this does the processing process(inputs_supplr, frags_suppl, writer, threshold=args.score_threshold) writer.close()
def main(): parser = argparse.ArgumentParser(description='Open3DAlign with RDKit') parser.add_argument('query', help='query molfile') parser.add_argument( '--qmolidx', help="Query molecule index in SD file if not the first", type=int, default=1) parser.add_argument('--crippen', action='store_true', help='Use Crippen (logP) contributions') parser.add_argument( '-t', '--threshold', type=float, help='score cuttoff relative to alignment of query to itself') parser.add_argument( '-n', '--num', default=0, type=int, help= 'number of conformers to generate, if None then input structures are assumed to already be 3D' ) parser.add_argument('-a', '--attempts', default=0, type=int, help='number of attempts to generate conformers') parser.add_argument('-r', '--rmsd', type=float, default=1.0, help='prune RMSD threshold for excluding conformers') parser.add_argument( '-e', '--emin', type=int, default=0, help= 'energy minimisation iterations for generated conformers (default of 0 means none)' ) parameter_utils.add_default_io_args(parser) args = parser.parse_args() utils.log("o3dAlign Args: ", args) # TODO - handle molecules with multiple fragments # TODO - allow to specify threshold as fraction of perfect score? qmol = rdkit_utils.read_single_molecule(args.query, index=args.qmolidx) qmol = Chem.RemoveHs(qmol) qmol2 = Chem.Mol(qmol) source = "conformers.py" datasetMetaProps = { "source": source, "description": "Open3DAlign using RDKit " + rdBase.rdkitVersion } clsMappings = {"O3DAScore": "java.lang.Float"} fieldMetaProps = [{ "fieldName": "O3DAScore", "values": { "source": source, "description": "Open3DAlign alignment score" } }] if args.num > 0: # we generate the conformers so will add energy info clsMappings["EnergyDelta"] = "java.lang.Float" clsMappings["EnergyAbs"] = "java.lang.Float" fieldMetaProps.append({ "fieldName": "EnergyDelta", "values": { "source": source, "description": "Energy difference to lowest energy conformer" } }) fieldMetaProps.append({ "fieldName": "EnergyAbs", "values": { "source": source, "description": "Absolute energy" } }) input,output,suppl,writer,output_base = rdkit_utils.\ default_open_input_output(args.input, args.informat, args.output, 'o3dAlign', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) if args.crippen: pyO3A = rdMolAlign.GetCrippenO3A(qmol2, qmol) else: pyO3A = rdMolAlign.GetO3A(qmol2, qmol) perfect_align = pyO3A.Align() perfect_score = pyO3A.Score() utils.log('Perfect score:', perfect_align, perfect_score, Chem.MolToSmiles(qmol, isomericSmiles=True), qmol.GetNumAtoms()) i = 0 count = 0 total = 0 errors = 0 for mol in suppl: if mol is None: i += 1 continue try: if args.num > 0: mol.RemoveAllConformers() conformerProps, minEnergy = conformers.process_mol_conformers( mol, i, args.num, args.attempts, args.rmsd, None, None, 0) mol = Chem.RemoveHs(mol) count += doO3Dalign(i, mol, qmol, args.crippen, args.threshold, perfect_score, writer, conformerProps=conformerProps, minEnergy=minEnergy) else: mol = Chem.RemoveHs(mol) count += doO3Dalign(i, mol, qmol, args.crippen, args.threshold, perfect_score, writer) total += mol.GetNumConformers() except ValueError as e: errors += 1 utils.log("Molecule", i, "failed to align:", e.message) i += 1 input.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, '__ErrorCount__': errors, 'RDKitO3DAlign': total })
def main(): parser = argparse.ArgumentParser(description='SuCOS with RDKit') parameter_utils.add_default_io_args(parser) parser.add_argument( '-r', '--refmol', help= 'Molecule to compare against in Molfile (.mol) or SDF (.sdf) format') parser.add_argument('-tm', '--target', help='Target molecule to compare against') parser.add_argument('-tf', '--target-format', help='Target molecule format') parser.add_argument('-ti', '--targetidx', help='Target molecule index in file if not the first', type=int, default=1) parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score') parser.add_argument( '--score_mode', choices=['all', 'closest', 'best'], help="choose the scoring mode for the feature map, default is 'all'.") args = parser.parse_args() utils.log("SuCOS Args: ", args) score_mode = parse_score_mode(args.score_mode) target_mol = rdkit_utils.read_single_molecule(args.target, index=args.targetidx, format=args.target_format) utils.log("Target mol has", str(target_mol.GetNumHeavyAtoms()), "heavy atoms") source = "sucos.py" datasetMetaProps = { "source": source, "description": "SuCOS using RDKit " + rdBase.rdkitVersion } clsMappings = {} fieldMetaProps = [] clsMappings[field_SuCOS_Score] = "java.lang.Float" clsMappings[field_SuCOS_FMScore] = "java.lang.Float" fieldMetaProps.append({ "fieldName": field_SuCOS_Score, "values": { "source": source, "description": "SuCOS score" } }) fieldMetaProps.append({ "fieldName": field_SuCOS_FMScore, "values": { "source": source, "description": "SuCOS Feature Map score" } }) if args.tanimoto: clsMappings[field_SuCOS_TaniScore] = "java.lang.Float" fieldMetaProps.append({ "fieldName": field_SuCOS_TaniScore, "values": { "source": source, "description": "SuCOS Tanimoto score" } }) else: clsMappings[field_SuCOS_ProtrudeScore] = "java.lang.Float" fieldMetaProps.append({ "fieldName": field_SuCOS_ProtrudeScore, "values": { "source": source, "description": "SuCOS Protrude score" } }) inputs_file,output,inputs_supplr,writer,output_base = rdkit_utils. \ default_open_input_output(args.input, args.informat, args.output, 'sucos', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) # this does the processing count, total, errors = process(target_mol, inputs_supplr, writer, tani=args.tanimoto, score_mode=score_mode) inputs_file.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors, 'RDKitSuCOS': total })
def main(): parser = argparse.ArgumentParser(description='SuCOS with RDKit') parser.add_argument('--target', help='molecule to compare against') parser.add_argument( '--targetidx', help="Target molecule index in SD file if not the first", type=int, default=1) parameter_utils.add_default_io_args(parser) args = parser.parse_args() utils.log("SuCOS Args: ", args) # TODO - handle molecules with multiple fragments ref_mol = rdkit_utils.read_single_molecule(args.target, index=args.targetidx) utils.log("Reference mol has", str(ref_mol.GetNumHeavyAtoms()), "heavy atoms") source = "sucos.py" datasetMetaProps = { "source": source, "description": "SuCOS using RDKit " + rdBase.rdkitVersion } clsMappings = {"SuCOS_score": "java.lang.Float"} fieldMetaProps = [{ "fieldName": field_SuCOS_Score, "values": { "source": source, "description": "SuCOS score" } }] input,output,suppl,writer,output_base = rdkit_utils.\ default_open_input_output(args.input, args.informat, args.output, 'sucos', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) count = 0 total = 0 errors = 0 for mol in suppl: count += 1 if mol is None: continue #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") try: fm_score = get_SucosScore(ref_mol, mol, field_SuCOS_Score) utils.log("Score:", str(fm_score)) writer.write(mol) total += 1 except ValueError as e: errors += 1 utils.log("Molecule", count, "failed to score:", e.message) input.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors, 'RDKitSuCOS': total })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help='query structure as smiles (incompatible with -qmolfile arg)') group.add_argument( '--qmolfile', help= 'query structure as filename in molfile format (incompatible with -qsmiles arg)' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parameter_utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] if args.qsmiles: query_rdkitmol = Chem.MolFromSmiles(args.qsmiles) elif args.qmolfile: query_rdkitmol = Chem.MolFromMolFile(args.qmolfile) else: raise ValueError('No query structure specified') query_fp = descriptor(query_rdkitmol) input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output( args.input, args.informat, args.output, 'screen', args.outformat, thinOutput=args.thin) i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue target_fp = descriptor(mol) sim = metric(query_fp, target_fp) if sim >= args.simmin and sim <= args.simmax: count += 1 if not args.quiet: utils.log(i, sim) mol.SetDoubleProp(field_Similarity, sim) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': i })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit conformers') parser.add_argument('-n', '--num', type=int, default=1, help='number of conformers to generate') parser.add_argument('-a', '--attempts', type=int, default=0, help='number of attempts') parser.add_argument('-r', '--rmsd', type=float, default=1.0, help='prune RMSD threshold') parser.add_argument( '-c', '--cluster', type=str.lower, choices=['none', 'rmsd', 'tfd'], help='Cluster method (RMSD or TFD). If None then no clustering') parser.add_argument( '-t', '--threshold', type=float, help='cluster threshold (default of 2.0 for RMSD and 0.3 for TFD)') parser.add_argument( '-e', '--emin', type=int, default=0, help='energy minimisation iterations (default of 0 means none)') parameter_utils.add_default_io_args(parser) parser.add_argument( '-s', '--smiles', help= 'input structure as smiles (incompatible with using files or stdin for input)' ) parser.add_argument('-f', '--outfile', type=argparse.FileType('w+'), default=sys.stdout, help="path to the result file, default it sdtout") args = parser.parse_args() if not args.threshold: if args.cluster == 'tfd': args.threshold = 0.3 elif args.cluster == 'rmsd': args.threshold = 2.0 utils.log("Conformers Args: ", args) source = "conformers.py" datasetMetaProps = { "source": source, "description": "Conformer generation using RDKit " + rdBase.rdkitVersion } clsMappings = { "RMSToCentroid": "java.lang.Float", "EnergyDelta": "java.lang.Float", "EnergyAbs": "java.lang.Float", "ConformerNum": "java.lang.Integer", "ClusterCentroid": "java.lang.Integer", "ClusterNum": "java.lang.Integer", "StructureNum": "java.lang.Integer" } fieldMetaProps = [{ "fieldName": "RMSToCentroid", "values": { "source": source, "description": "RMS distance to the cluster centroid" } }, { "fieldName": "EnergyDelta", "values": { "source": source, "description": "Energy difference to lowest energy structure" } }, { "fieldName": "EnergyAbs", "values": { "source": source, "description": "Absolute energy" } }, { "fieldName": "ConformerNum", "values": { "source": source, "description": "Conformer number" } }, { "fieldName": "ClusterCentroid", "values": { "source": source, "description": "Conformer number of the cluster centroid" } }, { "fieldName": "ClusterNum", "values": { "source": source, "description": "Cluster number" } }, { "fieldName": "StructureNum", "values": { "source": source, "description": "Structure number this conformer was generated from" } }] if args.smiles: mol = Chem.MolFromSmiles(args.smiles) suppl = [mol] input = None else: input, output, suppl, writer, output_base = rdkit_utils. \ default_open_input_output(args.input, args.informat, args.output, 'conformers', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. # if args.meta: # t = open(output_base + '_types.txt', 'w') # t.write(field_StructureNum + '=integer\n') # t.write(field_StructureNum + '=integer\n') # t.write(field_ConformerNum + '=integer\n') # t.write(field_EnergyAbs + '=double\n') # t.write(field_EnergyDelta + '=double\n') # if args.emin > 0: # t.write(field_MinimizationConverged + '=boolean\n') # if args.cluster: # t.write(field_RMSToCentroid + '=double\n') # t.write(field_ClusterNum + '=integer\n') # t.write(field_ClusterCentroid + '=integer\n') # t.flush() # t.close() i = 0 count = 0 writer = rdkit_utils.ThickSDWriter(args.outfile) for mol in suppl: if mol is None: continue m = Chem.AddHs(mol) conformerPropsDict, minEnergy = process_mol_conformers( m, i, args.num, args.attempts, args.rmsd, args.cluster, args.threshold, args.emin) m = Chem.RemoveHs(m) write_conformers(m, i, conformerPropsDict, minEnergy, writer) count = count + m.GetNumConformers() i += 1 if input: input.close() writer.flush() writer.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitConformer': count })
def main(): global PDB_PATH, WRITER, THRESHOLD parser = argparse.ArgumentParser( description='PLI scoring - Docking calculation.') parameter_utils.add_default_io_args(parser) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring") parser.add_argument('-t', '--threshold', type=float, help="The maximum score to allow", default=None) parser.add_argument( '--threads', type=int, help="Number of threads to used. Default is the number of cores", default=None) parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("PLI Args: ", args) # Open up the input file input, suppl = rdkit_utils.default_open_input(args.input, args.informat) # Open the output file s_now = datetime.datetime.utcnow().strftime("%d-%b-%Y %H:%M:%S UTC") source = 'pipelines/docking/plip.py' output, WRITER, output_base = \ rdkit_utils.default_open_output(args.output, "plip", args.outformat, compress=not args.no_gzip, thinOutput=args.thin, valueClassMappings={'pliff_iscore': 'java.lang.Float', 'pliff_cscore': 'java.lang.Float', 'pliff_nb_score': 'java.lang.Float', 'pliff_gscore': 'java.lang.Float', 'pliff_score': 'java.lang.Float', 'pliff_tscore': 'java.lang.Float'}, datasetMetaProps={'created': s_now, 'source': source, 'description': 'PLI scoring of docked structures'} ) PDB_PATH = args.pdb_file if args.threshold: THRESHOLD = args.threshold # Iterate over the molecules # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this. pool = ThreadPool(args.threads if args. threads is not None else multiprocessing.cpu_count()) pool.map(run_dock, suppl) pool.close() pool.join() # Close the file WRITER.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': COUNTER, '__OutputCount__': SUCCESS, 'PLI': COUNTER })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit filter') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parser.add_argument('--rotbmin', type=float, help='Min rotatable bond count') parser.add_argument('--rotbmax', type=float, help='Max rotatable bond count') parser.add_argument('--logpmin', type=float, help='Min logP') parser.add_argument('--logpmax', type=float, help='Max logP') parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records') parser.add_argument( '-c', '--chunksize', type=int, help= 'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...' ) parser.add_argument( '-d', '--digits', type=int, default=0, help= 'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...' ) parser.add_argument('-r', '--rename', action='append', help='Rename field (fromname:toname)') parser.add_argument( '-t', '--transform', action='append', help='Transform field value(fieldname:regex:type). ' + 'Regex is in the form of /regex/substitution/ (the 3 slashes are required). ' + 'Type is of int, float, boolean or string. The type is optional and if not specified then string is assumed. ' + 'Transformation occurs after field renaming so specify the new name.') parser.add_argument('--delete', action='append', help='Delete field') parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') # WARNING: thin output is not appropriate when using --fragment parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument( '-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering') parameter_utils.add_default_io_args(parser) args = parser.parse_args() utils.log("Filter Args: ", args) field_renames = {} if args.rename: for t in args.rename: parts = t.split(':') if len(parts) != 2: raise ValueError('Invalid field rename argument:', t) field_renames[parts[0]] = parts[1] if args.delete: for f in args.delete: field_renames[f] = None field_regexes = {} field_replacements = {} field_types = {} if args.transform: for t in args.transform: parts = t.split(':') if len(parts) < 2 or len(parts) > 3: raise ValueError('Invalid field transform argument:', t) terms = parts[1].split('/') utils.log("|".join(terms) + str(len(terms))) field_regexes[parts[0]] = re.compile(terms[1]) field_replacements[parts[0]] = terms[2] if len(parts) == 3: t = parts[2] else: t = 'string' field_types[parts[0]] = t utils.log("Created transform of " + terms[1] + " to " + terms[2] + " using type of " + t) if args.delete: for f in args.delete: field_renames[f] = None input, suppl = rdkit_utils.default_open_input(args.input, args.informat) if args.chunksize: chunkNum = 1 if args.output: output_base = args.output else: output_base = 'filter' output_base_chunk = output_base + str(chunkNum).zfill(args.digits) output, writer, output_base_chunk = rdkit_utils.default_open_output( output_base_chunk, output_base_chunk, args.outformat, thinOutput=args.thin, compress=not args.no_gzip) else: output, writer, output_base_chunk = rdkit_utils.default_open_output( args.output, "filter", args.outformat, thinOutput=args.thin, compress=not args.no_gzip) output_base = output_base_chunk utils.log("Writing to " + output_base_chunk) i = 0 count = 0 chunkNum = 1 for mol in suppl: if args.limit and count >= args.limit: break i += 1 if mol is None: continue if args.fragment: mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet) if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, minRotb=args.rotbmin, maxRotb=args.rotbmax, minLogp=args.logpmin, maxLogp=args.logpmax, quiet=args.quiet): continue if args.chunksize: if count > 0 and count % args.chunksize == 0: # new chunk, so create new writer writer.close() output.close() chunkNum += 1 output_chunk_base = output_base + str(chunkNum).zfill( args.digits) utils.log("Writing to " + output_chunk_base) output, writer, output_chunk_base = rdkit_utils.default_open_output( output_chunk_base, output_chunk_base, args.outformat, thinOutput=args.thin, compress=not args.no_gzip) for from_name in field_renames: to_name = field_renames[from_name] if mol.HasProp(from_name): val = mol.GetProp(from_name) mol.ClearProp(from_name) if to_name: mol.SetProp(to_name, val) for fieldname in field_regexes: p = mol.GetProp(fieldname) if p is not None: regex = field_regexes[fieldname] q = regex.sub(field_replacements[fieldname], p) t = field_types[fieldname] if t == 'int': mol.SetIntProp(fieldname, int(q)) elif t == 'float': mol.SetDoubleProp(fieldname, float(q)) elif t == 'boolean': mol.SetBoolProp(fieldname, bool(q)) else: mol.SetProp(fieldname, q) count += 1 writer.write(mol) utils.log("Filtered", i, "down to", count, "molecules") if args.chunksize: utils.log("Wrote", chunkNum, "chunks") if (args.digits > 0 and len(str(chunkNum)) > args.digits): utils.log( "WARNING: not enough digits specified for the number of chunks" ) writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitFilter': i })
def main(): ### command line args definitions ######################################### parser = argparse.ArgumentParser(description='RDKit Standardize') parser.add_argument( '--fragment-method', choices=['hac', 'mw'], help= 'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)' ) parser.add_argument('--neutralize', action='store_true', help='Neutralize the molecule') parameter_utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("Standardize Args: ", args) # handle metadata source = "standardize.py" datasetMetaProps = { "source": source, "description": "Standardize using RDKit " + rdBase.rdkitVersion } clsMappings = {} fieldMetaProps = [] input,output,suppl,writer,output_base = rdkit_utils.\ default_open_input_output(args.input, args.informat, args.output, 'standardize', args.outformat, thinOutput=False, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) count = 0 total = 0 errors = 0 for mol in suppl: count += 1 if mol is None: errors += 1 continue m = standardize(mol, args.neutralize, args.fragment_method) writer.write(m) total += 1 input.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors, 'RDKitStandardize': total })
def main(): # Example usage # python -m pipelines.xchem.calc_interactions -p ../../data/mpro/Mpro-x0387_0.pdb -i ../../data/mpro/hits-17.sdf.gz -o output parser = argparse.ArgumentParser(description='Calculate interactions') parameter_utils.add_default_io_args(parser) parser.add_argument('-p', '--protein', nargs='*', help="File with protein (PDB or MOL2 format") # NOTE reading mol2 format seems to be problematical. parser.add_argument('-pf', '--protein-format', choices=['pdb', 'mol2'], help="Protein file format") parser.add_argument('--strict', action='store_true', help='Strict filtering') parser.add_argument( '--exact-protein', action='store_true', help='Exact matching of hydrogens and charges for protein') parser.add_argument( '--exact-ligand', action='store_true', help='Exact matching of hydrogens and charges for ligand') parser.add_argument('--keep-hs-protein', action='store_true', help='Keep hydrogens on the protein') parser.add_argument('--keep-hs-ligand', action='store_true', help='Keep hydrogens on the ligand') parser.add_argument('--key-hbond', nargs='*', help='List of canonical H-bond interactions to count') parser.add_argument( '--key-hydrophobic', nargs='*', help='List of canonical hydrophobic interactions to count') parser.add_argument( '--key-salt-bridge', nargs='*', help='List of canonical salt bridge interactions to count') parser.add_argument( '--key-pi-stacking', nargs='*', help='List of canonical pi stacking interactions to count') parser.add_argument( '--key-pi-cation', nargs='*', help='List of canonical pi cation interactions to count') parser.add_argument( '--key-halogen', nargs='*', help='List of canonical halogen bond interactions to count') parser.add_argument( '--rfscores', nargs='*', help="Pickle(s) for RFScore model e.g. RFScore_v1_pdbbind2016.pickle") parser.add_argument( '--nnscores', nargs='*', help="Pickle(s) for NNScore model e.g. NNScore_pdbbind2016.pickle") parser.add_argument( '--plecscores', nargs='*', help= "Pickle(s) for PLECScore model e.g. PLEClinear_p5_l1_pdbbind2016_s65536.pickle" ) parser.add_argument('-r', '--report-file', help="File for the report") parser.add_argument('-c', '--compare', help="Compare interactions with this report") parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('--metrics', action='store_true', help='Write metrics') args = parser.parse_args() utils.log("Calculate interactions Args: ", args) key_inters = {} if args.key_hbond: key_inters[interactions.I_TYPE_HBOND] = args.key_hbond if args.key_hydrophobic: key_inters[interactions.I_TYPE_HYDROPHOBIC] = args.key_hydrophobic if args.key_salt_bridge: key_inters[interactions.I_TYPE_SALT_BRIDGE] = args.key_salt_bridge if args.key_pi_stacking: key_inters[interactions.I_TYPE_PI_STACKING] = args.key_pi_stacking if args.key_pi_cation: key_inters[interactions.I_TYPE_PI_CATION] = args.key_pi_cation if args.key_halogen: key_inters[interactions.I_TYPE_HALOGEN] = args.key_halogen source = "calc_interactions.py using ODDT" datasetMetaProps = { "source": source, "description": "Calculate interactions using ODDT" } clsMappings = {} fieldMetaProps = [] clsMappings[interactions.I_NAME_HBOND] = "java.lang.String" clsMappings[interactions.I_NAME_HALOGEN] = "java.lang.String" clsMappings[interactions.I_NAME_HYDROPHOBIC] = "java.lang.String" clsMappings[interactions.I_NAME_SALT_BRIDGE] = "java.lang.String" clsMappings[interactions.I_NAME_PI_STACKING] = "java.lang.String" clsMappings[interactions.I_NAME_PI_CATION] = "java.lang.String" clsMappings['NumTotalInteractions'] = "java.lang.Integer" clsMappings['NumKeyInteractions'] = "java.lang.Integer" clsMappings['KeyInteractions'] = "java.lang.String" fieldMetaProps.append({ "fieldName": interactions.I_NAME_HBOND, "values": { "source": source, "description": "Hydrogen bond interactions" }, "fieldName": interactions.I_NAME_HALOGEN, "values": { "source": source, "description": "Halogen bond interactions" }, "fieldName": interactions.I_NAME_HYDROPHOBIC, "values": { "source": source, "description": "Hydrophobic interactions" }, "fieldName": interactions.I_NAME_SALT_BRIDGE, "values": { "source": source, "description": "Salt bridge interactions" }, "fieldName": interactions.I_NAME_PI_STACKING, "values": { "source": source, "description": "Pi stacking interactions" }, "fieldName": interactions.I_NAME_PI_CATION, "values": { "source": source, "description": "Pi cation interactions" } }) inputs_file, inputs_supplr = rdkit_utils.default_open_input( args.input, args.informat) output, writer, output_base = rdkit_utils.default_open_output( args.output, 'calc_interactions', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps, compress=not args.no_gzip) # this does the processing count, errors = process(args.protein, args.input, writer, key_inters, protein_format=args.protein_format, filter_strict=args.strict, exact_protein=args.exact_protein, exact_ligand=args.exact_ligand, keep_hs_protein=args.keep_hs_protein, keep_hs_ligand=args.keep_hs_ligand, report_file=args.report_file, compare_file=args.compare, rfscores=args.rfscores, nnscores=args.nnscores, plecscores=args.plecscores) utils.log('Processing complete.', count, 'records processed.', errors, 'errors') inputs_file.close() writer.flush() writer.close() output.close() # if args.metrics: utils.write_metrics( output_base, { '__InputCount__': total, '__OutputCount__': count, '__ErrorCount__': errors, 'ODDTInteraction': count })
def main(): ### command line args defintions ######################################### ### Define the reactions available poised_filter = True if poised_filter == True: from poised_filter import Filter filter_to_use = Filter() parser = argparse.ArgumentParser(description='RDKit rxn process') parameter_utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-m', '--multi', action='store_true', help='Output one file for each reaction') parser.add_argument('-r', '--reaction', choices=filter_to_use.poised_reactions.keys(), help='Name of reaction to be run') parser.add_argument('-rl', '--reagent_lib', help="Input SD file, if not defined the STDIN is used") parser.add_argument( '-rlf', '--reagent_lib_format', choices=['sdf', 'json'], help="Input format. When using STDIN this must be specified.") args = parser.parse_args() utils.log("Screen Args: ", args) if not args.output and args.multi: raise ValueError( "Must specify output location when writing individual result files" ) input, suppl = rdkit_utils.default_open_input(args.input, args.informat) reagent_input, reagent_suppl = rdkit_utils.default_open_input( args.reagent_lib, args.reagent_lib_format) output, writer, output_base = rdkit_utils.default_open_output( args.output, "rxn_maker", args.outformat) i = 0 count = 0 if args.multi: dir_base = os.path.dirname(args.output) writer_dict = filter_to_use.get_writers(dir_base) else: writer_dict = None dir_base = None for mol in suppl: i += 1 if mol is None: continue # Return a dict/class here - indicating which filters passed count = filter_to_use.perform_reaction(mol, args.reaction, reagent_suppl, writer, count) utils.log("Created", count, "molecules from a total of ", i, "input molecules") writer.flush() writer.close() if input: input.close() if output: output.close() # close the individual writers if writer_dict: for key in writer_dict: writer_dict[key].close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, 'RxnSmartsFilter': count })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser( description='RDKit molecule standardizer / enumerator') parameter_utils.add_default_io_args(parser) parser.add_argument('-et', '--enumerate_tauts', action='store_true', help='Enumerate all tautomers') parser.add_argument('-es', '--enumerate_stereo', action='store_true', help='Enumerate all stereoisomers') parser.add_argument( '-st', '--standardize', action='store_true', help='Standardize molecules. Cannot be true if enumerate is on.') parser.add_argument('-stm', '--standardize_method', default="molvs", choices=STANDARD_MOL_METHODS.keys(), help="Choose the method to standardize.") parser.add_argument('-mf', '--mol_format', choices=['smiles', 'mol_2d', 'mol_3d'], help="Format for molecules.") args = parser.parse_args() utils.log("Sanifier Args: ", args) if args.standardize and args.enumerate_tauts: raise ValueError("Cannot Enumerate Tautomers and Standardize") if args.standardize and args.enumerate_stereo: raise ValueError("Cannot Enumerate Stereo and Standardize") if args.outformat == 'sdf' and args.mol_format == 'smiles': raise ValueError("Smiles cannot be used when outputting as SDF") if args.standardize: getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method] # handle metadata source = "sanifier.py" datasetMetaProps = { "source": source, "description": "Enumerate tautomers and stereoisomers" } clsMappings = { "EnumTautIsoSourceMolUUID": "java.lang.String", "EnumTautIsoSourceMolIdx": "java.lang.Integer" } fieldMetaProps = [{ "fieldName": "EnumTautIsoSourceMolUUID", "values": { "source": source, "description": "UUID of source molecule" } }, { "fieldName": "EnumTautIsoSourceMolIdx", "values": { "source": source, "description": "Index of source molecule" } }] oformat = utils.determine_output_format(args.outformat) input,output,suppl,writer,output_base = rdkit_utils. \ default_open_input_output(args.input, args.informat, args.output, 'sanifier', args.outformat, thinOutput=False, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) i = 0 count = 0 errors = 0 for mol in suppl: i += 1 if mol is None: continue if args.standardize: # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs oldUUID = mol.GetProp("uuid") inputCanSmiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) try: std = getStandardMolecule(mol) outputCanSmiles = Chem.MolToSmiles(std, isomericSmiles=True, canonical=True) if oldUUID: std.SetProp("uuid", oldUUID) #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles) if inputCanSmiles == outputCanSmiles: std.SetProp("Standardized", "False") else: std.SetProp("Standardized", "True") except: errors += 1 utils.log("Error standardizing", sys.exc_info()[0]) std = mol std.SetProp("Standardized", "Error") count = write_out([std], count, writer, args.mol_format, args.outformat) else: # we want a new UUID generating as we are generating new molecules if mol.HasProp('uuid'): parentUuid = mol.GetProp("uuid") else: parentUuid = None results = [] if args.enumerate_tauts: utils.log("Enumerating tautomers") results = enumerateTautomers(mol) else: results.append(mol) if args.enumerate_stereo: utils.log("Enumerating steroisomers") mols = results results = [] for m in mols: enumerated = enumerateStereoIsomers(m) results.extend(enumerated) for m in results: # copy the src mol props for name in mol.GetPropNames(): m.SetProp(name, mol.GetProp(name)) # add our new props m.ClearProp("uuid") m.SetIntProp("EnumTautIsoSourceMolIdx", i) if parentUuid: m.SetProp("EnumTautIsoSourceMolUUID", parentUuid) count = write_out(results, count, writer, args.mol_format, args.outformat) utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) + " outputs") writer.flush() writer.close() input.close() output.close() # re-write the metadata as we now know the size if oformat == 'json': utils.write_squonk_datasetmetadata(output_base, False, clsMappings, datasetMetaProps, fieldMetaProps, size=count) if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, '__ErrorCount__': errors, 'RDKitSanify': count }) return count
def main(): parser = argparse.ArgumentParser(description='Max SuCOS scores with RDKit') parameter_utils.add_default_io_args(parser) parser.add_argument('-tm', '--target-molecules', help='Target molecules to compare against') parser.add_argument('-tf', '--targets-format', help='Target molecules format') parser.add_argument('-n', '--name-field', help='Name of field with molecule name') parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('--filter-value', type=float, help='Filter out values with scores less than this.') parser.add_argument('--filter-field', help='Field to use to filter values.') args = parser.parse_args() utils.log("Max SuCOSMax Args: ", args) source = "sucos_max.py" datasetMetaProps = { "source": source, "description": "SuCOSMax using RDKit " + rdBase.rdkitVersion } clsMappings = {} fieldMetaProps = [] clsMappings[field_SuCOSMax_Score] = "java.lang.Float" clsMappings[field_SuCOSMax_FMScore] = "java.lang.Float" clsMappings[field_SuCOSMax_ProtrudeScore] = "java.lang.Float" clsMappings[field_SuCOSMax_Index] = "java.lang.Integer" clsMappings[field_SuCOSCum_Score] = "java.lang.Float" clsMappings[field_SuCOSCum_FMScore] = "java.lang.Float" clsMappings[field_SuCOSCum_ProtrudeScore] = "java.lang.Float" fieldMetaProps.append({ "fieldName": field_SuCOSMax_Score, "values": { "source": source, "description": "SuCOS Max score" } }) fieldMetaProps.append({ "fieldName": field_SuCOSMax_FMScore, "values": { "source": source, "description": "SuCOS Max Feature Map score" } }) fieldMetaProps.append({ "fieldName": field_SuCOSMax_ProtrudeScore, "values": { "source": source, "description": "SuCOS Max Protrude score" } }) fieldMetaProps.append({ "fieldName": field_SuCOSMax_Index, "values": { "source": source, "description": "SuCOS Max target index" } }) fieldMetaProps.append({ "fieldName": field_SuCOSCum_Score, "values": { "source": source, "description": "SuCOS Cumulative score" } }) fieldMetaProps.append({ "fieldName": field_SuCOSCum_FMScore, "values": { "source": source, "description": "SuCOS Cumulative Feature Map score" } }) fieldMetaProps.append({ "fieldName": field_SuCOSCum_ProtrudeScore, "values": { "source": source, "description": "SuCOS Cumulative Protrude score" } }) if args.name_field: clsMappings[field_SuCOSMax_Target] = "java.lang.String" fieldMetaProps.append({ "fieldName": field_SuCOSMax_Target, "values": { "source": source, "description": "SuCOS Max target name" } }) inputs_file, inputs_supplr = rdkit_utils.default_open_input( args.input, args.informat) output, writer, output_base = rdkit_utils.default_open_output( args.output, 'sucos-max', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps, compress=not args.no_gzip) targets_file, targets_supplr = rdkit_utils.default_open_input( args.target_molecules, args.targets_format) count, total, errors = process(inputs_supplr, targets_supplr, writer, args.name_field, args.filter_value, args.filter_field) inputs_file.close() targets_file.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors, 'RDKitSuCOS': total })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster') parser.add_argument('-t', '--threshold', type=float, default=0.0, help='similarity threshold (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='morgan2', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-n', '--num', type=int, help='maximum number to pick for diverse subset selection') parser.add_argument('-s', '--seed-molecules', help='optional file containing any seed molecules that have already been picked') parser.add_argument('--fragment-method', choices=['hac', 'mw'], default='hac', help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)') parser.add_argument('--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule') parameter_utils.add_default_io_args(parser) args = parser.parse_args() utils.log("MaxMinPicker Args: ", args) descriptor = descriptors[args.descriptor] if descriptor is None: raise ValueError('No descriptor specified') if not args.num and not args.threshold: raise ValueError('--num or --threshold arguments must be specified, or both') # handle metadata source = "max_min_picker.py" datasetMetaProps = {"source":source, "description": "MaxMinPicker using RDKit " + rdBase.rdkitVersion} ### generate fingerprints fps = [] mols = [] errors = 0 # first the initial seeds, if specified firstPicks = [] num_seeds = 0 if args.seed_molecules: seedsInput,seedsSuppl = rdkit_utils.default_open_input(args.seed_molecules, None) start = time.time() errors += mol_utils.fragmentAndFingerprint(seedsSuppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) end = time.time() seedsInput.close() num_seeds = len(fps) utils.log("Read", len(fps), "fingerprints for seeds in", end-start, "secs,", errors, "errors") firstPicks = list(range(num_seeds)) # now the molecules to pick from input,output,suppl,writer,output_base = rdkit_utils.default_open_input_output(args.input, args.informat, args.output, 'max_min_picker', args.outformat, datasetMetaProps=datasetMetaProps) # reset the mols list as we don't need the seeds, only the candidates mols = [] start = time.time() errs = mol_utils.fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) end = time.time() errors += errs input.close() num_fps = len(fps) num_candidates = num_fps - num_seeds utils.log("Read", num_candidates, "fingerprints for candidates in", end-start, "secs,", errs, "errors") if not args.num: num_to_pick = num_candidates elif args.num > num_candidates: num_to_pick = num_candidates utils.log("WARNING: --num argument (", args.num, ") is larger than the total number of candidates (", num_candidates, ") - resetting to", num_candidates) else: num_to_pick = args.num ### do picking utils.log("MaxMinPicking with descriptor", args.descriptor, "and threshold", args.threshold, ",", num_seeds, "seeds,", num_candidates, "candidates", num_fps, "total") start = time.time() picks, thresh = performPick(fps, num_to_pick + num_seeds, args.threshold, firstPicks) end = time.time() num_picks = len(picks) utils.log("Found", num_picks, "molecules in", end-start, "secs, final threshold", thresh) utils.log("Picks:", list(picks[num_seeds:])) del fps # we want to return the results in the order they were in the input so first we record the order in the pick list indices = {} i = 0 for idx in picks[num_seeds:]: indices[idx] = i i += 1 # now do the sort sorted_picks = sorted(picks[num_seeds:]) # now write out the mols in the correct order recording the value in the pick list as the PickIndex property i = 0 for idx in sorted_picks: mol = mols[idx - num_seeds] # mols array only contains the candidates mol.SetIntProp("PickIndex", indices[idx] + 1) writer.write(mol) i += 1 utils.log("Output", i, "molecules") writer.flush() writer.close() output.close() if args.meta: metrics = {} status_str = "{} compounds picked. Final threshold was {}.".format(i, thresh) if errors > 0: metrics['__ErrorCount__'] = errors status_str = status_str + " {} errors.".format(errors) metrics['__StatusMessage__'] = status_str metrics['__InputCount__'] = num_fps metrics['__OutputCount__'] = i metrics['RDKitMaxMinPicker'] = num_picks utils.write_metrics(output_base, metrics)
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit rxn smarts filter') parameter_utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-m', '--multi', action='store_true', help='Output one file for each reaction') parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("Screen Args: ", args) if not args.output and args.multi: raise ValueError( "Must specify output location when writing individual result files" ) ### Define the filter chooser - lots of logic possible # SMARTS patterns are defined in poised_filter.py. Currently this is hardcoded. # Should make this configurable so that this can be specified by the user at some stage. poised_filter = True if poised_filter == True: from .poised_filter import Filter filter_to_use = Filter() rxn_names = filter_to_use.get_rxn_names() utils.log("Using", len(rxn_names), "reaction filters") # handle metadata source = "rxn_smarts_filter.py" datasetMetaProps = { "source": source, "description": "Reaction SMARTS filter" } clsMappings = {} fieldMetaProps = [] for name in rxn_names: # this is the Java class type for an array of MoleculeObjects clsMappings[name] = "[Lorg.squonk.types.MoleculeObject;" fieldMetaProps.append({ "fieldName": name, "values": { "source": source, "description": "Sythons from " + name + " reaction" } }) input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output( args.input, args.informat, args.output, 'rxn_smarts_filter', args.outformat, thinOutput=args.thin, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) i = 0 count = 0 if args.multi: dir_base = os.path.dirname(args.output) writer_dict = filter_to_use.get_writers(dir_base) else: writer_dict = None dir_base = None for mol in suppl: i += 1 if mol is None: continue # Return a dict/class here - indicating which filters passed filter_pass = filter_to_use.pass_filter(mol) utils.log("Found", str(len(filter_pass)), "matches") if filter_pass: props = {} count += 1 for reaction in filter_pass: molObjList = [] # Write the reaction name as a newline separated list of the synthons to the mol object # this is used in SDF output mol.SetProp(reaction, "\n".join(filter_pass[reaction])) # now write to the props is a way that can be used for the JSON output for smiles in filter_pass[reaction]: # generate a dict that generates MoleculeObject JSON mo = utils.generate_molecule_object_dict( smiles, "smiles", None) molObjList.append(mo) props[reaction] = molObjList if args.multi: writer_dict[reaction].write(mol) writer_dict[reaction].flush() # write the output. # In JSON format the props will override values set on the mol # In SDF format the props are ignored so the values in the mol are used writer.write(mol, props) writer.flush() utils.log("Matched", count, "molecules from a total of", i) if dir_base: utils.log("Individual SD files found in: " + dir_base) writer.flush() writer.close() if input: input.close() if output: output.close() # close the individual writers if writer_dict: for key in writer_dict: writer_dict[key].close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, 'RxnSmartsFilter': count })
def main(): global WRITER, THRESHOLD global PDB_PATH parser = argparse.ArgumentParser( description='SMoG2016 - Docking calculation.') parameter_utils.add_default_io_args(parser) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring") parser.add_argument('-t', '--threshold', help="The maximum score to allow", default=None) parser.add_argument( '--threads', type=int, help="Number of threads to used. Default is the number of cores", default=None) parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("SMoG2016 Args: ", args) smog_path = "/usr/local/SMoG2016/" if args.threshold: THRESHOLD = float(args.threshold) else: THRESHOLD = None PDB_PATH = "/tmp/pdb_file.pdb" # Now copy it to prot_pdb.pdb -> silly SMOG bug requires underscore in the filename! shutil.copy(args.pdb_file, PDB_PATH) # Open up the input file input, suppl = rdkit_utils.default_open_input(args.input, args.informat) # Open the output file output, WRITER, output_base = rdkit_utils.\ default_open_output(args.output, "SMoG2016", args.outformat, compress=not args.no_gzip) # Cd to the route of the action # TODO - can this be done without changing dir? It gives problems in finding the input files and in writing the metrics cwd = os.getcwd() os.chdir(smog_path) # Iterate over the molecules # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this. if args.threads is None: threads = multiprocessing.cpu_count() else: threads = args.threads pool = ThreadPool(threads) pool.map(run_dock, suppl) # Close the file WRITER.close() os.chdir(cwd) if args.meta: utils.write_metrics( output_base, { '__InputCount__': COUNTER, '__OutputCount__': SUCCESS, 'SMoG2016': COUNTER }) utils.log("SMoG2016 complete")
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help= 'filename of query structures as smiles (incompatible with --sdf and --qjson args)' ) group.add_argument( '--qsdf', help= 'filename of query structures as sdfile (incompatible with --smiles and --qjson args)' ) group.add_argument( '--qjson', help= 'filename of query structures as MoleculeObject JSON (incompatible with --qsmiles and --qsdf args)' ) parser.add_argument('--qsmilesTitleLine', action='store_true', help='the smiles file has a title line') parser.add_argument('--qsmilesDelimiter', default='\t', help='delimiter for smiles file (default is tab)') parser.add_argument( '--qsmilesColumn', type=int, default=0, help='column in smiles file with the smiles (default is first column)') parser.add_argument( '--qsmilesNameColumn', type=int, default=1, help='column in smiles file with ID (default is second column)') parser.add_argument( '--qprop', help= 'property name in query molecules to report. If not defined (or property is not present) ' + 'then name property is not written. JSON format uses the UUID as default' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parameter_utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] propName = args.qprop if args.qsmiles: queryMolsupplier = rdkit_utils.default_open_input_smiles( args.qsmiles, delimiter=args.qsmilesDelimiter, smilesColumn=args.qsmilesColumn, nameColumn=args.qsmilesNameColumn, titleLine=args.qsmilesTitleLine) queryInput = None elif args.qsdf: queryInput, queryMolsupplier = rdkit_utils.default_open_input_sdf( args.qsdf) elif args.qjson: queryInput, queryMolsupplier = rdkit_utils.default_open_input_json( args.qjson, lazy=False) if not propName: propName = "uuid" else: raise ValueError('No query structure specified') queryFps = {} utils.log("Preparing query fingerprints") count = 0 for q in queryMolsupplier: count += 1 if q: queryFps[q] = descriptor(q) else: utils.log("WARNING: Failed to parse Molecule", count) if queryInput: queryInput.close() input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output( args.input, args.informat, args.output, 'screen_multi', args.outformat) # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. #if args.meta: # t = open(output_base + '_types.txt', 'w') # t.write(field_Similarity + '=integer\n') # t.flush() # t.close() i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue targetFp = descriptor(mol) idx = 0 hits = 0 bestScore = 0 bestName = None for queryMol in queryFps: idx += 1 sim = metric(queryFps[queryMol], targetFp) if propName: name = str(queryMol.GetProp(propName)) else: name = None if sim >= args.simmin and sim <= args.simmax: hits += 1 if not args.quiet: utils.log(i, idx, sim) if sim > bestScore: bestScore = sim bestIdx = idx if name: bestName = name if name: mol.SetDoubleProp(field_Similarity + "_" + name, sim) else: mol.SetDoubleProp( field_Similarity + "_" + str(idx) + "_Score", sim) if hits > 0: count += 1 mol.SetDoubleProp(field_Similarity + "_BestScore", bestScore) if bestName: mol.SetProp(field_Similarity + "_BestName", bestName) else: mol.SetIntProp(field_Similarity + "_BestIndex", bestIdx) mol.SetIntProp(field_Similarity + "_Count", hits) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': count }) return count
if my_mol.HasProp("uuid"): cleaned.SetProp("SourceMolUUID", my_mol.GetProp("uuid")) cleaned.SetIntProp("SourceMolNum", molIdx) cleaned.SetIntProp("ConformerNum", count + 1) outputfile.write(cleaned) count += 1 except ValueError: errors += 1 logging.exception('') return count, errors if __name__ == '__main__': parser = argparse.ArgumentParser( description='RDKit constrained conformer generator') parameter_utils.add_default_io_args(parser) parser.add_argument('-n', '--num', type=int, default=10, help='number of conformers to generate') parser.add_argument('-r', '--refmol', help="Reference molecule file") parser.add_argument('--refmolidx', help="Reference molecule index in file", type=int, default=1) parser.add_argument( '-c', '--core_smi', help='Core substructure. If not specified - guessed using MCS', default='')
def main(): ### command line args definitions ######################################### parser = argparse.ArgumentParser(description='Filter interactions') parameter_utils.add_default_io_args(parser) parser.add_argument('-f', '--group-by-field', required=True, help='Field to group records by (must be sequential)') parser.add_argument('-s', '--score-field', required=True, help='Field to use to rank records within a group') parser.add_argument('-d', '--score-descending', action='store_true', help='Sort records in descending order') parser.add_argument('-x', '--stats-fields', nargs='*', help='Field to use to for summary statistics') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') args = parser.parse_args() utils.log("filter_interactions: ", args) # handle metadata source = "filter_interactions.py" datasetMetaProps = { "source": source, "description": "Filter by interactions" } clsMappings = { # "EnumChargesSrcMolUUID": "java.lang.String", # "EnumChargesSrcMolIdx": "java.lang.Integer" } fieldMetaProps = [ # {"fieldName": "EnumChargesSrcMolUUID", "values": {"source": source, "description": "UUID of source molecule"}}, # {"fieldName": "EnumChargesSrcMolIdx", "values": {"source": source, "description": "Index of source molecule"}} ] input, suppl = rdkit_utils.default_open_input(args.input, args.informat) output, writer, output_base = rdkit_utils.default_open_output( args.output, 'filter_interactions', args.outformat, thinOutput=False, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps, compress=not args.no_gzip) report_file = open(output_base + '.report', 'wt') count, total, errors = execute(suppl, writer, report_file, args.group_by_field, args.score_field, args.score_descending, args.stats_fields) utils.log(count, total, errors) if input: input.close() writer.flush() writer.close() output.close() report_file.close() # re-write the metadata as we now know the size if args.outformat == 'json': utils.write_squonk_datasetmetadata(output_base, False, clsMappings, datasetMetaProps, fieldMetaProps, size=total) if args.meta: utils.write_metrics( output_base, { '__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors, 'FilterInteractions': total })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser( description='RDKit molecule standardiser / enumerator') parameter_utils.add_default_io_args(parser) parser.add_argument('-et', '--enumerate_tauts', action='store_true', help='Enumerate all tautomers') parser.add_argument('-es', '--enumerate_stereo', action='store_true', help='Enumerate all stereoisomers') parser.add_argument( '-st', '--standardize', action='store_true', help='Standardize molecules. Cannot be true if enumerate is on.') parser.add_argument('-stm', '--standardize_method', default="molvs", choices=STANDARD_MOL_METHODS.keys(), help="Choose the method to standardize.") parser.add_argument('-mf', '--mol_format', choices=['smiles', 'mol_2d', 'mol_3d'], help="Format for molecules.") args = parser.parse_args() utils.log("Sanifier Args: ", args) if args.standardize and args.enumerate_tauts: raise ValueError("Cannot Enumerate Tautomers and Standardise") if args.standardize and args.enumerate_stereo: raise ValueError("Cannot Enumerate Stereo and Standardise") if args.outformat == 'sdf' and args.mol_format == 'smiles': raise ValueError("Smiles cannot be used when outputting as SDF") if args.standardize: getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method] input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output( args.input, args.informat, args.output, 'sanify', args.outformat) i = 0 count = 0 errors = 0 for mol in suppl: i += 1 if mol is None: continue if args.standardize: # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs oldUUID = mol.GetProp("uuid") inputCanSmiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) try: std = getStandardMolecule(mol) outputCanSmiles = Chem.MolToSmiles(std, isomericSmiles=True, canonical=True) if oldUUID: std.SetProp("uuid", oldUUID) #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles) if inputCanSmiles == outputCanSmiles: std.SetProp("Standardised", "False") else: std.SetProp("Standardised", "True") except: errors += 1 utils.log("Error standardizing", sys.exc_info()[0]) std = mol std.SetProp("Standardised", "Error") count = write_out([std], count, writer, args.mol_format, args.outformat) else: # we want a new UUID generating as we are generating new molecules if mol.HasProp('uuid'): parentUuid = mol.GetProp("uuid") else: parentUuid = None results = [] results.append(mol) if args.enumerate_tauts: utils.log("Enumerating tautomers") results = enumerateTautomers(mol) if args.enumerate_stereo: utils.log("Enumerating steroisomers") mols = results results = [] for m in mols: enumerated = enumerateStereoIsomers(m) results.extend(enumerated) for m in results: m.ClearProp("uuid") m.SetIntProp("SourceMolNum", i) if parentUuid: m.SetProp("SourceMolUUID", parentUuid) count = write_out(results, count, writer, args.mol_format, args.outformat) utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) + " outputs") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, '__ErrorCount__': errors, 'RDKitSanify': count }) return count
def main(): # Example usage # python -m pipelines.xchem.rmsd_filter -i ../../data/mpro/poses.sdf.gz -o output -c 0.5 parser = argparse.ArgumentParser(description='RSMD filter') parameter_utils.add_default_io_args(parser) parser.add_argument('-c', '--cutoff-rmsd', type=float, help='RMSD cutoff') parser.add_argument('-f', '--field', default='_Name', help='Field to group records') parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('--metrics', action='store_true', help='Write metrics') args = parser.parse_args() utils.log("RSMD filter Args: ", args) source = "rmsd_filter.py" datasetMetaProps = { "source": source, "description": "RMSD filter " + rdBase.rdkitVersion } clsMappings = {} fieldMetaProps = [] # clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float" # clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float" # fieldMetaProps.append({"fieldName":field_FeatureSteinQualityScore, "values": {"source":source, "description":"FeatureStein quality score"}, # "fieldName":field_FeatureSteinQuantityScore, "values": {"source":source, "description":"FeatureStein quantity score"}}) inputs_file, inputs_supplr = rdkit_utils.default_open_input( args.input, args.informat) output, writer, output_base = rdkit_utils.default_open_output( args.output, 'rmsd_filter', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps, compress=not args.no_gzip) # this does the processing count, groups, kept, errors = process(inputs_supplr, writer, args.field, args.cutoff_rmsd) utils.log('Processing complete.', count, 'records processed with', groups, 'groups.', kept, 'records retained.', errors, 'errors') inputs_file.close() writer.flush() writer.close() output.close() if args.metrics: utils.write_metrics( output_base, { '__InputCount__': total, '__OutputCount__': success, '__ErrorCount__': errors, 'RDKitFeatureMap': success })
def main(): ### command line args definitions ######################################### parser = argparse.ArgumentParser(description='Enumerate charges') parser.add_argument( '--fragment-method', choices=['hac', 'mw'], help= 'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)' ) parser.add_argument('--min-ph', help='The min pH to consider', type=float, default=5.0) parser.add_argument('--max-ph', help='The max pH to consider', type=float, default=9.0) parameter_utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("Enumerate charges: ", args) # handle metadata source = "enumerate_charges.py" datasetMetaProps = { "source": source, "description": "Enumerate charges using Dimorphite-dl" } clsMappings = { "EnumChargesSrcMolUUID": "java.lang.String", "EnumChargesSrcMolIdx": "java.lang.Integer" } fieldMetaProps = [{ "fieldName": "EnumChargesSrcMolUUID", "values": { "source": source, "description": "UUID of source molecule" } }, { "fieldName": "EnumChargesSrcMolIdx", "values": { "source": source, "description": "Index of source molecule" } }] oformat = utils.determine_output_format(args.outformat) input,output,suppl,writer,output_base = rdkit_utils. \ default_open_input_output(args.input, args.informat, args.output, 'enumerateCharges', args.outformat, thinOutput=False, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) count = 0 total = 0 errors = 0 min_ph = args.min_ph max_ph = args.max_ph # this hacky bit is needed because the dimporphite entrypoint assumes it's args are passed using argparse # but it doesn't understand our args, so we need to switch between the two sets of args. dimorphite_sys_argv = sys.argv[:1] dimorphite_sys_argv.append('--min_ph') dimorphite_sys_argv.append(str(min_ph)) dimorphite_sys_argv.append('--max_ph') dimorphite_sys_argv.append(str(max_ph)) fragment = args.fragment_method for mol in suppl: if mol is None: continue count += 1 orig_sys_argv = sys.argv[:] sys.argv = dimorphite_sys_argv enum_mols = enumerateMol(mol, fragment) sys.argv = orig_sys_argv t, e = writeEnumeratedMols(mol, enum_mols, writer, count) total += t errors += e utils.log(count, total, errors) if input: input.close() writer.flush() writer.close() output.close() # re-write the metadata as we now know the size if oformat == 'json': utils.write_squonk_datasetmetadata(output_base, False, clsMappings, datasetMetaProps, fieldMetaProps, size=total) if args.meta: utils.write_metrics( output_base, { '__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors, 'EnumerateChargesDimporphite': total })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster') parser.add_argument('-t', '--threshold', type=float, default=0.7, help='similarity clustering threshold (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument('-n', '--num', type=int, help='maximum number to pick for diverse subset selection') parser.add_argument('-e', '--exclude', type=float, default=0.9, help='threshold for excluding structures in diverse subset selection (1.0 means identical)') parser.add_argument('--fragment-method', choices=['hac', 'mw'], default='hac', help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)') parser.add_argument('--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule') parser.add_argument('-f', '--field', help='field to use to optimise diverse subset selection') group = parser.add_mutually_exclusive_group() group.add_argument('--min', action='store_true', help='pick lowest value specified by the --field option') group.add_argument('--max', action='store_true', help='pick highest value specified by the --field option') parameter_utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("Cluster Args: ", args) descriptor = descriptors[args.descriptor] if descriptor is None: raise ValueError('Invalid descriptor name ' + args.descriptor) if args.field and not args.num: raise ValueError('--num argument must be specified for diverse subset selection') if args.field and not (args.min or args.max): raise ValueError('--min or --max argument must be specified for diverse subset selection') # handle metadata source = "cluster_butina.py" datasetMetaProps = {"source":source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion} clsMappings = {"Cluster": "java.lang.Integer"} fieldMetaProps = [{"fieldName":"Cluster", "values": {"source":source, "description":"Cluster number"}}] input,output,suppl,writer,output_base = rdkit_utils.\ default_open_input_output(args.input, args.informat, args.output, 'cluster_butina', args.outformat, thinOutput=False, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) ### fragment and generate fingerprints mols = [] fps = [] errs = mol_utils.fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) input.close() ### do clustering utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold) clusters, dists, matrix = ClusterFps(fps, args.metric, 1.0 - args.threshold) utils.log("Found", len(clusters), "clusters") ### generate diverse subset if specified if args.num: utils.log("Generating diverse subset") # diverse subset selection is specified finalClusters = SelectDiverseSubset(mols, clusters, dists, args.num, args.field, args.max, args.exclude, args.quiet) else: finalClusters = clusters utils.log("Found", len(finalClusters), "clusters") lookup = ClustersToMap(finalClusters) if not args.quiet: utils.log("Final Clusters:", finalClusters) ### write the results i = 0 result_count = 0 for mol in mols: if i in lookup: if args.thin: rdkit_utils.clear_mol_props(mol, ["uuid"]) cluster = lookup[i] mol.SetIntProp(field_Cluster, cluster) writer.write(mol) result_count += 1 i += 1 writer.flush() writer.close() output.close() if args.meta: status_str = str(result_count) + ' results from ' + str(len(finalClusters)) + ' clusters' utils.write_metrics(output_base, {'__StatusMessage__':status_str, '__InputCount__':i, '__OutputCount__':result_count, 'RDKitCluster':i})
def main(): # Example usage # python -m pipelines.xchem.featurestein_generate_and_score -i ../../data/mpro/poses.sdf.gz -f ../../data/mpro/hits-17.sdf.gz -o output_fs global fmaps parser = argparse.ArgumentParser( description='FeatureStein scoring with RDKit') parameter_utils.add_default_io_args(parser) parser.add_argument('-f', '--fragments', help='Fragments to use to generate the feature map') parser.add_argument('-ff', '--fragments-format', help='Fragments format') parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('--metrics', action='store_true', help='Write metrics') args = parser.parse_args() utils.log("FeatureStein Args: ", args) source = "featurestein_generate_and_score.py" datasetMetaProps = { "source": source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion } clsMappings = {} fieldMetaProps = [] clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float" clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float" fieldMetaProps.append({ "fieldName": field_FeatureSteinQualityScore, "values": { "source": source, "description": "FeatureStein quality score" }, "fieldName": field_FeatureSteinQuantityScore, "values": { "source": source, "description": "FeatureStein quantity score" } }) # generate the feature maps frags_input, frags_suppl = rdkit_utils.default_open_input( args.fragments, args.fragments_format) fmaps = create_featuremap(frags_suppl) frags_input.close() # read the ligands to be scored inputs_file, inputs_supplr = rdkit_utils.default_open_input( args.input, args.informat) output, writer, output_base = rdkit_utils.default_open_output( args.output, 'featurestein', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps, compress=not args.no_gzip) # do the scoring total, success, errors = score_molecules(inputs_supplr, writer) utils.log('Scored', success, 'molecules.', errors, 'errors.') inputs_file.close() writer.flush() writer.close() output.close() if args.metrics: utils.write_metrics( output_base, { '__InputCount__': total, '__OutputCount__': success, '__ErrorCount__': errors, 'RDKitFeatureMap': success })