def main(): global WRITER, THRESHOLD global PDB_PATH parser = argparse.ArgumentParser( description='SMoG2016 - Docking calculation.') utils.add_default_io_args(parser) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring") parser.add_argument('-t', '--threshold', help="The maximum score to allow", default=None) parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("SMoG2016 Args: ", args) smog_path = "/usr/local/SMoG2016_Rev1/" if args.threshold: THRESHOLD = float(args.threshold) else: THRESHOLD = None PDB_PATH = "/tmp/pdb_file.pdb" # Now copy it to prot_pdb.pdb -> silly SMOG bug requires underscore in the filename! shutil.copy(args.pdb_file, PDB_PATH) # Open up the input file input, suppl = utils.default_open_input(args.input, args.informat) # Open the ouput file output, WRITER, output_base = utils.default_open_output( args.output, "SMoG2016", args.outformat, compress=not args.no_gzip) # Cd to the route of the action # TODO - can this be done without changing dir? It gives problems in finding the input files and in writing the metrics cwd = os.getcwd() os.chdir(smog_path) # Iterate over the molecules # TODO - restore parallel processing, but need to ensure the order of molecules is preserved pool = ThreadPool(1) pool.map(run_dock, suppl) # Close the file WRITER.close() os.chdir(cwd) if args.meta: utils.write_metrics( output_base, { '__InputCount__': COUNTER, '__OutputCount__': SUCCESS, 'SMoG2016': COUNTER }) utils.log("SMoG2016 complete")
def main(): global PDB_PATH, WRITER, THRESHOLD parser = argparse.ArgumentParser( description='SMoG2016 - Docking calculation.') utils.add_default_io_args(parser) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring") parser.add_argument('-t', '--threshold', type=float, help="The maximum score to allow", default=None) parser.add_argument( '--threads', type=int, help="Number of threads to used. Default is the number of cores", default=None) parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("PLI Args: ", args) # Open up the input file input, suppl = utils.default_open_input(args.input, args.informat) # Open the ouput file output, WRITER, output_base = utils.default_open_output( args.output, "plip", args.outformat, compress=not args.no_gzip, thinOutput=args.thin) PDB_PATH = args.pdb_file if args.threshold: THRESHOLD = args.threshold # Iterate over the molecules # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this. pool = ThreadPool(args.threads if args. threads is not None else multiprocessing.cpu_count()) pool.map(run_dock, suppl) pool.close() pool.join() # Close the file WRITER.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': COUNTER, '__OutputCount__': SUCCESS, 'PLI': COUNTER })
def split(input, informat, fieldName, outputBase, writeMetrics): """Splits the input into separate files. The name of each file and the file the each record is written to is determined by the fieldName parameter """ input, suppl = utils.default_open_input(input, informat) i = 0 written = 0 writers = {} outputs = [] filenames = [] for mol in suppl: i += 1 if mol is None: continue if not mol.HasProp(fieldName): utils.log("Skipping molecule", i, "- did not contain field", fieldName) continue value = mol.GetProp(fieldName) if value: s = str(value) if writers.has_key(s): writer = writers[s] else: name = outputBase + s output, writer = utils.default_open_output_sdf( name, outputBase, False, False) filenames.append(name + '.sdf') outputs.append(output) writers[s] = writer writer.write(mol) written += 1 utils.log("Generated", len(writers), "outputs from", i, "records") input.close() for k in writers: writers[k].close() for o in outputs: o.close() if writeMetrics: utils.write_metrics(outputBase, { '__InputCount__': i, '__OutputCount__': written, 'Splitter': i }) return filenames
def split(input, informat, fieldName, outputBase): """Splits the input into separate files. The name of each files and the file the records is written to is determined by the fieldName parameter """ input, suppl = utils.default_open_input(input, informat) i = 0 writers = {} outputs = [] filenames = [] for mol in suppl: i += 1 if mol is None: continue value = mol.GetProp(fieldName) if value: s = str(value) if writers.has_key(s): writer = writers[s] else: name = outputBase + s output, writer = utils.default_open_output_sdf( name, outputBase, False, False) filenames.append(name + '.sdf') outputs.append(output) writers[s] = writer writer.write(mol) utils.log("Generated", len(writers), "outputs from", i, "records") input.close() for k in writers: writers[k].close() for o in outputs: o.close() return filenames
def main(): ### command line args defintions ######################################### ### Define the reactions available poised_filter = True if poised_filter == True: from poised_filter import Filter filter_to_use = Filter() parser = argparse.ArgumentParser(description='RDKit rxn process') utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-m', '--multi', action='store_true', help='Output one file for each reaction') parser.add_argument('-r', '--reaction', choices=filter_to_use.poised_reactions.keys(), help='Name of reaction to be run') parser.add_argument('-rl', '--reagent_lib', help="Input SD file, if not defined the STDIN is used") parser.add_argument( '-rlf', '--reagent_lib_format', choices=['sdf', 'json'], help="Input format. When using STDIN this must be specified.") args = parser.parse_args() utils.log("Screen Args: ", args) if not args.output and args.multi: raise ValueError( "Must specify output location when writing individual result files" ) input, suppl = utils.default_open_input(args.input, args.informat) reagent_input, reagent_suppl = utils.default_open_input( args.reagent_lib, args.reagent_lib_format) output, writer, output_base = utils.default_open_output( args.output, "rxn_maker", args.outformat) i = 0 count = 0 if args.multi: dir_base = os.path.dirname(args.output) writer_dict = filter_to_use.get_writers(dir_base) else: writer_dict = None dir_base = None for mol in suppl: i += 1 if mol is None: continue # Return a dict/class here - indicating which filters passed count = filter_to_use.perform_reaction(mol, args.reaction, reagent_suppl, writer, count) utils.log("Created", count, "molecules from a total of ", i, "input molecules") writer.flush() writer.close() if input: input.close() if output: output.close() # close the individual writers if writer_dict: for key in writer_dict: writer_dict[key].close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, 'RxnSmartsFilter': count })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster Matrix') utils.add_default_input_args(parser) parser.add_argument('-o', '--output', help="Base name for output file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.") parser.add_argument('-of', '--outformat', choices=['tsv', 'json'], default='tsv', help="Output format. Defaults to 'tsv'.") parser.add_argument('--meta', action='store_true', help='Write metadata and metrics files') parser.add_argument('-t', '--threshold', type=float, default=0.7, help='Similarity clustering threshold (1.0 means identical)') parser.add_argument('-mt', '--matrixThreshold', type=float, default=0.5, help='Threshold for outputting values (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(cluster_butina.descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(cluster_butina.metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Cluster Matrix Args: ", args) descriptor = cluster_butina.descriptors[args.descriptor] if descriptor is None: raise ValueError('Invalid descriptor name ' + args.descriptor) input,suppl = utils.default_open_input(args.input, args.informat) # handle metadata source = "cluster_butina_matrix.py" datasetMetaProps = {"source":source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion} clsMappings = { "Cluster1": "java.lang.Integer", "Cluster2": "java.lang.Integer", "ID1": "java.lang.String", "ID2": "java.lang.String", "M1": "java.lang.String", "M2": "java.lang.String", "Similarity": "java.lang.Float" } fieldMetaProps = [{"fieldName":"Cluster", "values": {"source":source, "description":"Cluster number"}}] fieldNames = collections.OrderedDict() fieldNames['ID1'] = 'ID1' fieldNames['ID2'] ='ID2' fieldNames['Cluster1'] = 'Cluster1' fieldNames['Cluster2'] = 'Cluster2' fieldNames['Similarity'] = 'Similarity' fieldNames['M1'] = 'M1' fieldNames['M2'] = 'M2' writer,output_base = utils.create_simple_writer(args.output, 'cluster_butina_matrix', args.outformat, fieldNames, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) ### generate fingerprints mols = [x for x in suppl if x is not None] fps = [descriptor(x) for x in mols] input.close() ### do clustering utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold) clusters, dists, matrix, = cluster_butina.ClusterFps(fps, args.metric, 1.0 - args.threshold) utils.log("Found", len(clusters), "clusters") MapClusterToMols(clusters, mols) if not args.quiet: utils.log("Clusters:", clusters) writer.writeHeader() size = len(matrix) #utils.log("len(matrix):", size) count = 0 for i in range(size ): #utils.log("element",i, "has length", len(matrix[i])) writer.write(create_values(mols, i, i, 1.0)) count += 1 for j in range(len(matrix[i])): #utils.log("writing",i,j) dist = matrix[i][j] if dist > args.matrixThreshold: # the matrix is the lower left segment without the diagonal x = j y = i + 1 writer.write(create_values(mols, x, y, dist)) writer.write(create_values(mols, y, x, dist)) count += 2 writer.write(create_values(mols, size, size, 1.0)) writer.writeFooter() writer.close() if args.meta: utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitCluster':i})
default=10, help='number of conformers to generate') parser.add_argument('-r', '--refmol', help="Reference molecule file") parser.add_argument('--refmolidx', help="Reference molecule index in file", type=int, default=1) parser.add_argument( '-c', '--core_smi', help='Core substructure. If not specified - guessed using MCS', default='') args = parser.parse_args() # Get the reference molecule ref_mol_input, ref_mol_suppl = utils.default_open_input( args.refmol, args.refmol) counter = 0 # Get the specified reference molecule. Default is the first for mol in ref_mol_suppl: counter += 1 if counter == args.refmolidx: ref_mol = mol break ref_mol_input.close() if counter < args.refmolidx: raise ValueError("Invalid refmolidx. " + str(args.refmolidx) + " was specified but only " + str(counter) + " molecules were present in refmol.") # handle metadata
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit filter') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records') parser.add_argument( '-c', '--chunksize', type=int, help= 'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...' ) parser.add_argument( '-d', '--digits', type=int, default=0, help= 'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...' ) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') # WARNING: thin output is not appropriate when using --fragment parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument( '-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering') utils.add_default_io_args(parser) args = parser.parse_args() utils.log("Filter Args: ", args) input, suppl = utils.default_open_input(args.input, args.informat) if args.chunksize: chunkNum = 1 if args.output: output_base = args.output else: output_base = 'filter' output_base_chunk = output_base + str(chunkNum).zfill(args.digits) output, writer, output_base_chunk = utils.default_open_output( output_base_chunk, output_base_chunk, args.outformat, compress=not args.no_gzip) else: output, writer, output_base_chunk = utils.default_open_output( args.output, "filter", args.outformat, compress=not args.no_gzip) output_base = output_base_chunk utils.log("Writing to " + output_base_chunk) i = 0 count = 0 chunkNum = 1 for mol in suppl: if args.limit and count >= args.limit: break i += 1 if mol is None: continue if args.fragment: mol = fragment(mol, args.fragment, quiet=args.quiet) if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue if args.chunksize: if count > 0 and count % args.chunksize == 0: writer.close() output.close() chunkNum += 1 output_chunk_base = output_base + str(chunkNum).zfill( args.digits) utils.log("Writing to " + output_chunk_base) output, writer, output_chunk_base = utils.default_open_output( output_chunk_base, output_chunk_base, args.outformat, compress=not args.no_gzip) count += 1 writer.write(mol) utils.log("Filtered", i, "down to", count, "molecules") if args.chunksize: utils.log("Wrote", chunkNum, "chunks") if (args.digits > 0 and len(str(chunkNum)) > args.digits): utils.log( "WARNING: not enough digits specified for the number of chunks" ) writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitFilter': i })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster') parser.add_argument('-t', '--threshold', type=float, default=0.0, help='similarity threshold (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='morgan2', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-n', '--num', type=int, help='maximum number to pick for diverse subset selection') parser.add_argument('-s', '--seed-molecules', help='optional file containing any seed molecules that have already been picked') parser.add_argument('--fragment-method', choices=['hac', 'mw'], default='hac', help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)') parser.add_argument('--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule') utils.add_default_io_args(parser) args = parser.parse_args() utils.log("MaxMinPicker Args: ", args) descriptor = descriptors[args.descriptor] if descriptor is None: raise ValueError('No descriptor specified') if not args.num and not args.threshold: raise ValueError('--num or --threshold arguments must be specified, or both') # handle metadata source = "max_min_picker.py" datasetMetaProps = {"source":source, "description": "MaxMinPicker using RDKit " + rdBase.rdkitVersion} ### generate fingerprints fps = [] mols = [] errors = 0 # first the initial seeds, if specified firstPicks = [] num_seeds = 0 if args.seed_molecules: seedsInput,seedsSuppl = utils.default_open_input(args.seed_molecules, None) start = time.time() errors += mol_utils.fragmentAndFingerprint(seedsSuppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) end = time.time() seedsInput.close() num_seeds = len(fps) utils.log("Read", len(fps), "fingerprints for seeds in", end-start, "secs,", errors, "errors") firstPicks = list(range(num_seeds)) # now the molecules to pick from input,output,suppl,writer,output_base = utils.default_open_input_output(args.input, args.informat, args.output, 'cluster_butina', args.outformat, datasetMetaProps=datasetMetaProps) # reset the mols list as we don't need the seeds, only the candidates mols = [] start = time.time() errs = mol_utils.fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) end = time.time() errors += errs input.close() num_fps = len(fps) num_candidates = num_fps - num_seeds utils.log("Read", num_candidates, "fingerprints for candidates in", end-start, "secs,", errs, "errors") if not args.num: num_to_pick = num_candidates elif args.num > num_candidates: num_to_pick = num_candidates utils.log("WARNING: --num argument (", args.num, ") is larger than the total number of candidates (", num_candidates, ") - resetting to", num_candidates) else: num_to_pick = args.num ### do picking utils.log("MaxMinPicking with descriptor", args.descriptor, "and threshold", args.threshold, ",", num_seeds, "seeds,", num_candidates, "candidates", num_fps, "total") start = time.time() picks, thresh = performPick(fps, num_to_pick + num_seeds, args.threshold, firstPicks) end = time.time() num_picks = len(picks) utils.log("Found", num_picks, "molecules in", end-start, "secs, final threshold", thresh) utils.log("Picks:", list(picks[num_seeds:])) del fps # we want to return the results in the order they were in the input so first we record the order in the pick list indices = {} i = 0 for idx in picks[num_seeds:]: indices[idx] = i i += 1 # now do the sort sorted_picks = sorted(picks[num_seeds:]) # now write out the mols in the correct order recording the value in the pick list as the PickIndex property i = 0 for idx in sorted_picks: mol = mols[idx - num_seeds] # mols array only contains the candidates mol.SetIntProp("PickIndex", indices[idx] + 1) writer.write(mol) i += 1 utils.log("Output", i, "molecules") writer.flush() writer.close() output.close() if args.meta: metrics = {} status_str = "{} compounds picked. Final threshold was {}.".format(i, thresh) if errors > 0: metrics['__ErrorCount__'] = errors status_str = status_str + " {} errors.".format(errors) metrics['__StatusMessage__'] = status_str metrics['__InputCount__'] = num_fps metrics['__OutputCount__'] = i metrics['RDKitMaxMinPicker'] = num_picks utils.write_metrics(output_base, metrics)
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit filter') parser.add_argument('-f', '--fragment', choices=['hac', 'mw'], help='Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )') parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records') parser.add_argument('-c', '--chunksize', type=int, help='Split output into chunks of size c. Output will always be files. Names like filter01.sdf.gz, filter02.sdf.gz ...') # WARNING: thin output is not appropriate when using --fragment parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering') utils.add_default_io_args(parser) args = parser.parse_args() utils.log("Filter Args: ", args) input,suppl = utils.default_open_input(args.input, args.informat) if args.output: output_base = args.output else: output_base = 'filter' # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. #t = open(output_base + '_types.txt', 'w') #t.write(field_Similarity + '=integer\n') #t.flush() #t.close() if args.chunksize: chunkNum = 1 output = gzip.open(output_base + str(chunkNum) + '.sdf.gz','w+') elif args.output: output = gzip.open(output_base + '.sdf.gz','w+') else: output = sys.stdout writer = Chem.SDWriter(output) i=0 count = 0 chunkCount = 1 for mol in suppl: if args.limit and count >= args.limit: break i +=1 if mol is None: continue if args.fragment: mol = fragment(mol, args.fragment, quiet=args.quiet) if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue if args.chunksize: if count > 0 and count % args.chunksize == 0: writer.close() output.close() chunkCount += 1 output = gzip.open(output_base + str(chunkCount) + '.sdf.gz','w+') writer = Chem.SDWriter(output) count += 1 writer.write(mol) utils.log("Filtered", i, "down to", count, "molecules") if args.chunksize: utils.log("Wrote", chunkCount, "chunks") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitFilter':i})