def enumerateMol(mol, fragment): """ Enumerate a single molecule :param mol: :param fragment The fragmentation method, 'hac' or 'mw'. If not specified the whole molecules is passed to Dimorphite :return: """ if fragment: mol = mol_utils.fragment(mol, fragment) inputmol = [] inputmol.append(mol) protonated_mols = run_with_mol_list(inputmol) return protonated_mols
def standardize(mol, neutralize, fragment): """ :param mol: The molecule to standardize :param neutralize: Boolean for whether to neutralize the molecule :param fragment: The approach for choosing the largest fragment. Either 'hac' or 'mw'. If not specified the whole molecule is used. :return: The standardized molecule """ mol = rdMolStandardize.Cleanup(mol) #mol = lfc.choose(mol) # We use our own largest fragment picker as the RDKit one behaves slightly differently if fragment: mol = mol_utils.fragment(mol, fragment) if neutralize: mol = uncharger.uncharge(mol) return mol
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help= 'filename of query structures as smiles (incompatible with --sdf and --qjson args)' ) group.add_argument( '--qsdf', help= 'filename of query structures as sdfile (incompatible with --smiles and --qjson args)' ) group.add_argument( '--qjson', help= 'filename of query structures as MoleculeObject JSON (incompatible with --qsmiles and --qsdf args)' ) parser.add_argument('--qsmilesTitleLine', action='store_true', help='the smiles file has a title line') parser.add_argument('--qsmilesDelimiter', default='\t', help='delimiter for smiles file (default is tab)') parser.add_argument( '--qsmilesColumn', type=int, default=0, help='column in smiles file with the smiles (default is first column)') parser.add_argument( '--qsmilesNameColumn', type=int, default=1, help='column in smiles file with ID (default is second column)') parser.add_argument( '--qprop', help= 'property name in query molecules to report. If not defined (or property is not present) ' + 'then name property is not written. JSON format uses the UUID as default' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parameter_utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] propName = args.qprop if args.qsmiles: queryMolsupplier = rdkit_utils.default_open_input_smiles( args.qsmiles, delimiter=args.qsmilesDelimiter, smilesColumn=args.qsmilesColumn, nameColumn=args.qsmilesNameColumn, titleLine=args.qsmilesTitleLine) queryInput = None elif args.qsdf: queryInput, queryMolsupplier = rdkit_utils.default_open_input_sdf( args.qsdf) elif args.qjson: queryInput, queryMolsupplier = rdkit_utils.default_open_input_json( args.qjson, lazy=False) if not propName: propName = "uuid" else: raise ValueError('No query structure specified') queryFps = {} utils.log("Preparing query fingerprints") count = 0 for q in queryMolsupplier: count += 1 if q: queryFps[q] = descriptor(q) else: utils.log("WARNING: Failed to parse Molecule", count) if queryInput: queryInput.close() input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output( args.input, args.informat, args.output, 'screen_multi', args.outformat) # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. #if args.meta: # t = open(output_base + '_types.txt', 'w') # t.write(field_Similarity + '=integer\n') # t.flush() # t.close() i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue targetFp = descriptor(mol) idx = 0 hits = 0 bestScore = 0 bestName = None for queryMol in queryFps: idx += 1 sim = metric(queryFps[queryMol], targetFp) if propName: name = str(queryMol.GetProp(propName)) else: name = None if sim >= args.simmin and sim <= args.simmax: hits += 1 if not args.quiet: utils.log(i, idx, sim) if sim > bestScore: bestScore = sim bestIdx = idx if name: bestName = name if name: mol.SetDoubleProp(field_Similarity + "_" + name, sim) else: mol.SetDoubleProp( field_Similarity + "_" + str(idx) + "_Score", sim) if hits > 0: count += 1 mol.SetDoubleProp(field_Similarity + "_BestScore", bestScore) if bestName: mol.SetProp(field_Similarity + "_BestName", bestName) else: mol.SetIntProp(field_Similarity + "_BestIndex", bestIdx) mol.SetIntProp(field_Similarity + "_Count", hits) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': count }) return count
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help='query structure as smiles (incompatible with -qmolfile arg)') group.add_argument( '--qmolfile', help= 'query structure as filename in molfile format (incompatible with -qsmiles arg)' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parameter_utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] if args.qsmiles: query_rdkitmol = Chem.MolFromSmiles(args.qsmiles) elif args.qmolfile: query_rdkitmol = Chem.MolFromMolFile(args.qmolfile) else: raise ValueError('No query structure specified') query_fp = descriptor(query_rdkitmol) input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output( args.input, args.informat, args.output, 'screen', args.outformat, thinOutput=args.thin) i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue target_fp = descriptor(mol) sim = metric(query_fp, target_fp) if sim >= args.simmin and sim <= args.simmax: count += 1 if not args.quiet: utils.log(i, sim) mol.SetDoubleProp(field_Similarity, sim) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': i })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit filter') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parser.add_argument('--rotbmin', type=float, help='Min rotatable bond count') parser.add_argument('--rotbmax', type=float, help='Max rotatable bond count') parser.add_argument('--logpmin', type=float, help='Min logP') parser.add_argument('--logpmax', type=float, help='Max logP') parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records') parser.add_argument( '-c', '--chunksize', type=int, help= 'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...' ) parser.add_argument( '-d', '--digits', type=int, default=0, help= 'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...' ) parser.add_argument('-r', '--rename', action='append', help='Rename field (fromname:toname)') parser.add_argument( '-t', '--transform', action='append', help='Transform field value(fieldname:regex:type). ' + 'Regex is in the form of /regex/substitution/ (the 3 slashes are required). ' + 'Type is of int, float, boolean or string. The type is optional and if not specified then string is assumed. ' + 'Transformation occurs after field renaming so specify the new name.') parser.add_argument('--delete', action='append', help='Delete field') parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') # WARNING: thin output is not appropriate when using --fragment parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument( '-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering') parameter_utils.add_default_io_args(parser) args = parser.parse_args() utils.log("Filter Args: ", args) field_renames = {} if args.rename: for t in args.rename: parts = t.split(':') if len(parts) != 2: raise ValueError('Invalid field rename argument:', t) field_renames[parts[0]] = parts[1] if args.delete: for f in args.delete: field_renames[f] = None field_regexes = {} field_replacements = {} field_types = {} if args.transform: for t in args.transform: parts = t.split(':') if len(parts) < 2 or len(parts) > 3: raise ValueError('Invalid field transform argument:', t) terms = parts[1].split('/') utils.log("|".join(terms) + str(len(terms))) field_regexes[parts[0]] = re.compile(terms[1]) field_replacements[parts[0]] = terms[2] if len(parts) == 3: t = parts[2] else: t = 'string' field_types[parts[0]] = t utils.log("Created transform of " + terms[1] + " to " + terms[2] + " using type of " + t) if args.delete: for f in args.delete: field_renames[f] = None input, suppl = rdkit_utils.default_open_input(args.input, args.informat) if args.chunksize: chunkNum = 1 if args.output: output_base = args.output else: output_base = 'filter' output_base_chunk = output_base + str(chunkNum).zfill(args.digits) output, writer, output_base_chunk = rdkit_utils.default_open_output( output_base_chunk, output_base_chunk, args.outformat, thinOutput=args.thin, compress=not args.no_gzip) else: output, writer, output_base_chunk = rdkit_utils.default_open_output( args.output, "filter", args.outformat, thinOutput=args.thin, compress=not args.no_gzip) output_base = output_base_chunk utils.log("Writing to " + output_base_chunk) i = 0 count = 0 chunkNum = 1 for mol in suppl: if args.limit and count >= args.limit: break i += 1 if mol is None: continue if args.fragment: mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet) if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, minRotb=args.rotbmin, maxRotb=args.rotbmax, minLogp=args.logpmin, maxLogp=args.logpmax, quiet=args.quiet): continue if args.chunksize: if count > 0 and count % args.chunksize == 0: # new chunk, so create new writer writer.close() output.close() chunkNum += 1 output_chunk_base = output_base + str(chunkNum).zfill( args.digits) utils.log("Writing to " + output_chunk_base) output, writer, output_chunk_base = rdkit_utils.default_open_output( output_chunk_base, output_chunk_base, args.outformat, thinOutput=args.thin, compress=not args.no_gzip) for from_name in field_renames: to_name = field_renames[from_name] if mol.HasProp(from_name): val = mol.GetProp(from_name) mol.ClearProp(from_name) if to_name: mol.SetProp(to_name, val) for fieldname in field_regexes: p = mol.GetProp(fieldname) if p is not None: regex = field_regexes[fieldname] q = regex.sub(field_replacements[fieldname], p) t = field_types[fieldname] if t == 'int': mol.SetIntProp(fieldname, int(q)) elif t == 'float': mol.SetDoubleProp(fieldname, float(q)) elif t == 'boolean': mol.SetBoolProp(fieldname, bool(q)) else: mol.SetProp(fieldname, q) count += 1 writer.write(mol) utils.log("Filtered", i, "down to", count, "molecules") if args.chunksize: utils.log("Wrote", chunkNum, "chunks") if (args.digits > 0 and len(str(chunkNum)) > args.digits): utils.log( "WARNING: not enough digits specified for the number of chunks" ) writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitFilter': i })