def doO3Dalign(i, mol, qmol, threshold, perfect_score, writer, conformerProps=None, minEnergy=None): pyO3As = rdMolAlign.GetO3AForProbeConfs(mol, qmol) best_score = 0 j = 0 conf_id = -1 for pyO3A in pyO3As: align = pyO3A.Align() score = pyO3A.Score() if score > best_score: best_score = score conf_id = j j += 1 #utils.log("Best score = ",best_score) if not threshold or perfect_score - best_score < threshold: utils.log(i, align, score, Chem.MolToSmiles(mol, isomericSmiles=True)) mol.SetDoubleProp(field_O3DAScore, score) if conformerProps and minEnergy: eAbs = conformerProps[conf_id][(conformers.field_EnergyAbs)] eDelta = eAbs - minEnergy if eAbs: mol.SetDoubleProp(conformers.field_EnergyAbs, eAbs) if eDelta: mol.SetDoubleProp(conformers.field_EnergyDelta, eDelta) writer.write(mol, confId=conf_id) return 1 return 0
def fragment(mol, mode, quiet=False): frags = Chem.GetMolFrags(mol, asMols=True) if len(frags) == 1: return mol else: # TODO - handle ties if mode == 'hac': biggest_count = 0 for frag in frags: hac = frag.GetNumHeavyAtoms() if hac > biggest_count: biggest_count = hac biggest_mol = frag if not quiet: utils.log("Chose fragment from ", len(frags), "based on HAC") elif mode == 'mw': biggest_mw = 0 for frag in frags: mw = Descriptors.MolWt(frag) if mw > biggest_mw: biggest_mw = mw biggest_mol = frag if not quiet: utils.log("Chose fragment from ", len(frags), "based on MW") else: raise ValueError('Invalid fragment mode:', mode) # copy the properties across for name in mol.GetPropNames(): biggest_mol.SetProp(name, mol.GetProp(name)) return biggest_mol
def main(): ### command line args definitions ######################################### parser = argparse.ArgumentParser(description='RDKit Input Splitter') utils.add_default_input_args(parser) parser.add_argument('-o', '--output', required=True, help="Directory name for output files (no extension).") parser.add_argument( '-f', '--field', required=True, help= "field to use to split input. Output files will have the name of this field's value" ) parser.add_argument('--meta', action='store_true', help='Write metadata and metrics files') args = parser.parse_args() utils.log("Splitter Args: ", args) filenames = split(args.input, args.informat, args.field, args.output, args.meta) utils.log("Files generated:", " ".join(filenames))
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser( description='Calculate plane of best fit for molecules') utils.add_default_io_args(parser) args = parser.parse_args() utils.log("PBFEV args: ", args) input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'PBFEV', args.outformat) i = 0 count = 0 errors = 0 out_results = [] for mol in suppl: i += 1 AllChem.EmbedMolecule(mol) if mol is None: continue out_vector = PBFev(mol) if out_vector is None: continue rd = PBFRD(mol) mol.SetDoubleProp("distance", rd) for j, angle in enumerate(out_vector): mol.SetDoubleProp("angle" + "_" + str(j), angle) out_results.append(mol) count = write_out(out_results, count, writer, args.outformat) utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) + " outputs") writer.flush() writer.close() input.close() output.close()
def main(): ### command line args defintions ################################## parser = argparse.ArgumentParser(description='Tmax/Cmax simulation') parser.add_argument('--half-life', type=float, required=True, help='half life (hours)') parser.add_argument('--absorption', type=float, required=True, help='half life absorption (hours)') parser.add_argument('--dose', type=float, required=True, help='initial dose (mg)') parser.add_argument('--auc', type=float, required=True, help='AUC (mg/L*hr)') parser.add_argument('--time', type=float, required=True, help='time (h)') parser.add_argument('--plot-height', type=int, default=4, help='plot height') parser.add_argument('--plot-width', type=int, default=10, help='plot width') parser.add_argument('--font-size', type=int, default=12, help='font size') parser.add_argument('-o', '--output', type=str, default='cmax.png', help='output file name') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Tmax/Cmax simulation Args: ", args) ### execute ####################################################### generatePlot(args.half_life, args.absorption, args.dose, args.auc, args.time, plot_width=args.plot_width, plot_height=args.plot_height, font_size=args.font_size, filename=args.output)
def main(): global WRITER, THRESHOLD global PDB_PATH parser = argparse.ArgumentParser( description='SMoG2016 - Docking calculation.') utils.add_default_io_args(parser) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring") parser.add_argument('-t', '--threshold', help="The maximum score to allow", default=None) parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("SMoG2016 Args: ", args) smog_path = "/usr/local/SMoG2016_Rev1/" if args.threshold: THRESHOLD = float(args.threshold) else: THRESHOLD = None PDB_PATH = "/tmp/pdb_file.pdb" # Now copy it to prot_pdb.pdb -> silly SMOG bug requires underscore in the filename! shutil.copy(args.pdb_file, PDB_PATH) # Open up the input file input, suppl = utils.default_open_input(args.input, args.informat) # Open the ouput file output, WRITER, output_base = utils.default_open_output( args.output, "SMoG2016", args.outformat, compress=not args.no_gzip) # Cd to the route of the action # TODO - can this be done without changing dir? It gives problems in finding the input files and in writing the metrics cwd = os.getcwd() os.chdir(smog_path) # Iterate over the molecules # TODO - restore parallel processing, but need to ensure the order of molecules is preserved pool = ThreadPool(1) pool.map(run_dock, suppl) # Close the file WRITER.close() os.chdir(cwd) if args.meta: utils.write_metrics( output_base, { '__InputCount__': COUNTER, '__OutputCount__': SUCCESS, 'SMoG2016': COUNTER }) utils.log("SMoG2016 complete")
def main(): global PDB_PATH, WRITER, THRESHOLD parser = argparse.ArgumentParser( description='SMoG2016 - Docking calculation.') utils.add_default_io_args(parser) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring") parser.add_argument('-t', '--threshold', type=float, help="The maximum score to allow", default=None) parser.add_argument( '--threads', type=int, help="Number of threads to used. Default is the number of cores", default=None) parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("PLI Args: ", args) # Open up the input file input, suppl = utils.default_open_input(args.input, args.informat) # Open the ouput file output, WRITER, output_base = utils.default_open_output( args.output, "plip", args.outformat, compress=not args.no_gzip, thinOutput=args.thin) PDB_PATH = args.pdb_file if args.threshold: THRESHOLD = args.threshold # Iterate over the molecules # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this. pool = ThreadPool(args.threads if args. threads is not None else multiprocessing.cpu_count()) pool.map(run_dock, suppl) pool.close() pool.join() # Close the file WRITER.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': COUNTER, '__OutputCount__': SUCCESS, 'PLI': COUNTER })
def filter_by_molwt(mol, minMw, maxMw, quiet=False): mw = Descriptors.MolWt(mol) if minMw is not None and mw < minMw: if not quiet: utils.log("MolWt", mw, "<", minMw) return False if maxMw is not None and mw > maxMw: if not quiet: utils.log("MolWt", mw, ">", maxMw) return False return True
def filter_by_heavy_atom_count(mol, minCount, maxCount, quiet=False): hac = mol.GetNumHeavyAtoms() if minCount is not None and hac < minCount: if not quiet: utils.log("HAC", hac, "<", minCount) return False if maxCount is not None and hac > maxCount: if not quiet: utils.log("HAC", hac, ">", maxCount) return False return True
def SelectDiverseSubset(mols, clusters, distances, count, field, maximise, score, quiet): total = len(mols) num_clusters = len(clusters) pickedList = [] clustersList = [] for i in range(0, num_clusters): pickedList.append([]) if field: filteredByValue = [ x for x in clusters[i] if mols[x].HasProp(field) ] sortedByValue = sorted( filteredByValue, key=lambda idx: FetchScore(idx, mols, field, maximise)) clustersList.append(sortedByValue) else: allRecords = [x for x in clusters[i]] clustersList.append(allRecords) totalIter = 0 clusterIter = 0 pickedCount = 0 while totalIter < total and pickedCount < count: clusterNum = totalIter % num_clusters clus = clustersList[clusterNum] pick = pickedList[clusterNum] #utils.log("iter",totalIter,"cluster",clusterNum,"length",len(clus)) if len(clus) > 0: # remove that item from the cluster so that it's not tried again molIndex = clus.pop(0) if len(pick) == 0: # first time for this cluster pick.append(molIndex) pickedCount += 1 clusterIter += 1 if not quiet: utils.log("Cluster", clusterNum, "initialised with", molIndex) else: closestDist = GetClosestDistance(distances, molIndex, pick) #utils.log("Closest score",closestDist) if closestDist < score: pick.append(molIndex) pickedCount += 1 clusterIter += 1 if not quiet: utils.log("Cluster", clusterNum, "added", molIndex, "with score", closestDist) elif not quiet: utils.log("Cluster", clusterNum, "discarded", molIndex, "with score", closestDist) else: # cluster has been exhausted #utils.log("Cluster",clusterNum,"exhasted") clusterIter += 1 totalIter += 1 utils.log("Picked", pickedCount, "using", totalIter, "iterations") return pickedList
def split(input, informat, fieldName, outputBase, writeMetrics): """Splits the input into separate files. The name of each file and the file the each record is written to is determined by the fieldName parameter """ input, suppl = utils.default_open_input(input, informat) i = 0 written = 0 writers = {} outputs = [] filenames = [] for mol in suppl: i += 1 if mol is None: continue if not mol.HasProp(fieldName): utils.log("Skipping molecule", i, "- did not contain field", fieldName) continue value = mol.GetProp(fieldName) if value: s = str(value) if writers.has_key(s): writer = writers[s] else: name = outputBase + s output, writer = utils.default_open_output_sdf( name, outputBase, False, False) filenames.append(name + '.sdf') outputs.append(output) writers[s] = writer writer.write(mol) written += 1 utils.log("Generated", len(writers), "outputs from", i, "records") input.close() for k in writers: writers[k].close() for o in outputs: o.close() if writeMetrics: utils.write_metrics(outputBase, { '__InputCount__': i, '__OutputCount__': written, 'Splitter': i }) return filenames
def run_dock(mol): global WRITER, COUNTER, SUCCESS, THRESHOLD answer_dict = run_and_get_ans(mol, PDB_PATH) COUNTER += 1 if not answer_dict: utils.log("FAILED MOL", Chem.MolToSmiles(mol)) return if THRESHOLD is not None: if answer_dict["system"]["pliff_score"] > THRESHOLD: return for ans in answer_dict["system"]: if ans.startswith(u"pliff"): mol.SetDoubleProp(str(ans), answer_dict["system"][ans]) utils.log("SCORED MOL:", Chem.MolToSmiles(mol), answer_dict) lock.acquire() WRITER.write(mol) SUCCESS += 1 WRITER.flush() lock.release()
def process_mol_conformers(mol, i, numConfs, maxAttempts, pruneRmsThresh, clusterMethod, clusterThreshold, minimizeIterations): #utils.log("generating conformers for molecule",i) # generate the conformers conformerIds = gen_conformers(mol, numConfs, maxAttempts, pruneRmsThresh, True, True, True) conformerPropsDict = {} minEnergy = 9999999999999 for conformerId in conformerIds: #utils.log("Processing conf",i,conformerId) # energy minimise (optional) and energy calculation props = collections.OrderedDict() energy = calc_energy(mol, conformerId, minimizeIterations, props) if energy and energy < minEnergy: minEnergy = energy conformerPropsDict[conformerId] = props # cluster the conformers if clusterMethod: rmsClusters = cluster_conformers(mol, clusterMethod, clusterThreshold) utils.log("Molecule", i, "generated", len(conformerIds), "conformers and", len(rmsClusters), "clusters") rmsClustersPerCluster = [] clusterNumber = 0 for cluster in rmsClusters: clusterNumber = clusterNumber + 1 rmsWithinCluster = align_conformers(mol, cluster) for conformerId in cluster: props = conformerPropsDict[conformerId] props[field_ClusterNum] = clusterNumber props[field_ClusterCentroid] = cluster[0] + 1 idx = cluster.index(conformerId) if idx > 0: props[field_RMSToCentroid] = rmsWithinCluster[idx - 1] else: props[field_RMSToCentroid] = 0.0 else: utils.log("Molecule", i + 1, "generated", len(conformerIds), "conformers") return conformerPropsDict, minEnergy
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Sdf2Json') parser.add_argument('-i', '--input', help="Input SD file, if not defined the STDIN is used") parser.add_argument('-o', '--output', help="Base name for output json file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.") parser.add_argument('--exclude', help="Optional list of fields (comma separated) to exclude from the output.") args = parser.parse_args() utils.log("Screen Args: ", args) if args.input: if args.input.lower().endswith(".sdf"): base = args.input[:-4] elif args.input.lower().endswith(".sdf.gz"): base = args.input[:-7] else: base = "json" utils.log("Base:", base) input,output,suppl,writer,output_base = utils.default_open_input_output(args.input, "sdf", args.output, base, "json") if args.exclude: excludes = args.exclude.split(",") utils.log("Excluding", excludes) else: excludes = None i=0 count = 0 for mol in suppl: i +=1 if mol is None: continue if excludes: for exclude in excludes: if mol.HasProp(exclude): mol.ClearProp(exclude) writer.write(mol) count += 1 utils.log("Converted", count, " molecules") writer.flush() writer.close() input.close() output.close() utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitSdf2Json':count}) return count
def enumerateStereoIsomers(mol): out = [] chiralCentres = Chem.FindMolChiralCenters(mol, includeUnassigned=True) #return the molecule object when no chiral centres where identified if chiralCentres == []: return [mol] #All bit permutations with number of bits equals number of chiralCentres elements = _spam(len(chiralCentres)) for isoId,element in enumerate(elements): for centreId,i in enumerate(element): atomId = chiralCentres[centreId][0] if i == 0: mol.GetAtomWithIdx(atomId).SetChiralTag(Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW) elif i == 1: mol.GetAtomWithIdx(atomId).SetChiralTag(Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW) outmol = copy(mol) utils.log("Enumerated ", Chem.MolToSmiles(mol, isomericSmiles=True)) out.append(outmol) return out
def run_and_get_ans(mol, pdb_path): global PDB_PATH smogmol = tempfile.NamedTemporaryFile("w", suffix=".sdf", delete=False).name utils.log("PDB: " + PDB_PATH + " ligand: " + smogmol) out_f = open(smogmol, "w") out_f.write(Chem.MolToMolBlock(mol)) out_f.close() # Run command pli_path = "/usr/local/pli/bin/pli" cmd = [ pli_path, "-protein", pdb_path, "-ligand", smogmol, "-mode", "score", "-output", "system,scores", "-exact_voronoi_areas", "0", "-selection", "ligand", "-oformat", "json", "-minimise", "1", "-warnings", "0", "-min_max_iter", "10" ] utils.log("PLI CMD: " + " ".join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) # Parse the output me = proc.stdout.read() if not me: return None return json.loads(me)
def split(input, informat, fieldName, outputBase): """Splits the input into separate files. The name of each files and the file the records is written to is determined by the fieldName parameter """ input, suppl = utils.default_open_input(input, informat) i = 0 writers = {} outputs = [] filenames = [] for mol in suppl: i += 1 if mol is None: continue value = mol.GetProp(fieldName) if value: s = str(value) if writers.has_key(s): writer = writers[s] else: name = outputBase + s output, writer = utils.default_open_output_sdf( name, outputBase, False, False) filenames.append(name + '.sdf') outputs.append(output) writers[s] = writer writer.write(mol) utils.log("Generated", len(writers), "outputs from", i, "records") input.close() for k in writers: writers[k].close() for o in outputs: o.close() return filenames
def main(): global PDB_PATH, WRITER, THRESHOLD parser = argparse.ArgumentParser(description='Open babel PDB prepare') parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output') parser.add_argument('-i', '--input', help="PDB file for converting") parser.add_argument('-o', '--output', help="Base name for output files (no extension).") parser.add_argument('-mol2', '--mol2', action='store_true', help='Output as Mol2 format.') parser.add_argument('-pdbqt', '--pdbqt', action='store_true', help='Output as pdbqt format.') parser.add_argument('--meta', action='store_true', help='Write metrics files') parser.add_argument('-prot', '--protonate', type=float, help="protonate at this pH (optional)") args = parser.parse_args() utils.log("Prepare Args: ", args) if not (args.mol2 or args.pdbqt): raise ValueError( "Must specify at least one output fromat: mol2 and/or pdbqt") if args.pdbqt: utils.log("Preparing as pdbqt") execute(args.input, args.output, "pdbqt", "-opdbqt", args.protonate, args.no_gzip) if args.mol2: utils.log("Preparing as mol2") execute(args.input, args.output, "mol2", "-omol2", args.protonate, args.no_gzip) utils.log("Preparation complete")
def run_dock(mol): global COUNTER global SUCCESS global THRESHOLD answer = run_and_get_ans(mol) COUNTER += 1 if answer is None: utils.log("FAILED MOL", Chem.MolToSmiles(mol)) return if THRESHOLD is not None: print(answer, THRESHOLD) if answer > THRESHOLD: utils.log("UNDER THRESHOLD", Chem.MolToSmiles(mol)) return mol.SetDoubleProp("SMoG2016_SCORE", answer) utils.log("SCORED MOL:", Chem.MolToSmiles(mol), answer) # Write ligand lock.acquire() SUCCESS += 1 WRITER.write(mol) WRITER.flush() lock.release() return
datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) inputs = 0 totalCount = 0 totalErrors = 0 for mol in suppl: inputs += 1 if mol: count, errors = generate_conformers(inputs, mol, args.num, ref_mol, WRITER, args.core_smi) totalCount += count totalErrors += errors input.close() WRITER.close() if totalErrors > 0: utils.log("WARNING:", totalErrors, "conformers failed to generate") # write metrics if args.meta: metrics = { '__InputCount__': inputs, '__OutputCount__': totalCount, 'RDKitConstrainedConformer': totalCount } if totalErrors > 0: metrics['__ErrorCount__'] = totalErrors utils.write_metrics(output_base, metrics)
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help='query structure as smiles (incompatible with -qmolfile arg)') group.add_argument( '--qmolfile', help= 'query structure as filename in molfile format (incompatible with -qsmiles arg)' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] if args.qsmiles: query_rdkitmol = Chem.MolFromSmiles(args.qsmiles) elif args.qmolfile: query_rdkitmol = Chem.MolFromMolFile(args.qmolfile) else: raise ValueError('No query structure specified') query_fp = descriptor(query_rdkitmol) input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'screen', args.outformat, thinOutput=args.thin) i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = filter.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue target_fp = descriptor(mol) sim = metric(query_fp, target_fp) if sim >= args.simmin and sim <= args.simmax: count += 1 if not args.quiet: utils.log(i, sim) mol.SetDoubleProp(field_Similarity, sim) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': i }) return count
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser( description='RDKit molecule standardiser / enumerator') utils.add_default_io_args(parser) parser.add_argument('-et', '--enumerate_tauts', action='store_true', help='Enumerate all tautomers') parser.add_argument('-es', '--enumerate_stereo', action='store_true', help='Enumerate all stereoisomers') parser.add_argument( '-st', '--standardize', action='store_true', help='Standardize molecules. Cannot be true if enumerate is on.') parser.add_argument('-stm', '--standardize_method', default="molvs", choices=STANDARD_MOL_METHODS.keys(), help="Chose the method to standardize.") args = parser.parse_args() if args.standardize and args.enumerate_tauts: raise ValueError("Cannot Enumerate Tautomers and Standardise") if args.standardize and args.enumerate_stereo: raise ValueError("Cannot Enumerate Stereo and Standardise") if args.standardize: getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method] input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'sanify', args.outformat) i = 0 count = 0 errors = 0 for mol in suppl: i += 1 if mol is None: continue if args.standardize: # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs oldUUID = mol.GetProp("uuid") inputCanSmiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) try: std = getStandardMolecule(mol) outputCanSmiles = Chem.MolToSmiles(std, isomericSmiles=True, canonical=True) if oldUUID: std.SetProp("uuid", oldUUID) #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles) if inputCanSmiles == outputCanSmiles: std.SetProp("Standardised", "False") else: std.SetProp("Standardised", "True") except: errors += 1 utils.log("Error standardizing", sys.exc_info()[0]) std = mol std.SetProp("Standardised", "Error") count = write_out([std], count, writer) else: # we want a new UUID generating as we are generating new molecules parentUuid = mol.GetProp("uuid") results = [] results.append(mol) if args.enumerate_tauts: utils.log("Enumerating tautomers") results = enumerateTautomers(mol) if args.enumerate_stereo: utils.log("Enumerating steroisomers") mols = results results = [] for m in mols: enumerated = enumerateStereoIsomers(m) results.extend(enumerated) for m in results: m.ClearProp("uuid") m.SetIntProp("SourceMolNum", i) if parentUuid: m.SetProp("SourceMolUUID", parentUuid) count = write_out(results, count, writer) utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) + " outputs") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, '__ErrorCount__': errors, 'RDKitSanify': count }) return count
def generatePlot(t_hf, t_hf_a, D, AUC, tn, quiet=False, plot_height=4, plot_width=10, font_size=12, filename='cmax.png'): kel= math.log(2)/t_hf ka= math.log(2)/ t_hf_a Tmax=(math.log(ka)-math.log(kel))/(ka-kel) Cmax=math.exp(-kel*Tmax)*kel*AUC V_F=D/kel/AUC if not quiet: utils.log('------------------------------------------------------------------------------------------') utils.log('kel \t',kel) utils.log('ka \t',ka) utils.log('Tmax \t',Tmax) utils.log('Cmax \t',Cmax) utils.log('V_F \t',V_F) utils.log('------------------------------------------------------------------------------------------') b_time=[] c_cp=[] d_perc=[] for i in range(0,101): a_no=i if(i==0): b_time.append(0) else: b_time.append(b_time[i-1]+tn/100) c_cp.append(ka*D/V_F/(ka-kel)*(math.exp(-kel*b_time[i])-math.exp(-ka*b_time[i]))) d_perc.append(100-100*math.exp(-ka*b_time[i])) #print(b_time[100],c_cp[100],d_perc[100]) #Creating the visulisation plt.figure(figsize=(plot_width,plot_height)) plt.subplot(1, 2, 1) plt.plot(b_time,c_cp,linewidth=2,linestyle='dashed',color='coral') #Plotting the observed data plt.xlabel('Time (h)',fontsize=font_size) plt.ylabel('Cp(mg/L',fontsize=font_size) plt.title('cp Vs Time',color='coral',fontsize=font_size) plt.grid(True) #plt.yscale('log') #Change the Y sclae to logscale plt.subplot(1, 2, 2) plt.plot(b_time,d_perc,linewidth=2,linestyle='dashed') #Plotting the observed data plt.xlabel('Time (h)',fontsize=font_size) plt.ylabel('% Absorbed',fontsize=font_size) plt.title('%Absorbed Vs Time',color='dodgerblue',fontsize=font_size) plt.grid(True) # Fine-tune figure; make subplots farther from each other. # refine layout to better support different sizes plt.savefig(filename)
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit screen') group = parser.add_mutually_exclusive_group() group.add_argument( '--qsmiles', help= 'filename of query structures as smiles (incompatible with --sdf and --qjson args)' ) group.add_argument( '--qsdf', help= 'filename of query structures as sdfile (incompatible with --smiles and --qjson args)' ) group.add_argument( '--qjson', help= 'filename of query structures as MoleculeObject JSON (incompatible with --qsmiles and --qsdf args)' ) parser.add_argument('--qsmilesTitleLine', action='store_true', help='the smiles file has a title line') parser.add_argument('--qsmilesDelimiter', default='\t', help='delimiter for smiles file (default is tab)') parser.add_argument( '--qsmilesColumn', type=int, default=0, help='column in smiles file with the smiles (default is first column)') parser.add_argument( '--qsmilesNameColumn', type=int, default=1, help='column in smiles file with ID (default is second column)') parser.add_argument( '--qprop', help= 'property name in query molecules to report. If not defined (or property is not present) ' + 'then name property is not written. JSON format uses the UUID as default' ) parser.add_argument('--simmin', type=float, default=0.7, help='similarity lower cutoff (1.0 means identical)') parser.add_argument('--simmax', type=float, default=1.0, help='similarity upper cutoff (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') utils.add_default_io_args(parser) parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Screen Args: ", args) descriptor = descriptors[args.descriptor.lower()] metric = metrics[args.metric.lower()] propName = args.qprop if args.qsmiles: queryMolsupplier = utils.default_open_input_smiles( args.qsmiles, delimiter=args.qsmilesDelimiter, smilesColumn=args.qsmilesColumn, nameColumn=args.qsmilesNameColumn, titleLine=args.qsmilesTitleLine) queryInput = None elif args.qsdf: queryInput, queryMolsupplier = utils.default_open_input_sdf(args.qsdf) elif args.qjson: queryInput, queryMolsupplier = utils.default_open_input_json( args.qjson, lazy=False) if not propName: propName = "uuid" else: raise ValueError('No query structure specified') queryFps = {} utils.log("Preparing query fingerprints") count = 0 for q in queryMolsupplier: count += 1 if q: queryFps[q] = descriptor(q) else: utils.log("WARNING: Failed to parse Molecule", count) if queryInput: queryInput.close() input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'screen_multi', args.outformat) # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. #if args.meta: # t = open(output_base + '_types.txt', 'w') # t.write(field_Similarity + '=integer\n') # t.flush() # t.close() i = 0 count = 0 for mol in suppl: i += 1 if mol is None: continue if args.fragment: mol = filter.fragment(mol, args.fragment, quiet=args.quiet) if not filter.filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue targetFp = descriptor(mol) idx = 0 hits = 0 bestScore = 0 bestName = None for queryMol in queryFps: idx += 1 sim = metric(queryFps[queryMol], targetFp) if propName: name = str(queryMol.GetProp(propName)) else: name = None if sim >= args.simmin and sim <= args.simmax: hits += 1 if not args.quiet: utils.log(i, idx, sim) if sim > bestScore: bestScore = sim bestIdx = idx if name: bestName = name if name: mol.SetDoubleProp(field_Similarity + "_" + name, sim) else: mol.SetDoubleProp( field_Similarity + "_" + str(idx) + "_Score", sim) if hits > 0: count += 1 mol.SetDoubleProp(field_Similarity + "_BestScore", bestScore) if bestName: mol.SetProp(field_Similarity + "_BestName", bestName) else: mol.SetIntProp(field_Similarity + "_BestIndex", bestIdx) mol.SetIntProp(field_Similarity + "_Count", hits) writer.write(mol) utils.log("Found", count, "similar molecules") writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitScreen': count }) return count
def main(): parser = argparse.ArgumentParser(description='Open3DAlign with RDKit') parser.add_argument('query', help='query molfile') parser.add_argument( '--qmolidx', help="Query molecule index in SD file if not the first", type=int, default=1) parser.add_argument( '-t', '--threshold', type=float, help='score cuttoff relative to alignment of query to itself') parser.add_argument( '-n', '--num', default=0, type=int, help= 'number of conformers to generate, if None then input structures are assumed to already be 3D' ) parser.add_argument('-a', '--attempts', default=0, type=int, help='number of attempts to generate conformers') parser.add_argument('-r', '--rmsd', type=float, default=1.0, help='prune RMSD threshold for excluding conformers') parser.add_argument( '-e', '--emin', type=int, default=0, help= 'energy minimisation iterations for generated confomers (default of 0 means none)' ) utils.add_default_io_args(parser) args = parser.parse_args() utils.log("o3dAlign Args: ", args) qmol = utils.read_single_molecule(args.query, index=args.qmolidx) qmol = Chem.RemoveHs(qmol) qmol2 = Chem.Mol(qmol) source = "conformers.py" datasetMetaProps = { "source": source, "description": "Open3DAlign using RDKit " + rdBase.rdkitVersion } clsMappings = {"O3DAScore": "java.lang.Float"} fieldMetaProps = [{ "fieldName": "O3DAScore", "values": { "source": source, "description": "Open3DAlign alignment score" } }] if args.num > 0: # we generate the conformers so will add energy info clsMappings["EnergyDelta"] = "java.lang.Float" clsMappings["EnergyAbs"] = "java.lang.Float" fieldMetaProps.append({ "fieldName": "EnergyDelta", "values": { "source": source, "description": "Energy difference to lowest energy conformer" } }) fieldMetaProps.append({ "fieldName": "EnergyAbs", "values": { "source": source, "description": "Absolute energy" } }) input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'o3dAlign', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) pyO3A = rdMolAlign.GetO3A(qmol2, qmol) perfect_align = pyO3A.Align() perfect_score = pyO3A.Score() utils.log('Perfect score:', perfect_align, perfect_score, Chem.MolToSmiles(qmol, isomericSmiles=True), qmol.GetNumAtoms()) i = 0 count = 0 total = 0 for mol in suppl: if mol is None: continue if args.num > 0: mol.RemoveAllConformers() conformerProps, minEnergy = conformers.process_mol_conformers( mol, i, args.num, args.attempts, args.rmsd, None, None, 0) mol = Chem.RemoveHs(mol) count += doO3Dalign(i, mol, qmol, args.threshold, perfect_score, writer, conformerProps=conformerProps, minEnergy=minEnergy) else: mol = Chem.RemoveHs(mol) count += doO3Dalign(i, mol, qmol, args.threshold, perfect_score, writer) i += 1 total += mol.GetNumConformers() input.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitO3DAlign': total })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit filter') parser.add_argument( '-f', '--fragment', choices=['hac', 'mw'], help= 'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )' ) parser.add_argument('--hacmin', type=int, help='Min heavy atom count') parser.add_argument('--hacmax', type=int, help='Max heavy atom count') parser.add_argument('--mwmin', type=float, help='Min mol weight') parser.add_argument('--mwmax', type=float, help='Max mol weight') parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records') parser.add_argument( '-c', '--chunksize', type=int, help= 'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...' ) parser.add_argument( '-d', '--digits', type=int, default=0, help= 'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...' ) parser.add_argument( '--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') # WARNING: thin output is not appropriate when using --fragment parser.add_argument('--thin', action='store_true', help='Thin output mode') parser.add_argument( '-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering') utils.add_default_io_args(parser) args = parser.parse_args() utils.log("Filter Args: ", args) input, suppl = utils.default_open_input(args.input, args.informat) if args.chunksize: chunkNum = 1 if args.output: output_base = args.output else: output_base = 'filter' output_base_chunk = output_base + str(chunkNum).zfill(args.digits) output, writer, output_base_chunk = utils.default_open_output( output_base_chunk, output_base_chunk, args.outformat, compress=not args.no_gzip) else: output, writer, output_base_chunk = utils.default_open_output( args.output, "filter", args.outformat, compress=not args.no_gzip) output_base = output_base_chunk utils.log("Writing to " + output_base_chunk) i = 0 count = 0 chunkNum = 1 for mol in suppl: if args.limit and count >= args.limit: break i += 1 if mol is None: continue if args.fragment: mol = fragment(mol, args.fragment, quiet=args.quiet) if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet): continue if args.chunksize: if count > 0 and count % args.chunksize == 0: writer.close() output.close() chunkNum += 1 output_chunk_base = output_base + str(chunkNum).zfill( args.digits) utils.log("Writing to " + output_chunk_base) output, writer, output_chunk_base = utils.default_open_output( output_chunk_base, output_chunk_base, args.outformat, compress=not args.no_gzip) count += 1 writer.write(mol) utils.log("Filtered", i, "down to", count, "molecules") if args.chunksize: utils.log("Wrote", chunkNum, "chunks") if (args.digits > 0 and len(str(chunkNum)) > args.digits): utils.log( "WARNING: not enough digits specified for the number of chunks" ) writer.flush() writer.close() input.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitFilter': i })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit conformers') parser.add_argument('-n', '--num', type=int, default=1, help='number of conformers to generate') parser.add_argument('-a', '--attempts', type=int, default=0, help='number of attempts') parser.add_argument('-r', '--rmsd', type=float, default=1.0, help='prune RMSD threshold') parser.add_argument( '-c', '--cluster', type=str.lower, choices=['rmsd', 'tdf'], help='Cluster method (RMSD or TFD). If None then no clustering') parser.add_argument( '-t', '--threshold', type=float, help='cluster threshold (default of 2.0 for RMSD and 0.3 for TFD)') parser.add_argument( '-e', '--emin', type=int, default=0, help='energy minimisation iterations (default of 0 means none)') utils.add_default_io_args(parser) parser.add_argument( '--smiles', help= 'input structure as smiles (incompatible with using files or stdin for input)' ) args = parser.parse_args() if not args.threshold: if args.cluster == 'tfd': args.threshold = 0.3 else: args.threshold = 2.0 utils.log("Conformers Args: ", args) source = "conformers.py" datasetMetaProps = { "source": source, "description": "Conformer generation using RDKit " + rdBase.rdkitVersion } clsMappings = { "RMSToCentroid": "java.lang.Float", "EnergyDelta": "java.lang.Float", "EnergyAbs": "java.lang.Float", "ConformerNum": "java.lang.Integer", "ClusterCentroid": "java.lang.Integer", "ClusterNum": "java.lang.Integer", "StructureNum": "java.lang.Integer" } fieldMetaProps = [{ "fieldName": "RMSToCentroid", "values": { "source": source, "description": "RMS distance to the cluster centroid" } }, { "fieldName": "EnergyDelta", "values": { "source": source, "description": "Energy difference to lowest energy structure" } }, { "fieldName": "EnergyAbs", "values": { "source": source, "description": "Absolute energy" } }, { "fieldName": "ConformerNum", "values": { "source": source, "description": "Conformer number" } }, { "fieldName": "ClusterCentroid", "values": { "source": source, "description": "Conformer number of the cluster centroid" } }, { "fieldName": "ClusterNum", "values": { "source": source, "description": "Cluster number" } }, { "fieldName": "StructureNum", "values": { "source": source, "description": "Structure number this conformer was generated from" } }] if args.smiles: mol = Chem.MolFromSmiles(args.smiles) suppl = [mol] input = None output, writer, output_base = utils.default_open_output( args.output, 'conformers', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) else: input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'conformers', args.outformat, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) # OK, all looks good so we can hope that things will run OK. # But before we start lets write the metadata so that the results can be handled. #if args.meta: # t = open(output_base + '_types.txt', 'w') # t.write(field_StructureNum + '=integer\n') # t.write(field_StructureNum + '=integer\n') # t.write(field_ConformerNum + '=integer\n') # t.write(field_EnergyAbs + '=double\n') # t.write(field_EnergyDelta + '=double\n') # if args.emin > 0: # t.write(field_MinimizationConverged + '=boolean\n') # if args.cluster: # t.write(field_RMSToCentroid + '=double\n') # t.write(field_ClusterNum + '=integer\n') # t.write(field_ClusterCentroid + '=integer\n') # t.flush() # t.close() i = 0 count = 0 for mol in suppl: if mol is None: continue m = Chem.AddHs(mol) conformerPropsDict, minEnergy = process_mol_conformers( m, i, args.num, args.attempts, args.rmsd, args.cluster, args.threshold, args.emin) m = Chem.RemoveHs(m) write_conformers(m, i, conformerPropsDict, minEnergy, writer) count = count + m.GetNumConformers() i += 1 if input: input.close() writer.flush() writer.close() output.close() if args.meta: utils.write_metrics(output_base, { '__InputCount__': i, '__OutputCount__': count, 'RDKitConformer': count })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster') parser.add_argument( '-t', '--threshold', type=float, default=0.7, help='similarity clustering threshold (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument( '-n', '--num', type=int, help='maximum number to pick for diverse subset selection') parser.add_argument( '-e', '--exclude', type=float, default=0.9, help= 'threshold for excluding structures in diverse subset selection (1.0 means identical)' ) parser.add_argument( '--fragment-method', choices=['hac', 'mw'], default='hac', help= 'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)' ) parser.add_argument( '--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule') parser.add_argument( '-f', '--field', help='field to use to optimise diverse subset selection') group = parser.add_mutually_exclusive_group() group.add_argument( '--min', action='store_true', help='pick lowest value specified by the --field option') group.add_argument( '--max', action='store_true', help='pick highest value specified by the --field option') utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('--thin', action='store_true', help='Thin output mode') args = parser.parse_args() utils.log("Cluster Args: ", args) descriptor = descriptors[args.descriptor] if descriptor is None: raise ValueError('Invalid descriptor name ' + args.descriptor) if args.field and not args.num: raise ValueError( '--num argument must be specified for diverse subset selection') if args.field and not (args.min or args.max): raise ValueError( '--min or --max argument must be specified for diverse subset selection' ) # handle metadata source = "cluster_butina.py" datasetMetaProps = { "source": source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion } clsMappings = {"Cluster": "java.lang.Integer"} fieldMetaProps = [{ "fieldName": "Cluster", "values": { "source": source, "description": "Cluster number" } }] input, output, suppl, writer, output_base = utils.default_open_input_output( args.input, args.informat, args.output, 'cluster_butina', args.outformat, thinOutput=args.thin, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) ### generate fingerprints #mols = [x for x in suppl if x is not None] #fps = [descriptor(x) for x in mols] mols = [] fps = [] errs = mol_utils.fragmentAndFingerprint( suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet) input.close() ### do clustering utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold) clusters, dists, matrix = ClusterFps(fps, args.metric, 1.0 - args.threshold) utils.log("Found", len(clusters), "clusters") ### generate diverse subset if specified if args.num: utils.log("Generating diverse subset") # diverse subset selection is specified finalClusters = SelectDiverseSubset(mols, clusters, dists, args.num, args.field, args.max, args.exclude, args.quiet) else: finalClusters = clusters utils.log("Found", len(finalClusters), "clusters") lookup = ClustersToMap(finalClusters) if not args.quiet: utils.log("Final Clusters:", finalClusters) ### write the results i = 0 result_count = 0 for mol in mols: if lookup.has_key(i): if args.thin: utils.clear_mol_props(mol, ["uuid"]) cluster = lookup[i] mol.SetIntProp(field_Cluster, cluster) writer.write(mol) result_count += 1 i += 1 writer.flush() writer.close() output.close() if args.meta: status_str = str(result_count) + ' results from ' + str( len(finalClusters)) + ' clusters' utils.write_metrics( output_base, { '__StatusMessage__': status_str, '__InputCount__': i, '__OutputCount__': result_count, 'RDKitCluster': i })
def main(): ### command line args defintions ######################################### ### Define the reactions available poised_filter = True if poised_filter == True: from poised_filter import Filter filter_to_use = Filter() parser = argparse.ArgumentParser(description='RDKit rxn process') utils.add_default_io_args(parser) parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') parser.add_argument('-m', '--multi', action='store_true', help='Output one file for each reaction') parser.add_argument('-r', '--reaction', choices=filter_to_use.poised_reactions.keys(), help='Name of reaction to be run') parser.add_argument('-rl', '--reagent_lib', help="Input SD file, if not defined the STDIN is used") parser.add_argument( '-rlf', '--reagent_lib_format', choices=['sdf', 'json'], help="Input format. When using STDIN this must be specified.") args = parser.parse_args() utils.log("Screen Args: ", args) if not args.output and args.multi: raise ValueError( "Must specify output location when writing individual result files" ) input, suppl = utils.default_open_input(args.input, args.informat) reagent_input, reagent_suppl = utils.default_open_input( args.reagent_lib, args.reagent_lib_format) output, writer, output_base = utils.default_open_output( args.output, "rxn_maker", args.outformat) i = 0 count = 0 if args.multi: dir_base = os.path.dirname(args.output) writer_dict = filter_to_use.get_writers(dir_base) else: writer_dict = None dir_base = None for mol in suppl: i += 1 if mol is None: continue # Return a dict/class here - indicating which filters passed count = filter_to_use.perform_reaction(mol, args.reaction, reagent_suppl, writer, count) utils.log("Created", count, "molecules from a total of ", i, "input molecules") writer.flush() writer.close() if input: input.close() if output: output.close() # close the individual writers if writer_dict: for key in writer_dict: writer_dict[key].close() if args.meta: utils.write_metrics( output_base, { '__InputCount__': i, '__OutputCount__': count, 'RxnSmartsFilter': count })
def main(): ### command line args defintions ######################################### parser = argparse.ArgumentParser(description='RDKit Butina Cluster Matrix') utils.add_default_input_args(parser) parser.add_argument('-o', '--output', help="Base name for output file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.") parser.add_argument('-of', '--outformat', choices=['tsv', 'json'], default='tsv', help="Output format. Defaults to 'tsv'.") parser.add_argument('--meta', action='store_true', help='Write metadata and metrics files') parser.add_argument('-t', '--threshold', type=float, default=0.7, help='Similarity clustering threshold (1.0 means identical)') parser.add_argument('-mt', '--matrixThreshold', type=float, default=0.5, help='Threshold for outputting values (1.0 means identical)') parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(cluster_butina.descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)') parser.add_argument('-m', '--metric', type=str.lower, choices=list(cluster_butina.metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)') parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode') args = parser.parse_args() utils.log("Cluster Matrix Args: ", args) descriptor = cluster_butina.descriptors[args.descriptor] if descriptor is None: raise ValueError('Invalid descriptor name ' + args.descriptor) input,suppl = utils.default_open_input(args.input, args.informat) # handle metadata source = "cluster_butina_matrix.py" datasetMetaProps = {"source":source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion} clsMappings = { "Cluster1": "java.lang.Integer", "Cluster2": "java.lang.Integer", "ID1": "java.lang.String", "ID2": "java.lang.String", "M1": "java.lang.String", "M2": "java.lang.String", "Similarity": "java.lang.Float" } fieldMetaProps = [{"fieldName":"Cluster", "values": {"source":source, "description":"Cluster number"}}] fieldNames = collections.OrderedDict() fieldNames['ID1'] = 'ID1' fieldNames['ID2'] ='ID2' fieldNames['Cluster1'] = 'Cluster1' fieldNames['Cluster2'] = 'Cluster2' fieldNames['Similarity'] = 'Similarity' fieldNames['M1'] = 'M1' fieldNames['M2'] = 'M2' writer,output_base = utils.create_simple_writer(args.output, 'cluster_butina_matrix', args.outformat, fieldNames, valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps) ### generate fingerprints mols = [x for x in suppl if x is not None] fps = [descriptor(x) for x in mols] input.close() ### do clustering utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold) clusters, dists, matrix, = cluster_butina.ClusterFps(fps, args.metric, 1.0 - args.threshold) utils.log("Found", len(clusters), "clusters") MapClusterToMols(clusters, mols) if not args.quiet: utils.log("Clusters:", clusters) writer.writeHeader() size = len(matrix) #utils.log("len(matrix):", size) count = 0 for i in range(size ): #utils.log("element",i, "has length", len(matrix[i])) writer.write(create_values(mols, i, i, 1.0)) count += 1 for j in range(len(matrix[i])): #utils.log("writing",i,j) dist = matrix[i][j] if dist > args.matrixThreshold: # the matrix is the lower left segment without the diagonal x = j y = i + 1 writer.write(create_values(mols, x, y, dist)) writer.write(create_values(mols, y, x, dist)) count += 2 writer.write(create_values(mols, size, size, 1.0)) writer.writeFooter() writer.close() if args.meta: utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitCluster':i})