def convert_extension(infile, outfile, canonical=False): """ Convert one molecule file format into another using OpenEye tools. The user may also assign canonical smiles as name before writing output. """ # open input file mols = reader.read_mols(infile) # open output file ofs = oechem.oemolostream() if not ofs.open(outfile): oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile) # write to output for mol in mols: if canonical: smi = oechem.OEMolToSmiles(mol) for conf in mol.GetConfs(): if canonical: conf.SetTitle(smi) oechem.OEWriteConstMolecule(ofs, conf) # close filestreams ofs.close()
def basic_plot(infile, tag, style, molname=None, take_relative=False, har_to_kcal=False): """ TODO Parameters ---------- infile : string Name of SDF file with information in SD tags. tag : string Full tag string directly as listed in the SD file. style : string plot style. can be 'scatter', 'line', or 'bar' TODO take_relative : Boolean subtract lowest value har_to_kcal : Boolean multiply data in Hartrees by 627.5095 to yield kcal/mol """ # Open molecule file. mols = reader.read_mols(infile) for i, mol_i in enumerate(mols): if molname is not None and mol_i.GetTitle() != molname: continue # get array of all conformer data of this mol try: data_array = np.fromiter(pt.get_sd_list(mol_i, datum='', taglabel=tag), dtype=np.float64) except ValueError: data_array = np.asarray([np.nan]) * mol_i.NumConfs() # exclude conformers for which job did not finish (nan) nanIndices = np.argwhere(np.isnan(data_array)) for j in reversed(nanIndices): # loop in reverse to delete correctly data_array = np.delete(data_array, j) if take_relative: data_array = data_array - np.amin(data_array) if har_to_kcal: data_array = 627.5095 * data_array # generate plot plt.plot(data_array) plt.grid() plt.title(mol_i.GetTitle() + '\n' + tag, fontsize=14) plt.savefig(f'output_{i}.png', bbox_inches='tight') plt.show()
def filter_confs(infile, tag, outfile): """ Read in OEMols (and each of their conformers) in 'infile'. For each molecule: rough filter conformers based on energy differences specified by 'tag', fine filter conformers based on RMSD values. Parameters ---------- infile : str Name of SDF file with conformers to be filtered tag : str SD tag name with the energy value to roughly screen conformers before RMSD Screening works by removing conformers of very similar energies, where "similar" is defined by thresE parameter. Examples: - "QM Psi4 Final Opt. Energy (Har) mp2/def-sv(p)" - "QM Psi4 Final Single Pt. Energy (Har) mp2/def-sv(p)" outfile : str Name of the output file with filtered conformers """ # Parameters for distinguishing cutoff of conformer similarity thresE = 5.E-4 # declare confs diff & skip RMSD comparison above this threshold thresRMSD = 0.2 # above this threshold (Angstrom), confs are "diff" minima wdir, fname = os.path.split(infile) numConfsF = open(os.path.join(os.getcwd(), "numConfs.txt"), 'a') numConfsF.write("\n{}\n".format(tag)) # open molecule file rmsd_molecules = reader.read_mols(infile) # Open outstream file. rmsd_ofs = oechem.oemolostream() if os.path.exists(outfile): raise FileExistsError("Output file {} already exists in {}".format( outfile, os.getcwd())) if not rmsd_ofs.open(outfile): oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile) # Identify minima and write output file. for mol in rmsd_molecules: if identify_minima(mol, tag, thresE, thresRMSD): numConfsF.write("%s\t%s\n" % (mol.GetTitle(), mol.NumConfs())) oechem.OEWriteConstMolecule(rmsd_ofs, mol) else: numConfsF.write("%s\t0\n" % (mol.GetTitle())) numConfsF.close() rmsd_ofs.close() print("Done filtering %s to %s.\n" % (fname, outfile))
def convert_extension_separate(infile, presuffix, canonical=False, separate='mol'): """ Convert one molecule file format into another using OpenEye tools. The user may also assign canonical smiles as name before writing output. Separate output into (each mol with all confs) or (each conf). presuffix : list first item contains prefix of output name, last item contains extension ex. ['alkyl', '.xyz'] separate : string 'mol' or 'conf' """ # open input file mols = reader.read_mols(infile) # write to output for i, mol in enumerate(mols): # open output file if separate == 'mol': ofs = oechem.oemolostream() if not ofs.open('{}_{}{}'.format(presuffix[0], str(i), presuffix[1])): oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile) if canonical: smi = oechem.OEMolToSmiles(mol) for j, conf in enumerate(mol.GetConfs()): # open output file if separate == 'conf': ofs = oechem.oemolostream() if not ofs.open('{}_{}_{}{}'.format(presuffix[0], str(i), str(j), presuffix[1])): oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile) if canonical: conf.SetTitle(smi) oechem.OEWriteConstMolecule(ofs, conf) # close filestreams ofs.close()
def confs_to_psi(insdf, method, basis, calctype='opt', memory=None, via_json=False): """ Read in molecule(s) (and conformers, if present) in insdf file. Create Psi4 input calculations for each structure. Parameters ---------- insdf: string Name of the molecule file for which to create Psi4 input file. SDF format can contain multiple molecules and multiple conformers per molecule in a single file. method: string Name of the method as understood by Psi4. Example: "mp2" basis : string Name of the basis set as understood by Psi4. Example: "def2-sv(p)" calctype : string What kind of Psi4 calculation to run. Supported inputs are: 'opt' for geometry optimization, 'spe' for single point energy calculation, and 'hess' for Hessian calculation. memory : string How much memory each Psi4 job should take. If not specified, the default in Psi4 is 500 Mb. Examples: "2000 MB" "1.5 GB" http://www.psicode.org/psi4manual/master/psithoninput.html via_json : Boolean If True, use JSON wrapper for Psi4 input and output. - Psi4 input would be in "input.py", called with python - Psi4 output would be in "output.json" If False, use normal text files for Psi4 input and output. - Psi4 input would be in "input.dat" - Psi4 output would be in "output.dat" """ wdir = os.getcwd() # open molecules molecules = reader.read_mols(insdf) ### For each molecule: for each conf, generate input for mol in molecules: print(mol.GetTitle(), mol.NumConfs()) if not mol.GetTitle(): sys.exit("ERROR: OEMol must have title assigned! Exiting.") for i, conf in enumerate(mol.GetConfs()): # change into subdirectory ./mol/conf/ subdir = os.path.join(wdir, "%s/%s" % (mol.GetTitle(), i + 1)) if not os.path.isdir(subdir): os.makedirs(subdir) if os.path.exists(os.path.join(subdir, 'input.dat')): print("Input file already exists. Skipping.\n{}\n".format( os.path.join(subdir, 'input.dat'))) continue label = mol.GetTitle() + '_' + str(i + 1) if via_json: ofile = open(os.path.join(subdir, 'input.py'), 'w') ofile.write("# molecule {}\n\nimport numpy as np\nimport psi4" "\nimport json\n\njson_data = ".format(label)) json.dump(make_psi_json(conf, label, method, basis, calctype, memory), ofile, indent=4, separators=(',', ': ')) ofile.write( "\njson_ret = psi4.json_wrapper.run_json(json_data)\n\n") ofile.write("with open(\"output.json\", \"w\") as ofile:\n\t" "json.dump(json_ret, ofile, indent=2)\n\n") else: ofile = open(os.path.join(subdir, 'input.dat'), 'w') ofile.write( make_psi_input(conf, label, method, basis, calctype, memory)) ofile.close()
def extract_enes(dict1, mol_slice=[]): """ From files in input dictionaries, read in molecules, extract information from SD tags for conformer energies and indices. Parameters ---------- dict1 : dict dictionary of input files and information to extract from SD tags keys are: 'theory' 'fname' 'tagkey' 'label' mol_slice : list list of indices from which to slice mols generator for read_mols [start, stop, step] Returns ------- titleMols : list of strings names of all molecules in the SDF file confNums : list of ints conformer index numbers enes : list of numpy arrays conformer energies of the compared file (kcal/mol) confNans : list of numpy arrays indices of enes where the values are nans """ # Open molecule file. if len(mol_slice) == 3: mols = reader.read_mols(dict1['fname'], mol_slice) else: mols = reader.read_mols(dict1['fname']) short_tag = dict1['tagkey'] qmethod, qbasis = reader.separated_theory(dict1['theory']) titleMols = [] confNums = [] enes = [] confNans = [] for imol in mols: # Get absolute energies from the SD tags iabs = np.array( list( map(float, pt.get_sd_list(imol, short_tag, 'Psi4', qmethod, qbasis)))) # Get omega conformer number of first, for reference info # whole list can be used for matching purposes indices_orig = pt.get_sd_list(imol, "original index") # find conformers for which job did not finish (nan) nanIndices = np.argwhere(np.isnan(iabs)) # convert energies from Hartrees to kcal/mol iabs = 627.5095 * iabs titleMols.append(imol.GetTitle()) confNans.append(nanIndices) confNums.append(indices_orig) enes.append(iabs) return titleMols, confNums, enes, confNans
def avg_mol_time(titles, infile, method, basis, tag, mol_slice=[]): """ For an SDF file with all confs of all mols, get the average runtime of all conformers for each molecule. The input dictionary may or not be empty. If it is, append the avg/stdev time of the calculation from this infile to existing molecule's value. Parameters ---------- titles : dictionary Keys are molecule names. Values are [[qm1_avg, qm1_std], [qm2_avg, qm2_std], ... ] where the index refers to a particular level of theory. Dictionary may or may not be empty. infile : string name of the SDF file from which to extract time data from SD tag method : string QM method basis : string QM basis set tag : string datum of interest, e.g., "QM opt energy" See keys in the define_tag function of proc_tags module. mol_slice : list list of indices from which to slice mols generator for read_mols [start, stop, step] Returns ------- titles : dictionary dictionary with extracted data from SDF file; keys are molnames, values are lists of list of [avg_time, stdev_time] for many QM methods """ # Open molecule file. if len(mol_slice) == 3: mols = reader.read_mols(infile, mol_slice) else: mols = reader.read_mols(infile) # Prepare text file to write extracted data. timeF = open("timeAvgs.txt", 'a') timeF.write("\nFile: {}\n".format(infile)) timeF.write( "Average [{}/{}] [{}s] over all confs for each molecule\n".format( method, basis, tag)) for mol_i in mols: # get array of all conformer data of this mol try: time_array = np.fromiter(pt.get_sd_list(mol_i, tag, 'Psi4', method, basis), dtype=np.float64) except ValueError: time_array = np.asarray([np.nan]) * mol_i.NumConfs() # exclude conformers for which job did not finish (nan) nanIndices = np.argwhere(np.isnan(time_array)) for i in reversed(nanIndices): # loop in reverse to delete correctly time_array = np.delete(time_array, i) meantime = np.mean(time_array) stdtime = np.std(time_array) # write out data to file and store in dictionary timeF.write(" %s\t%d confs\t\t%.3f +- %.3f\n" % (mol_i.GetTitle(), time_array.size, meantime, stdtime)) name = mol_i.GetTitle() if name not in titles: titles[name] = [] titles[name].append([meantime, stdtime]) timeF.close() return titles
def combine_files_plot(infile, figname='combined.png', molname=None, verbose=False, take_relative=False, har_to_kcal=False): """ TODO This only supports plotting of ONE specified molecule across different files. Note on take_relative: [1] Subtracting global minimum (single value) from all energies doesn't work since everything is still on different scale. subtract: (1) first conformer of each?, (2) global minimum?, (3) minimum of each? Parameters ---------- infile : str Filename with information on the files to read in, and the SDF tags to be extracted from each. Columns are: (1) QM method/basis, (2) sdf file, (3) tag key in sdf (like 'QM spe'), (4) arbitrary label for plotting. Separate columns by comma. molname verbose """ wholedict = reader.read_text_input(infile) numFiles = len(wholedict) xarray = [] yarray = [] labels = [] titles = [] for i in wholedict: print("Reading molecule(s) from file: ", wholedict[i]['fname']) mols = reader.read_mols(wholedict[i]['fname']) qmethod, qbasis = reader.separated_theory(wholedict[i]['theory']) short_tag = wholedict[i]['tagkey'] for j, mol_j in enumerate(mols): if molname is not None and mol_j.GetTitle() != molname: continue data_array = np.array( list( map( float, pt.get_sd_list(mol_j, short_tag, 'Psi4', qmethod, qbasis)))) if take_relative: data_array = data_array - data_array[0] #data_array = data_array/data_array[0] if har_to_kcal: data_array = 627.5095 * data_array titles.append(mol_j.GetTitle()) labels.append(wholedict[i]['label']) yarray.append(data_array) xarray.append(range(len(data_array))) if verbose: header = '{}\n'.format(molname) for l in labels: header += ("%s\n" % l) xydata = np.vstack((xarray[0], yarray)).T np.savetxt('combined.dat', xydata, delimiter='\t', header=header, fmt=' '.join(['%i'] + ['%10.4f'] * numFiles)) # letter labels for x-axis num_confs = len(xarray[0]) letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' rpt = int((num_confs / 26) + 1) xlabs = [''.join(i) for i in itertools.product(letters, repeat=rpt)][:num_confs] fig = plt.figure() ax = fig.add_subplot(111) xlabel = 'conformer' ylabel = "energy" # vtl print max range of relative energies conf_then_file = np.array(yarray).T ranges = [] for c in conf_then_file: c_spread = max(c) - min(c) ranges.append(c_spread) print(f'mol {molname} max range: {max(ranges)}') ax.set_prop_cycle( plt.cycler('color', plt.cm.rainbow(np.linspace(0, 1, len(yarray))))) for i, (xs, ys) in enumerate(zip(xarray, yarray)): plt.plot(xs, ys, '-o', lw=0.8, label=labels[i]) # publication view # plt.ylabel(ylabel,fontsize=8) # plt.xlabel(xlabel,fontsize=8) # plt.legend(bbox_to_anchor=(0.08,1.05),loc=3,fontsize=8) # fig.set_size_inches(3.37,1.7) # standard view plt.ylabel(ylabel, fontsize=14) plt.xlabel(xlabel, fontsize=14) plt.xticks(list(range(num_confs)), xlabs) plt.legend(bbox_to_anchor=(1.05, 1), loc=2) plt.title(molname) plt.grid() plt.savefig(figname, bbox_inches='tight', dpi=300) plt.show()
def getRMSD(sdfRef, theory, rmsdict, package='Psi4'): """ Perform RMSD calculation from an SDF file for molecule and its conformers. sdfRef: string, pathname of the SDF file with energies of opt 1 and opt 2 theory: string, level of theory in format of mp2/6-31G* rmsdict: dictionary (can be empty) which will be populated in form of rmsdict[theory][molName] = 0.000 if the RMSD of before/after energies are 0.000 package: string, name of software package used for QM calculation. only Psi4 currently supported """ method, basis = theory.split('/')[0].strip(), theory.split('/')[1].strip() # create a molecule read in stream print("Opening SDF file %s" % sdfRef) molsRef = reader.read_mols(sdfRef) # create file object for output RMSD calculation RMSD = open("RMSD.txt", 'a') RMSD.write("\nAnalyzing file: %s\n# Level of theory: %s\n" % (sdfRef, theory)) # create file object for initial and final energies energies = open("energies_breakdown.txt", 'a') maximum = open("maxenergies.txt", "a") # Grab energies, perform RMSD calculation, write data to txt files. for rmol in molsRef: molName = rmol.GetTitle() tmol = np.asarray(pt.get_sd_list(rmol, 'QM opt energy', 'Psi4', method, basis), dtype=float) imol = np.asarray(pt.get_sd_list(rmol, 'QM opt energy initial', 'Psi4', method, basis), dtype=float) final = tmol.copy() initial = imol.copy() # subtract conformer[0] energies from all conformers try: tmol -= tmol[0] except IndexError as e: sys.exit("No energies found for {} {}/{}! Check that data is \ stored in tags. Exiting.".format(rmol.GetTitle(), method, basis)) imol -= imol[0] #subtracts initial minus final and sqaures all values fmol = np.subtract(tmol, imol) fmol = fmol[~np.isnan(fmol)] fmol = np.square(fmol) #sums all energies of conformers for given rmol and then takes average with respect to n-1 number of conformers tot = 0 for n in fmol: tot += n average = math.sqrt(tot / (fmol.size - 1)) #convert average from Hartree to Kcal/mol average = average * 627.5095 # puts RMSD values into .txt file, and store in dict for plotting. RMSD.write("#%s\t%.5f RMSD(Kcal/mol)\n" % (molName, average)) rmsdict[theory][molName] = average # store energies of initial and final for molecules conformers in energies.txt energies.write( "\n#%s\n#%s\n#RMSD = %.5f(y)\t\t(x=Hartree, y=kcal/mol)\n#conf. init. Energy(x) \t final Energy(x) \t diff.(x)\tdiff. (y) \n" % (theory, molName, average)) # get list of conformer indices to identify high RMSD ones conflist = pt.get_sd_list(rmol, "original index", package, method, basis) conformer = [] for item in conflist: conformer.append(item.split(',')[0]) # append orig conf conformer = np.asarray(conformer, dtype=int) difference = np.array([]) for i in range(len(tmol)): energies.write( "%r \t %5.9f \t %5.9f \t %5.9f\t%5.9f \n" % (conformer[i], initial[i], final[i], final[i] - initial[i], (final[i] - initial[i]) * 627.5095)) difference = np.append(difference, [(final[i] - initial[i]) * 627.5095]) # find max 3 confs with highest RMSDs try: difference = np.absolute(difference) confmax1 = (np.nanargmax(difference)) # set max conf to zero to find next highest difference[confmax1] = 0 difference = np.absolute(difference) confmax2 = (np.nanargmax(difference)) difference[confmax2] = 0 difference = np.absolute(difference) confmax3 = (np.nanargmax(difference)) difference[confmax3] = 0 max1 = conformer[confmax1] max2 = conformer[confmax2] max3 = conformer[confmax3] except ValueError as e: #print("ValueError: {}".format(e)) # TODO don't plot this mol for all nan's print("All RMSDs in list for file {} mol {} are nan!!!".format( sdfRef, molName)) max1 = max2 = max3 = -1 energies.write( "#*** Max energy differences are conformers (hi-->low): %r, %r, %r ***\n\n" % (max1, max2, max3)) maximum.write("%s, %s : %r, %r, %r\n" % (theory, molName, max1, max2, max3)) maximum.close() RMSD.close() energies.close() return rmsdict
def get_psi_results(origsdf, finsdf, calctype='opt', psiout="output.dat", timeout="timer.dat"): """ Read in OEMols (and each of their conformers) in origsdf file, get results from Psi4 calculations in the same directory as origsdf, and write out results into finsdf file. Directory layout is .../maindir/molName/confNumber/outputfiles . Both origsdf and finsdf are located in maindir. Parameters ---------- origsdf: string - original SDF file of input structures of QM calculation finsdf: string - full name of final SDF file with optimized results. calctype: string; one of 'opt','spe','hess' for geometry optimization, single point energy calculation, or Hessian calculation psiout: string - name of the Psi4 output files. Default is "output.dat" timeout: string - name of the Psi4 timer files. Default is "timer.dat" Returns ------- method: string - QM method from Psi4 calculations basisset: string - QM basis set from Psi4 calculations None is returned if the function returns early (e.g., if output file already exists) """ hdir, fname = os.path.split(origsdf) wdir = os.getcwd() # check that specified calctype is valid if calctype not in {'opt', 'spe', 'hess'}: raise ValueError("Specify a valid calculation type.") # read in molecules molecules = reader.read_mols(origsdf) # open outstream file writeout = os.path.join(wdir, finsdf) write_ofs = oechem.oemolostream() if os.path.exists(writeout): raise FileExistsError(f"File already exists: {finsdf}\n") if not write_ofs.open(writeout): oechem.OEThrow.Fatal("Unable to open %s for writing" % writeout) # Hessian dictionary, where hdict['molTitle']['confIndex'] has np array if calctype == 'hess': hdict = {} # for each conformer, process output file and write new data to SDF file for mol in molecules: print("===== %s =====" % (mol.GetTitle())) if calctype == 'hess': hdict[mol.GetTitle()] = {} for j, conf in enumerate(mol.GetConfs()): props = initiate_dict() # set file locations timef = os.path.join(hdir, "%s/%s/%s" % (mol.GetTitle(), j + 1, timeout)) outf = os.path.join(hdir, "%s/%s/%s" % (mol.GetTitle(), j + 1, psiout)) # process output and get dictionary results props = get_conf_data(props, calctype, timef, outf) # if output was missing or are missing calculation details # move on to next conformer if props['missing'] or (calctype == 'opt' and not all( key in props for key in ['numSteps', 'finalEnergy', 'coords'])): print(f"ERROR reading {outf}\nEither Psi4 job was incomplete " "or wrong calctype specified\n") method = None basisset = None continue # add data to oemol conf = set_conf_data(conf, props, calctype) method = props['method'] basisset = props['basis'] # if hessian, append to dict bc does not go to SD tag if calctype == 'hess': hdict[mol.GetTitle()][j + 1] = props['hessian'] # check mol title conf = check_title(conf, origsdf) # write output file oechem.OEWriteConstMolecule(write_ofs, conf) # if hessian, write hdict out to separate file if calctype == 'hess': hfile = os.path.join(wdir, os.path.splitext(finsdf)[0] + '.hess.pickle') pickle.dump(hdict, open(hfile, 'wb')) # close file streams write_ofs.close() return method, basisset