"800-820", "820-840", "840-860", "880-900",\ "920-940", "940-960", "980-1000", "1000-1020"] range_1000 = range(0, 3800, 100) srt_numbers = ["%i-%i"%(j, j+100) for j in range_1000] print(srt_numbers) number_ends = [srt_numbers, unsrt_numbers] print("number of molecules total:", 20*len(unsrt_numbers)) for k in range(len(unsrt_numbers)):#range(0, 4000, 100): partialfilename = filename + number_ends[i-1][k] print("file: ", partialfilename) if os.path.isfile(partialfilename): results = datprep.read_compounds(partialfilename) else: print(partialfilename, "was not found") continue xdata, ydata, newresults = prepresults(results, rep = repro,\ dwhich = which_d, repno = 2,\ norm = xnorm, yval = yvalues,\ with_whichd = False) #datprep.store_compounds(newresults, partialfilename) for yd in ydata: ax.scatter(xdata, yd[0], c = colorlist[i], label = repro if j == 0 else "") #np.savez(outfile, xdata = x, yd[0] = y) del(xdata) del(ydata)
###A) ###store compounds to database_file #datprep.store_compounds(compound_ls, database_file) ###A) supplement: you don't need to do this step ###take info from database_file and extract all molecules with less than 7 heavy atoms to dat_ha_file #max_atoms = datprep.sortby_heavyatoms(database_file, dat_ha_file, 7) ###A) supplement: if in doubt, just choose 23 for QM9 dataset. max_atoms is just the maximal size of your representation ###max_atoms is maximal number of atoms in file. needed to set size of CM #print("all CM should have size " , max_atoms) #input("Press enter once you have made sure the size of the unsorted CM matrix has been adapted accordingly") ###B) ###read list of compounds from data file full_compound_ls = datprep.read_compounds(data_file) print(len(full_compound_ls), " compounds in full data file") ###B) #If you want to plot only part of all compounds, use this code: try: compound_ls = full_compound_ls[init:end] except IndexError: print("Your indices were out of bound, restart. min: 0, max: ", len(full_compound_ls)) exit() print(len(compound_ls), " of which are being processed") ###B)
def cleanup_results(result_file, multiple_runs=False, Choose_Folder=False, rep_no=1): ''' gets data from resultfile and returns plottable Curve objects Variables --------- resultsfile : string, path to file containing pickled Result objects multiple_runs : if True, calculate mean of runs with same lamda and sigma Choose_Folder: boolean, if True, file is directly stored to result_file. if not, result file is stored in ./Pickled/Kernel_Results folder Returns ------- this_curve : LearningResults object ''' if not Choose_Folder: #print("your results were stored to ./Pickled/Kernel_Results/") result_file = "./Pickled/Kernel_Results/" + result_file + "_" + str( rep_no) + "reps" plottable_curves = [] if rep_no > 1: multiple_runs = True if multiple_runs: lamdas = [] sigmas = [] results_list = datprep.read_compounds(result_file) #print("len results_list:", len(results_list)) for result in results_list: #print("type result:", type(result)) lamda = result.lamda sigma = result.sigma xlist = result.set_sizes ylist = result.maes if not multiple_runs: name = curve_name(sigma, lamda) curve = CurveObj(name) curve.xnparray = xlist curve.ynparray = ylist plottable_curves.append(curve) else: lamdas.append(lamda) sigmas.append(sigma) #probably plottable_curves could already be returned here for False if multiple_runs: for l in list(set(lamdas)): #get all unique occurances for lamda for s in list(set(sigmas)): #get all unique occurances for sigma same_x = [] same_y = [] #find all results with these s and l for result in results_list: if result.lamda == l and result.sigma == s: same_x.append(result.set_sizes) same_y.append(result.maes) #print("all arrays of same y:\n", same_y) #calculate average now av_ylist, yerror = jmath.calculate_mean(same_y) print("the calculated mean and it's error are:\n mean:", av_ylist, "\n error:", yerror) #add Curve object name = curve_name(s, l) curve = CurveObj(name) curve.xnparray = same_x[0] curve.ynparray = av_ylist curve.yerror = yerror plottable_curves.append(curve) return (plottable_curves)
axb1 = fig.add_subplot(gs[1, 0:2]) axb2 = fig.add_subplot(gs[1, 2:4]) axb3 = fig.add_subplot(gs[1, 4:6]) axb4 = fig.add_subplot(gs[1, 6:8]) axb5 = fig.add_subplot(gs[1, 8:10]) plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=3, hspace=None) database = "../Databases/Pickled/qm7.pickle" compoundlist = datprep.read_compounds(database) #print("length of compoundlist:", len(compoundlist)) Zlist = [] atomlen = [] halist = [] atomtypes = [] atomfrequencylist = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] for compound in compoundlist: Z = compound.Z atoms = len(Z) atomscount = Counter(list(Z)) atomscountdict = dict(atomscount)
repno = 1 #final_file_list = ["./Results/trial.obj"] final_file_list = ['CM_QM7', \ 'EVCM_QM7', \ 'BOB_QM7',\ 'OM_QM7',\ 'EVOM_QM7'] repnames = ["CM", "EVCM", "BOB", "OM", "EVOM"] max_no = 3993 #150 #for i in [0, 1, 2]: # results = kproc.kernel_learning(datapath, final_file_list[i], representation_no = i, maxnumber = max_no, repno = repno) final_file = "./tmp/Curves.pickle" curve_list = datprep.read_compounds(final_file) ''' #curves = kplot.cleanup_results(final_file, rep_no = repno) for curve in curves: curve.name = repnames[i] + curve.name curve_list.append(curve) #print("curve", curve) #kplot.plot_curves(curves, file_title = final_file[11:], plottitle = final_file + "Learning on 200 QM7 datapoints") ''' kplot.plot_curves(curve_list, file_title="ML_trial", plottitle="Learning of Molecular Energies on QM7 Dataset")
for i in range(5): results, resultaddition = jader.calculate_num_der( replist[i], compounds) res_file = database + namelist[i] + "der_results.pickle" datprep.store_compounds(results, res_file) print("results were successfully stored to ", res_file) if do_plot_derivatives: yvals = [] norms_nuc = [] for i in range(5): resfile = database + namelist[i] + "der_results.pickle" res_list = datprep.read_compounds(resfile) xlist, ylist, results = \ pltder.prepresults(results = res_list,\ rep = namelist[i],\ repno = i,\ yval = "perc",\ with_whichd = True) yvals.extend(ylist) for i in ylist: norms_nuc.append(xlist) print(yvals) pltder.plot_percentage_zeroEV([1,2], yvals, title = "4 C Atoms",\
1e-5] #optimal lamdas for every representation #get maximum number of compounds for which representations need to be calculated final_file_list = ['CM_QM7', \ 'EVCM_QM7', \ 'BOB_QM7',\ 'OM_QM7',\ 'EVOM_QM7'] final_curve_file = "./tmp/Curves.pickle" filepath_thisjob = "./tmp/trial" if plot_scatter: #plot scatter plots results = datprep.read_compounds(filepath_thisjob) for i in range(len(training_no)): name = "%i Training Instances, OM representation" % training_no[i] y_test = results[i].y_test y_predicted = results[i].test_predicted_results pltker.plot_scatter(y_test, y_predicted, title=name, figuretitle="Scatterplot_OM_%i" % i) if plot_learning: results = datprep.read_compounds(filepath_thisjob) print("trying to plot learning from existing file")
import representation_ZRN as ZRNrep import jax_representation as jrep import database_preparation as datprep from time import time as tic import statistics import numpy as np data_file = "../Databases/Pickled/qm7.pickle" repros = [ZRNrep.Coulomb_Matrix, ZRNrep.Eigenvalue_Coulomb_Matrix, ZRNrep.Overlap_Matrix, \ ZRNrep.Eigenvalue_Overlap_Matrix, ZRNrep.Bag_of_Bonds] repronames = ["CM", "EVCM", "OM", "EVOM", "BOB"] ###read list of compounds from data file compounds = datprep.read_compounds(data_file) compounds = compounds[:1] print("number of compounds:", len(compounds)) #store times for every single and total calculation one_times = [[], [], [], [], []] total_times = [] for i in range(5): start = tic() for c in compounds: thisstart = tic() M = repros[i](c.Z, c.R) thisend = tic() one_times[i].append(thisend - thisstart)
import jax_representation as jrep import jax.numpy as jnp import plot_derivative as pltder #define path to folder containing xyz files. All files are considered. datapath = "../Database/QM9/" compounds = #where do you want these compounds to be saved to? small_data_file = "../Database/Pickled/compounds.pickle" CM_ev_result_file = "/home/linux-miriam/Uniqueness_QML/Pickled/fourcompounds_res.pickle" compounds = datprep.read_compounds(small_data_file) single = False if single: for c in compounds: ev, vectors = jrep.CM_ev(c.Z, c.R, c.N) print("name of compound:", c.filename) #print("eigenvalue repro:\n", ev) derivative = jader.sort_derivative('CM_EV', c.Z, c.R, c.N, 2, "R", "R") print(derivative) results, fractions = jader.calculate_eigenvalues('CM_EV', compounds) datprep.store_compounds(results, CM_ev_result_file) print(type(results[0]))
def kernel_learning(datapath, final_file, representation_no=0, maxnumber=3993, repno=1): ''' Metafunction to simplify a ML run datapath: list of pickled compound instances in a file final_file : where the results are dumped with pickle representation_no: [0,1,2,3,4] stand for [CM, EVCM, BOB, OM, EVOM] respectively maxnumber = number of compounds from datapath file to be considered returns: ------- ''' start = tic() representation_list = ["CM", "EVCM", "BOB", "OM", "EVOM"] repro_name = representation_list[representation_no] repro_sigmas = [[80], [120], [120], [15, 20, 120], [30, 150]] repro_lambdas = [[1e-15], [1e-15], [1e-15], [1e-15, 1e-13], [1e-14, 1e-13]] #define parameters for learning set_sizes = [5, 120, 600, 1500, 3000] #not bigger than the total number of instances in set sigmas = repro_sigmas[ representation_no] # [9, 12, 15] #how tight is the fit? needs to be tested depending on data varies widely lambdas = repro_lambdas[ representation_no] #how much variation to the initial data is introduced? 1e-17 - 1e-13 good to try number_of_runs = repno #how many times should the learning be done before averaging and plotting? #define (hashed) representation representation_list = [ZRN_rep.Coulomb_Matrix_h,\ ZRN_rep.Eigenvalue_Coulomb_Matrix_h,\ ZRN_rep.Bag_of_Bonds_h,\ ZRN_rep.Overlap_Matrix_h,\ ZRN_rep.Eigenvalue_Overlap_Matrix_h] repro = representation_list[representation_no] #unpack pickled data to compounds list compounds = datprep.read_compounds(datapath) #print("len of compoundlist:", len(compounds)) #shorten compounds to make stuff faster compounds = compounds[:maxnumber] #print("len of compoundlist used for this run:", len(compounds)) #for compounds create list of energies and list of fingerprints in "repro" representation #internal = potential energy in hartree raw_energylist = [] #convert to atomization energy in kcal/mol energylist = [] fingerprintlist = [] for c in compounds: #get properties from compound class energy = float(c.energy) Z = c.Z R = c.R N = c.N atomization_energy = datprep.atomization_energy( potential_energy=energy, nuclear_charges=Z) #calculate fingerprint of molecule fingerprint = repro(Z, R, N) #add energy and fingerprint to lists energylist.append(energy) fingerprintlist.append(fingerprint) t_compounds = tic() #print("time start to compounds: ", t_compounds - start) #run learning results, metadata = kler.full_kernel_ridge(fingerprintlist,\ energylist,\ final_file,\ set_sizes,\ sigmas,\ lambdas,\ rep_no = number_of_runs,\ upperlimit = maxnumber,\ representation = repro_name) return (results)
# # results.extend(compoundlist[0]) #print("number of compounds: ", len(results)) #datprep.store_compounds(results, result_file) #data has now been stored to resultfile #C) #read data from result file #If you want to plot from multiple pickle results file, use this code: #result_file = resultfile results_EV = datprep.read_compounds(results_file) ''' results_EV = datprep.read_compounds(result_file_EV) results_CM = datprep.read_compounds(result_file_CM) ''' #C) #prepare plotting #y-axis information dZ_percentages_EV = [] dR_percentages_EV = [] dZdZ_percentages_EV = [] dRdR_percentages_EV = [] dZdR_percentages_EV = [] ''' dZ_percentages_CM = []
import qml import numpy as np import database_preparation as datprep import kernel_learning as kler import plot_kernel as pltker import representation_ZRN as ZRN_rep """" this file creates pickled lists of Kernel_Result class objects with the represented information """ calculated = datprep.read_compounds("./tmp/BOB_raw_Kernel_Results") print("len of list: ", len(calculated)) print("first element:", calculated[0]) print("CM of first element:", calculated[0].representation_name) print(calculated[0].x[0]) print("energy:", calculated[0].y[0]) #define datapath to a pickled list of compound instances datapath = "./Pickled/qm7.pickle" #list of representations to be considered, 0 = CM, 1 = EVCM, 2 = BOB, 3 = OM, 4 = EVOM rep = 4 rep_names = ["CM", "EVCM", "BOB", "OM", "EVOM"] #get maximum number of compounds for which representations need to be calculated total = 3993 #unpickle list of compounds compound_list = datprep.read_compounds(datapath)
ZRNrep.Bag_of_Bonds, ZRNrep.Overlap_Matrix, \ ZRNrep.Eigenvalue_Overlap_Matrix] #which representation should be computed? 0 = CM, 1 = EVCM, 2 = BOB, 3 = OM, 4 = EVOM which_rep = 0 try: init, end = int(sys.argv[1]), int(sys.argv[2]) except IndexError: init = int(input("starting point")) end = int(input("end point")) name = str(init) + "-" + str(end) ###read list of compounds from data file full_compound_ls = datprep.read_compounds(data_file) print(len(full_compound_ls), " compounds in full data file") ###B) #If you want to plot only part of all compounds, use this code: try: compound_ls = full_compound_ls[init : end] except IndexError: print("Your indices were out of bound, restart. min: 0, max: ", len(full_compound_ls)) exit() #print("you are going to calculate the repro on a list of compounds of length:") #print(len(compound_ls)) t1 = tic()