for d in diff5: EVOM_error += d * d print("Difference between Compounds by representation:") print("BOB", BOB_error) print("CM", CM_error) print("EVCM", EVCM_error) print("OM", OM_error) print("EVOM", EVOM_error) if do_derivative_calculation: #create results instances mols, compounds = datprep.read_xyz_energies(database) datprep.store_compounds(compounds, database + "compounds.pickle") #prepare derivatives of all representations replist = [ZRNrep.Coulomb_Matrix, ZRNrep.Eigenvalue_Coulomb_Matrix, ZRNrep.Bag_of_Bonds, ZRNrep.Overlap_Matrix, \ ZRNrep.Eigenvalue_Overlap_Matrix] for i in range(5): results, resultaddition = jader.calculate_num_der( replist[i], compounds) res_file = database + namelist[i] + "der_results.pickle" datprep.store_compounds(results, res_file) print("results were successfully stored to ", res_file) if do_plot_derivatives: yvals = [] norms_nuc = []
###analytical derivative: #results = jader.calculate_eigenvalues('CM_EV', compound_ls) ###numerical derivative: results, resultaddition = jader.calculate_num_der(numerical_representations[2], compound_ls) ###B) ###store list of results in result_file ''' #If you want to plot from multiple pickle results file, use this code: #result_file = result_folder + "results_%i-%i.pickle" %(init, end) ''' print("results:", results) print("len of results:", len(results)) datprep.store_compounds(results, results_file_OM) #C) #If you want to plot from multiple pickle results file, use this code: #these are used as file identifiers for the results_%i-%i.pickle files #numbers = ["0-200", "200-400", "400-600", "600-800", "800-1000", "1000-1200", "1200-1400", "1400-1600", "1600-1800", "1800-2000", "2000-2200", "2200-2400", "2400-2600", "2600-2800", "2800-3000", "3000-3200", "3200-3400", "3400-3600", "3600-3800", "3800-3993"] ''' #read list of compounds from data file full_compound_ls = datprep.read_compounds(results_file) #print(len(full_compound_ls), " compounds in full data file") ''' ###use if only part of dataset should be processed #try: # compound_ls = full_compound_ls[init : end]
K_test = m_c.laplacian_kernel_matrix( x_training=m_c.x_training, x_test=m_c.x_test) m_c.test_predicted_results = np.dot(K_test, alphas) mae = m_c.calculate_mae() mae_nmodels += mae print("mae_nmodels ", mae_nmodels) avg_mae = mae_nmodels / float(nModels) print("avg mae:", avg_mae) m_c.mae = avg_mae learning_list.append(m_c) mae_list.append(avg_mae) print("totally tested:", total_tested) name = rep_names[ rep] #"sigma: %.2e, lambda: %.2e" %(sigma, lamda) curve = datprep.CurveObj(name) curve.xnparray = training_no curve.ynparray = np.array(mae_list) curve_list.append(curve) final_file = "./tmp/Kernel_Results/" + final_file_list[rep] datprep.store_compounds(curve_list, final_curve_file)
compounds = datprep.read_compounds(small_data_file) single = False if single: for c in compounds: ev, vectors = jrep.CM_ev(c.Z, c.R, c.N) print("name of compound:", c.filename) #print("eigenvalue repro:\n", ev) derivative = jader.sort_derivative('CM_EV', c.Z, c.R, c.N, 2, "R", "R") print(derivative) results, fractions = jader.calculate_eigenvalues('CM_EV', compounds) datprep.store_compounds(results, CM_ev_result_file) print(type(results[0])) #y-axis information dZ_percentages = [] dR_percentages = [] dZdZ_percentages = [] dRdR_percentages = [] dZdR_percentages = [] #x-axis information norms = [] #C) #get all the data from our results list
def full_kernel_ridge(fingerprint_list, property_list, result_file, set_sizes, sigmas=[], lambdas=[], rep_no=1, upperlimit=12, Choose_Folder=False, representation="CM"): #print("result_file:", result_file) ''' Kernel ridge regression model y(X') = sum_i alpha_i K(X', X_i) Input ----- fingerprint_list : list of fingerprints property_list : list of learning data, e.g. energy values corresponding to the fingerprints result_file : file where data is stored with pickle. training_size : desired size of training set sigmas : fitting coefficient lambdas : fitting coefficient upperlimit : int, total of training + test set. Can be used if more data is available than is being used or to bootstrap Choose_Folder: boolean, if True, file is directly stored to result_file. if not, result file is stored in ./Pickled/Kernel_Results folder representation: str, abbreviation for fingerprint used Return ------ learning_list : list of LearningResults Objects raw_data_files : list of names of files where raw data was stored to Stored ------ raw data is stored to raw_data_file entries, learning_list is sotred to result_file ''' start = tic() learning_list = [] raw_data_files = [] if not Choose_Folder: print("your results are stored to ./Pickled/Kernel_Results/") result_file = "./Pickled/Kernel_Results/" + result_file + "_" + str( rep_no) + "reps" #loop over learning defined by number of repetition, sigmas, and lamdas for i in range(rep_no): for s in sigmas: for l in lambdas: #for every i, s, l combination, a new Learning Object is created and stored to the learning list maes = [] for sets in set_sizes: t1 = tic() #make training and test list: training_indices, test_indices = make_training_test( len(fingerprint_list), sets, upperlim=upperlimit) #print("training:", training_indices) #print("test:", test_indices) tr_fingerprints = [ fingerprint_list[i] for i in training_indices ] tr_properties = [ property_list[i] for i in training_indices ] tr_size = len(training_indices) tst_fingerprints = [ fingerprint_list[i] for i in test_indices ] tst_properties = [property_list[i] for i in test_indices] t2 = tic() K = build_kernel_matrix(tr_fingerprints, tr_size, s) t3 = tic() #print("\n \n \nkernel matrix:\n ", K) #get alpha coefficients alphas = get_alphas(K, tr_properties, tr_size, l) t4 = tic() #print("\n \n \n alphas:\n ", alphas) #print("trainin/test split:", t2 - t1) #print("kernel matrix:", t3-t2) #print("alphas calculation:", t4 - t3) #predict properties of test set results, errors = predict_new(s, alphas, tr_fingerprints, tst_fingerprints, tst_properties) mae = sum(abs(errors)) / (len(errors)) maes.append(mae) #save raw data filename = './tmp/%srawdata_rep%i_sigma%s_lamda%f_set%i.dat' % ( representation, i, str(s), l, sets) raw_data_files.append(filename) save_raw_data(filename, tr_properties, training_indices, tst_properties, results, test_indices) #add learning result to list learning_list.append( LearningResults(l, s, np.array(set_sizes), np.array(maes))) print("round %i successfully finished" % (i + 1)) #save maes with data so it can be plotted datprep.store_compounds(learning_list, result_file) return (learning_list, raw_data_files)
with this file XYZ files can be converted to database_preparation.compounds class objects ''' #define path to folder containing xyz files database = "../Databases/QM9_XYZ/" #define path to where you want to store your data database_file = "../Databases/Pickled/qm9.pickle" #define path to where you want to store data of molecules with #less heavy atoms than in the database_file dat_ha_file = "../Databases/Pickled/qm7.pickle" #read all compounds in database file and convert to datprep class objects mol_ls, compound_ls = datprep.read_xyz_energies(database) #store compounds to database_file datprep.store_compounds(compound_ls, database_file) #store all compounds with less than 7 atoms with code below: datprep.sortby_heavyatoms(database_file, dat_ha_file, 7) ha_compounds = datprep.read_compounds(dat_ha_file) for c in compound_ls: print(c.heavy_atoms()) print("now all heavy atoms in sorted list") for i in ha_compounds: print(i.heavy_atoms())
if rep == 1: M = ZRN_rep.Eigenvalue_Coulomb_Matrix_h(compound.Z, compound.R) mol.representation = M.flatten() if rep == 2: mol.generate_bob(asize={'C': 7, 'H': 16, 'N': 6, 'O': 4, 'F': 4}) if rep == 3: M = ZRN_rep.Overlap_Matrix_h(compound.Z, compound.R) mol.representation = M.flatten() if rep == 4: M = ZRN_rep.Eigenvalue_Overlap_Matrix_h(compound.Z, compound.R) mol.representation = M.flatten() #add representation array to X_list X_list[rep].append(mol.representation) #prepare Kernel_Result raw instances (no training/test split, no sigma, no lamda) m = datprep.Kernel_Result() m.representation_name = rep_names[rep] m.x = X_list[rep] m.y = Y_energy_list CM_list.append(m) datprep.store_compounds(CM_list, "./tmp/%s_raw_Kernel_Results" % rep_names[rep])