def gen_representations(data): nuclear_charges = [] # print(list(data.keys())) # print(data["Z"]) max_atoms = max([len(_) for _ in data["Z"]]) elements = sorted(list(set(data["Z"].reshape(-1).tolist()))) print("max_atoms", max_atoms) print("elements", elements) reps = [] dreps = [] for i in tqdm(range(len(data["E"]))): x, dx = generate_fchl_acsf(data["Z"][i], data["R"][i], elements=elements, gradients=True, pad=max_atoms) reps.append(x) dreps.append(dx) energies = data["E"].flatten() nuclear_charges = data["Z"].tolist() reps = np.array(reps) dreps = np.array(dreps) return reps, dreps, nuclear_charges, energies
def csv_to_reps(csv_filename, n=32): # max_atoms = 12 # HARDCODED for ETHANOL df = pandas.read_csv(csv_filename, sep=";|") max_n = len(df["atomization_energy"]) n = min(max_n, n) index = np.random.choice(max_n, size=n, replace=False) print(csv_filename, max_n) X = [] dX = [] Q = [] E = [] F = [] for i in index: coordinates = np.array(ast.literal_eval(df["coordinates"][i])) nuclear_charges = np.array(ast.literal_eval(df["nuclear_charges"][i]), dtype=np.int32) atomtypes = ast.literal_eval(df["atomtypes"][i]) force = np.array(ast.literal_eval(df["forces"][i])) energy = float(df["atomization_energy"][i]) # HACK new_cut = 4.0 cut_parameters = { "rcut": new_cut, "acut": new_cut, # "nRs2": int(24 * new_cut / 8.0), # "nRs3": int(20 * new_cut / 8.0), } (rep, drep) = generate_fchl_acsf(nuclear_charges, coordinates, gradients=True, pad=MAX_ATOMS, elements=[1, 6, 8], **cut_parameters) X.append(rep) dX.append(drep) Q.append(nuclear_charges) E.append(energy) F.append(force) X = np.array(X) dX = np.array(dX) E = np.array(E).flatten() # = np.concatenate(F) return X, dX, Q, E, F
def predict(nuclear_charges, coordinates): """ Given a query molecule (charges and coordinates) predict energy and forces """ # Initialize training data (only need to do this once) alpha = np.load(FILENAME_ALPHAS) X = np.load(FILENAME_REPRESENTATIONS) Q = np.load(FILENAME_CHARGES) # Generate representation max_atoms = X.shape[1] (rep, drep) = generate_fchl_acsf(nuclear_charges, coordinates, gradients=True, pad=max_atoms) # Put data into arrays Qs = [nuclear_charges] Xs = np.array([rep]) dXs = np.array([drep]) # Get kernels Kse = get_atomic_local_kernel(X, Xs, Q, Qs, SIGMA) Ks = get_atomic_local_gradient_kernel(X, Xs, dXs, Q, Qs, SIGMA) # Offset from training offset = -97084.83100465109 # Energy prediction energy_predicted = np.dot(Kse, alpha)[0] + offset energy_true = -97086.55524903 print("True energy %16.4f kcal/mol" % energy_true) print("Predicted energy %16.4f kcal/mol" % energy_predicted) # Force prediction forces_predicted = np.dot(Ks, alpha).reshape((len(nuclear_charges), 3)) forces_true = np.array([[-66.66673100, 2.45752385, 49.92224945], [-17.98600137, 68.72856500, -28.82689294], [31.88432927, 8.98739402, -18.11946195], [4.19798833, -31.31692744, 8.12825145], [16.78395377, -24.76072606, -38.99054658], [6.03046276, -7.24928076, -3.88797517], [17.44954868, 0.21604968, 8.56118603], [11.73901551, -19.38200606, 13.26191987], [-3.43256595, 2.31940789, 9.95126984]]) print("True forces [kcal/mol]") print(forces_true) print("Predicted forces [kcal/mol]") print(forces_predicted) return
def query(self, atoms=None, print_time=True): if print_time: start = time.time() # kcal/mol til ev # kcal/mol/aangstrom til ev / aangstorm conv_energy = 0.0433635093659 conv_force = 0.0433635093659 coordinates = atoms.get_positions() nuclear_charges = atoms.get_atomic_numbers() n_atoms = coordinates.shape[0] new_cut = 4.0 cut_parameters = { "rcut": new_cut, "acut": new_cut, # "nRs2": int(24 * new_cut / 8.0), # "nRs3": int(20 * new_cut / 8.0), } rep, drep = generate_fchl_acsf(nuclear_charges, coordinates, gradients=True, elements=[1, 6, 8], pad=self.max_atoms, **cut_parameters) # Put data into arrays Qs = [nuclear_charges] Xs = np.array([rep], order="F") dXs = np.array([drep], order="F") # Get kernels Kse = get_atomic_local_kernel(self.repr, Xs, self.charges, Qs, self.sigma) Ks = get_atomic_local_gradient_kernel(self.repr, Xs, dXs, self.charges, Qs, self.sigma) # Energy prediction energy_predicted = np.dot(Kse, self.alphas)[0] + self.offset self.energy = energy_predicted * conv_energy # Force prediction forces_predicted = np.dot(Ks, self.alphas).reshape((n_atoms, 3)) self.forces = forces_predicted * conv_force if print_time: end = time.time() print("qml query {:7.3f}s {:10.3f} ".format( end - start, energy_predicted)) return
def _get_rep(self, confid): """ Lazily build representations for result conformers.""" if confid not in self._rep_cache: coords = np.array( self._dataset['conformers'][confid]['geo']).reshape(-1, 3) self._rep_cache[confid] = generate_fchl_acsf(self._charges, coords, pad=len( self._charges)) return self._rep_cache[confid]
def _is_duplicate(self, haystack, needle): """ Accurate, yet expensive comparison operation. Checks for equivalents of the geometry needle in the list of conformers haystack.""" rep = generate_fchl_acsf(self._charges, needle, pad=len(self._charges)) reps = [self._get_rep(confid) for confid in haystack] if len(reps) == 0: return False sim = get_global_kernel(np.array([rep]), np.array(reps), np.array([self._charges]), np.array([list(self._charges)] * len(reps)), QML_FCHL_SIGMA) return sim
def get_representation(atoms, coordinates, **kwargs): """ atoms coordinates max_atoms """ max_atoms = kwargs.get("max_atoms", len(atoms)) rep = generate_fchl_acsf(atoms, coordinates, pad=max_atoms) return rep
def query(self, atoms=None): if self.debug: start = time.time() # kcal/mol til ev # kcal/mol/aangstrom til ev / aangstorm conv_energy = 0.0433635093659 conv_force = 0.0433635093659 coordinates = atoms.get_positions() nuclear_charges = atoms.get_atomic_numbers() n_atoms = coordinates.shape[0] # Calculate representation for query molecule rep, drep = generate_fchl_acsf(nuclear_charges, coordinates, gradients=True, **self.parameters) # Put data into arrays Qs = [nuclear_charges] Xs = np.array([rep], order="F") dXs = np.array([drep], order="F") # Get kernels Kse = get_atomic_local_kernel(self.repr, Xs, self.charges, Qs, self.sigma) Ks = get_atomic_local_gradient_kernel(self.repr, Xs, dXs, self.charges, Qs, self.sigma) # Energy prediction energy_predicted = np.dot(Kse, self.alphas)[0] + self.offset self.energy = energy_predicted * conv_energy # Force prediction forces_predicted = np.dot(Ks, self.alphas).reshape((n_atoms, 3)) self.forces = forces_predicted * conv_force if self.debug: end = time.time() print("fchl19 query {:7.3f}s {:10.3f} ".format( end - start, energy_predicted)) return
def read_csv_file(filename, n=32, parameters=DEFAULT_PARAMETERS): """ """ df = pd.read_csv(filename, sep=";") max_n = len(df["atomization_energy"]) n = min(max_n, n) random_indexes = np.random.choice(max_n, size=n, replace=False) representations = [] d_representations = [] charges = [] energies = [] forces = [] for i in random_indexes: # atomistic coordinates = np.array(ast.literal_eval(df["coordinates"][i])) nuclear_charges = np.array(ast.literal_eval(df["nuclear_charges"][i]), dtype=np.int32) atomtypes = ast.literal_eval(df["atomtypes"][i]) # properties force = np.array(ast.literal_eval(df["forces"][i])) energy = float(df["atomization_energy"][i]) # calculate representations rep, drep = generate_fchl_acsf(nuclear_charges, coordinates, gradients=True, **parameters) # representations.append(rep) d_representations.append(drep) charges.append(nuclear_charges) energies.append(energy) forces.append(force) return representations, d_representations, charges, energies, forces
def get_data_from_file(filename, n=100): data = np.load(filename) X = [] dX = [] Q = [] E = [] F = [] max_n = len(data["E"]) index = np.random.choice(max_n, size=n, replace=False) nuclear_charges = data["z"] # max_atoms = len(nuclear_charges) for i in index: coordinates = data["R"][i] (rep, drep) = generate_fchl_acsf(nuclear_charges, coordinates, gradients=True, pad=MAX_ATOMS, elements=[1, 6, 8]) X.append(rep) dX.append(drep) Q.append(nuclear_charges) E.append(data["E"][i]) F.append(data["F"][i]) # print(coordinates) # print(data["E"][i]) # print(data["F"][i]) X = np.array(X) dX = np.array(dX) E = np.array(E).flatten() F = np.array(F) return X, dX, Q, E, F
def get_potential_energy(self, atoms=None, force_consistent=False): x = [] disp_x = [] q = [] # x1 = generate_fchl_acsf(atoms.get_atomic_numbers(), atoms.get_positions(), gradients=False, pad=9, elements=[1,6,7,9,17,35]) x1 = generate_fchl_acsf(atoms.get_atomic_numbers(), atoms.get_positions(), gradients=False, pad=self.nAtoms) x.append(x1) q.append(atoms.get_atomic_numbers()) Xs = np.array(x) Qs = q Kse = get_atomic_local_kernel(self.X, Xs, self.Q, Qs, self.sigmas) energy = (float(np.dot(Kse, self.alphas))) * convback_E return energy
def _repr_wrapper(frame, elements, nRs2=24, nRs3=20, nFourier=1, eta2=0.32, eta3=2.7, zeta=np.pi, rcut=8.0, acut=8.0, two_body_decay=1.8, three_body_decay=0.57, three_body_weight=13.4, stride=1): nuclear_charges, coordinates = frame.get_atomic_numbers( ), frame.get_positions() rep = generate_fchl_acsf(nuclear_charges, coordinates, elements, nRs2=nRs2, nRs3=nRs3, nFourier=nFourier, eta2=eta2, eta3=eta3, zeta=zeta, rcut=rcut, acut=acut, two_body_decay=two_body_decay, three_body_decay=three_body_decay, three_body_weight=three_body_weight, pad=False, gradients=False) rep_out = np.zeros((rep.shape[0], len(elements), rep.shape[1])) for i, z in enumerate(nuclear_charges): j = np.where(np.equal(z, elements))[0][0] rep_out[i, j] = rep[i] rep_out = rep_out.reshape(len(rep_out), -1) return rep_out
def get_forces(self, atoms=None): x = [] disp_x = [] q = [] # (x1, dx1) = generate_fchl_acsf(atoms.get_atomic_numbers(), atoms.get_positions(), gradients=True, pad=9, elements=[1,6,7,9,17,35]) (x1, dx1) = generate_fchl_acsf(atoms.get_atomic_numbers(), atoms.get_positions(), gradients=True, pad=self.nAtoms) x.append(x1) disp_x.append(dx1) q.append(atoms.get_atomic_numbers()) Xs = np.array(x) dXs = np.array(disp_x) Qs = q Ks = get_atomic_local_gradient_kernel(self.X, Xs, dXs, self.Q, Qs, self.sigmas) self.fYs = np.dot(Ks, self.alphas) Fss = self.fYs.reshape((self.nAtoms, 3)) * convback return Fss
#from tutorial_data_2files import compounds #from tutorial_data_2files import energy_cc2 #from tutorial_data_2files import energy_delta if __name__ == "__main__": # For every compound generate a coulomb matrix Qall = [] print('Generating representations') #for mol in tqdm.tqdm(compounds): for mol in compounds: #mol.generate_coulomb_matrix(size=29, sorting="row-norm") mol.representation = generate_fchl_acsf(mol.nuclear_charges, mol.coordinates, gradients=False, pad=33, elements=[1, 6, 7, 8]) # mol.generate_bob(size=23, asize={"O":3, "C":7, "N":3, "H":16, "S":1}) Qall.append(mol.nuclear_charges) # Make a big 2D array with all the X = np.array([mol.representation for mol in compounds]) # X = np.array([mol.bob for mol in compounds]) #split into training/validation and final test N_train_val = len(compounds) N_final_test = len(compounds) X_train_val = X[:N_train_val] Q_train_val = Qall[:N_train_val] Y_train_val = energy_cc2[:N_train_val]
def _do_workpackage(self, molname, dihedrals, resolution): ndih = len(dihedrals) start, step, n_steps = self._clockwork(resolution) scanangles = np.arange(start, start + step * n_steps, step) # fetch input self._sdfstr, self._torsions, self._bonds, self._smiles, bytecost = _fetch_problem_description( self._connection, molname) if _fetch_problem_description.cache_info().hits > 0: bytecost = 0 accepted_geometries = [] accepted_energies = [] accepted_bondorders = [] accepted_reps = [] for angles in it.product(scanangles, repeat=ndih): try: xyzfile, atoms, coordinates = self._get_classical_constrained_geometry( dihedrals, angles) geometry, energy = self._xtbgeoopt(xyzfile, 0) except: continue try: energy = float(energy) except ValueError: continue # require same molecule try: newsmiles = self._get_smiles(geometry) except: continue if newsmiles != self._smiles: continue # check for similar energies in list compare_required = np.where( np.abs(np.array(accepted_energies) - energy) < ENERGY_THRESHOLD)[0] charges = [{"H": 1, "C": 6, "N": 7, "O": 8}[_] for _ in atoms] rep = generate_fchl_acsf(charges, coordinates, pad=len(atoms)) include = True if len(compare_required) > 0: sim = get_global_kernel( np.array([rep]), np.array(accepted_reps)[compare_required], np.array([charges]), np.array([charges] * len(compare_required)), QML_FCHL_SIGMA, ) if np.max(sim) > QML_FCHL_THRESHOLD: include = False if include: accepted_energies.append(energy) accepted_geometries.append(self._condense_geo(geometry)) accepted_reps.append(rep) results = {} results["mol"] = molname results["dih"] = dihedrals results["res"] = resolution results["geo"] = accepted_geometries results["ene"] = accepted_energies return results, bytecost
def train(): # print(" -> Start training") # start = time() # subprocess.Popen(("python3","model_training.py","train")) # end = time() # # total_runtime = end - start # # print(" -> Training time: {:.3f}".format(total_runtime)) #data = get_properties("energies.txt") data = get_properties("train") mols = [] mols_pred = [] SIGMA = 2.5 #float(sys.argv[1]) for name in sorted(data.keys()): mol = qml.Compound() mol.read_xyz("xyz/" + name + ".xyz") # Associate a property (heat of formation) with the object mol.properties = data[name][0] mols.append(mol) shuffle(mols) #mols_train = mols[:400] #mols_test = mols[400:] # REPRESENTATIONS print("\n -> calculate representations") start = time() x = [] disp_x = [] f = [] e = [] q = [] for mol in mols: (x1, dx1) = generate_fchl_acsf(mol.nuclear_charges, mol.coordinates, gradients=True, pad=23, elements=[1, 6, 7, 8, 16, 17]) e.append(mol.properties) f.append(data[(mol.name)[4:-4]][1]) x.append(x1) disp_x.append(dx1) q.append(mol.nuclear_charges) X_train = np.array(x) F_train = np.array(f) F_train *= -1 E_train = np.array(e) dX_train = np.array(disp_x) Q_train = q E_mean = np.mean(E_train) E_train -= E_mean F_train = np.concatenate(F_train) end = time() print(end - start) print("") print(" -> calculating Kernels") start = time() Kte = get_atomic_local_kernel(X_train, X_train, Q_train, Q_train, SIGMA) #Kte_test = get_atomic_local_kernel(X_train, X_test, Q_train, Q_test, SIGMA) Kt = get_atomic_local_gradient_kernel(X_train, X_train, dX_train, Q_train, Q_train, SIGMA) #Kt_test = get_atomic_local_gradient_kernel(X_train, X_test, dX_test, Q_train, Q_test, SIGMA) C = np.concatenate((Kte, Kt)) Y = np.concatenate((E_train, F_train.flatten())) end = time() print(end - start) print("") print("Alphas operator ...") start = time() alpha = svd_solve(C, Y, rcond=1e-12) end = time() print(end - start) print("") print("save X") np.save('X_active_learning.npy', X_train) # with open("X_mp2.cpickle", 'wb') as f: # cPickle.dump(X_train, f, protocol=2) print("save alphas") np.save('alphas_active_learning.npy', alpha) # with open("alphas_mp2.cpickle", 'wb') as f: # cPickle.dump(alpha, f, protocol=2) print("save Q") np.save('Q_active_learning.npy', Q_train) # with open("Q_mp2.cpickle", 'wb') as f: # cPickle.dump(Q_train, f, protocol=2) eYt = np.dot(Kte, alpha) fYt = np.dot(Kt, alpha) #eYt_test = np.dot(Kte_test, alpha) #fYt_test = np.dot(Kt_test, alpha) slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( E_train, eYt) print("TRAINING ENERGY MAE = %10.4f slope = %10.4f intercept = %10.4f r^2 = %9.6f" % \ (np.mean(np.abs(E_train - eYt)), slope, intercept, r_value )) slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( F_train.flatten(), fYt.flatten()) print("TRAINING FORCE MAE = %10.4f slope = %10.4f intercept = %10.4f r^2 = %9.6f" % \ (np.mean(np.abs(F_train.flatten() - fYt.flatten())), slope, intercept, r_value ))
def predict_only(): # Initialize training data (only need to do this once) alpha = np.load("data/training_alphas.npy") X = np.load("data/training_X.npy") Q = np.load("data/training_Q.npy") # Define a molecule nuclear_charges = np.array([6, 6, 8, 1, 1, 1, 1, 1, 1]) coordinates = np.array([[0.07230959, 0.61441211, -0.03115568], [-1.26644639, -0.27012846, -0.00720771], [1.11516977, -0.30732869, 0.06414394], [0.10673943, 1.44346835, -0.79573006], [-0.02687486, 1.19350887, 0.98075343], [-2.06614011, 0.38757505, 0.39276693], [-1.68213881, -0.60620688, -0.97804526], [-1.18668224, -1.07395366, 0.67075071], [1.37492532, -0.56618891, -0.83172035]]) # Generate representation max_atoms = X.shape[1] (rep, drep) = generate_fchl_acsf(nuclear_charges, coordinates, gradients=True, pad=max_atoms) # Put data into arrays Qs = [nuclear_charges] Xs = np.array([rep]) dXs = np.array([drep]) SIGMA = 10.0 # Get kernels Kse = get_atomic_local_kernel(X, Xs, Q, Qs, SIGMA) Ks = get_atomic_local_gradient_kernel(X, Xs, dXs, Q, Qs, SIGMA) # Offset from training offset = -97084.83100465109 # Energy prediction energy_predicted = np.dot(Kse, alpha)[0] + offset energy_true = -97086.55524903 print("True energy %16.4f kcal/mol" % energy_true) print("Predicted energy %16.4f kcal/mol" % energy_predicted) # Force prediction forces_predicted = np.dot(Ks, alpha).reshape((len(nuclear_charges), 3)) forces_true = np.array([[-66.66673100, 2.45752385, 49.92224945], [-17.98600137, 68.72856500, -28.82689294], [31.88432927, 8.98739402, -18.11946195], [4.19798833, -31.31692744, 8.12825145], [16.78395377, -24.76072606, -38.99054658], [6.03046276, -7.24928076, -3.88797517], [17.44954868, 0.21604968, 8.56118603], [11.73901551, -19.38200606, 13.26191987], [-3.43256595, 2.31940789, 9.95126984]]) print("True forces [kcal/mol]") print(forces_true) print("Predicted forces [kcal/mol]") print(forces_predicted)
for xyz_file in sorted(data.keys()): mol = qml.Compound() mol.read_xyz(xyz_file) mol.properties = data[xyz_file] mol.name = xyz_file mols.append(mol) x = [] q = [] list_of_elements = [1, 5, 6, 7, 8, 9, 17, 35] for mol in mols: x1 = generate_fchl_acsf(mol.nuclear_charges, mol.coordinates, gradients=False, pad=21, elements=list_of_elements) x.append(x1) q.append(mol.nuclear_charges) X = np.array(x) Q = q K = get_global_kernel(X, X, Q, Q, .64) Y = np.asarray([mol.properties for mol in mols]) lst = [mol.name for mol in mols] lst_old = len(lst) for i in range(len(Y)):
def repr_wrapper(frame, elements, is_periodic=False, nRs2=24, nRs3=20, nFourier=1, eta2=0.32, eta3=2.7, zeta=np.pi, rcut=8.0, acut=8.0, two_body_decay=1.8, three_body_decay=0.57, three_body_weight=13.4, stride=1): ''' Periodic systems not implemented for FCHL19. :frame: ase Atoms class :param elements: list of unique nuclear charges (atom types) :type elements: numpy array :is_periodic: Boolean determining Whether the system is periodic. :type Boolean: :param nRs2: Number of gaussian basis functions in the two-body terms :type nRs2: integer :param nRs3: Number of gaussian basis functions in the three-body radial part :type nRs3: integer :param nFourier: Order of Fourier expansion :type nFourier: integer :param eta2: Precision in the gaussian basis functions in the two-body terms :type eta2: float :param eta3: Precision in the gaussian basis functions in the three-body radial part :type eta3: float :param zeta: Precision parameter of basis functions in the three-body angular part :type zeta: float :param two_body_decay: exponential decay for the two body function :type two_body_decay: float :param three_body_decay: exponential decay for the three body function :type three_body_decay: float :param three_body_weight: relative weight of the three body function :type three_body_weight: float ''' if is_periodic: raise NotImplementedError('Periodic system not implemented!') nuclear_charges, coordinates = frame.get_atomic_numbers( ), frame.get_positions() rep = generate_fchl_acsf(nuclear_charges, coordinates, elements, nRs2=nRs2, nRs3=nRs3, nFourier=nFourier, eta2=eta2, eta3=eta3, zeta=zeta, rcut=rcut, acut=acut, two_body_decay=two_body_decay, three_body_decay=three_body_decay, three_body_weight=three_body_weight, pad=False, gradients=False) rep_out = np.zeros((rep.shape[0], len(elements), rep.shape[1])) for i, z in enumerate(nuclear_charges): j = np.where(np.equal(z, elements))[0][0] rep_out[i, j] = rep[i] rep_out = rep_out.reshape(len(rep_out), -1) return rep_out
def overview_properties_pca(): elements = [] with open('data/sdf/subset_properties.csv', 'r') as f: properties = f.readlines() properties = [float(x) for x in properties] properties = np.array(properties) representations = [] molobjs = cheminfo.read_sdffile("data/sdf/subset_structures.sdf") mols_atoms = [] mols_coord = [] n_atoms = 0 n_items = 500 for i, molobj in enumerate(molobjs): atoms, coord = cheminfo.molobj_to_xyz(molobj) mols_atoms.append(atoms) mols_coord.append(coord) elements += list(np.unique(atoms)) elements = list(np.unique(elements)) if len(atoms) > n_atoms: n_atoms = len(atoms) i += 1 if i == n_items: break properties = properties[:n_items] print(elements) print(n_atoms) print(len(mols_atoms)) distance_cut = 20.0 parameters = { "pad": n_atoms, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": elements } for atoms, coord in zip(mols_atoms, mols_coord): representation = generate_fchl_acsf(atoms, coord, **parameters) representations.append(representation) representations = np.array(representations) sigma = 10. kernel = qml.kernels.get_local_kernel(representations, representations, mols_atoms, mols_atoms, sigma) print(kernel.shape) pca = kpca(kernel, n=2) fig, axs = plt.subplots(2, 1, figsize=(5, 10)) sc = axs[0].scatter(*pca, c=properties) fig.colorbar(sc, ax=axs[0]) im = axs[1].imshow(kernel) fig.colorbar(im, ax=axs[1]) fig.savefig("_tmp_pca_prop.png") return
def query(self, atoms=None, print_time=True): if print_time: start = time.time() # kcal/mol til ev # kcal/mol/aangstrom til ev / aangstorm conv_energy = 1.0 #0.0433635093659 conv_force = 1.0 # 0.0433635093659 coordinates = atoms.get_positions() nuclear_charges = atoms.get_atomic_numbers() n_atoms = coordinates.shape[0] rep_start = time.time() rep, drep = generate_fchl_acsf( nuclear_charges, coordinates, gradients=True, elements=[1, 6, 8], pad=self.max_atoms, ) Qs = [nuclear_charges] Xs = np.array([rep], order="F") dXs = np.array([drep], order="F") if self.reducer is not None: Xs = np.einsum("ijk,kl->ijl", Xs, self.reducer) dXs = np.einsum("ijkmn,kl->ijlmn", dXs, self.reducer) rep_end = time.time() kernel_start = time.time() # Ks = get_gp_kernel(self.repr, Xs, self.drepr, dXs, self.charges, Qs, self.sigma) Kse = get_atomic_local_kernel(self.repr, Xs, self.charges, Qs, self.sigma) Ksf = get_atomic_local_gradient_kernel(self.repr, Xs, dXs, self.charges, Qs, self.sigma) kernel_end = time.time() pred_start = time.time() # Energy prediction energy_predicted = np.dot(Kse, self.alphas)[0] + self.offset self.energy = energy_predicted * conv_energy # Force prediction forces_predicted = np.dot(Ksf, self.alphas).reshape((n_atoms, 3)) self.forces = forces_predicted * conv_force pred_end = time.time() if print_time: end = time.time() # print("rep ", rep_end - rep_start) # print("kernel ", kernel_end - kernel_start) # print("prediciton ", pred_end - pred_start) # print("qml query {:7.3f}s {:10.3f} ".format(end-start, energy_predicted)) return
def prepare_training_data_qmepa890(): # distance_cut = 10.0 # parameters = { # "pad": 25, # max atoms # "rcut": distance_cut, # "acut": distance_cut, # "elements": [1, 6, 7, 8], # } # Table 5. Free atom energies from DFT/PBE0/def2TZVP. # H C N O S # Multiplicity 2 3 4 3 3 # Energy / Eh −0.501036 −37.8054 −54.5438 −75.0186 −397.974 au2kcal = 627.518135759111 atom_energies = {} atom_energies["H"] = -0.501036 * au2kcal atom_energies["C"] = -37.8054 * au2kcal atom_energies["N"] = -54.5438 * au2kcal atom_energies["O"] = -75.0186 * au2kcal atom_energies["S"] = -397.974 * au2kcal distance_cut = 20.0 parameters = { "pad": 25, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": [1, 6, 7, 8, 12] } dirprefix = "data/qmepa890/" filename = dirprefix + "data.csv" # 1. File ID (e.g. 0415 means the information pertains to the files `0415.xyz` and `0415_+.xyz`) # 2. Index of the proton (in the `XXXX_+.xyz` file listed in the same row) # 3. Gas-phase energy of neutral molecule plus thermal corrections from vibrational analysis # 4. Gas-phase energy of protonated molecule plus thermal corrections from vibrational analysis # 5. Gas-phase energy of neutral molecule # 6. Gas-phase energy of protonated molecule # 7. Energy of neutral molecule using SMD implicit solvent model # 8. Energy of protonated molecule using SMD implicit solvent model # 9. PM6 heat-of-formation of neutral molecule using COSMO implicit solvent model # 10. PM6 heat-of-formation of protonated molecule using COSMO implicit solvent model df = pd.read_csv(filename, sep=",", header=None) molecule_names = df.iloc[:, 0] proton_idxs = df.iloc[:, 1] energies = df.iloc[:, 2:] p_representations = [] p_coord_list = [] p_atoms_list = [] n_representations = [] n_coord_list = [] n_atoms_list = [] atomization_list = [] for h_idx, name in zip(proton_idxs, molecule_names): name = str(name).zfill(4) print(f"representing {name}") atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "structures/" + name + ".xyz") atom_energy = 0 for atom in atoms: atom_energy += atom_energies[atom] atomization_list.append(atom_energy) atoms = [cheminfo.convert_atom(atom) for atom in atoms] n_representation = generate_fchl_acsf(atoms, coord, **parameters) n_representations.append(n_representation) n_coord_list.append(coord) n_atoms_list.append(atoms) atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "structures/" + name + "_+.xyz") atoms = [cheminfo.convert_atom(atom) for atom in atoms] atoms[h_idx - 1] = 12 p_representation = generate_fchl_acsf(atoms, coord, **parameters) p_representations.append(n_representation) p_coord_list.append(coord) p_atoms_list.append(atoms) proton_idxs = np.array(proton_idxs) n_representations = np.array(n_representations) p_representations = np.array(p_representations) atomization_list = np.array(atomization_list) return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, proton_idxs, energies, atomization_list
def prepare_training_data_protonafinity(): distance_cut = 20.0 parameters = { "pad": 25, 'nRs2': 22, 'nRs3': 17, 'eta2': 0.41, 'eta3': 0.97, 'three_body_weight': 45.83, 'three_body_decay': 2.39, 'two_body_decay': 2.39, "rcut": distance_cut, "acut": distance_cut, "elements": [1, 6, 7, 8, 9, 12] } dirprefix = "data/dataset-proton-affinity/data/" filename = dirprefix + "pm3_properties.csv" df = pd.read_csv(filename, sep=",") n_rows = df.shape[0] # column names col_neuidx = "MoleculeIdx" col_proidx = "ProtonatedIdx" col_refsmi = "ReferenceSmiles" col_prosmi = "ProtonatedSmiles" col_neueng = "NeutralEnergy" col_proeng = "ProtonatedEnergy" # Collect energies energies_neutr = df[col_neueng] energies_proto = df[col_proeng] energies = [energies_neutr, energies_proto] energies = np.array(energies) # Protonated representation p_representations = [] p_coord_list = [] p_atoms_list = [] # Neutral representation n_representations = [] n_coord_list = [] n_atoms_list = [] for idx, row in tqdm.tqdm(df.iterrows(), desc="Preparing FCHL19", total=n_rows, **TQDM_OPTIONS): # print(row) nidx = row[col_neuidx] pidx = row[col_proidx] nname = f"xyz{nidx}_n.xyz" pname = f"xyz{nidx}_{pidx}.xyz" # Neutral state atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" + nname) atoms = [cheminfo.convert_atom(atom) for atom in atoms] n_representation = generate_fchl_acsf(atoms, coord, **parameters) n_representations.append(n_representation) n_coord_list.append(coord) n_atoms_list.append(atoms) # Protonated state atoms, coord = rmsd.get_coordinates_xyz(dirprefix + "pm3.cosmo.mop/" + pname) atoms = [cheminfo.convert_atom(atom) for atom in atoms] # Find protonated atom smiles = row[col_prosmi] molobj = cheminfo.smiles_to_molobj(smiles) assert molobj is not None, "Molobj failed for {smiles}" smi_atoms = molobj.GetAtoms() atom_charges = [atom.GetFormalCharge() for atom in smi_atoms] atom_charges = np.array(atom_charges) idx, = np.where(atom_charges > 0) assert len(idx) == 1, f"Should only be one charged atom in {pname}" idx = idx[0] # Set nitrogen to heavy atom atoms[idx] = 12 p_representation = generate_fchl_acsf(atoms, coord, **parameters) p_representations.append(n_representation) p_coord_list.append(coord) p_atoms_list.append(atoms) # proton_idxs = np.array(proton_idxs) n_representations = np.array(n_representations) p_representations = np.array(p_representations) return n_representations, p_representations, n_coord_list, p_coord_list, n_atoms_list, p_atoms_list, energies