def add_mol_to_training(self, new_system, pun, atom=None, xyz=None): 'Add molecule to training set' new_system.initialize_multipoles() # Don't build SLATM yet, only add information to mbtypes mol = None if new_system.xyz[0] is None: if xyz is not None: mol = qml.Compound(xyz) else: raise ValueError("Missing xyz file") else: mol = qml.Compound(new_system.xyz[0]) self.qml_mols.append(mol) if atom is None: self.qml_filter_ele.append([1 for i in range(mol.natoms)]) else: self.qml_filter_ele.append([ 1 if (str(mol.atomtypes[i]) == atom) else 0 for i in range(mol.natoms) ]) new_system.multipoles = np.empty((new_system.num_atoms, 9)) # Read in multipole moments from txt file new_system.load_mtp_from_hipart(pun, rotate=False) if len(new_system.multipoles) != new_system.num_atoms: raise Exception("Wrong number of charges in %s" % (pun)) for i in range(len(new_system.elements)): ele_i = new_system.elements[i] if (ele_i == atom) or atom is None: if ele_i not in self.target_train.keys(): self.target_train[ele_i] = [] self.descr_train[ele_i] = [] self.num_mols_train[ele_i] = 0 new_target_train = [] # Rotate system until atom pairs point in all x,y,z directions vec_all_dir = new_system.compute_basis() # charge new_target_train.append([new_system.multipoles[i][0]]) # dipole new_target_train.append( np.dot(new_system.multipoles[i][1:4], new_system.basis[i].T)) # quadrupole tmp = np.dot( np.dot(new_system.basis[i], utils.spher_to_cart(new_system.multipoles[i][4:9])), new_system.basis[i].T).reshape((9, )) new_target_train.append(tmp) self.target_train[ele_i].append(new_target_train) if atom in new_system.elements or atom is None: self.num_mols_train[ele_i] += 1 self.logger.info("Added file to training set: %s" % new_system) return None
def _to_qml(ds, nmol, sublist): """Returns a list of nmol qml.Molecule objects. * ds :: dataset objects * nmol :: number of molecules * sublist :: list of indices of molecules to be converted to ase Atoms object. """ list_of_mol = [] if nmol == None: nmol = ds.nmol if sublist == None: sublist = ds.list_of_mol[:nmol] else: sublist = np.array(ds.list_of_mol)[sublist] for m in sublist: qmlc = qml.Compound() qmlc.natoms = m.natm qmlc.atomtypes = m.symb qmlc.nuclear_charges = m.z qmlc.coordinates = m.R list_of_mol.append(qmlc) return list_of_mol
def find_similar_local_environments(filename, element=6): """ Returns a list of sets of atoms with similar environments. Atoms are identified by their zero-based atom index.""" c = qml.Compound(xyz=filename) # relevant atoms atoms = np.where(c.nuclear_charges == element)[0] if len(atoms) < 2: return [] # get coulomb matrix a = qml.representations.generate_coulomb_matrix(c.nuclear_charges, c.coordinates, size=c.natoms, sorting='unsorted') # reconstruct full symmetric matrix s = np.zeros((c.natoms, c.natoms)) s[np.tril_indices(c.natoms)] = a d = np.diag(s) s += s.T s[np.diag_indices(c.natoms)] = d # find similar sites accepted = nx.Graph() sorted_elements = [np.sort(_) for _ in s[atoms]] for i in range(len(atoms)): for j in range(i + 1, len(atoms)): dist = np.linalg.norm(sorted_elements[i] - sorted_elements[j]) if dist < 1: accepted.add_edge(atoms[i], atoms[j]) return [list(_.nodes) for _ in nx.connected_component_subgraphs(accepted)]
def test_slatm_representation(): files = [ "qm7/0001.xyz", "qm7/0002.xyz", "qm7/0003.xyz", "qm7/0004.xyz", "qm7/0005.xyz", "qm7/0006.xyz", "qm7/0007.xyz", "qm7/0008.xyz", "qm7/0009.xyz", "qm7/0010.xyz" ] path = test_dir = os.path.dirname(os.path.realpath(__file__)) print(path) mols = [] for xyz_file in files: mol = qml.Compound(xyz=path + "/" + xyz_file) mols.append(mol) mbtypes = get_slatm_mbtypes(np.array([mol.nuclear_charges for mol in mols])) for i, mol in enumerate(mols): mol.generate_slatm(mbtypes) X_qml = np.array([mol.representation for mol in mols]) X_ref = np.loadtxt(path + "/data/slatm_representation.txt") assert np.allclose(X_qml, X_ref), "Error in SLATM generation"
def read_xyz_qml(pathway): '''function that reads all xyz files in pathway and returns list of Z, R, N information input ----- pathway: string, pathway to folder containing '.xyz' files. ends with '/' output ------ compoundlist: list containing compound information (qml element) ZRN_data: list containing Z, R and N arrays of the compounds ''' compoundlist = [] ZRN_data = [] print("iterate over all molecules") for xyzfile in os.listdir(database): xyz_fullpath = database + xyzfile #probably path can be gotten more directly compound = qml.Compound(xyz_fullpath) print("compound %s" % xyzfile) Z = compound.nuclear_charges.astype(float) R = compound.coordinates N = float(len(Z)) compoundlist.append(compound) ZRN_data.append(Z, R, N) return (compoundlist, ZRN_data)
def test_krr_cmat(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects mols = [] for xyz_file in sorted(data.keys())[:1000]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_coulomb_matrix(size=23, sorting="row-norm") mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = 300 n_train = 700 training = mols[:n_train] test = mols[-n_test:] # List of representations X = np.array([mol.representation for mol in training]) Xs = np.array([mol.representation for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 10**(4.2) llambda = 10**(-10.0) # Generate training Kernel K = laplacian_kernel(X, X, sigma) # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # Calculate prediction kernel Ks = laplacian_kernel(X, Xs, sigma) Yss = np.dot(Ks.transpose(), alpha) mae = np.mean(np.abs(Ys - Yss)) assert mae < 6.0, "ERROR: Too high MAE!"
def get_data(): """" Generate coulomb matrices and heat of formation for QM7. """ test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects mols = [] for xyz_file in sorted(data.keys())[:1000]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_coulomb_matrix(size=23, sorting="row-norm") mols.append(mol) X = np.array([mol.representation for mol in mols]) Y = np.array([mol.properties for mol in mols]) sigma = 10**(4.2) return X, Y, sigma
def calc_coloumb_matrices(size, save_path=None): ''' Return dict of CMs for each molecule in both train and test sets :param size: int Max num of atoms per molecule found in datasets :param save_path: str If provided will pickle computed CMs to save_path :return: Dict of 2D numpy arrays Dict where keys are molecule_names, values are 2d CMs ''' CMs = {} for file in tqdm.tqdm(glob.glob('./data/structures/*.xyz')): mol_name = file.split('/')[-1].split('.')[0] mol = qml.Compound(xyz=file) # After experiments, seems upper triangle CM was concated in fortran-order mol.generate_coulomb_matrix(size=size, sorting='unsorted') cm_tri = mol.representation cm = inv_tri(cm_tri, size=size) # Concat to dict CMs[mol_name] = cm if save_path is not None: with open(save_path, 'wb') as h: pickle.dump(CMs, h, protocol=pickle.HIGHEST_PROTOCOL) return CMs
def test_compound(): test_dir = os.path.dirname(os.path.realpath(__file__)) c = qml.Compound(xyz=test_dir + "/data/compound_test.xyz") ref_atomtypes = ['C', 'Cl', 'Br', 'H', 'H'] ref_charges = [ 6, 17, 35, 1 , 1] assert compare_lists(ref_atomtypes, c.atomtypes), "Failed parsing atomtypes" assert compare_lists(ref_charges, c.nuclear_charges), "Failed parsing nuclear_charges" # Test extended xyz c2 = qml.Compound(xyz=test_dir + "/data/compound_test.exyz") ref_atomtypes = ['C', 'Cl', 'Br', 'H', 'H'] ref_charges = [ 6, 17, 35, 1 , 1] assert compare_lists(ref_atomtypes, c.atomtypes), "Failed parsing atomtypes" assert compare_lists(ref_charges, c.nuclear_charges), "Failed parsing nuclear_charges"
def add_mol_to_training(self, new_system, ref_ratios,atom = None): 'Add molecule to training set' if self.mbtypes is None: raise ValueError("Missing MBTypes") mol = None # Init the molecule in qml if new_system.xyz[0] is None: if xyz is not None: mol = qml.Compound(xyz) else: raise ValueError("Missing xyz file") else: mol = qml.Compound(new_system.xyz[0]) self.qml_mols.append(mol) # build slatm representation mol.generate_slatm(self.mbtypes, rcut = self.cutoff, local=True) natom = 0 for i in range(len(new_system.elements)): ele = new_system.elements[i] if (ele == atom) or atom is None: natom += 1 # reference pops/widths for element i hr = ref_ratios[i] self.target_train[ele].append(hr) self.descr_train[ele].append(mol.representation[i]) if len(self.descr_train[ele]) != len(self.target_train[ele]): print(len(self.descr_train[ele])) print(len(self.target_train[ele])) print(self.descr_train[ele]) print(self.target_train[ele]) print("Inconsistency in training data") raise ValueError("Inconsistency in training data") #self.descr_train += new_system.coulomb_mat #self.target_train += [i for i in new_system.hirshfeld_ref] self.logger.info("Added file to training set: %s" % new_system) return natom
def test_arad(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects mols = [] for xyz_file in sorted(data.keys())[:10]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.representation = generate_arad_representation( mol.coordinates, mol.nuclear_charges) mols.append(mol) sigmas = [25.0] X1 = np.array([mol.representation for mol in mols]) K_local_asymm = get_local_kernels_arad(X1, X1, sigmas) K_local_symm = get_local_symmetric_kernels_arad(X1, sigmas) assert np.allclose(K_local_symm, K_local_asymm), "Symmetry error in local kernels" assert np.invert(np.all(np.isnan( K_local_asymm))), "ERROR: ARAD local symmetric kernel contains NaN" K_local_asymm = get_local_kernels_arad(X1[-4:], X1[:6], sigmas) molid = 5 X1 = generate_arad_representation(mols[molid].coordinates, mols[molid].nuclear_charges, size=mols[molid].natoms) XA = X1[:mols[molid].natoms] K_atomic_asymm = get_atomic_kernels_arad(XA, XA, sigmas) K_atomic_symm = get_atomic_symmetric_kernels_arad(XA, sigmas) assert np.allclose(K_atomic_symm, K_atomic_asymm), "Symmetry error in atomic kernels" assert np.invert(np.all(np.isnan( K_atomic_asymm))), "ERROR: ARAD atomic symmetric kernel contains NaN" K_atomic_asymm = get_atomic_kernels_arad(XA, XA, sigmas)
def test_krr_fchl_atomic(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects" mols = [] for xyz_file in sorted(data.keys())[:10]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.representation = generate_representation(mol.coordinates, \ mol.nuclear_charges, cut_distance=1e6) mols.append(mol) X = np.array([mol.representation for mol in mols]) # Set hyper-parameters sigma = 2.5 K = get_local_symmetric_kernels(X, [sigma])[0] K_test = np.zeros((len(mols), len(mols))) for i, Xi in enumerate(X): for j, Xj in enumerate(X): K_atomic = get_atomic_kernels(Xi[:mols[i].natoms], Xj[:mols[j].natoms], [sigma])[0] K_test[i, j] = np.sum(K_atomic) assert np.invert(np.all( np.isnan(K_atomic))), "FCHL atomic kernel contains NaN" if (i == j): K_atomic_symmetric = get_atomic_symmetric_kernels( Xi[:mols[i].natoms], [sigma])[0] assert np.allclose(K_atomic, K_atomic_symmetric ), "Error in FCHL symmetric atomic kernels" assert np.invert(np.all(np.isnan(K_atomic_symmetric)) ), "FCHL atomic symmetric kernel contains NaN" assert np.allclose(K, K_test), "Error in FCHL atomic kernels"
def test_arad_wrapper(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies("%s/data/hof_qm7.txt" % test_dir) # Generate a list of qml.Compound() objects mols = [] for xyz_file in sorted(data.keys())[:50]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz="%s/qm7/" % test_dir + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_arad_representation(size=23) mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = 10 n_train = 40 training = mols[:n_train] test = mols[-n_test:] sigmas = [10.0, 100.0] K1 = arad_local_symmetric_kernels(training, sigmas) assert np.all(K1 > 0.0), "ERROR: ARAD symmetric kernel negative" assert np.invert(np.all( np.isnan(K1))), "ERROR: ARAD symmetric kernel contains NaN" K2 = arad_local_kernels(training, test, sigmas) assert np.all(K2 > 0.0), "ERROR: ARAD symmetric kernel negative" assert np.invert(np.all( np.isnan(K2))), "ERROR: ARAD symmetric kernel contains NaN"
def run(self, commandstring): if "ERROR" in commandstring: return "--" content = json.loads(commandstring) xyz = MockXYZ(content["neutralgeometry"].split("\n")) c = qml.Compound(xyz=xyz) rep = qml.representations.generate_fchl_acsf( c.nuclear_charges, c.coordinates, gradients=False, pad=31, elements=[1, 6, 8], ) K = qml.kernels.get_local_kernel(self._reps, np.array([rep]), self._Qs, [c.nuclear_charges], 0.128) preds1 = np.dot(K, self._alphas1)[0] preds2 = np.dot(K, self._alphas2)[0] preds3 = np.dot(K, self._alphas3)[0] return str(preds1) + "," + str(preds2) + "," + str(preds3)
def get_descriptor_and_property(filenames, atype, cutoff): X = [] Y = [] for filename in filenames: # Get the partial charges (property to predict) y = get_properties(filename, atype) # generate a Compound data structure mol = qml.Compound(filename) # generate the descriptor mol.generate_atomic_coulomb_matrix(central_cutoff=cutoff, size=30) # either use all atoms, or just atoms of a specific type if atype == "all": x = mol.representation else: x = mol.representation[mol.nuclear_charges == qml.data.NUCLEAR_CHARGE[atype]] # add the atoms to be used in this molecule to the entire set X.extend(x) Y.extend(y) return np.asarray(X), np.asarray(Y)
def test_representations(): files = [ "qm7/0101.xyz", "qm7/0102.xyz", "qm7/0103.xyz", "qm7/0104.xyz", "qm7/0105.xyz", "qm7/0106.xyz", "qm7/0107.xyz", "qm7/0108.xyz", "qm7/0109.xyz", "qm7/0110.xyz" ] path = test_dir = os.path.dirname(os.path.realpath(__file__)) mols = [] for xyz_file in files: mol = qml.Compound(xyz=path + "/" + xyz_file) mols.append(mol) size = max(mol.nuclear_charges.size for mol in mols) + 1 asize = get_asize(mols, 1) coulomb_matrix(mols, size, path) atomic_coulomb_matrix(mols, size, path) eigenvalue_coulomb_matrix(mols, size, path) bob(mols, size, asize, path)
def __init__(self, connection): self.connection = connection # self._upload() # return "uploaded" lines = (gzip.decompress( self.connection.get("qml-structures")).decode("ascii").split("\n")) q = (gzip.decompress(self.connection.get("qml-alphas1")).decode( "ascii").strip().split("\n")) alphas1 = np.array([float(_) for _ in q]) q = (gzip.decompress(self.connection.get("qml-alphas2")).decode( "ascii").strip().split("\n")) alphas2 = np.array([float(_) for _ in q]) q = (gzip.decompress(self.connection.get("qml-alphas3")).decode( "ascii").strip().split("\n")) alphas3 = np.array([float(_) for _ in q]) reps = [] Qs = [] for geoidx in range(len(alphas1)): c = qml.Compound(xyz=MockXYZ(lines[geoidx * 33:(geoidx + 1) * 33])) reps.append( qml.representations.generate_fchl_acsf( c.nuclear_charges, c.coordinates, gradients=False, pad=31, elements=[1, 6, 8], )) Qs.append(c.nuclear_charges) self._reps = np.array(reps) self._Qs = np.array(Qs) self._alphas1 = alphas1 self._alphas2 = alphas2 self._alphas3 = alphas3
def test_engrad(): dH = 1e-6 comp = qml.Compound(xyz="water.xyz") energy, grad = get_engrad(comp.nuclear_charges, comp.coordinates) grad_numm = np.zeros(grad.shape) for i in range(len(comp.nuclear_charges)): for j in range(3): coords_displaced = deepcopy(comp.coordinates) coords_displaced[i, j] += dH e_plus = get_energy(comp.nuclear_charges, coords_displaced) coords_displaced = deepcopy(comp.coordinates) coords_displaced[i, j] -= dH e_minus = get_energy(comp.nuclear_charges, coords_displaced) grad_numm[i, j] = (e_plus - e_minus) / (2 * dH / BOHR_TO_ANGS) assert np.allclose(grad, grad_numm)
def test_krr_fchl_global(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects" mols = [] for xyz_file in sorted(data.keys())[:100]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.representation = generate_representation(mol.coordinates, \ mol.nuclear_charges, cut_distance=1e6) mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = len(mols) // 3 n_train = len(mols) - n_test training = mols[:n_train] test = mols[-n_test:] X = np.array([mol.representation for mol in training]) Xs = np.array([mol.representation for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 100.0 llambda = 1e-8 K_symmetric = get_global_symmetric_kernels(X, [sigma])[0] K = get_global_kernels(X, X, [sigma])[0] assert np.allclose(K, K_symmetric), "Error in FCHL symmetric global kernels" assert np.invert(np.all( np.isnan(K_symmetric))), "FCHL global symmetric kernel contains NaN" assert np.invert(np.all(np.isnan(K))), "FCHL global kernel contains NaN" # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # # Calculate prediction kernel Ks = get_global_kernels(Xs, X, [sigma])[0] assert np.invert(np.all( np.isnan(Ks))), "FCHL global testkernel contains NaN" Yss = np.dot(Ks, alpha) mae = np.mean(np.abs(Ys - Yss)) assert abs(2 - mae) < 1.0, "Error in FCHL global kernel-ridge regression"
hof = float(tokens[0]) #hof is the to predicting value #dftb = float(tokens[2]) #print(i) if key == "dft": energies[xyz_name[i]] = hof #energies[xyz_name[i].split("/")[-1]] = hof #elif key=="delta": #energies[xyz_name] = hof - dftb else: energies[xyz_name[i]] = hof return energies qm7_dft_energy = get_energies("obabel_dG.txt", key="dft") #qm7_delta_energy = get_energies("hof_qm7.txt", key = "delta") compounds = [qml.Compound(xyz=path + f) for f in sorted(os.listdir(path))] for mol in compounds: mol.properties = qm7_dft_energy[mol.name] #mol.properties2 = qm7_delta_energy[mol.name] #with open('obabel.pkl', 'wb') as f: #pickle.dump(compounds, f) random.seed(666) random.shuffle(compounds) energy_pbe0 = np.array([mol.properties for mol in compounds]) #energy_delta = np.array([mol.properties2 for mol in compounds])
keys = sorted(data.keys()) # Shuffle molecules np.random.seed(666) np.random.shuffle(keys) n_test = 500 n_train = 1000 n_total = n_test+n_train for xyz_file in keys[:n_total]: mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) mol.properties = data[xyz_file] mol.representation = generate_input(mol.nuclear_charges, mol.coordinates) mols.append(mol) # Make training and test sets training = mols[:n_train] test = mols[-n_test:] # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test])
import os import jax_representation as jrep import matplotlib.pyplot as plt import qml import itertools import numpy as np #path to xyz files database = "../Databases/XYZ_diatom/" distance_vector = [] OM_overlap_vector = [] CM_overlap_vector = [] for xyzfile in os.listdir(database): if xyzfile.endswith(".xyz"): xyz_fullpath = database + xyzfile compound = qml.Compound(xyz_fullpath) distance_vector.append( xyzfile[10:13] ) #distance is given in 'name...i.xyz', retrieve i here print('file:', xyzfile, 'distance:', xyzfile[10:13]) Z = compound.nuclear_charges.astype(float) R = compound.coordinates N = float(len(Z)) #Calculate Overlap matrix and determine dimensionality dim OM, order = jrep.OM_full_sorted(Z, R, N) CM, order = jrep.CM_full_sorted(Z, R, N) dim = len(order) #loop over OM and add all off-diagonal elements OM_overlap = 0
return energies if __name__ == "__main__": # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies("hof_qm7.txt") # Generate a list of qml.Compound() objects mols = [] for xyz_file in sorted(data.keys()): # Initialize the qml.Compound() objects mol = qml.Compound(xyz="qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_coulomb_matrix(size=23, sorting="row-norm") mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = 1000
def train(): # print(" -> Start training") # start = time() # subprocess.Popen(("python3","model_training.py","train")) # end = time() # # total_runtime = end - start # # print(" -> Training time: {:.3f}".format(total_runtime)) #data = get_properties("energies.txt") data = get_properties("train") mols = [] mols_pred = [] SIGMA = 2.5 #float(sys.argv[1]) for name in sorted(data.keys()): mol = qml.Compound() mol.read_xyz("xyz/" + name + ".xyz") # Associate a property (heat of formation) with the object mol.properties = data[name][0] mols.append(mol) shuffle(mols) #mols_train = mols[:400] #mols_test = mols[400:] # REPRESENTATIONS print("\n -> calculate representations") start = time() x = [] disp_x = [] f = [] e = [] q = [] for mol in mols: (x1, dx1) = generate_fchl_acsf(mol.nuclear_charges, mol.coordinates, gradients=True, pad=23, elements=[1, 6, 7, 8, 16, 17]) e.append(mol.properties) f.append(data[(mol.name)[4:-4]][1]) x.append(x1) disp_x.append(dx1) q.append(mol.nuclear_charges) X_train = np.array(x) F_train = np.array(f) F_train *= -1 E_train = np.array(e) dX_train = np.array(disp_x) Q_train = q E_mean = np.mean(E_train) E_train -= E_mean F_train = np.concatenate(F_train) end = time() print(end - start) print("") print(" -> calculating Kernels") start = time() Kte = get_atomic_local_kernel(X_train, X_train, Q_train, Q_train, SIGMA) #Kte_test = get_atomic_local_kernel(X_train, X_test, Q_train, Q_test, SIGMA) Kt = get_atomic_local_gradient_kernel(X_train, X_train, dX_train, Q_train, Q_train, SIGMA) #Kt_test = get_atomic_local_gradient_kernel(X_train, X_test, dX_test, Q_train, Q_test, SIGMA) C = np.concatenate((Kte, Kt)) Y = np.concatenate((E_train, F_train.flatten())) end = time() print(end - start) print("") print("Alphas operator ...") start = time() alpha = svd_solve(C, Y, rcond=1e-12) end = time() print(end - start) print("") print("save X") np.save('X_active_learning.npy', X_train) # with open("X_mp2.cpickle", 'wb') as f: # cPickle.dump(X_train, f, protocol=2) print("save alphas") np.save('alphas_active_learning.npy', alpha) # with open("alphas_mp2.cpickle", 'wb') as f: # cPickle.dump(alpha, f, protocol=2) print("save Q") np.save('Q_active_learning.npy', Q_train) # with open("Q_mp2.cpickle", 'wb') as f: # cPickle.dump(Q_train, f, protocol=2) eYt = np.dot(Kte, alpha) fYt = np.dot(Kt, alpha) #eYt_test = np.dot(Kte_test, alpha) #fYt_test = np.dot(Kt_test, alpha) slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( E_train, eYt) print("TRAINING ENERGY MAE = %10.4f slope = %10.4f intercept = %10.4f r^2 = %9.6f" % \ (np.mean(np.abs(E_train - eYt)), slope, intercept, r_value )) slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( F_train.flatten(), fYt.flatten()) print("TRAINING FORCE MAE = %10.4f slope = %10.4f intercept = %10.4f r^2 = %9.6f" % \ (np.mean(np.abs(F_train.flatten() - fYt.flatten())), slope, intercept, r_value ))
import jax_additional_derivative as jader #what to do? do_fingerprint_distance = False do_derivative_calculation = False do_plot_derivatives = True #path to xyz files database = "/home/linux-miriam/Databases/BOB/" '''define folder of .xyz files''' names = [database + "BOB1.xyz", database + "BOB2.xyz"] #which representations? namelist = ["CM", "EVCM", "BOB", "OM", "EVOM"] compound1 = qml.Compound(names[0]) compound2 = qml.Compound(names[1]) if do_fingerprint_distance: Z1 = compound1.nuclear_charges.astype(float) R1 = compound1.coordinates Z2 = compound2.nuclear_charges.astype(float) R2 = compound2.coordinates #calculate difference to reference constitution M_CM1 = jrep.CM_full_unsorted_matrix(Z1, R1, size=4) M_CM2 = jrep.CM_full_unsorted_matrix(Z2, R2, size=4) M_EVCM1 = jrep.CM_ev_unsrt(Z1, R1, N=0, size=4)
#!/usr/bin/env python from __future__ import print_function import qml if __name__ == "__main__": # Create the compound object mol from the file qm7/0001.xyz which happens to be methane mol = qml.Compound(xyz="qm7/0001.xyz") # Generate and print a coulomb matrix for compound with 5 atoms mol.generate_coulomb_matrix(size=5, sorting="row-norm") print(mol.representation) # Generate and print BoB bags for compound containing C and H mol.generate_bob(size=5, asize={"C": 2, "H": 5}) print(mol.representation) # Print other properties stored in the object print(mol.coordinates) print(mol.atomtypes) print(mol.nuclear_charges) print(mol.name) print(mol.unit_cell)
def test_krr_fchl_local(): # Test that all kernel arguments work kernel_args = { "cut_distance": 1e6, "cut_start": 0.5, "two_body_width": 0.1, "two_body_scaling": 2.0, "two_body_power": 6.0, "three_body_width": 3.0, "three_body_scaling": 2.0, "three_body_power": 3.0, "alchemy": "periodic-table", "alchemy_period_width": 1.0, "alchemy_group_width": 1.0, "fourier_order": 2, } test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects" mols = [] for xyz_file in sorted(data.keys())[:100]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_fchl_representation(cut_distance=1e6) mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = len(mols) // 3 n_train = len(mols) - n_test training = mols[:n_train] test = mols[-n_test:] X = np.array([mol.representation for mol in training]) Xs = np.array([mol.representation for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 2.5 llambda = 1e-8 K_symmetric = get_local_symmetric_kernels(X, [sigma], **kernel_args)[0] K = get_local_kernels(X, X, [sigma], **kernel_args)[0] assert np.allclose(K, K_symmetric), "Error in FCHL symmetric local kernels" assert np.invert(np.all( np.isnan(K_symmetric))), "FCHL local symmetric kernel contains NaN" assert np.invert(np.all(np.isnan(K))), "FCHL local kernel contains NaN" # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # Calculate prediction kernel Ks = get_local_kernels(Xs, X, [sigma], **kernel_args)[0] assert np.invert(np.all( np.isnan(Ks))), "FCHL local testkernel contains NaN" Yss = np.dot(Ks, alpha) mae = np.mean(np.abs(Ys - Yss)) assert abs(2 - mae) < 1.0, "Error in FCHL local kernel-ridge regression"
'''we start with a straight molecule stretching out on the x axis: H--C===C--H to then move both H simultaneously anti-clockwise by an angle phi: H / phi C===C....... / H ''' #calculate reference values compound = qml.Compound(reference) Z = compound.nuclear_charges.astype(float) R = compound.coordinates ref_M_EVCM = jrep.CM_ev_unsrt(Z, R, size = 4) dZ_eigenvalues = [] #list of eigenvalue vectors. length is same as len of name_vector dimZ_list = [] #dimension of files may vary. store all dimensions here dZ_slot1_list = [[],[],[],[]] dZ_slot2_list = [[],[],[],[]] dZ_slot3_list = [[],[],[],[]] dZ_slot4_list = [[],[],[],[]]
return energies """ Generating dict with binding energies and filename. """ if __name__ == "__main__": print("\n -> load binding energies") data = get_energies("data/trainUrt.txt") data2 = get_energies("data/testUrt.txt") mols = [] mols_test = [] for xyz_file in tqdm(sorted(data.keys())): mol = qml.Compound() mol.read_xyz("data/QM9Train/" + xyz_file) mol.properties = data[xyz_file] mols.append(mol) for xyz_file in tqdm(sorted(data2.keys())): mol = qml.Compound() mol.read_xyz("data/QM9Test/" + xyz_file) mol.properties = data2[xyz_file] mols_test.append(mol) mbtypes = get_slatm_mbtypes( [mol.nuclear_charges for mol in mols + mols_test]) print("\n -> generate representation") for mol in tqdm(mols): mol.generate_slatm(mbtypes, local=False)
def test_krr_gaussian_local_cmat(): test_dir = os.path.dirname(os.path.realpath(__file__)) # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames data = get_energies(test_dir + "/data/hof_qm7.txt") # Generate a list of qml.Compound() objects" mols = [] for xyz_file in sorted(data.keys())[:1000]: # Initialize the qml.Compound() objects mol = qml.Compound(xyz=test_dir + "/qm7/" + xyz_file) # Associate a property (heat of formation) with the object mol.properties = data[xyz_file] # This is a Molecular Coulomb matrix sorted by row norm mol.generate_atomic_coulomb_matrix(size=23, sorting="row-norm") mols.append(mol) # Shuffle molecules np.random.seed(666) np.random.shuffle(mols) # Make training and test sets n_test = 100 n_train = 200 training = mols[:n_train] test = mols[-n_test:] X = np.concatenate([mol.representation for mol in training]) Xs = np.concatenate([mol.representation for mol in test]) N = np.array([mol.natoms for mol in training]) Ns = np.array([mol.natoms for mol in test]) # List of properties Y = np.array([mol.properties for mol in training]) Ys = np.array([mol.properties for mol in test]) # Set hyper-parameters sigma = 724.0 llambda = 10**(-6.5) K = get_local_kernels_gaussian(X, X, N, N, [sigma])[0] assert np.allclose(K, K.T), "Error in local Gaussian kernel symmetry" K_test = np.loadtxt(test_dir + "/data/K_local_gaussian.txt") assert np.allclose( K, K_test), "Error in local Gaussian kernel (vs. reference)" K_test = get_atomic_kernels_gaussian(training, training, [sigma])[0] assert np.allclose(K, K_test), "Error in local Gaussian kernel (vs. wrapper)" # Solve alpha K[np.diag_indices_from(K)] += llambda alpha = cho_solve(K, Y) # Calculate prediction kernel Ks = get_local_kernels_gaussian(Xs, X, Ns, N, [sigma])[0] Ks_test = np.loadtxt(test_dir + "/data/Ks_local_gaussian.txt") # Somtimes a few coulomb matrices differ because of parallel sorting and numerical error # Allow up to 5 molecules to differ from the supplied reference. differences_count = len(set(np.where(Ks - Ks_test > 1e-7)[0])) assert differences_count < 5, "Error in local Laplacian kernel (vs. reference)" # assert np.allclose(Ks, Ks_test), "Error in local Gaussian kernel (vs. reference)" Ks_test = get_atomic_kernels_gaussian(test, training, [sigma])[0] assert np.allclose(Ks, Ks_test), "Error in local Gaussian kernel (vs. wrapper)" Yss = np.dot(Ks, alpha) mae = np.mean(np.abs(Ys - Yss)) assert abs(19.0 - mae) < 1.0, "Error in local Gaussian kernel-ridge regression"