def test_distribution(self): """Tests if the random sorting obeys a gaussian distribution. Can rarely fail when everything is OK. """ # Get the mean value to compare to sigma = 5 desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) cm = desc.create(HHe) means = sorted(np.linalg.norm(cm, axis=1)) means = np.linalg.norm(cm, axis=1) mu2 = means[0] mu1 = means[1] # Measures how many times the two rows with biggest norm exchange place # when random noise is added. This should correspond to the probability # P(X > Y), where X = N(\mu_1, \sigma^2), Y = N(\mu_2, \sigma^2). This # probability can be reduced to P(X > Y) = P(X-Y > 0) = P(N(\mu_1 - # \mu_2, \sigma^2 + sigma^2) > 0). See e.g. # https://en.wikipedia.org/wiki/Sum_of_normally_distributed_random_variables desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=sigma, flatten=False) count = 0 rand_instances = 20000 for i in range(0, rand_instances): cm = desc.create(HHe) if np.linalg.norm(cm[0]) < np.linalg.norm(cm[1]): count += 1 # The expected probability is calculated from the cumulative # distribution function. expected = 1 - scipy.stats.norm.cdf(0, mu1 - mu2, np.sqrt(sigma**2 + sigma**2)) observed = count/rand_instances self.assertTrue(abs(expected - observed) <= 1e-2)
def test_exceptions(self): """Tests different invalid parameters that should raise an exception. """ with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="unknown") with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=-1) with self.assertRaises(ValueError): cm = CoulombMatrix(n_atoms_max=2) cm.create([HHe, H2O])
def test_flatten(self): """Tests the flattening.""" # Unflattened desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) cm = desc.create(H2O) self.assertEqual(cm.shape, (5, 5)) # Flattened desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=True) cm = desc.create(H2O) self.assertEqual(cm.shape, (25,))
def test_flatten(self): """Tests the flattening.""" # Unflattened desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", flatten=False) cm = desc.create(H2O) # print(cm) self.assertEqual(cm.shape, (5,)) # Flattened desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", flatten=True) cm = desc.create(H2O) self.assertEqual(cm.shape, (5,))
def test_sparse(self): """Tests the sparse matrix creation. """ # Dense desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False, sparse=False) vec = desc.create(H2O) self.assertTrue(type(vec) == np.ndarray) # Sparse desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=True, sparse=True) vec = desc.create(H2O) self.assertTrue(type(vec) == scipy.sparse.coo_matrix)
def test_norm_vector(self): """Tests if the attribute _norm_vector is written and used correctly """ desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False) cm = desc.create(H2O) self.assertEqual(len(cm), 5) # The norm_vector is not zero padded in this implementation. All zero-padding # is done at the end after randomly sorting self.assertEqual(len(desc._norm_vector), 3) cm = desc.create(H2O) self.assertEqual(len(cm), 5)
def test_match_with_sorted(self): """Tests if sorting the random coulomb matrix results in the same as the sorted coulomb matrix """ desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False) rcm = desc.create(H2O) srcm = desc.sort(rcm) desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) scm = desc.create(H2O) self.assertTrue(np.array_equal(scm, srcm))
def test_sparse(self): """Tests the sparse matrix creation.""" # Dense desc = CoulombMatrix( n_atoms_max=5, permutation="random", sigma=100, flatten=False, sparse=False ) vec = desc.create(H2O) self.assertTrue(type(vec) == np.ndarray) # Sparse desc = CoulombMatrix( n_atoms_max=5, permutation="random", sigma=100, flatten=True, sparse=True ) vec = desc.create(H2O) self.assertTrue(type(vec) == sparse.COO)
def test_features(self): """Tests that the correct features are present in the desciptor. """ desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False) cm = desc.create(H2O) # Test against assumed values q = H2O.get_atomic_numbers() p = H2O.get_positions() norm = np.linalg.norm assumed = np.array([ [ 0.5 * q[0]**2.4, q[0] * q[1] / (norm(p[0] - p[1])), q[0] * q[2] / (norm(p[0] - p[2])) ], [ q[1] * q[0] / (norm(p[1] - p[0])), 0.5 * q[1]**2.4, q[1] * q[2] / (norm(p[1] - p[2])) ], [ q[2] * q[0] / (norm(p[2] - p[0])), q[2] * q[1] / (norm(p[2] - p[1])), 0.5 * q[2]**2.4 ], ]) zeros = np.zeros((5, 5)) zeros[:3, :3] = assumed assumed = zeros self.assertTrue(np.array_equal(cm, assumed))
def test_parallel_dense(self): """Tests creating dense output parallelly.""" samples = [molecule("CO"), molecule("N2O")] desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=True, sparse=False) n_features = desc.get_number_of_features() # Determining number of jobs based on the amount of CPUs desc.create(system=samples, n_jobs=-1, only_physical_cores=False) desc.create(system=samples, n_jobs=-1, only_physical_cores=True) # Test multiple systems, serial job output = desc.create( system=samples, n_jobs=1, ) assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]) assumed[1, :] = desc.create(samples[1]) self.assertTrue(np.allclose(output, assumed)) # Test multiple systems, parallel job output = desc.create( system=samples, n_jobs=2, ) assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]) assumed[1, :] = desc.create(samples[1]) self.assertTrue(np.allclose(output, assumed)) # Non-flattened output desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False, sparse=False) output = desc.create( system=samples, n_jobs=2, ) assumed = np.empty((2, 5, 5)) assumed[0] = desc.create(samples[0]) assumed[1] = desc.create(samples[1]) self.assertTrue(np.allclose(np.array(output), assumed))
def test_flatten(self): """Tests the flattening. """ # Unflattened desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False) cm = desc.create(H2O) self.assertEqual(cm.shape, (5, 5)) # Flattened desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=True) cm = desc.create(H2O) self.assertEqual(cm.shape, (1, 25))
def test_features(self): """Tests that the correct features are present in the desciptor.""" desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) cm = desc.create(H2O) lens = np.linalg.norm(cm, axis=1) old_len = lens[0] for length in lens[1:]: self.assertTrue(length <= old_len) old_len = length
class Global_Descriptor_CM(Global_Descriptor_Base): def __init__(self, desc_spec): """ make a DScribe CM object """ from dscribe.descriptors import CoulombMatrix if "type" not in desc_spec.keys() or desc_spec["type"] != "CM": raise ValueError( "Type is not CM or cannot find the type of the descriptor") # required try: self.max_atoms = desc_spec['max_atoms'] except: raise ValueError( "Not enough information to intialize the `Atomic_Descriptor_CM` object" ) if 'periodic' in desc_spec.keys() and desc_spec['periodic'] == True: raise ValueError( "Coulomb Matrix cannot be used for periodic systems") self.cm = CoulombMatrix(self.max_atoms) print("Using CoulombMatrix ...") # make an acronym self.acronym = "CM" + "-" + str(self.max_atoms) def create(self, frame): """ compute the CM descriptor vector for a frame Parameters ---------- frame: ASE atom object. Coordinates of a frame. Returns ------- desc_dict: a dictionary. each entry contains the essential info of the descriptor, i.e. acronym and a np.array [N_desc]. Global descriptors for a frame. e.g. {'d1':{ 'acronym': 'CM-*', 'descriptors': `a np.array [N_desc]`}} atomic_desc_dict : {} """ if len(frame.get_positions()) > self.max_atoms: raise ValueError( 'the size of the system is larger than the max_atoms of the CM descriptor' ) # notice that we return an empty dictionary for "atomic descriptors" return { 'acronym': self.acronym, 'descriptors': self.cm.create(frame, n_jobs=1) }, {}
def test_features(self): """Tests that the correct features are present in the desciptor.""" desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum") cm = desc.create(H2O) self.assertEqual(cm.shape, (5,)) # Test that eigenvalues are in decreasing order when looking at absolute value prev_eig = float("Inf") for eigenvalue in cm[: len(H2O)]: self.assertTrue(abs(eigenvalue) <= abs(prev_eig)) prev_eig = eigenvalue # Test that array is zero-padded self.assertTrue(np.array_equal(cm[len(H2O) :], [0, 0]))
def test_periodicity(self): """Tests that periodicity is not taken into account in Coulomb matrix even if the system is set as periodic. """ system = Atoms(cell=[5, 5, 5], scaled_positions=[ [0.1, 0, 0], [0.9, 0, 0], ], symbols=["H", "H"], pbc=True) desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False) cm = desc.create(system) pos = system.get_positions() assumed = 1 * 1 / np.linalg.norm((pos[0] - pos[1])) self.assertEqual(cm[0, 1], assumed)
def ML_potential(config, data): model = data['metadata'][3]['best_model_fitted'] if data['metadata'][1]['descriptor_type'] == 'Coulomb_matrix': descriptor = CoulombMatrix( n_atoms_max=7, flatten=True, permutation = 'sorted_l2') x = Atoms('O2H5',positions=config) X = descriptor.create(x) energy = model.predict(X)[0][0] return energy if data['metadata'][1]['descriptor_type'] == 'PIV': descriptor = data['metadata'][1]['descriptor'] x = Atoms('O2H5', positions=config) X = descriptor(x) energy = model.predict(X)[0][0] return energy
def setupDescs(structs, indexs, level, descname, chemsyms_uniques, n_atoms, steve, v): """ Setup descriptor and run it for ASE structures. Return DataFrame with given strictures as descriptors """ # choose the descriptor if descname == "CM": desc = CoulombMatrix(n_atoms_max=n_atoms, flatten=True) # permutation = 'sorted_l2' is default n_feat = desc.get_number_of_features() if descname == "MBTR": desc = MBTR(species=chemsyms_uniques, k1=mk1, k2=mk2, k3=mk3, periodic=False, normalization="l2_each", flatten=True) n_feat = desc.get_number_of_features() if descname == "SOAP": desc = SOAP(species=chemsyms_uniques, periodic=False, rcut=srcut, nmax=snmax, lmax=slmax, average=True) # Averaging for global n_feat = desc.get_number_of_features() # Create descriptors descs = desc.create(structs, n_jobs=steve) # Parallel # Create a DF of returned `list` of `arrays` of descs descs_df = pd.DataFrame(descs, index=indexs) if v: print("""🔘 Created {}-descriptors for all {} {}-structures. Number of features in {}: {}""".format(descname, structs.shape[0], level, descname, n_feat)) return descs_df, n_feat
def test_parallel_sparse(self): """Tests creating sparse output parallelly. """ # Test indices samples = [molecule("CO"), molecule("N2O")] desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=True, sparse=True) n_features = desc.get_number_of_features() # Test multiple systems, serial job output = desc.create( system=samples, n_jobs=1, ).toarray() assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]).toarray() assumed[1, :] = desc.create(samples[1]).toarray() self.assertTrue(np.allclose(output, assumed)) # Test multiple systems, parallel job output = desc.create( system=samples, n_jobs=2, ).toarray() assumed = np.empty((2, n_features)) assumed[0, :] = desc.create(samples[0]).toarray() assumed[1, :] = desc.create(samples[1]).toarray() self.assertTrue(np.allclose(output, assumed)) # Non-flattened output desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False, sparse=True) output = [ x.toarray() for x in desc.create( system=samples, n_jobs=2, ) ] assumed = np.empty((2, 5, 5)) assumed[0] = desc.create(samples[0]).toarray() assumed[1] = desc.create(samples[1]).toarray() self.assertTrue(np.allclose(np.array(output), assumed))
atomic_numbers = [1, 8] rcut = 6.0 nmax = 8 lmax = 6 # Setting up the CM descriptor cm = CoulombMatrix(n_atoms_max=6, ) # Creation from ase.build import molecule # Molecule created as an ASE.Atoms methanol = molecule("CH3OH") # Create CM output for the system cm_methanol = cm.create(methanol) print(cm_methanol) print("flattened", cm_methanol.shape) # Create output for multiple system samples = [molecule("H2O"), molecule("NO2"), molecule("CO2")] coulomb_matrices = cm.create(samples) # Serial coulomb_matrices = cm.create(samples, n_jobs=2) # Parallel # No flattening cm = CoulombMatrix(n_atoms_max=6, flatten=False) cm_methanol = cm.create(methanol) print(cm_methanol) print("not flattened", cm_methanol.shape)
def create(system): desc = CoulombMatrix(n_atoms_max=3, permutation="none", flatten=True) return desc.create(system)
def main(fxyz, dictxyz, prefix, output, max_atoms, stride): """ Generate the SOAP descriptors. Parameters ---------- fxyz: string giving location of xyz file dictxyz: string giving location of xyz file that is used as a dictionary prefix: string giving the filename prefix output: [xyz]: append the representations to extended xyz file; [mat] output as a standlone matrix max_atoms: int: Max number of atoms in the Coulomb Matrix stride: compute descriptor each X frames """ fframes = [] dictframes = [] # read frames if fxyz != 'none': fframes = read(fxyz, slice(0, None, stride)) nfframes = len(fframes) print("read xyz file:", fxyz, ", a total of", nfframes, "frames") # read frames in the dictionary if dictxyz != 'none': dictframes = read(dictxyz, ':') ndictframes = len(dictframes) print("read xyz file used for a dictionary:", dictxyz, ", a total of", ndictframes, "frames") frames = dictframes + fframes nframes = len(frames) global_species = [] for frame in frames: global_species.extend(frame.get_atomic_numbers()) frame.set_pbc([False, False, False]) global_species = np.unique(global_species) print("a total of", nframes, "frames, with elements: ", global_species) rep_atomic = CoulombMatrix(max_atoms) foutput = prefix + "-max_atoms" + str(max_atoms) desc_name = "CM" + "-max_atoms" + str(max_atoms) # prepare for the output if os.path.isfile(foutput + ".xyz"): os.rename(foutput + ".xyz", "bck." + foutput + ".xyz") if os.path.isfile(foutput + ".desc"): os.rename(foutput + ".desc", "bck." + foutput + ".desc") for i, frame in enumerate(frames): fnow = rep_atomic.create(frame, n_jobs=8) frame.info[desc_name] = fnow # save if output == 'matrix': with open(foutput + ".desc", "ab") as f: np.savetxt(f, frame.info[desc_name][None]) np.savetxt(fatomic, fnow) elif output == 'xyz': # output per-atom info # write xyze write(foutput + ".xyz", frame, append=True) else: raise ValueError('Cannot find the output format')
def select_descriptor(data, descriptor_metadata): print('creating descriptor....') from dscribe.descriptors import CoulombMatrix if descriptor_metadata['descriptor_type'] == 'Coulomb_matrix': scaling = descriptor_metadata['scaling'] descriptor = CoulombMatrix( n_atoms_max=data['configuration'][0].get_number_of_atoms(), flatten=True, permutation=descriptor_metadata['permutation']) rounding = 5 l = data['configuration'].tolist() features = [descriptor.create(l[i]) for i in range(len(l))] features = np.round(np.array(features), rounding) data = pd.concat([data, pd.DataFrame(features)], axis=1) descriptor_metadata['descriptor'] = descriptor features = [ i for i in range( descriptor_metadata['descriptor'].get_number_of_features()) ] labels = ['energy'] features_and_labels = labels + features descriptor_metadata['features'] = features descriptor_metadata['labels'] = labels descriptor_metadata['features_and_labels'] = features_and_labels data['metadata'].at[1] = descriptor_metadata return data elif descriptor_metadata['descriptor_type'] == 'PIV': def switching_OO(x): n = 8 d0 = 4.5 return x def switching_OH_plus(x): n = 8 d0 = 2.3 #return 1/(1+(x/d0)**n) return x def switching_HH(x): return x def switching_HH_plus(x): return switching_HH(x) def switching_OH(x): return switching_HH(x) def PIV(configuration): distances = configuration.get_all_distances() np.fill_diagonal(distances, 0) distances[np.tril_indices(distances.shape[0], -1)] = 0 OO = distances[0, 1].flatten() OH_plus = distances[0:2, 2].flatten() OH = distances[0:2, 3:7].flatten() HH_plus = distances[2, 3:7].flatten() HH = distances[3:6, 4:7].flatten() HH = np.delete(HH, (3, 6, 7)) OO = switching_OO(OO) OH_plus = switching_OH_plus(OH_plus) OH = switching_OH(OH) HH_plus = switching_HH_plus(HH_plus) HH = switching_HH(HH) OO = np.sort(OO) OH_plus = np.sort(OH_plus) OH = np.sort(OH) HH_plus = np.sort(HH_plus) HH = np.sort(HH) PIV = np.concatenate((OO, OH_plus, OH, HH_plus, HH), axis=None) return PIV l = data['configuration'].tolist() features = [PIV(l[i]) for i in range(len(l))] data = pd.concat([data, pd.DataFrame(features)], axis=1) features = [i for i in range(21)] labels = ['energy'] features_and_labels = labels + features descriptor_metadata['features'] = features descriptor_metadata['labels'] = labels descriptor_metadata['features_and_labels'] = features_and_labels descriptor_metadata['switching_OO'] = switching_OO descriptor_metadata['switching_OH_plus'] = switching_OH_plus descriptor_metadata['switching_OH'] = switching_OH descriptor_metadata['switching_HH_plus'] = switching_HH_plus descriptor_metadata['switching_HH'] = switching_HH descriptor_metadata['descriptor'] = PIV data['metadata'].at[1] = descriptor_metadata return data elif descriptor_metadata['descriptor_type'] == 'CM_with_PIV_sorting': def PIV(configuration): def w_diag(x): return 0.5 * np.power(x, 2.4) distances = configuration.get_all_distances() for i in range(len(distances)): distances[i, i] = 1 def switching_function(x): return 1 / x distances = switching_function(distances) distances[0, 0] = w_diag(8) distances[1, 1] = w_diag(8) for i in range(2, 7): distances[i, i] = w_diag(1) distances[0, 1] = distances[0, 1] * 64 distances[1, 0] = distances[1, 0] * 64 distances[0:2, 2:7] = distances[0:2, 2:7] * 8 distances[2:7, 0:2] = distances[2:7, 0:2] * 8 OO = distances[0:2, 0:2].flatten() HH1 = distances[0:7, 2:7].flatten() HH2 = distances[2:7, 0:2].flatten() HH = np.append(HH2, HH1) OO = np.sort(OO) HH = np.sort(HH) PIV = np.concatenate((OO, HH), axis=None) return PIV l = data['configuration'].tolist() features = [PIV(l[i]) for i in range(len(l))] data = pd.concat([data, pd.DataFrame(features)], axis=1) features = [i for i in range(49)] labels = ['energy'] features_and_labels = labels + features descriptor_metadata['features'] = features descriptor_metadata['labels'] = labels descriptor_metadata['features_and_labels'] = features_and_labels data['metadata'].at[1] = descriptor_metadata return data data['metadata'].at[1] = descriptor_metadata return data elif descriptor_metadata['descriptor_type'] == 'PIV_without_H_plus': def PIV(configuration): distances = configuration.get_all_distances() distances[np.tril_indices(distances.shape[0], -1)] = 0 distances OO = distances[0:2, 0:2].flatten() HH1 = distances[0:7, 2:7].flatten() HH2 = distances[2:7, 0:2].flatten() HH = np.append(HH2, HH1) OO = np.sort(OO) HH = np.sort(HH) OO = OO[OO != 0] HH = HH[HH != 0] PIV = np.concatenate((OO, HH), axis=None) return PIV l = data['configuration'].tolist() features = [PIV(l[i]) for i in range(len(l))] data = pd.concat([data, pd.DataFrame(features)], axis=1) features = [i for i in range(21)] labels = ['energy'] features_and_labels = labels + features descriptor_metadata['features'] = features descriptor_metadata['labels'] = labels descriptor_metadata['features_and_labels'] = features_and_labels data['metadata'].at[1] = descriptor_metadata return data data['metadata'].at[1] = descriptor_metadata return data elif descriptor_metadata[ 'descriptor_type'] == 'PIV_with_CM_diagonal_and_weighting': def PIV(configuration): def w_diag(x): return 0.5 * np.power(x, 2.4) #This PIV has 28 elements distances = configuration.get_all_distances() for i in range(len(distances)): distances[i, i] = 1 def switching_function(x): return 1 / x distances = switching_function(distances) distances[0, 0] = w_diag(8) distances[1, 1] = w_diag(8) for i in range(2, 7): distances[i, i] = w_diag(1) distances[np.tril_indices(distances.shape[0], -1)] = 0 distances OO = distances[0:2, 0:2].flatten() HH1 = distances[0:7, 2:7].flatten() HH2 = distances[2:7, 0:2].flatten() HH = np.append(HH2, HH1) OO = np.sort(OO) HH = np.sort(HH) OO = OO[OO != 0] HH = HH[HH != 0] PIV = np.concatenate((OO, HH), axis=None) return PIV l = data['configuration'].tolist() features = [PIV(l[i]) for i in range(len(l))] data = pd.concat([data, pd.DataFrame(features)], axis=1) features = [i for i in range(28)] labels = ['energy'] features_and_labels = labels + features descriptor_metadata['features'] = features descriptor_metadata['labels'] = labels descriptor_metadata['features_and_labels'] = features_and_labels data['metadata'].at[1] = descriptor_metadata return data data['metadata'].at[1] = descriptor_metadata return data else: print('NO DESCRIPTOR SELECTED!') descriptor_metadata['descriptor_type'] = 'NO_DESCRIPTOR_SELECTED' data['metadata'].at[1] = descriptor_metadata return data
from dscribe.descriptors import SOAP from dscribe.descriptors import CoulombMatrix from ase.build import molecule # Define geometry mol = molecule("H2O") # Setup descriptors cm_desc = CoulombMatrix(n_atoms_max=3, permutation="sorted_l2") soap_desc = SOAP(atomic_numbers=[1, 8], rcut=5, nmax=8, lmax=6, crossover=True) # Create descriptors as numpy arrays or scipy sparse matrices input_cm = cm_desc.create(mol) input_soap = soap_desc.create(mol, positions=[0])
atomic_numbers = stats["atomic_numbers"] max_atomic_number = stats["max_atomic_number"] min_atomic_number = stats["min_atomic_number"] min_distance = stats["min_distance"] cm_desc = CoulombMatrix( n_atoms_max= 29, ## maximum number of atoms in a molecule that occurs in dataset permutation="sorted_l2", #sparse=True ) time_start = time.time() cm_start = time.time() ############# create CM for data ############################################################################## cm = cm_desc.create(ase_mol) cm_end = time.time() cm_time = np.round(cm_end - cm_start, decimals=3) ################# split CM and h**o array into 5 different parts ### define index index = np.arange(np.shape(cm)[0]) ### shuffle index np.random.shuffle(index) ### return shuffled cm matrix shuffled_cm = cm[index, :] ### return shuffled h**o array h**o = np.array(h**o) shuffled_homo = h**o[index] # shuffled_homo.tolist()
def create(system): desc = CoulombMatrix( n_atoms_max=3, permutation="random", sigma=0.000001, flatten=True ) return desc.create(system)
from ase.build import molecule from dscribe.descriptors import CoulombMatrix # Define atomic structures samples_mol = [molecule("H2O"), molecule("NO2"), molecule("CO2")] # Setup descriptor cm_desc = CoulombMatrix(n_atoms_max=3, permutation="sorted_l2") # Create descriptor water = samples_mol[0] coulomb_matrix = cm_desc.create(water) print("Coulomb matrix for water:\n", coulomb_matrix) # Create multiple descriptors coulomb_matrices = cm_desc.create(samples_mol) print("List of Coulomb matrices:\n", coulomb_matrices)
import numpy as np from ase.build import molecule from dscribe.descriptors import SOAP from dscribe.descriptors import CoulombMatrix # Define atomic structures samples = [molecule("H2O"), molecule("NO2"), molecule("CO2")] # Setup descriptors cm_desc = CoulombMatrix(n_atoms_max=3, permutation="sorted_l2") soap_desc = SOAP(species=["C", "H", "O", "N"], rcut=5, nmax=8, lmax=6, crossover=True) # Create descriptors as numpy arrays or sparse arrays water = samples[0] coulomb_matrix = cm_desc.create(water) soap = soap_desc.create(water, positions=[0]) # Easy to use also on multiple systems, can be parallelized across processes coulomb_matrices = cm_desc.create(samples) coulomb_matrices = cm_desc.create(samples, n_jobs=3) oxygen_indices = [np.where(x.get_atomic_numbers() == 8)[0] for x in samples] oxygen_soap = soap_desc.create(samples, oxygen_indices, n_jobs=3) # Some descriptors also allow calculating derivatives with respect to atomic # positions der, des = soap_desc.derivatives(samples, method="auto", return_descriptor=True)
def plotDescs(structs, indexs, level, descname, chemsyms, n_atoms, steve, v, path_output, save=True): """ Plot descriptors """ # choose the descriptor if descname == "CM": desc = CoulombMatrix( n_atoms_max=n_atoms, flatten=False, permutation='none') # permutation = 'sorted_l2' is default n_feat = desc.get_number_of_features() # Create descriptors descs = desc.create(structs, n_jobs=steve) # Parallel # Plot CM of zero_cluster and save it to outputs-folder sns.heatmap(descs, cmap='Spectral', robust=True, xticklabels=chemsyms, yticklabels=chemsyms) plt.title("CM of {}".format(indexs)) if save: plt.savefig("{}/{}_CM.png".format(path_output, indexs[:-4])) if descname == "MBTR": desc = MBTR(species=list(set(chemsyms)), k1=mk1, k2=mk2, k3=mk3, periodic=False, normalization="l2_each", flatten=False) n_feat = desc.get_number_of_features() descs = desc.create(structs, n_jobs=steve) # Parallel # Create the mapping between an index in the output and the corresponding chemical symbol n_elements = len(desc.species) # dict({index_of_atom_type:Z_of_atom_type}) imap = desc.index_to_atomic_number # dict({index_of_atom_type:atom_type_symbol}) smap = { index: ase.data.chemical_symbols[number] for index, number in imap.items() } # Plot k=1 x = np.linspace(0, 1, 100) # las number defines the resolution of x-axis x1 = desc.get_k1_axis() # from fullmetalfelix/ML-CSC-tutorial fig, ax = plt.subplots() for i in range(n_elements): plt.plot(x1, descs["k1"][i, :], label="{}".format(smap[i])) ax.set_xlabel("Charge") ax.set_xlabel( "Atomic number") #, size=20) # from fullmetalfelix/ML-CSC-tutorial ax.set_ylabel("k1 values (arbitrary units)") #, size=20) plt.legend() plt.title("MBTR k1 of {}".format(indexs)) if save: plt.savefig("{}/{}_MBTR_k1.png".format(path_output, indexs[:-4])) # Plot k=2 x = np.linspace(0, 0.5, 100) # Kato mitä tää on docsista x2 = desc.get_k2_axis() # from fullmetalfelix/ML-CSC-tutorial fig, ax = plt.subplots() for i in range(n_elements): for j in range(n_elements): if j >= i: plt.plot(x2, descs["k2"][i, j, :], label="{}-{}".format(smap[i], smap[j])) ax.set_xlabel("Inverse distance (1/angstrom)" ) #, size=20) # How to make not inverse? ax.set_ylabel("k2 values (arbitrary units)") #, size=20) plt.legend() plt.title("MBTR k2 of {}".format(indexs)) if save: plt.savefig("{}/{}_MBTR_k2.png".format(path_output, indexs[:-4])) # Plot k=3 x = np.linspace(0, 0.5, 100) # Kato mitä tää on docsista x3 = desc.get_k3_axis() # from fullmetalfelix/ML-CSC-tutorial fig, ax = plt.subplots() for i in range(n_elements): for j in range(n_elements): if j >= i: for k in range(n_elements): if k >= j and smap[k] == "S": plt.plot(x3, descs["k3"][i, j, k, :], label="{}-{}-{}".format( smap[i], smap[j], smap[k])) ax.set_xlabel("cos(angle)") #, size=20) ax.set_ylabel("k3 values (arbitrary units)") #, size=20) plt.legend() plt.title("MBTR k3 of {}".format(indexs)) if save: plt.savefig("{}/{}_MBTR_k3.png".format(path_output, indexs[:-4])) if descname == "SOAP": desc = SOAP(species=list(set(chemsyms)), periodic=False, rcut=srcut, nmax=snmax, lmax=slmax, average=False) # Averaging for global n_feat = desc.get_number_of_features() descs = desc.create(structs, n_jobs=steve) # Plot SOAPs for all atom pairs chemsyms_combos = list(combinations_with_replacement(desc.species, 2)) for combo in chemsyms_combos: # The locations of specific element combinations can be retrieved like this. pairloc = desc.get_location(combo) # These locations can be directly used to slice the corresponding part from an # SOAP output for e.g. plotting. plt.plot(descs[0, pairloc], label="{}-{}".format(combo[0], combo[1])) plt.legend() #plt.xlim(20,40) plt.xlabel("N of features for an atom pair") plt.ylabel("Output value of SOAPs") plt.title("SOAPs of {}".format(indexs)) if save: plt.savefig("{}/{}_SOAP.png".format(path_output, indexs[:-4])) if v: print("🔘 Plotting {} done.".format(descname))
atomic_numbers = [1, 8] rcut = 6.0 nmax = 8 lmax = 6 # Setting up the CM descriptor cm = CoulombMatrix(n_atoms_max=6, ) # Creating an atomic system as an ase.Atoms-object from ase.build import molecule methanol = molecule("CH3OH") print(methanol) # Create CM output for the system cm_methanol = cm.create(methanol) print(cm_methanol) print("flattened", cm_methanol.shape) # No flattening cm = CoulombMatrix(n_atoms_max=6, flatten=False) cm_methanol = cm.create(methanol) print(cm_methanol) print("not flattened", cm_methanol.shape) # Introduce zero-padding cm = CoulombMatrix(n_atoms_max=10, flatten=False) cm_methanol = cm.create(methanol)
def f(x): filename = 'boss_outfile.txt' if os.path.exists(filename): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not iteration_start = time.time() ## KRR parameters alpha_exp = -x[0][0] gamma_exp = -x[0][1] alpha = 10**alpha_exp gamma = 10**gamma_exp # write variables to file f = open('variables.in', 'w') f.write(str(alpha)) f.write("\n") f.write(str(gamma)) f.close() time_cv_array = [] MAE_list = [] cv_time_list = [] #### Load data data = pd.read_json("../data/data_train_1k.json") ###### extract xyz coordinates and HOMOs from dataframe homo_array = [] out_mol = StringIO() for i, row in data.iterrows(): h**o = row[0][1] homo_array.append(h**o) x = "".join(row.molecule) #print("x:", x) out_mol.write(x) h**o = np.array(homo_array) h**o = [float(x) for x in h**o] #print(homo_train) ase_mol = list(ase.io.iread(out_mol, format="xyz")) ## Load statistics from the dataset stats = system_stats(ase_mol) atomic_numbers = stats["atomic_numbers"] max_atomic_number = stats["max_atomic_number"] min_atomic_number = stats["min_atomic_number"] min_distance = stats["min_distance"] cm_desc = CoulombMatrix( #n_atoms_max=max_atomic_number, n_atoms_max=29, permutation="sorted_l2", #sparse=True ) ############# create CM for data ############################################################################## cm_start = time.time() cm = cm_desc.create(ase_mol) cm_end = time.time() cm_time = np.round(cm_end - cm_start, decimals=3) ################# split CM and h**o array into 5 different parts ### mbtr to csr #mbtr = mbtr_mol.tocsr() ## select 3 random rows of mbtr matrix #select_ind = np.array([0,2,4]) #mbtr[select_ind, :] ## see contents: todense() ### define index index = np.arange(np.shape(cm)[0]) ### shuffle index np.random.shuffle(index) ### return shuffled cm matrix shuffled_cm = cm[index, :] ### return shuffled h**o array h**o = np.array(h**o) shuffled_homo = h**o[index] # shuffled_homo.tolist() ### split data into 5 equal parts select_ind_1 = np.arange(0, 200, 1) cm_1 = shuffled_cm[select_ind_1, :] homo_1 = shuffled_homo[select_ind_1] select_ind_2 = np.arange(200, 400, 1) cm_2 = shuffled_cm[select_ind_2, :] homo_2 = shuffled_homo[select_ind_2] select_ind_3 = np.arange(400, 600, 1) cm_3 = shuffled_cm[select_ind_3, :] homo_3 = shuffled_homo[select_ind_3] select_ind_4 = np.arange(600, 800, 1) cm_4 = shuffled_cm[select_ind_4, :] homo_4 = shuffled_homo[select_ind_4] select_ind_5 = np.arange(800, 1000, 1) cm_5 = shuffled_cm[select_ind_5, :] homo_5 = shuffled_homo[select_ind_5] ##### arrange data into training and validation sets cm_train_1 = np.concatenate((cm_2, cm_3, cm_4, cm_5)) print("cm_train_1:", cm_train_1) print("Length cm_train:", len(cm_train_1)) print("Shape cm_train:", cm_train_1.shape) cm_val_1 = cm_1 homo_train_1 = np.concatenate((homo_2, homo_3, homo_4, homo_5)) homo_val_1 = homo_1 cm_train_2 = np.concatenate((cm_3, cm_4, cm_5, cm_1)) #print("Length cm_train:", cm_train_2.shape) cm_val_2 = cm_2 #print("Length cm_val:", cm_val_2.shape) homo_train_2 = np.concatenate((homo_3, homo_4, homo_5, homo_1)) #print("Length homo_train:", len(homo_train_2)) homo_val_2 = homo_2 #print("Length homo_val:", len(homo_val_2)) cm_train_3 = np.concatenate((cm_4, cm_5, cm_1, cm_2)) #print("Length cm_train:", cm_train_3.shape) cm_val_3 = cm_3 #print("Length cm_val:", cm_val_3.shape) homo_train_3 = np.concatenate((homo_4, homo_5, homo_1, homo_2)) homo_val_3 = homo_3 cm_train_4 = np.concatenate((cm_5, cm_1, cm_2, cm_3)) #print("Length cm_train:", cm_train_4.shape) cm_val_4 = cm_4 #print("Length cm_val:", cm_val_4.shape) homo_train_4 = np.concatenate((homo_5, homo_1, homo_2, homo_3)) homo_val_4 = homo_4 cm_train_5 = np.concatenate((cm_1, cm_2, cm_3, cm_4)) #print("Length cm_train:", cm_train_5.shape) cm_val_5 = cm_5 #print("Length cm_val:", cm_val_5.shape) homo_train_5 = np.concatenate((homo_1, homo_2, homo_3, homo_4)) homo_val_5 = homo_5 cm_train = [cm_train_1, cm_train_2, cm_train_3, cm_train_4, cm_train_5] cm_val = [cm_val_1, cm_val_2, cm_val_3, cm_val_4, cm_val_5] homo_train = [ homo_train_1, homo_train_2, homo_train_3, homo_train_4, homo_train_5 ] homo_val = [homo_val_1, homo_val_2, homo_val_3, homo_val_4, homo_val_5] ### KRR ############### for cm_train_i, homo_train_i, cm_val_i, homo_val_i in zip( cm_train, homo_train, cm_val, homo_val): cv_start = time.time() model = KernelRidge(alpha=alpha, kernel='laplacian', gamma=gamma) model.fit(cm_train_i, homo_train_i) y_true = homo_val_i y_pred = model.predict(cm_val_i) MAE = mean_absolute_error(y_true, y_pred) cv_end = time.time() cv_time_list.append(np.round(cv_end - cv_start, decimals=3)) MAE_list.append(MAE) print("MAE:", MAE) avg_MAE = np.mean(MAE_list) avg_cv_time = np.mean(cv_time_list) iteration_end = time.time() iteration_time = np.round(iteration_end - iteration_start, decimals=3) print("iteration time:", iteration_time) if os.path.isfile('results/df_results_cm.json'): df_results = pd.read_json('results/df_results_cm.json', orient='split') iteration = len(df_results) + 1 print("iteration:", iteration) row = [ iteration, avg_MAE, iteration_time, cm_time, avg_cv_time, alpha, gamma ] df_results.loc[len(df_results)] = row df_results.to_json('results/df_results_cm.json', orient='split') print(df_results) else: df_results = pd.DataFrame( [[1, avg_MAE, iteration_time, cm_time, avg_cv_time, alpha, gamma]], columns=[ 'iteration', 'avg_MAE', 'iteration_time', 'cm_time', 'avg_cv_time', 'alpha', 'gamma' ]) df_results.to_json('results/df_results_cm.json', orient='split') return avg_MAE