def __init__(self, desc_spec): """ make a DScribe CM object """ from dscribe.descriptors import CoulombMatrix if "type" not in desc_spec.keys() or desc_spec["type"] != "CM": raise ValueError( "Type is not CM or cannot find the type of the descriptor") # required try: self.max_atoms = desc_spec['max_atoms'] except: raise ValueError( "Not enough information to intialize the `Atomic_Descriptor_CM` object" ) if 'periodic' in desc_spec.keys() and desc_spec['periodic'] == True: raise ValueError( "Coulomb Matrix cannot be used for periodic systems") self.cm = CoulombMatrix(self.max_atoms) print("Using CoulombMatrix ...") # make an acronym self.acronym = "CM" + "-" + str(self.max_atoms)
def test_features(self): """Tests that the correct features are present in the desciptor. """ desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False) cm = desc.create(H2O) # Test against assumed values q = H2O.get_atomic_numbers() p = H2O.get_positions() norm = np.linalg.norm assumed = np.array([ [ 0.5 * q[0]**2.4, q[0] * q[1] / (norm(p[0] - p[1])), q[0] * q[2] / (norm(p[0] - p[2])) ], [ q[1] * q[0] / (norm(p[1] - p[0])), 0.5 * q[1]**2.4, q[1] * q[2] / (norm(p[1] - p[2])) ], [ q[2] * q[0] / (norm(p[2] - p[0])), q[2] * q[1] / (norm(p[2] - p[1])), 0.5 * q[2]**2.4 ], ]) zeros = np.zeros((5, 5)) zeros[:3, :3] = assumed assumed = zeros self.assertTrue(np.array_equal(cm, assumed))
def test_constructor(self): """Tests different valid and invalid constructor values. """ with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="unknown") with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=-1)
def test_number_of_features(self): """Tests that the reported number of features is correct. """ desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) n_features = desc.get_number_of_features() self.assertEqual(n_features, 25)
def test_features(self): """Tests that the correct features are present in the desciptor.""" desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) cm = desc.create(H2O) lens = np.linalg.norm(cm, axis=1) old_len = lens[0] for length in lens[1:]: self.assertTrue(length <= old_len) old_len = length
def test_constructor(self): """Tests different valid and invalid constructor values. """ with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="random", sigma=None) with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", sigma=3) with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="none", sigma=3) with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", sigma=3)
def test_exceptions(self): """Tests different invalid parameters that should raise an exception. """ with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="random", sigma=None) with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", sigma=3) with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="none", sigma=3) with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", sigma=3)
def test_norm_vector(self): """Tests if the attribute _norm_vector is written and used correctly """ desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False) cm = desc.create(H2O) self.assertEqual(len(cm), 5) # The norm_vector is not zero padded in this implementation. All zero-padding # is done at the end after randomly sorting self.assertEqual(len(desc._norm_vector), 3) cm = desc.create(H2O) self.assertEqual(len(cm), 5)
def __init__(self, preprocessor=None, batch_size=None, filename="features.db", scheduler="distributed", save_preprocessor="ml4chem", overwrite=True, **kwargs): super(CoulombMatrix, self).__init__() CoulombMatrixDscribe.__init__(self, permutation="none", flatten=False, **kwargs) self.batch_size = batch_size self.filename = filename self.preprocessor = preprocessor self.scheduler = scheduler self.overwrite = overwrite self.save_preprocessor = save_preprocessor # Let's add parameters that are going to be stored in the .params json # file. self.params = OrderedDict() self.params["name"] = self.name() # This is a very general way of not forgetting to save variables _params = vars() # Delete useless variables delete = [ "self", "scheduler", "overwrite", "k", "v", "value", "keys", "batch_size", "__class__", ] for param in delete: try: del _params[param] except KeyError: # In case the variable does not exist we just pass. pass for k, v in _params.items(): if v is not None: self.params[k] = v
def test_features(self): """Tests that the correct features are present in the desciptor.""" desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum") cm = desc.create(H2O) self.assertEqual(cm.shape, (5,)) # Test that eigenvalues are in decreasing order when looking at absolute value prev_eig = float("Inf") for eigenvalue in cm[: len(H2O)]: self.assertTrue(abs(eigenvalue) <= abs(prev_eig)) prev_eig = eigenvalue # Test that array is zero-padded self.assertTrue(np.array_equal(cm[len(H2O) :], [0, 0]))
def test_periodicity(self): """Tests that periodicity is not taken into account in Coulomb matrix even if the system is set as periodic. """ system = Atoms(cell=[5, 5, 5], scaled_positions=[ [0.1, 0, 0], [0.9, 0, 0], ], symbols=["H", "H"], pbc=True) desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False) cm = desc.create(system) pos = system.get_positions() assumed = 1 * 1 / np.linalg.norm((pos[0] - pos[1])) self.assertEqual(cm[0, 1], assumed)
def ML_potential(config, data): model = data['metadata'][3]['best_model_fitted'] if data['metadata'][1]['descriptor_type'] == 'Coulomb_matrix': descriptor = CoulombMatrix( n_atoms_max=7, flatten=True, permutation = 'sorted_l2') x = Atoms('O2H5',positions=config) X = descriptor.create(x) energy = model.predict(X)[0][0] return energy if data['metadata'][1]['descriptor_type'] == 'PIV': descriptor = data['metadata'][1]['descriptor'] x = Atoms('O2H5', positions=config) X = descriptor(x) energy = model.predict(X)[0][0] return energy
def test_distribution(self): """Tests if the random sorting obeys a gaussian distribution. Can rarely fail when everything is OK. """ # Get the mean value to compare to sigma = 5 desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) cm = desc.create(HHe) means = sorted(np.linalg.norm(cm, axis=1)) means = np.linalg.norm(cm, axis=1) mu2 = means[0] mu1 = means[1] # Measures how many times the two rows with biggest norm exchange place # when random noise is added. This should correspond to the probability # P(X > Y), where X = N(\mu_1, \sigma^2), Y = N(\mu_2, \sigma^2). This # probability can be reduced to P(X > Y) = P(X-Y > 0) = P(N(\mu_1 - # \mu_2, \sigma^2 + sigma^2) > 0). See e.g. # https://en.wikipedia.org/wiki/Sum_of_normally_distributed_random_variables desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=sigma, flatten=False) count = 0 rand_instances = 20000 for i in range(0, rand_instances): cm = desc.create(HHe) if np.linalg.norm(cm[0]) < np.linalg.norm(cm[1]): count += 1 # The expected probability is calculated from the cumulative # distribution function. expected = 1 - scipy.stats.norm.cdf(0, mu1 - mu2, np.sqrt(sigma**2 + sigma**2)) observed = count/rand_instances self.assertTrue(abs(expected - observed) <= 1e-2)
class Global_Descriptor_CM(Global_Descriptor_Base): def __init__(self, desc_spec): """ make a DScribe CM object """ from dscribe.descriptors import CoulombMatrix if "type" not in desc_spec.keys() or desc_spec["type"] != "CM": raise ValueError( "Type is not CM or cannot find the type of the descriptor") # required try: self.max_atoms = desc_spec['max_atoms'] except: raise ValueError( "Not enough information to intialize the `Atomic_Descriptor_CM` object" ) if 'periodic' in desc_spec.keys() and desc_spec['periodic'] == True: raise ValueError( "Coulomb Matrix cannot be used for periodic systems") self.cm = CoulombMatrix(self.max_atoms) print("Using CoulombMatrix ...") # make an acronym self.acronym = "CM" + "-" + str(self.max_atoms) def create(self, frame): """ compute the CM descriptor vector for a frame Parameters ---------- frame: ASE atom object. Coordinates of a frame. Returns ------- desc_dict: a dictionary. each entry contains the essential info of the descriptor, i.e. acronym and a np.array [N_desc]. Global descriptors for a frame. e.g. {'d1':{ 'acronym': 'CM-*', 'descriptors': `a np.array [N_desc]`}} atomic_desc_dict : {} """ if len(frame.get_positions()) > self.max_atoms: raise ValueError( 'the size of the system is larger than the max_atoms of the CM descriptor' ) # notice that we return an empty dictionary for "atomic descriptors" return { 'acronym': self.acronym, 'descriptors': self.cm.create(frame, n_jobs=1) }, {}
def test_match_with_sorted(self): """Tests if sorting the random coulomb matrix results in the same as the sorted coulomb matrix """ desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False) rcm = desc.create(H2O) srcm = desc.sort(rcm) desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) scm = desc.create(H2O) self.assertTrue(np.array_equal(scm, srcm))
def test_exceptions(self): """Tests different invalid parameters that should raise an exception. """ with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=5, permutation="unknown") with self.assertRaises(ValueError): CoulombMatrix(n_atoms_max=-1) with self.assertRaises(ValueError): cm = CoulombMatrix(n_atoms_max=2) cm.create([HHe, H2O])
def test_flatten(self): """Tests the flattening.""" # Unflattened desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=False) cm = desc.create(H2O) self.assertEqual(cm.shape, (5, 5)) # Flattened desc = CoulombMatrix(n_atoms_max=5, permutation="sorted_l2", flatten=True) cm = desc.create(H2O) self.assertEqual(cm.shape, (25,))
def setupDescs(structs, indexs, level, descname, chemsyms_uniques, n_atoms, steve, v): """ Setup descriptor and run it for ASE structures. Return DataFrame with given strictures as descriptors """ # choose the descriptor if descname == "CM": desc = CoulombMatrix(n_atoms_max=n_atoms, flatten=True) # permutation = 'sorted_l2' is default n_feat = desc.get_number_of_features() if descname == "MBTR": desc = MBTR(species=chemsyms_uniques, k1=mk1, k2=mk2, k3=mk3, periodic=False, normalization="l2_each", flatten=True) n_feat = desc.get_number_of_features() if descname == "SOAP": desc = SOAP(species=chemsyms_uniques, periodic=False, rcut=srcut, nmax=snmax, lmax=slmax, average=True) # Averaging for global n_feat = desc.get_number_of_features() # Create descriptors descs = desc.create(structs, n_jobs=steve) # Parallel # Create a DF of returned `list` of `arrays` of descs descs_df = pd.DataFrame(descs, index=indexs) if v: print("""🔘 Created {}-descriptors for all {} {}-structures. Number of features in {}: {}""".format(descname, structs.shape[0], level, descname, n_feat)) return descs_df, n_feat
def test_flatten(self): """Tests the flattening.""" # Unflattened desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", flatten=False) cm = desc.create(H2O) # print(cm) self.assertEqual(cm.shape, (5,)) # Flattened desc = CoulombMatrix(n_atoms_max=5, permutation="eigenspectrum", flatten=True) cm = desc.create(H2O) self.assertEqual(cm.shape, (5,))
def test_sparse(self): """Tests the sparse matrix creation. """ # Dense desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=False, sparse=False) vec = desc.create(H2O) self.assertTrue(type(vec) == np.ndarray) # Sparse desc = CoulombMatrix(n_atoms_max=5, permutation="none", flatten=True, sparse=True) vec = desc.create(H2O) self.assertTrue(type(vec) == scipy.sparse.coo_matrix)
def test_sparse(self): """Tests the sparse matrix creation.""" # Dense desc = CoulombMatrix( n_atoms_max=5, permutation="random", sigma=100, flatten=False, sparse=False ) vec = desc.create(H2O) self.assertTrue(type(vec) == np.ndarray) # Sparse desc = CoulombMatrix( n_atoms_max=5, permutation="random", sigma=100, flatten=True, sparse=True ) vec = desc.create(H2O) self.assertTrue(type(vec) == sparse.COO)
def test_batch_create(self): """Tests that the batch creation function works as expected. """ samples = [molecule("H2O"), molecule("C6H6")] # Test with global descriptor descriptor = CoulombMatrix(n_atoms_max=12, permutation="sorted_l2") x = batch_create(descriptor, samples, 2) # Test with local descriptor descriptor = SOAP( atomic_numbers=[1, 6, 8], rcut=5, nmax=3, lmax=3, sigma=1, periodic=False, crossover=True, average=False, sparse=True, ) positions = [[0], [1]] x = batch_create(descriptor, samples, positions=positions, n_proc=2)
def test_flatten(self): """Tests the flattening. """ # Unflattened desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=False) cm = desc.create(H2O) self.assertEqual(cm.shape, (5, 5)) # Flattened desc = CoulombMatrix(n_atoms_max=5, permutation="random", sigma=100, flatten=True) cm = desc.create(H2O) self.assertEqual(cm.shape, (1, 25))
from dscribe.descriptors import CoulombMatrix atomic_numbers = [1, 8] rcut = 6.0 nmax = 8 lmax = 6 # Setting up the CM descriptor cm = CoulombMatrix(n_atoms_max=6, ) # Creating an atomic system as an ase.Atoms-object from ase.build import molecule methanol = molecule("CH3OH") print(methanol) # Create CM output for the system cm_methanol = cm.create(methanol) print(cm_methanol) print("flattened", cm_methanol.shape) # No flattening cm = CoulombMatrix(n_atoms_max=6, flatten=False) cm_methanol = cm.create(methanol) print(cm_methanol) print("not flattened", cm_methanol.shape) # Introduce zero-padding cm = CoulombMatrix(n_atoms_max=10, flatten=False) cm_methanol = cm.create(methanol)
import torch from ase.build import bulk from ase import Atom, Atoms import random, pickle import numpy as np from ase.formula import Formula from dscribe.descriptors import CoulombMatrix cm_dscrb = CoulombMatrix(n_atoms_max=50, ) from dscribe.descriptors import SOAP species = ["H", "C", "O", "N", "F"] rcut = 6.0 nmax = 8 lmax = 6 # Setting up the SOAP descriptor soap = SOAP( species=species, periodic=False, rcut=rcut, nmax=nmax, lmax=lmax, ) from sklearn.decomposition import PCA # pca = PCA(n_components=600) # pca = None seed = 1234
PySpark API. """ from functional.streams import ParallelStream as pseq from collections import namedtuple import ase.build.bulk from dscribe.descriptors import CoulombMatrix from dscribe.descriptors import SineMatrix from dscribe.descriptors import EwaldMatrix # Setup the descriptors n_atoms_max = 4 n_proc = 4 coulombmatrix = CoulombMatrix(n_atoms_max=n_atoms_max) sinematrix = SineMatrix(n_atoms_max=n_atoms_max) ewaldmatrix = EwaldMatrix(n_atoms_max=n_atoms_max) # Define a dataset data = { "NaCl": ase.build.bulk("NaCl", "rocksalt", 5.64), "Diamond": ase.build.bulk("C", "diamond", 3.567), "Al": ase.build.bulk("Al", "fcc", 4.046), "GaAs": ase.build.bulk("GaAs", "zincblende", 5.653), } # Setup an iterable that runs through the samples. Result = namedtuple("Result", "cm sm em") Sample = namedtuple("Sample", "key value") samples = [Sample(key, value) for key, value in data.items()]
# Load configuration from an XYZ file with ASE. See # "https://wiki.fysik.dtu.dk/ase/ase/io/io.html" for a list of supported file # formats. atoms = ase.io.read("nacl.xyz") atoms.set_cell([5.640200, 5.640200, 5.640200]) atoms.set_initial_charges(atoms.get_atomic_numbers()) # There are utilities for automatically detecting statistics for ASE Atoms # objects. Typically some statistics are needed for the descriptors in order to # e.g. define a proper zero-padding stats = system_stats([atoms]) n_atoms_max = stats["n_atoms_max"] atomic_numbers = stats["atomic_numbers"] # Create descriptors for this system directly from the ASE atoms cm = CoulombMatrix(n_atoms_max, permutation="sorted_l2").create(atoms) sm = SineMatrix(n_atoms_max, permutation="sorted_l2").create(atoms) mbtr = MBTR(atomic_numbers, k=[1, 2, 3], periodic=True, weighting={ "k2": { "function": "exponential", "scale": 0.5, "cutoff": 1e-3 }, "k3": { "function": "exponential", "scale": 0.5, "cutoff": 1e-3 },
import numpy as np from ase.build import molecule from dscribe.descriptors import SOAP from dscribe.descriptors import CoulombMatrix # Define atomic structures samples = [molecule("H2O"), molecule("NO2"), molecule("CO2")] # Setup descriptors cm_desc = CoulombMatrix(n_atoms_max=3, permutation="sorted_l2") soap_desc = SOAP(species=["C", "H", "O", "N"], rcut=5, nmax=8, lmax=6, crossover=True) # Create descriptors as numpy arrays or sparse arrays water = samples[0] coulomb_matrix = cm_desc.create(water) soap = soap_desc.create(water, positions=[0]) # Easy to use also on multiple systems, can be parallelized across processes coulomb_matrices = cm_desc.create(samples) coulomb_matrices = cm_desc.create(samples, n_jobs=3) oxygen_indices = [np.where(x.get_atomic_numbers() == 8)[0] for x in samples] oxygen_soap = soap_desc.create(samples, oxygen_indices, n_jobs=3) # Some descriptors also allow calculating derivatives with respect to atomic # positions der, des = soap_desc.derivatives(samples, method="auto", return_descriptor=True)
h**o = np.array(homo_array) h**o = [float(x) for x in h**o] #print(homo_train) ase_mol = list(ase.io.iread(out_mol, format="xyz")) ## Load statistics from the dataset stats = system_stats(ase_mol) atomic_numbers = stats["atomic_numbers"] max_atomic_number = stats["max_atomic_number"] min_atomic_number = stats["min_atomic_number"] min_distance = stats["min_distance"] cm_desc = CoulombMatrix( n_atoms_max= 29, ## maximum number of atoms in a molecule that occurs in dataset permutation="sorted_l2", #sparse=True ) time_start = time.time() cm_start = time.time() ############# create CM for data ############################################################################## cm = cm_desc.create(ase_mol) cm_end = time.time() cm_time = np.round(cm_end - cm_start, decimals=3) ################# split CM and h**o array into 5 different parts ### define index index = np.arange(np.shape(cm)[0]) ### shuffle index
def create(system): desc = CoulombMatrix(n_atoms_max=3, permutation="none", flatten=True) return desc.create(system)