def test_xy(self): """Tests that the kernel can be also calculated between two different sets, which is necessary for making predictions with kernel-based methods. """ # Create SOAP features for a system desc = SOAP( species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False, ) a = molecule("H2O") b = molecule("O2") c = molecule("H2O2") a_feat = desc.create(a) b_feat = desc.create(b) c_feat = desc.create(c) # Linear dot-product kernel kernel = AverageKernel(metric="linear") K = kernel.create([a_feat, b_feat], [c_feat]) self.assertEqual(K.shape, (2, 1))
def test_convergence_infinity(self): """Tests that the REMatch kernel correctly converges to the average kernel at the the limit of infinite alpha. """ # Create SOAP features for a system desc = SOAP( species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False, ) a = molecule("H2O") b = molecule("H2O2") a_features = desc.create(a) b_features = desc.create(b) # REMatch kernel with very high alpha kernel_re = REMatchKernel(metric="linear", alpha=1e20, threshold=1e-6) K_re = kernel_re.create([a_features, b_features]) # Average kernel kernel_ave = AverageKernel(metric="linear") K_ave = kernel_ave.create([a_features, b_features]) # Test approximate equality self.assertTrue(np.allclose(K_re, K_ave))
def test_difference(self): """Tests that the similarity is correct. """ # Create SOAP features for a system desc = SOAP(species=[1, 6, 7, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False) # Calculate that identical molecules are identical. a = molecule("H2O") a_features = desc.create(a) kernel = AverageKernel(metric="linear") K = kernel.create([a_features, a_features]) self.assertTrue(np.all(np.abs(K - 1) < 1e-3)) # Check that completely different molecules are completely different a = molecule("N2") b = molecule("H2O") a_features = desc.create(a) b_features = desc.create(b) K = kernel.create([a_features, b_features]) self.assertTrue(np.all(np.abs(K - np.eye(2)) < 1e-3)) # Check that somewhat similar molecules are somewhat similar a = molecule("H2O") b = molecule("H2O2") a_features = desc.create(a) b_features = desc.create(b) K = kernel.create([a_features, b_features]) self.assertTrue(K[0, 1] > 0.9)
def test_metrics(self): """Tests that different metrics as defined by scikit-learn can be used.""" # Create SOAP features for a system desc = SOAP( species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False, ) a = molecule("H2O") a_features = desc.create(a) # Linear dot-product kernel kernel = AverageKernel(metric="linear") K = kernel.create([a_features, a_features]) # Gaussian kernel kernel = AverageKernel(metric="rbf", gamma=1) K = kernel.create([a_features, a_features]) # Laplacian kernel kernel = AverageKernel(metric="laplacian", gamma=1) K = kernel.create([a_features, a_features])
def average_listcomp(desc_list): re = AverageKernel(metric='linear') av_comp_list = [] loop_count = 0 for i in range(0, len(desc_list) - 1): comp_pair = [desc_list[i], desc_list[i + 1][:, 0:len(desc_list[i][0])]] print([len(comp_pair[0]), len(comp_pair[1])]) print([len(comp_pair[0][0]), len(comp_pair[1][0])]) kern = re.create(comp_pair) av_comp_list.append(kern[0][1]) loop_count += 1 print(f'done {loop_count} comparisons') return av_comp_list
def test_sparse(self): """Tests that sparse features may also be used to construct the kernels. """ # Create SOAP features for a system desc = SOAP(species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=True) a = molecule('H2O') a_feat = desc.create(a) kernel = AverageKernel(metric="linear") K = kernel.create([a_feat])
def main(fxyz, dictxyz, prefix, soap_rcut, soap_g, soap_n, soap_l, soap_periodic, matrix_plot): """ Generate the SOAP kernel matrix. Parameters ---------- fxyz: string giving location of xyz file dictxyz: string giving location of xyz file that is used as a dictionary prefix: string giving the filename prefix soap_rcut: float giving the cutoff radius, default value is 3.0 soap_g: float giving the atom width soap_n: int giving the maximum radial label soap_l: int giving the maximum angular label. Must be less than or equal to 9 soap_periodic: string (True or False) indicating whether the system is periodic matrix_plot: string (True or False) indicating whether a plot of the kernel matrix is to be generated """ soap_periodic = bool(soap_periodic) fframes = [] dictframes = [] # read frames if fxyz != 'none': fframes = read(fxyz, ':') nfframes = len(fframes) print("read xyz file:", fxyz, ", a total of", nfframes, "frames") # read frames in the dictionary if dictxyz != 'none': dictframes = read(dictxyz, ':') ndictframes = len(dictframes) print("read xyz file used for a dictionary:", dictxyz, ", a total of", ndictframes, "frames") frames = dictframes + fframes nframes = len(frames) global_species = [] for frame in frames: global_species.extend(frame.get_atomic_numbers()) if not soap_periodic: frame.set_pbc([False, False, False]) global_species = np.unique(global_species) print("a total of", nframes, "frames, with elements: ", global_species) if nframes > 1: # set up the soap descriptors soap_desc = SOAP(species=global_species, rcut=soap_rcut, nmax=soap_n, lmax=soap_l, sigma=soap_g, crossover=False, average=True, periodic=soap_periodic) else: # if only one frame we compute the kernel matrix (kmat) between the atomic environments # within this frame soap_desc = SOAP(species=global_species, rcut=soap_rcut, nmax=soap_n, lmax=soap_l, sigma=soap_g, crossover=False, average=False, periodic=soap_periodic) # compute soap finger prints fall = soap_desc.create(frames, n_jobs=8) # compute kmat fshape = np.shape(fall) re = AverageKernel(metric="linear") kNN = re.create(fall.reshape((fshape[0], 1, fshape[1]))) # save np.savetxt(prefix + "-n" + str(soap_n) + "-l" + str(soap_l) + "-c" + str(soap_rcut) + "-g" + str(soap_g) + ".kmat", kNN, fmt='%4.8f') # plot if matrix_plot: plt.matshow(kNN) plt.title('Kernel matrix: ' + prefix) plt.show()
from dscribe.descriptors import SOAP from dscribe.kernels import AverageKernel from ase.build import molecule # We will compare two similar molecules a = molecule("H2O") b = molecule("H2O2") # First we will have to create the features for atomic environments. Lets # use SOAP. desc = SOAP(species=[1, 6, 7, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False) a_features = desc.create(a) b_features = desc.create(b) # Calculates the similarity with an average kernel and a linear metric. The # result will be a full similarity matrix. re = AverageKernel(metric="linear") re_kernel = re.create([a_features, b_features]) # Any metric supported by scikit-learn will work: e.g. a Gaussian: re = AverageKernel(metric="rbf", gamma=1) re_kernel = re.create([a_features, b_features])
def pop_fitness(population, rcut, sigma, kernel, tgt_atoms, tgt_species, tgt_atoms2=None, max_score=[-9999, '']): """ Calculates the fitness (ie SOAP similarity score) of the population by generating conformers for each of the population molecules, then evaluating their SOAP descriptors and calculating its similarity score with the SOAP descriptor of the binding ligand 'field' Conformer generation and similarity calculation are the computational bottlenecks - might be worth splitting the task up with MPI. see return_borders.py in helper.py if you want to do that - make sure you only run the reproduction on the master node (since there is randomness), then broadcast to the other nodes :param population: list of RDKit molecule objects :param tgt_atoms: list of ASE atom objects of the target ligand field - from read_xyz, second is optional if separate sites :param tgt_species: list of the atomic species present in the target ligand field - from read_xyz :param rcut, sigma: SOAP parameters :param max_score: Maximum SOAP similarity found so far :return: fitness, max_score, fit_mean, fit_std """ t0 = time.time() # loop over RDKit mols and turn them into lists of ASE atom objects for dscribe SOAP atomic feature generation population_ase = [] num_list = [] species = ['C'] bad_mols = [] for m in population: m = Chem.AddHs(m) conf_result = AllChem.EmbedMolecule(m, maxAttempts=1000) if conf_result != 0: bad_mols.append(m) continue m = Chem.RemoveHs(m) num_list.append(len(m.GetAtoms())) for i, atom in enumerate(m.GetAtoms()): symbol = atom.GetSymbol() conf = m.GetConformer() population_ase.append(Atoms(symbol, [conf.GetPositions()[i]])) if symbol not in species: # find unique atomic species for SOAP generation species.append(symbol) if bad_mols != []: for bm in bad_mols: bm = Chem.RemoveHs(bm) try: population.remove( bm) # filter out molecules which have no conformers except: continue # Check that we also include the atom types present in the ligand targets for atom in tgt_species: if atom not in species: species.append(atom) t1 = time.time() print('Time taken to generate conformers: {}'.format(t1 - t0)) # Generate SOAP descriptors using dscribe soap_generator = SOAP(species=species, periodic=False, rcut=rcut, nmax=8, lmax=6, sigma=sigma, sparse=True) soap = soap_generator.create(population_ase) tgt_soap = soap_generator.create(tgt_atoms) if tgt_atoms2 is not None: tgt_soap2 = [normalize(soap_generator.create(tgt_atoms2), copy=False)] # normalize SOAP atom descriptors and group by molecule soap = normalize(soap, copy=False) tgt_soap = [normalize(tgt_soap, copy=False)] soap = split_by_lengths(soap, num_list) t2 = time.time() print('Time taken to generate SOAP descriptors: {}'.format(t2 - t1)) # TODO make REMatch kernel args as input args if kernel == 'rematch': soap_similarity = REMatchKernel(metric="polynomial", degree=3, gamma=1, coef0=0, alpha=0.1, threshold=1e-3, normalize_kernel=True) elif kernel == 'average': soap_similarity = AverageKernel(metric="polynomial", degree=3, gamma=1, coef0=0, normalize_kernel=True) if tgt_atoms2 is not None: fitness1 = soap_similarity.create(soap, tgt_soap) fitness1.flatten() fitness2 = soap_similarity.create(soap, tgt_soap2) fitness2.flatten() # calculate fitness score as product of the two fitnesses fitness = np.multiply(fitness1, fitness2) fitness = np.array([f[0] for f in fitness]) else: fitness = soap_similarity.create(soap, tgt_soap) fitness = fitness.flatten() t3 = time.time() print('Time taken to calculate fitness: {}'.format(t3 - t2)) # update max_score, include new champion if np.amax(fitness) > max_score[0]: max_score = [ np.amax(fitness), Chem.MolToSmiles(population[np.argmax(fitness)]) ] #Print the top 5 scores and corresponding molecules for a particular generation top_scores = np.flip(fitness[np.argsort(fitness)[-5:]]) # print(top_scores) for i in range(5): print("Mol {}: {} (fitness = {:.3f})".format( i, Chem.MolToSmiles(population[np.argsort(fitness)[-i - 1]]), top_scores[i])) fitness = fitness / np.sum(fitness) return fitness, max_score
def pop_fitness(mpi_comm, mpi_rank, mpi_size, population, rcut, sigma, kernel, tgt_atoms, tgt_species, tgt_atoms2=None, max_score=[-9999, '']): """ Calculates the fitness (ie SOAP similarity score) of the population by generating conformers for each of the population molecules, then evaluating their SOAP descriptors and calculating its similarity score with the SOAP descriptor of the binding ligand 'field' :param population: list of RDKit molecule objects :param tgt_atoms: list of ASE atom objects of the target ligand field - from read_xyz, second is optional if separate sites :param tgt_species: list of the atomic species present in the target ligand field - from read_xyz :param rcut, sigma: SOAP parameters :param max_score: Maximum SOAP similarity found so far :return: fitness, max_score, fit_mean, fit_std """ t0 = time.time() # partition the population between the MPI cpus my_border_low, my_border_high = return_borders(mpi_rank, len(population), mpi_size) my_pop = population[my_border_low:my_border_high] # loop over RDKit mols and turn them into lists of ASE atom objects for dscribe SOAP atomic feature generation population_ase = [] num_list = [] species = ['C'] bad_mols = [] for ind, m in enumerate(my_pop): m = Chem.AddHs(m) conf_result = AllChem.EmbedMolecule(m, maxAttempts=1000) m = Chem.RemoveHs(m) num_list.append(len(m.GetAtoms())) for i, atom in enumerate( m.GetAtoms() ): # this is actually wrong, should have an Atoms object for each mol... symbol = atom.GetSymbol() if conf_result != 0: bad_mols.append(ind) population_ase.append(Atoms(symbol, [[0, 0, 0]])) else: conf = m.GetConformer() population_ase.append(Atoms(symbol, [conf.GetPositions()[i]])) if symbol not in species: # find unique atomic species for SOAP generation species.append(symbol) # Check that we also include the atom types present in the ligand targets for atom in tgt_species: if atom not in species: species.append(atom) t1 = time.time() if mpi_rank == 0: print('Time taken to generate conformers: {}'.format(t1 - t0)) # Generate SOAP descriptors using dscribe soap_generator = SOAP(species=species, periodic=False, rcut=rcut, nmax=8, lmax=6, sigma=sigma, sparse=True) soap = soap_generator.create(population_ase) tgt_soap = soap_generator.create(tgt_atoms) if tgt_atoms2 is not None: tgt_soap2 = [normalize(soap_generator.create(tgt_atoms2), copy=False)] # normalize SOAP atom descriptors and group by molecule soap = normalize(soap, copy=False) tgt_soap = [normalize(tgt_soap, copy=False)] soap = split_by_lengths(soap, num_list) t2 = time.time() if mpi_rank == 0: print('Time taken to generate SOAP descriptors: {}'.format(t2 - t1)) # TODO make REMatch kernel args as input args if kernel == 'rematch': soap_similarity = REMatchKernel(metric="polynomial", degree=3, gamma=1, coef0=0, alpha=0.1, threshold=1e-3, normalize_kernel=True) elif kernel == 'average': soap_similarity = AverageKernel(metric="polynomial", degree=3, gamma=1, coef0=0, normalize_kernel=True) if tgt_atoms2 is not None: fitness1 = soap_similarity.create(soap, tgt_soap) fitness1.flatten() fitness2 = soap_similarity.create(soap, tgt_soap2) fitness2.flatten() # calculate fitness score as product of the two fitnesses fitness = np.multiply(fitness1, fitness2) fitness = np.array([f[0] for f in fitness]) else: fitness = soap_similarity.create(soap, tgt_soap) fitness = fitness.flatten() fitness[bad_mols] = 0 # set fitness of bad conformers to 0 sendcounts = np.array(mpi_comm.gather(len(fitness), root=0)) if mpi_rank == 0: fitness_full = np.empty(len(population)) else: fitness_full = None # Gather fitness arrays from MPI cpus into the root cpu, then broadcast the gathered array to all cpus mpi_comm.Gatherv(sendbuf=fitness, recvbuf=(fitness_full, sendcounts), root=0) fitness = mpi_comm.bcast(fitness_full, root=0) t3 = time.time() if mpi_rank == 0: print('Time taken to calculate fitness: {}'.format(t3 - t2)) # update max_score, include new champion if np.amax(fitness) > max_score[0]: max_score = [ np.amax(fitness), Chem.MolToSmiles(population[np.argmax(fitness)]) ] #Print the top 5 scores and corresponding molecules for a particular generation top_scores = np.flip(fitness[np.argsort(fitness)[-5:]]) # print(top_scores) for i in range(5): if mpi_rank == 0: print("Mol {}: {} (fitness = {:.3f})".format( i, Chem.MolToSmiles(population[np.argsort(fitness)[-i - 1]]), top_scores[i])) fitness = fitness / np.sum(fitness) return fitness, max_score
t2_per_soap = SOAP(species=species, rcut=r_cut, nmax=nmax, lmax=lmax, periodic=True, sparse=False) #--------------------------------------------------------------------- #RUN SOAP ACROSS n FILES IN LIST AND OUTPUT COMPARISON KERNEL #--------------------------------------------------------------------- tic_1 = time.perf_counter() comparisons = [t2_per_soap.create(i) for i in structures] metric = "linear" re = AverageKernel(metric=metric) kern = re.create(comparisons) toc_1 = time.perf_counter() comp_time = toc_1 - tic_1 print( f"Took {comp_time:.2} seconds to compare {ns} structures with r_cut = {r_cut:.2}, lmax = {lmax}, nmax = {nmax}" ) #--------------------------------------------------------------------- #OUTPUT COMPARISON AS CSV FILE #--------------------------------------------------------------------- soap_array = pd.DataFrame(kern, index=names, columns=names) soap_array.to_csv(outputdir + "/soap_comparison_rcut = %s.csv" % r_cut,
rectime = [] allkerndiffs = [] for nmax in range(1, 15): soapgen_rcut = SOAP(species=species, rcut=rcut, nmax=nmax, lmax=lmax, periodic=True, sparse=False, rbf='gto') descriptors = [soapgen_rcut.create(i) for i in structures] descdiffs.append(descriptors[1][0][0] - descriptors[0][0][0]) tic_1 = time.perf_counter() re = AverageKernel(metric='linear') kern = re.create(descriptors) toc_1 = time.perf_counter() ctime.append(toc_1 - tic_1) kerndiffs.append(kern[0][1]) tic_2 = time.perf_counter() normed = [normalize(i) for i in descriptors] rem = REMatchKernel(metric='rbf', gamma=1, alpha=1, threshold=1e-6) remkern = rem.create(descriptors) toc_2 = time.perf_counter() rectime.append(toc_2 - tic_2) remkerndiffs.append(remkern[0][1]) allkerndiffs.append(abs(remkern[0][1] - kern[0][1]))