Пример #1
0
    def test_xy(self):
        """Tests that the kernel can be also calculated between two different
        sets, which is necessary for making predictions with kernel-based
        methods.
        """
        # Create SOAP features for a system
        desc = SOAP(
            species=[1, 8],
            rcut=5.0,
            nmax=2,
            lmax=2,
            sigma=0.2,
            periodic=False,
            crossover=True,
            sparse=False,
        )
        a = molecule("H2O")
        b = molecule("O2")
        c = molecule("H2O2")

        a_feat = desc.create(a)
        b_feat = desc.create(b)
        c_feat = desc.create(c)

        # Linear dot-product kernel
        kernel = AverageKernel(metric="linear")
        K = kernel.create([a_feat, b_feat], [c_feat])

        self.assertEqual(K.shape, (2, 1))
Пример #2
0
    def test_convergence_infinity(self):
        """Tests that the REMatch kernel correctly converges to the average
        kernel at the the limit of infinite alpha.
        """
        # Create SOAP features for a system
        desc = SOAP(
            species=[1, 8],
            rcut=5.0,
            nmax=2,
            lmax=2,
            sigma=0.2,
            periodic=False,
            crossover=True,
            sparse=False,
        )
        a = molecule("H2O")
        b = molecule("H2O2")
        a_features = desc.create(a)
        b_features = desc.create(b)

        # REMatch kernel with very high alpha
        kernel_re = REMatchKernel(metric="linear", alpha=1e20, threshold=1e-6)
        K_re = kernel_re.create([a_features, b_features])

        # Average kernel
        kernel_ave = AverageKernel(metric="linear")
        K_ave = kernel_ave.create([a_features, b_features])

        # Test approximate equality
        self.assertTrue(np.allclose(K_re, K_ave))
Пример #3
0
    def test_difference(self):
        """Tests that the similarity is correct.
        """
        # Create SOAP features for a system
        desc = SOAP(species=[1, 6, 7, 8],
                    rcut=5.0,
                    nmax=2,
                    lmax=2,
                    sigma=0.2,
                    periodic=False,
                    crossover=True,
                    sparse=False)

        # Calculate that identical molecules are identical.
        a = molecule("H2O")
        a_features = desc.create(a)
        kernel = AverageKernel(metric="linear")
        K = kernel.create([a_features, a_features])
        self.assertTrue(np.all(np.abs(K - 1) < 1e-3))

        # Check that completely different molecules are completely different
        a = molecule("N2")
        b = molecule("H2O")
        a_features = desc.create(a)
        b_features = desc.create(b)
        K = kernel.create([a_features, b_features])
        self.assertTrue(np.all(np.abs(K - np.eye(2)) < 1e-3))

        # Check that somewhat similar molecules are somewhat similar
        a = molecule("H2O")
        b = molecule("H2O2")
        a_features = desc.create(a)
        b_features = desc.create(b)
        K = kernel.create([a_features, b_features])
        self.assertTrue(K[0, 1] > 0.9)
Пример #4
0
    def test_metrics(self):
        """Tests that different metrics as defined by scikit-learn can be used."""
        # Create SOAP features for a system
        desc = SOAP(
            species=[1, 8],
            rcut=5.0,
            nmax=2,
            lmax=2,
            sigma=0.2,
            periodic=False,
            crossover=True,
            sparse=False,
        )
        a = molecule("H2O")
        a_features = desc.create(a)

        # Linear dot-product kernel
        kernel = AverageKernel(metric="linear")
        K = kernel.create([a_features, a_features])

        # Gaussian kernel
        kernel = AverageKernel(metric="rbf", gamma=1)
        K = kernel.create([a_features, a_features])

        # Laplacian kernel
        kernel = AverageKernel(metric="laplacian", gamma=1)
        K = kernel.create([a_features, a_features])
Пример #5
0
def average_listcomp(desc_list):
    re = AverageKernel(metric='linear')
    av_comp_list = []
    loop_count = 0

    for i in range(0, len(desc_list) - 1):
        comp_pair = [desc_list[i], desc_list[i + 1][:, 0:len(desc_list[i][0])]]
        print([len(comp_pair[0]), len(comp_pair[1])])
        print([len(comp_pair[0][0]), len(comp_pair[1][0])])
        kern = re.create(comp_pair)
        av_comp_list.append(kern[0][1])
        loop_count += 1
        print(f'done {loop_count} comparisons')

    return av_comp_list
Пример #6
0
 def test_sparse(self):
     """Tests that sparse features may also be used to construct the kernels.
     """
     # Create SOAP features for a system
     desc = SOAP(species=[1, 8],
                 rcut=5.0,
                 nmax=2,
                 lmax=2,
                 sigma=0.2,
                 periodic=False,
                 crossover=True,
                 sparse=True)
     a = molecule('H2O')
     a_feat = desc.create(a)
     kernel = AverageKernel(metric="linear")
     K = kernel.create([a_feat])
Пример #7
0
def main(fxyz, dictxyz, prefix, soap_rcut, soap_g, soap_n, soap_l,
         soap_periodic, matrix_plot):
    """

    Generate the SOAP kernel matrix.

    Parameters
    ----------
    fxyz: string giving location of xyz file
    dictxyz: string giving location of xyz file that is used as a dictionary
    prefix: string giving the filename prefix
    soap_rcut: float giving the cutoff radius, default value is 3.0
    soap_g: float giving the atom width
    soap_n: int giving the maximum radial label
    soap_l: int giving the maximum angular label. Must be less than or equal to 9
    soap_periodic: string (True or False) indicating whether the system is periodic
    matrix_plot: string (True or False) indicating whether a plot of the kernel matrix
                 is to be generated
    """

    soap_periodic = bool(soap_periodic)
    fframes = []
    dictframes = []

    # read frames
    if fxyz != 'none':
        fframes = read(fxyz, ':')
        nfframes = len(fframes)
        print("read xyz file:", fxyz, ", a total of", nfframes, "frames")
    # read frames in the dictionary
    if dictxyz != 'none':
        dictframes = read(dictxyz, ':')
        ndictframes = len(dictframes)
        print("read xyz file used for a dictionary:", dictxyz, ", a total of",
              ndictframes, "frames")

    frames = dictframes + fframes
    nframes = len(frames)
    global_species = []
    for frame in frames:
        global_species.extend(frame.get_atomic_numbers())
        if not soap_periodic:
            frame.set_pbc([False, False, False])
    global_species = np.unique(global_species)
    print("a total of", nframes, "frames, with elements: ", global_species)

    if nframes > 1:
        # set up the soap descriptors
        soap_desc = SOAP(species=global_species,
                         rcut=soap_rcut,
                         nmax=soap_n,
                         lmax=soap_l,
                         sigma=soap_g,
                         crossover=False,
                         average=True,
                         periodic=soap_periodic)
    else:
        # if only one frame we compute the kernel matrix (kmat) between the atomic environments
        # within this frame
        soap_desc = SOAP(species=global_species,
                         rcut=soap_rcut,
                         nmax=soap_n,
                         lmax=soap_l,
                         sigma=soap_g,
                         crossover=False,
                         average=False,
                         periodic=soap_periodic)

    # compute soap finger prints
    fall = soap_desc.create(frames, n_jobs=8)

    # compute kmat
    fshape = np.shape(fall)
    re = AverageKernel(metric="linear")

    kNN = re.create(fall.reshape((fshape[0], 1, fshape[1])))

    # save
    np.savetxt(prefix + "-n" + str(soap_n) + "-l" + str(soap_l) + "-c" +
               str(soap_rcut) + "-g" + str(soap_g) + ".kmat",
               kNN,
               fmt='%4.8f')

    # plot
    if matrix_plot:
        plt.matshow(kNN)
        plt.title('Kernel matrix: ' + prefix)
        plt.show()
Пример #8
0
from dscribe.descriptors import SOAP
from dscribe.kernels import AverageKernel

from ase.build import molecule

# We will compare two similar molecules
a = molecule("H2O")
b = molecule("H2O2")

# First we will have to create the features for atomic environments. Lets
# use SOAP.
desc = SOAP(species=[1, 6, 7, 8],
            rcut=5.0,
            nmax=2,
            lmax=2,
            sigma=0.2,
            periodic=False,
            crossover=True,
            sparse=False)
a_features = desc.create(a)
b_features = desc.create(b)

# Calculates the similarity with an average kernel and a linear metric. The
# result will be a full similarity matrix.
re = AverageKernel(metric="linear")
re_kernel = re.create([a_features, b_features])

# Any metric supported by scikit-learn will work: e.g. a Gaussian:
re = AverageKernel(metric="rbf", gamma=1)
re_kernel = re.create([a_features, b_features])
Пример #9
0
def pop_fitness(population,
                rcut,
                sigma,
                kernel,
                tgt_atoms,
                tgt_species,
                tgt_atoms2=None,
                max_score=[-9999, '']):
    """
    Calculates the fitness (ie SOAP similarity score) of the population by generating conformers for each of the
    population molecules, then evaluating their SOAP descriptors and calculating its similarity score with the SOAP
    descriptor of the binding ligand 'field'

    Conformer generation and similarity calculation are the computational bottlenecks  - might be worth splitting the
    task up with MPI. see return_borders.py in helper.py if you want to do that - make sure you only run the
    reproduction on the master node (since there is randomness), then broadcast to the other nodes

    :param population: list of RDKit molecule objects
    :param tgt_atoms: list of ASE atom objects of the target ligand field - from read_xyz, second is optional if separate sites
    :param tgt_species: list of the atomic species present in the target ligand field - from read_xyz
    :param rcut, sigma: SOAP parameters
    :param max_score: Maximum SOAP similarity found so far

    :return: fitness, max_score, fit_mean, fit_std
    """
    t0 = time.time()

    # loop over RDKit mols and turn them into lists of ASE atom objects for dscribe SOAP atomic feature generation
    population_ase = []
    num_list = []
    species = ['C']
    bad_mols = []
    for m in population:
        m = Chem.AddHs(m)
        conf_result = AllChem.EmbedMolecule(m, maxAttempts=1000)
        if conf_result != 0:
            bad_mols.append(m)
            continue
        m = Chem.RemoveHs(m)
        num_list.append(len(m.GetAtoms()))
        for i, atom in enumerate(m.GetAtoms()):
            symbol = atom.GetSymbol()
            conf = m.GetConformer()
            population_ase.append(Atoms(symbol, [conf.GetPositions()[i]]))
            if symbol not in species:  # find unique atomic species for SOAP generation
                species.append(symbol)
    if bad_mols != []:
        for bm in bad_mols:
            bm = Chem.RemoveHs(bm)
            try:
                population.remove(
                    bm)  # filter out molecules which have no conformers
            except:
                continue
    # Check that we also include the atom types present in the ligand targets
    for atom in tgt_species:
        if atom not in species:
            species.append(atom)
    t1 = time.time()
    print('Time taken to generate conformers: {}'.format(t1 - t0))

    # Generate SOAP descriptors using dscribe
    soap_generator = SOAP(species=species,
                          periodic=False,
                          rcut=rcut,
                          nmax=8,
                          lmax=6,
                          sigma=sigma,
                          sparse=True)
    soap = soap_generator.create(population_ase)
    tgt_soap = soap_generator.create(tgt_atoms)
    if tgt_atoms2 is not None:
        tgt_soap2 = [normalize(soap_generator.create(tgt_atoms2), copy=False)]

    # normalize SOAP atom descriptors and group by molecule
    soap = normalize(soap, copy=False)
    tgt_soap = [normalize(tgt_soap, copy=False)]
    soap = split_by_lengths(soap, num_list)

    t2 = time.time()
    print('Time taken to generate SOAP descriptors: {}'.format(t2 - t1))

    # TODO make REMatch kernel args as input args
    if kernel == 'rematch':
        soap_similarity = REMatchKernel(metric="polynomial",
                                        degree=3,
                                        gamma=1,
                                        coef0=0,
                                        alpha=0.1,
                                        threshold=1e-3,
                                        normalize_kernel=True)
    elif kernel == 'average':
        soap_similarity = AverageKernel(metric="polynomial",
                                        degree=3,
                                        gamma=1,
                                        coef0=0,
                                        normalize_kernel=True)
    if tgt_atoms2 is not None:
        fitness1 = soap_similarity.create(soap, tgt_soap)
        fitness1.flatten()
        fitness2 = soap_similarity.create(soap, tgt_soap2)
        fitness2.flatten()
        # calculate fitness score as product of the two fitnesses
        fitness = np.multiply(fitness1, fitness2)
        fitness = np.array([f[0] for f in fitness])
    else:
        fitness = soap_similarity.create(soap, tgt_soap)
        fitness = fitness.flatten()

    t3 = time.time()
    print('Time taken to calculate fitness: {}'.format(t3 - t2))
    # update max_score, include new champion
    if np.amax(fitness) > max_score[0]:
        max_score = [
            np.amax(fitness),
            Chem.MolToSmiles(population[np.argmax(fitness)])
        ]

    #Print the top 5 scores and corresponding molecules for a particular generation
    top_scores = np.flip(fitness[np.argsort(fitness)[-5:]])
    # print(top_scores)
    for i in range(5):
        print("Mol {}: {} (fitness = {:.3f})".format(
            i, Chem.MolToSmiles(population[np.argsort(fitness)[-i - 1]]),
            top_scores[i]))

    fitness = fitness / np.sum(fitness)

    return fitness, max_score
Пример #10
0
def pop_fitness(mpi_comm,
                mpi_rank,
                mpi_size,
                population,
                rcut,
                sigma,
                kernel,
                tgt_atoms,
                tgt_species,
                tgt_atoms2=None,
                max_score=[-9999, '']):
    """
    Calculates the fitness (ie SOAP similarity score) of the population by generating conformers for each of the
    population molecules, then evaluating their SOAP descriptors and calculating its similarity score with the SOAP
    descriptor of the binding ligand 'field'

    :param population: list of RDKit molecule objects
    :param tgt_atoms: list of ASE atom objects of the target ligand field - from read_xyz, second is optional if separate sites
    :param tgt_species: list of the atomic species present in the target ligand field - from read_xyz
    :param rcut, sigma: SOAP parameters
    :param max_score: Maximum SOAP similarity found so far

    :return: fitness, max_score, fit_mean, fit_std
    """
    t0 = time.time()

    # partition the population between the MPI cpus
    my_border_low, my_border_high = return_borders(mpi_rank, len(population),
                                                   mpi_size)
    my_pop = population[my_border_low:my_border_high]

    # loop over RDKit mols and turn them into lists of ASE atom objects for dscribe SOAP atomic feature generation
    population_ase = []
    num_list = []
    species = ['C']
    bad_mols = []
    for ind, m in enumerate(my_pop):
        m = Chem.AddHs(m)
        conf_result = AllChem.EmbedMolecule(m, maxAttempts=1000)
        m = Chem.RemoveHs(m)
        num_list.append(len(m.GetAtoms()))
        for i, atom in enumerate(
                m.GetAtoms()
        ):  # this is actually wrong, should have an Atoms object for each mol...
            symbol = atom.GetSymbol()
            if conf_result != 0:
                bad_mols.append(ind)
                population_ase.append(Atoms(symbol, [[0, 0, 0]]))
            else:
                conf = m.GetConformer()
                population_ase.append(Atoms(symbol, [conf.GetPositions()[i]]))
            if symbol not in species:  # find unique atomic species for SOAP generation
                species.append(symbol)

    # Check that we also include the atom types present in the ligand targets
    for atom in tgt_species:
        if atom not in species:
            species.append(atom)

    t1 = time.time()
    if mpi_rank == 0:
        print('Time taken to generate conformers: {}'.format(t1 - t0))

    # Generate SOAP descriptors using dscribe
    soap_generator = SOAP(species=species,
                          periodic=False,
                          rcut=rcut,
                          nmax=8,
                          lmax=6,
                          sigma=sigma,
                          sparse=True)
    soap = soap_generator.create(population_ase)
    tgt_soap = soap_generator.create(tgt_atoms)
    if tgt_atoms2 is not None:
        tgt_soap2 = [normalize(soap_generator.create(tgt_atoms2), copy=False)]

    # normalize SOAP atom descriptors and group by molecule
    soap = normalize(soap, copy=False)
    tgt_soap = [normalize(tgt_soap, copy=False)]
    soap = split_by_lengths(soap, num_list)

    t2 = time.time()
    if mpi_rank == 0:
        print('Time taken to generate SOAP descriptors: {}'.format(t2 - t1))

    # TODO make REMatch kernel args as input args
    if kernel == 'rematch':
        soap_similarity = REMatchKernel(metric="polynomial",
                                        degree=3,
                                        gamma=1,
                                        coef0=0,
                                        alpha=0.1,
                                        threshold=1e-3,
                                        normalize_kernel=True)
    elif kernel == 'average':
        soap_similarity = AverageKernel(metric="polynomial",
                                        degree=3,
                                        gamma=1,
                                        coef0=0,
                                        normalize_kernel=True)
    if tgt_atoms2 is not None:
        fitness1 = soap_similarity.create(soap, tgt_soap)
        fitness1.flatten()
        fitness2 = soap_similarity.create(soap, tgt_soap2)
        fitness2.flatten()
        # calculate fitness score as product of the two fitnesses
        fitness = np.multiply(fitness1, fitness2)
        fitness = np.array([f[0] for f in fitness])
    else:
        fitness = soap_similarity.create(soap, tgt_soap)
        fitness = fitness.flatten()

    fitness[bad_mols] = 0  # set fitness of bad conformers to 0

    sendcounts = np.array(mpi_comm.gather(len(fitness), root=0))

    if mpi_rank == 0:
        fitness_full = np.empty(len(population))
    else:
        fitness_full = None

    # Gather fitness arrays from MPI cpus into the root cpu, then broadcast the gathered array to all cpus
    mpi_comm.Gatherv(sendbuf=fitness,
                     recvbuf=(fitness_full, sendcounts),
                     root=0)
    fitness = mpi_comm.bcast(fitness_full, root=0)

    t3 = time.time()
    if mpi_rank == 0:
        print('Time taken to calculate fitness: {}'.format(t3 - t2))

    # update max_score, include new champion
    if np.amax(fitness) > max_score[0]:
        max_score = [
            np.amax(fitness),
            Chem.MolToSmiles(population[np.argmax(fitness)])
        ]

    #Print the top 5 scores and corresponding molecules for a particular generation
    top_scores = np.flip(fitness[np.argsort(fitness)[-5:]])
    # print(top_scores)
    for i in range(5):
        if mpi_rank == 0:
            print("Mol {}: {} (fitness = {:.3f})".format(
                i, Chem.MolToSmiles(population[np.argsort(fitness)[-i - 1]]),
                top_scores[i]))

    fitness = fitness / np.sum(fitness)

    return fitness, max_score
Пример #11
0
t2_per_soap = SOAP(species=species,
                   rcut=r_cut,
                   nmax=nmax,
                   lmax=lmax,
                   periodic=True,
                   sparse=False)

#---------------------------------------------------------------------
#RUN SOAP ACROSS n FILES IN LIST AND OUTPUT COMPARISON KERNEL
#---------------------------------------------------------------------
tic_1 = time.perf_counter()
comparisons = [t2_per_soap.create(i) for i in structures]

metric = "linear"

re = AverageKernel(metric=metric)
kern = re.create(comparisons)

toc_1 = time.perf_counter()

comp_time = toc_1 - tic_1

print(
    f"Took {comp_time:.2} seconds to compare {ns} structures with r_cut = {r_cut:.2}, lmax = {lmax}, nmax = {nmax}"
)

#---------------------------------------------------------------------
#OUTPUT COMPARISON AS CSV FILE
#---------------------------------------------------------------------
soap_array = pd.DataFrame(kern, index=names, columns=names)
soap_array.to_csv(outputdir + "/soap_comparison_rcut = %s.csv" % r_cut,
Пример #12
0
rectime = []
allkerndiffs = []

for nmax in range(1, 15):
    soapgen_rcut = SOAP(species=species,
                        rcut=rcut,
                        nmax=nmax,
                        lmax=lmax,
                        periodic=True,
                        sparse=False,
                        rbf='gto')
    descriptors = [soapgen_rcut.create(i) for i in structures]
    descdiffs.append(descriptors[1][0][0] - descriptors[0][0][0])

    tic_1 = time.perf_counter()
    re = AverageKernel(metric='linear')
    kern = re.create(descriptors)
    toc_1 = time.perf_counter()
    ctime.append(toc_1 - tic_1)
    kerndiffs.append(kern[0][1])

    tic_2 = time.perf_counter()
    normed = [normalize(i) for i in descriptors]
    rem = REMatchKernel(metric='rbf', gamma=1, alpha=1, threshold=1e-6)
    remkern = rem.create(descriptors)
    toc_2 = time.perf_counter()
    rectime.append(toc_2 - tic_2)
    remkerndiffs.append(remkern[0][1])

    allkerndiffs.append(abs(remkern[0][1] - kern[0][1]))