示例#1
0
    def check_data(self, df):
        startTime = datetime.datetime.now()

        def fnDisplay(message):
            display(message, startTime)

        smiles = df.smiles
        for i in range(len(smiles)):
            ion = smiles[i]
            try:
                Chem.SanitizeMol(Chem.MolFromSmiles(ion))
            except ValueError:
                name = salty.check_name(ion)
                message = "RDKit cannot interpret %s ion SMILES in datafile" \
                          % name
                fnDisplay(message)
            if "-" not in ion and "+" not in ion:
                name = salty.check_name(ion)
                message = "%s ion does not have a charge" % name
                fnDisplay(message)
            if "." in ion:
                name = salty.check_name(ion)
                message = "%s ion contains more than one molecular entity" \
                          % name
                fnDisplay(message)
示例#2
0
def _show_ion(genes,
              target,
              mutation_attempts,
              sim_score,
              molecular_relative,
              models,
              deslists,
              anion_smiles,
              exp_data=None):
    """
    for printing results to the screen. _show_ion is called when a candidate
    has achieved the desired fitness core and is returned by the engine
    """
    mol = Chem.MolFromSmiles(genes)
    anion = Chem.MolFromSmiles(anion_smiles)
    fitness, mol_property = _get_fitness(anion, genes, target, models,
                                         deslists)
    anion_name = salty.check_name(anion_smiles)
    if exp_data:
        chrom = genetic.Chromosome(genes, fitness)
        exp_parent_candidates = eval(exp_data.Data_summary.iloc[1][0])
        tan_sim_score, sim_index = \
            genetic.molecular_similarity(chrom, exp_parent_candidates)
        molecular_relative = exp_parent_candidates[sim_index]
    print("{}\t{}".format("Salt Smiles: ", genes))
    print("{}\t{}".format("Cation Heavy Atoms: ", mol.GetNumAtoms()))
    print("Tanimoto Similarity Score: \t{0:10.3f}".format(sim_score))
    print("{}\t{}".format("Molecular Relative: ",
                          salty.check_name(molecular_relative)))
    print("{}\t{}".format("Anion: ", anion_name))
    print("{}\t{}".format("Model Prediction: ", mol_property))
    print("{}\t{}".format("Mutation Attempts: ", mutation_attempts))
示例#3
0
def _show_ion(genes, target, mutation_attempts, sim_score, molecular_relative,
              models, deslists, anion_smiles):
    """
    for printing results to the screen. _show_ion is called when a candidate
    has achieved the desired fitness core and is returned by the engine
    """
    mol = Chem.MolFromSmiles(genes)
    anion = Chem.MolFromSmiles(anion_smiles)
    fitness, mol_property = _get_fitness(anion, genes, target,
                                         models, deslists)
    anion_name = salty.check_name(anion_smiles)
    print("{}\t{}".format("Salt Smiles: ", genes))
    print("{}\t{}".format("Cation Heavy Atoms: ", mol.GetNumAtoms()))
    print("Tanimoto Similarity Score: \t{0:10.3f}".format(sim_score))
    print("{}\t{}".format("Molecular Relative: ",
                          salty.check_name(molecular_relative)))
    print("{}\t{}".format("Anion: ", anion_name))
    print("{}\t{}".format("Model Prediction: ", mol_property))
    print("{}\t{}".format("Mutation Attempts: ", mutation_attempts))
示例#4
0
def _show_ion(genes, target, mutation_attempts, sim_score, molecular_relative,
              model_ID, anion):
    """
    for printing results to the screen. _show_ion is called when a candidate
    has achieved the desired fitness core and is returned by the engine
    """
    mol = Chem.MolFromSmiles(genes)
    fitness, mol_property = _get_fitness(anion, genes, target, model_ID)
    print("{}\t{}".format("number of atoms: ", mol.GetNumAtoms()))
    print("{}\t{}".format("mutation attempts: ", mutation_attempts))
    print("with prediction: \t{}".format(mol_property))
    print("similarity score:  {0:10.3f}".format(sim_score))
    print("{}\t{}\n".format("molecular relative: ",
                            salty.check_name(molecular_relative)))
示例#5
0
class iupac_smiles_tests(unittest.TestCase):
    data_files = ["cationInfo.csv", "anionInfo.csv"]
    df = salty.load_data(data_files[0])
    smiles = df.smiles
    for i in range(len(smiles)):
        ion = smiles[i]
        salty.check_name(ion)

    def test_1_check_data(self):
        for i in range(len(self.data_files)):
            df = salty.load_data(self.data_files[i])
            self.check_data(df)

    def test_2_check_wrong_ion(selfs):
        ion = 'stupid_nonsense_string'
        salty.check_name(ion)

    def test_benchmark(self):
        salty.Benchmark.run(self.test_1_check_data)
        salty.Benchmark.run(self.test_2_check_wrong_ion)

    def check_data(self, df):
        startTime = datetime.datetime.now()

        def fnDisplay(message):
            display(message, startTime)

        smiles = df.smiles
        for i in range(len(smiles)):
            ion = smiles[i]
            try:
                Chem.SanitizeMol(Chem.MolFromSmiles(ion))
            except ValueError:
                name = salty.check_name(ion)
                message = "RDKit cannot interpret %s ion SMILES in datafile" \
                          % name
                fnDisplay(message)
            if "-" not in ion and "+" not in ion:
                name = salty.check_name(ion)
                message = "%s ion does not have a charge" % name
                fnDisplay(message)
            if "." in ion:
                name = salty.check_name(ion)
                message = "%s ion contains more than one molecular entity" \
                          % name
                fnDisplay(message)
示例#6
0
 def test_2_check_wrong_ion(selfs):
     ion = 'stupid_nonsense_string'
     salty.check_name(ion)
示例#7
0
print('There are ' + str(len(four)) + ' salts of 2 each')
print('There are ' + str(len(more)) + ' salts of 2 or more each')

cation2 = []
anion2 = []
error2_anion = []
error2_cation = []

for i in two:
    cation2.append(i[0])
    anion2.append(i[1])

for i in anion2:  #CHECK CHECK_NAME FUNC FOR MISSING ANION OR CATION
    #print(i)
    try:
        check_name(i)
    except:
        UnboundLocalError
        error2_anion.append(i)
        #print(i)
for i in cation2:
    #print(i)
    try:
        check_name(i)
    except:
        UnboundLocalError
        error2_cation.append(i)

print('There are ' + str(len(set(error2_anion))) +
      ' unique missing anions from the data base')
#print(error2_anion)
示例#8
0
def generate_solvent(target,
                     model_ID,
                     heavy_atom_limit=50,
                     sim_bounds=[0.4, 1.0],
                     hits=1,
                     write_file=False):
    """
    the primary public function of the salt_generator module

    Parameters
    ----------
    target : array, float, or int
        the desired property value to be achieved by the engine, if
        an array, a multi-output model must be supplied to the engine
    model_ID : str
        the name of the model to be used by the engine. Gains has
        several built-in models to choose from
    heavy_atom_limit : int, optional
        the upper value for allowable heavy atoms in the returned
        candidate
    sim_bounds : array, optional
        the tanimoto similarity score between the returned candidate
        and its closest molecular relative in parent_candidates
    hits : int, optional
        the number of desired solutions
    write_file : boolean, optional
        defaults to False. if True will return the solutions and a
        csv log file

    Returns
    -------
    new : object
        default behavior is to return a pandas DataFrame. This is
        a log file of the solution(s). if write_file = True the
        function will also return pdb files of the cations/anions
    """

    parent_candidates = eval(
        genetic.load_data("{}_summary.csv".format(model_ID)).loc[1][1])
    anion_candidates = eval(
        genetic.load_data("{}_summary.csv".format(model_ID)).loc[2][1])
    cols = [
        "Salt ID", "Salt Smiles", "Cation Heavy Atoms",
        "Tanimoto Similarity Score", "Molecular Relative", "Anion",
        "Model Prediction", "MD Calculation", "Error"
    ]
    salts = pd.DataFrame(columns=cols)
    for i in range(1, hits + 1):
        while True:
            anion_smiles = random.sample(list(anion_candidates), 1)[0]
            anion = Chem.MolFromSmiles(anion_smiles)
            best = _guess_password(target, anion, parent_candidates, model_ID)
            tan_sim_score, sim_index =\
                genetic.molecular_similarity(best, parent_candidates)
            cation_heavy_atoms = best.Mol.GetNumAtoms()
            salt_smiles = best.Genes + "." + Chem.MolToSmiles(anion)
            if cation_heavy_atoms < heavy_atom_limit and\
                    tan_sim_score >= sim_bounds[0] and\
                    tan_sim_score < sim_bounds[1] and\
                    salt_smiles not in salts["Salt Smiles"]:
                scr, pre = _get_fitness(anion, best.Genes, target, model_ID)
                if i < 10:
                    CAT_ID = "C0%s" % i
                    AN_ID = "A0%s" % i
                else:
                    CAT_ID = "C%s" % i
                    AN_ID = "A%s" % i
                salt_ID = CAT_ID + "_" + AN_ID
                molecular_relative = salty.check_name(
                    parent_candidates[sim_index])
                anion_name = salty.check_name(anion_smiles)
                new_entry = pd.DataFrame([[
                    salt_ID, salt_smiles, cation_heavy_atoms, tan_sim_score,
                    molecular_relative, anion_name, pre
                ]],
                                         columns=cols[:-2])
                try:
                    cation = Chem.AddHs(best.Mol)
                    Chem.EmbedMolecule(cation, Chem.ETKDG())
                    Chem.UFFOptimizeMolecule(cation)
                    anion = Chem.AddHs(anion)
                    Chem.EmbedMolecule(anion, Chem.ETKDG())
                    Chem.UFFOptimizeMolecule(anion)
                    new = pd.DataFrame(pd.concat([salts, new_entry]),
                                       columns=cols)
                except BaseException:
                    continue
                if write_file:
                    MolToPDBFile(cation, "{}.pdb".format(CAT_ID))
                    MolToPDBFile(anion, "{}.pdb".format(AN_ID))
                break
            else:
                continue
        if write_file:
            pd.DataFrame.to_csv(new, path_or_buf="salt_log.csv", index=False)
        salts = new
    if not write_file:
        return new
示例#9
0
def generate_solvent(target,
                     model_ID,
                     heavy_atom_limit=50,
                     sim_bounds=[0, 1.0],
                     hits=1,
                     write_file=False,
                     seed=None,
                     hull=None,
                     simplex=None,
                     path=None,
                     exp_data=None,
                     verbose=0,
                     gen_token=False,
                     hull_bounds=[0, 1],
                     inner_search=True,
                     parent_cap=25,
                     mutation_cap=1000):
    """
    the primary public function of the salt_generator module

    Parameters
    ----------
    target : array, float, or int
        the desired property value to be achieved by the engine, if
        an array, a multi-output model must be supplied to the engine
    model_ID : str
        the name of the model to be used by the engine. Gains has
        several built-in models to choose from
    heavy_atom_limit : int, optional
        the upper value for allowable heavy atoms in the returned
        candidate
    sim_bounds : array, optional
        the tanimoto similarity score between the returned candidate
        and its closest molecular relative in parent_candidates
    hits : int, optional
        the number of desired solutions
    write_file : boolean, optional
        defaults to False. if True will return the solutions and a
        csv log file
    seed : int, optional
        optional randint seed for unittest consistency
    hull : pandas DataFrame, optional
        nxm pandas DataFrame to use convex hull search strategy. hull
        columns should be the same properties used in the genetic algorithm
        fitness test
    simplex : array, optional
        array to access boundary datapoints in the convex hull. This is used
        during target resampling defined by the convex hull/simplex
    path : str, optional
        absolute path to the qspr model used as the fitness function
    exp_data: salty devmodel obj, optional
        used during hull target reassignment search strategy. Salty devmodel
        object of the original experimental data
    verbose : int, optional, default 0
        0 : most verbose. Best child, parent/target resampling,
            sanitization failure
        1 : parent/target resampling, solution metadata, sanitization failure
        2 : solution metdata, sanitization failure
        3 : target resampling, csv-formatted solution metadata
        4 : csv-formatted solution metadata
    gen_token : int, str, optional
        a string or integer to append to file outputs. Useful in the case of
        parallel searches.
    hull_bounds : array, optional
        if hull and simplex are not none, hull_bounds describes the
        proximity convex_search should be to the simplex
    inner_search : bool, optional
        if hull and simplex are not none, inner_search specifies if
        convex_search should return values only within the convex hull

    Returns
    -------
    new : object
        default behavior is to return a pandas DataFrame. This is
        a log file of the solution(s). if write_file = True the
        function will also return pdb files of the cations/anions
    """
    parent_candidates = []
    anion_candidates = []
    models = []
    deslists = []
    for i, name in enumerate(model_ID):
        if path:
            model = np.array(
                [load_model(join(path, '{}_qspr.h5'.format(name)))])
            with open(join(path, '{}_desc.csv'.format(name)),
                      'rb') as csv_file:
                deslist = list([pd.read_csv(csv_file, encoding='latin1')])
            with open(join(path, '{}_summ.csv'.format(name)),
                      'rb') as csv_file:
                summary = pd.read_csv(csv_file, encoding='latin1')
        else:
            model = np.array(
                [genetic.load_data("{}_qspr.h5".format(name), h5File=True)])
            deslist = list([genetic.load_data("{}_desc.csv".format(name))])
            summary = genetic.load_data("{}_summ.csv".format(name))
        parents = eval(summary.iloc[1][1])
        anions = eval(summary.iloc[2][1])
        if i > 0:
            parent_candidates = np.concatenate((parents, parent_candidates))
            anion_candidates = np.concatenate((anions, anion_candidates))
            models = np.concatenate((models, model))
            deslists = list([deslists, deslist])
        else:
            parent_candidates = parents
            anion_candidates = anions
            models = model
            deslists = deslist
    cols = [
        "Salt ID", "Salt Smiles", "Cation Heavy Atoms",
        "Tanimoto Similarity Score", "Molecular Relative", "Anion",
        "Model Prediction", "MD Calculation", "Error"
    ]
    salts = pd.DataFrame(columns=cols)
    if exp_data:
        anion_candidates = eval(exp_data.Data_summary.iloc[2][0])
    for i in range(1, hits + 1):
        while True:
            if seed:
                random.seed(seed)
            anion_smiles = random.sample(list(anion_candidates), 1)[0]
            anion = Chem.MolFromSmiles(anion_smiles)
            best = _guess_password(target,
                                   anion_smiles,
                                   parent_candidates,
                                   models,
                                   deslists,
                                   seed=seed,
                                   hull=hull,
                                   simplex=simplex,
                                   exp_data=exp_data,
                                   verbose=verbose,
                                   hull_bounds=hull_bounds,
                                   inner_search=inner_search,
                                   parent_cap=parent_cap,
                                   mutation_cap=mutation_cap)
            if exp_data:
                exp_parent_candidates = eval(exp_data.Data_summary.iloc[1][0])
                tan_sim_score, sim_index = \
                    genetic.molecular_similarity(best, exp_parent_candidates)
            else:
                tan_sim_score, sim_index = \
                    genetic.molecular_similarity(best, parent_candidates)
            cation_heavy_atoms = best.Mol.GetNumAtoms()
            salt_smiles = best.Genes + "." + Chem.MolToSmiles(anion)
            if cation_heavy_atoms < heavy_atom_limit and \
                    sim_bounds[0] <= tan_sim_score < sim_bounds[1] and\
                    salt_smiles not in salts["Salt Smiles"]:
                scr, pre = _get_fitness(anion, best.Genes, target, models,
                                        deslists)
                if i < 10:
                    CAT_ID = "C0%s" % i
                    AN_ID = "A0%s" % i
                else:
                    CAT_ID = "C%s" % i
                    AN_ID = "A%s" % i
                salt_ID = CAT_ID + "_" + AN_ID
                if exp_data:
                    molecular_relative = salty.check_name(
                        exp_parent_candidates[sim_index])
                else:
                    molecular_relative = salty.check_name(
                        parent_candidates[sim_index])
                anion_name = salty.check_name(anion_smiles)
                new_entry = pd.DataFrame([[
                    salt_ID, salt_smiles, cation_heavy_atoms, tan_sim_score,
                    molecular_relative, anion_name, pre
                ]],
                                         columns=cols[:-2])
                try:
                    cation = Chem.AddHs(best.Mol)
                    Chem.EmbedMolecule(cation, Chem.ETKDG())
                    Chem.UFFOptimizeMolecule(cation)
                    anion = Chem.AddHs(anion)
                    Chem.EmbedMolecule(anion, Chem.ETKDG())
                    Chem.UFFOptimizeMolecule(anion)
                    new = pd.DataFrame(pd.concat([salts, new_entry]),
                                       columns=cols)
                except BaseException:
                    if verbose == any([0, 1, 2]):
                        print("molecule not sanitizable")
                    continue
                if write_file:
                    if verbose == any([3, 4]):
                        print(new)
                    if gen_token:
                        MolToPDBFile(cation,
                                     "{}_{}.pdb".format(gen_token, CAT_ID))
                        MolToPDBFile(anion,
                                     "{}_{}.pdb".format(gen_token, AN_ID))
                    else:
                        MolToPDBFile(cation, "{}.pdb".format(CAT_ID))
                        MolToPDBFile(anion, "{}.pdb".format(AN_ID))
                break
            else:
                continue
        if write_file:
            if gen_token:
                pd.DataFrame.to_csv(
                    new,
                    path_or_buf="{}_salt_log.csv".format(gen_token),
                    index=False)
            else:
                pd.DataFrame.to_csv(new,
                                    path_or_buf="salt_log.csv",
                                    index=False)
        salts = new
    if not write_file:
        return new