示例#1
0
def test_ngram_3(data):
    ngram = NGram(sample_order=5)
    ngram.fit(data['pg'][0][:20], train_order=5)

    def on_errors(self, error):
        if isinstance(error, MolConvertError):
            raise error
        else:
            return error.old_smi

    np.random.seed(123456)
    ngram.on_errors = types.MethodType(on_errors, ngram)
    with pytest.raises(MolConvertError):
        old_smis = ['CC(=S)C([*])(C)=CCC([*])']
        ngram.proposal(old_smis)

    def on_errors(self, error):
        if isinstance(error, GetProbError):
            raise error
        else:
            return error.old_smi

    np.random.seed(654321)
    ngram.on_errors = types.MethodType(on_errors, ngram)
    with pytest.raises(GetProbError):
        old_smis = ['C([*])C([*])(C1=C(OCCC)C=CC(Br)C1)']
        ngram.proposal(old_smis)
示例#2
0
def learn_n_gram2(smiles):
    # Method 2: expand n-gram training set with randomly reordered SMILES
    # (we show one of the many possible ways of doing it)
    n_reorder = 10  # pick a fixed number of re-ordering

    # convert the SMILES to canonical SMILES in RDKit (not necessary in general)
    cans = []
    for smi in smiles:
        # remove some molecules in the full SMILES list that may lead to error
        try:
            cans.append(Chem.MolToSmiles(Chem.MolFromSmiles(smi)))
        except:
            print(smi)
            pass

    mols = [Chem.MolFromSmiles(smi) for smi in cans]
    smi_reorder = []
    for mol in mols:
        idx = list(range(mol.GetNumAtoms()))
        np.random.shuffle(idx)
        tmp = [Chem.MolToSmiles(mol, rootedAtAtom=x) for x in range(min(len(idx), n_reorder))]
        smi_reorder.append(list(set(tmp)))

    # flatten out the list and train the N-gram
    flat_list = [item for sublist in smi_reorder for item in sublist]
    n_gram_reorder = NGram(reorder_prob=0.5)
    n_gram_reorder.fit(flat_list)

    # save results
    # with open('ngram_reorder_full.obj', 'wb') as f:
    #     pk.dump(n_gram_reorder, f)
    return n_gram_reorder
示例#3
0
def learn_n_gram0(smiles):
    # initialize a new n-gram
    n_gram = NGram()

    # train the n-gram with SMILES of available molecules
    n_gram.fit(smiles, train_order=5)
    return n_gram
示例#4
0
def _learn_n_gram2(smiles, reorder_prob=0.5, paraphrased_smiles_number=10):
    # Method 2: expand n-gram training set with randomly reordered SMILES
    # (we show one of the many possible ways of doing it)

    mols = _smiles_to_mol(smiles)

    generated_smiles = []
    gen_smi_append = generated_smiles.append
    for mol in mols:
        number_of_atom = mol.GetNumAtoms()
        sample_number = min(number_of_atom, paraphrased_smiles_number)

        shuffled_index = np.random.permutation(number_of_atom)
        tmp = [
            Chem.MolToSmiles(mol, rootedAtAtom=int(x))
            for x in shuffled_index[:sample_number]
        ]
        gen_smi_append(list(set(tmp)))

    flat_list = [item for sublist in generated_smiles for item in sublist]

    n_gram = NGram(reorder_prob=reorder_prob)
    n_gram.fit(flat_list)

    return n_gram
示例#5
0
def _learn_n_gram1(smiles):
    # Method 1: use canonical SMILES in RDKit with no reordering
    cans = _canonicalize_smiles(smiles)
    n_gram = NGram(reorder_prob=0)
    n_gram.fit(cans)

    return n_gram
示例#6
0
def learn_n_gram1(smiles):
    # Method 1: use canonical SMILES in RDKit with no reordering
    cans = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles]
    n_gram_cans = NGram(reorder_prob=0)
    n_gram_cans.fit(cans)

    # save results
    # with open('ngram_cans.obj', 'wb') as f:
    #     pk.dump(n_gram_cans, f)
    return n_gram_cans
示例#7
0
def test_iqspr_1(data):
    np.random.seed(0)
    ecfp = ECFP(n_jobs=1, input_type='smiles')
    bre = BayesianRidgeEstimator(descriptor=ecfp)
    ngram = NGram()
    iqspr = IQSPR(estimator=bre, modifier=ngram)
    X, y = data['pg']
    bre.fit(X, y)
    ngram.fit(data['pg'][0][0:20], train_order=10)
    beta = np.linspace(0.05, 1, 10)
    for s, ll, p, f in iqspr(data['pg'][0][:5], beta, yield_lpf=True, bandgap=(0.1, 0.2), density=(0.9, 1.2)):
        assert np.abs(np.sum(p) - 1.0) < 1e-5
        assert np.sum(f) == 5, print(f)
示例#8
0
def test_iqspr_1(data):
    np.random.seed(0)
    ecfp = data['ecfp']
    bre = GaussianLogLikelihood(descriptor=ecfp)
    ngram = NGram()
    iqspr = IQSPR(estimator=bre, modifier=ngram)
    X, y = data['pg']
    bre.fit(X, y)
    bre.update_targets(reset=True, bandgap=(0.1, 0.2), density=(0.9, 1.2))
    ngram.fit(data['pg'][0][0:20], train_order=10)
    beta = np.linspace(0.05, 1, 10)
    for s, ll, p, f in iqspr(data['pg'][0][:5], beta, yield_lpf=True):
        assert np.abs(np.sum(p) - 1.0) < 1e-5
        assert np.sum(f) == 5
示例#9
0
def test_ngram_1(data):
    ngram = NGram()
    assert ngram.ngram_table is None
    assert ngram.max_len == 1000
    assert ngram.del_range == (1, 10)
    assert ngram.reorder_prob == 0
    assert ngram.sample_order == (1, 10)
    assert ngram._train_order is None

    ngram.set_params(max_len=500, reorder_prob=0.2)

    assert ngram.max_len == 500
    assert ngram.del_range == (1, 10)
    assert ngram.reorder_prob == 0.2
示例#10
0
def test_ngram_4():
    smis1 = ['CCCc1ccccc1', 'CC(CCc1ccccc1)CC', 'Cc1ccccc1CC', 'C(CC(C))CC', 'CCCC']
    n_gram1 = NGram()  # base case
    n_gram1.fit(smis1, train_order=3)
    assert n_gram1._train_order == (1, 3)

    tmp_tab = n_gram1._table
    assert (len(tmp_tab),len(tmp_tab[0][0])) == (3,2)

    check_close = [[(4, 5), (2, 2)], [(6, 5), (3, 2)], [(8, 4), (4, 2)]]
    check_open = [[(5, 5), (2, 2)], [(6, 5), (3, 2)], [(6, 5), (4, 2)]]

    for ii, x in enumerate(tmp_tab):
        for i in range(len(x[0])):
            assert x[0][i].shape == check_close[ii][i]
            assert x[1][i].shape == check_open[ii][i]
示例#11
0
def data():
    # ignore numpy warning
    import warnings
    print('ignore NumPy RuntimeWarning\n')
    warnings.filterwarnings("ignore", message="numpy.dtype size changed")
    warnings.filterwarnings("ignore", message="numpy.ndarray size changed")

    pwd = Path(__file__).parent
    pg_data = pd.read_csv(str(pwd / 'polymer_test_data.csv'))

    X = pg_data['smiles']
    y = pg_data.drop(['smiles', 'Unnamed: 0'], axis=1)
    ecfp = ECFP(n_jobs=1, input_type='smiles', target_col=0)
    rdkitfp = RDKitFP(n_jobs=1, input_type='smiles', target_col=0)
    bre = GaussianLogLikelihood(descriptor=ecfp)
    bre2 = GaussianLogLikelihood(descriptor=rdkitfp)
    bre.fit(X, y[['bandgap', 'glass_transition_temperature']])
    bre2.fit(X, y[['density', 'refractive_index']])
    bre.update_targets(bandgap=(1, 2), glass_transition_temperature=(200, 300))
    bre2.update_targets(refractive_index=(2, 3), density=(0.9, 1.2))

    class MyLogLikelihood(BaseLogLikelihoodSet):
        def __init__(self):
            super().__init__()

            self.loglike = bre
            self.loglike = bre2

    like_mdl = MyLogLikelihood()
    ngram = NGram()
    ngram.fit(X[0:20], train_order=5)
    iqspr = IQSPR(estimator=bre, modifier=ngram)
    # prepare test data
    yield dict(ecfp=ecfp,
               rdkitfp=rdkitfp,
               bre=bre,
               bre2=bre2,
               like_mdl=like_mdl,
               ngram=ngram,
               iqspr=iqspr,
               pg=(X, y))

    print('test over')
示例#12
0
def test_ngram_6(data):
    smis0 = ['CCCc1ccccc1', 'CC(CCc1ccccc1)CC', 'Cc1ccccc1CC', 'C(CC(C))CC', 'CCCC']
    n_gram0 = NGram()  # base case
    n_gram0.fit(smis0, train_order=5)

    n_gram1, n_gram2 = n_gram0.split_table(cut_order=2)
    assert n_gram1._train_order == (1, 2)
    assert n_gram1.min_len == 1
    assert n_gram2._train_order == (3, 5)
    assert n_gram2.min_len == 3

    n_gram3 = n_gram2.merge_table(n_gram1, weight=1, overwrite=False)
    assert n_gram3._train_order == (1, 5)
    assert n_gram3.min_len == 3
    assert np.all(n_gram3._table[3][0][1] == n_gram0._table[3][0][1])
    assert np.all(n_gram3._table[2][1][0] == n_gram0._table[2][1][0])
    n_gram1.merge_table(n_gram2, weight=1)
    assert n_gram1._train_order == (1, 5)
    assert n_gram1.min_len == 1
    assert np.all(n_gram1._table[3][0][1] == n_gram0._table[3][0][1])
    assert np.all(n_gram1._table[2][1][0] == n_gram0._table[2][1][0])
示例#13
0
def data():
    # ignore numpy warning
    import warnings
    print('ignore NumPy RuntimeWarning\n')
    warnings.filterwarnings("ignore", message="numpy.dtype size changed")
    warnings.filterwarnings("ignore", message="numpy.ndarray size changed")

    pwd = Path(__file__).parent
    pg_data = pd.read_csv(str(pwd / 'polymer_test_data.csv'))

    X = pg_data['smiles']
    y = pg_data.drop(['smiles', 'Unnamed: 0'], axis=1)
    ecfp = ECFP(n_jobs=1, input_type='smiles')
    bre = BayesianRidgeEstimator(descriptor=ecfp)
    ngram = NGram()
    iqspr = IQSPR(estimator=bre, modifier=ngram)
    # prepare test data
    yield dict(ecfp=ecfp, bre=bre, ngram=ngram, iqspr=iqspr, pg=(X, y))

    print('test over')
示例#14
0
def test_ngram_2(data):
    ngram = NGram()

    with pytest.warns(RuntimeWarning, match='<sample_order>'):
        ngram.fit(data['pg'][0][:20], train_order=5)

    assert ngram._train_order == (1, 5)
    assert ngram.sample_order == (1, 5)
    assert ngram.ngram_table is not None

    np.random.seed(123456)
    with pytest.warns(RuntimeWarning, match='can not convert'):
        old_smis = ['CC(=S)C([*])(C)=CCC([*])']
        tmp = ngram.proposal(old_smis)
        assert tmp == old_smis

    np.random.seed(654321)
    with pytest.warns(RuntimeWarning, match='get_prob: '):
        old_smis = ['C([*])C([*])(C1=C(OCCC)C=CC(Br)C1)']
        tmp = ngram.proposal(old_smis)
        assert tmp == old_smis
示例#15
0
def test_ngram_5():
    smis1 = ['CCCc1ccccc1', 'CC(CCc1ccccc1)CC', 'Cc1ccccc1CC', 'C(CC(C))CC', 'CCCC']
    smis2 = ['C(F)(F)C', 'CCCF', 'C(F)C=C']
    smis3 = ['c123c(cc(ccc(N)ccc3)cccc2)cccc1', 'c1cncc1', 'CC(=O)CN']
    n_gram1 = NGram()  # base case
    n_gram1.fit(smis1, train_order=3)
    n_gram2 = NGram()  # higher order but lower num of rings
    n_gram2.fit(smis2, train_order=4)
    n_gram3 = NGram()  # lower order but higher num of rings
    n_gram3.fit(smis3, train_order=2)

    tmp_ngram = n_gram1.merge_table(n_gram2, weight=1, overwrite=False)
    assert tmp_ngram._train_order == (1, 4)
    tmp_tab = tmp_ngram._table
    assert (len(tmp_tab), len(tmp_tab[0][0])) == (4,2)

    tmp_ngram = n_gram1.merge_table(n_gram3, weight=1, overwrite=False)
    assert tmp_ngram._train_order == (1, 3)
    tmp_tab = tmp_ngram._table
    assert (len(tmp_tab), len(tmp_tab[0][0])) == (3, 4)

    n_gram1.merge_table(n_gram2, n_gram3, weight=[0.5, 1])
    tmp_tab = n_gram1._table
    assert n_gram1._train_order == (1, 4)
    assert (len(tmp_tab), len(tmp_tab[0][0])) == (4, 4)
    assert tmp_tab[0][0][0].loc["['C']","C"] == 11.0
    assert tmp_tab[0][1][0].loc["['(']", "C"] == 3.0
示例#16
0
    def combine_fragments(self, smis_base, smis_frag):
        """
        combine two SMILES strings with '*' as connection points
        Parameters
        ----------
        smis_base: str
            SMILES for combining.
            If no '*', assume connection point at the end.
            If more than one '*', the first will be picked if it's not the 1st character.
        smis_frag: str
            SMILES for combining.
            If no '*', assume connection point at the front.
            If more than one '*', the first will be picked.
        """

        # prepare NGram object for use of ext. SMILES
        ngram = NGram()

        # check position of '*'
        mols_base = Chem.MolFromSmiles(smis_base)
        if mols_base is None:
            raise RuntimeError('Invalid base SMILES!')
        idx_base = [
            i for i in range(mols_base.GetNumAtoms())
            if mols_base.GetAtomWithIdx(i).GetSymbol() == '*'
        ]

        # rearrange base SMILES to avoid 1st char = '*'
        if len(idx_base) == 1 and idx_base[0] == 0:
            smis_base_head = Chem.MolToSmiles(mols_base, rootedAtAtom=1)
        elif len(idx_base) == 0:
            smis_base_head = smis_base + '*'
        else:
            smis_base_head = smis_base

        # converge base to ext. SMILES and pick insertion location
        esmi_base = ngram.smi2esmi(smis_base_head)
        esmi_base = esmi_base[:-1]
        idx_base = esmi_base.index[esmi_base['esmi'] == '*'].tolist()
        if idx_base[0] == 0:
            idx_base = idx_base[1]
        else:
            idx_base = idx_base[0]

        # rearrange fragment to have 1st char = '*' and convert to ext. SMILES
        mols_frag = Chem.MolFromSmiles(smis_frag)
        if mols_frag is None:
            raise RuntimeError('Invalid frag SMILES!')
        idx_frag = [
            i for i in range(mols_frag.GetNumAtoms())
            if mols_frag.GetAtomWithIdx(i).GetSymbol() == '*'
        ]
        if len(idx_frag) == 0:
            esmi_frag = ngram.smi2esmi(smis_frag)
            # remove last '!'
            esmi_frag = esmi_frag[:-1]
        else:
            esmi_frag = ngram.smi2esmi(
                Chem.MolToSmiles(mols_frag, rootedAtAtom=idx_frag[0]))
            # remove leading '*' and last '!'
            esmi_frag = esmi_frag[1:-1]

        # check open rings of base SMILES
        nRing_base = esmi_base['n_ring'].loc[idx_base]

        # re-number rings in fragment SMILES
        esmi_frag['n_ring'] = esmi_frag['n_ring'] + nRing_base

        # delete '*' at the insertion location
        esmi_base = esmi_base.drop(idx_base).reset_index(drop=True)

        # combine base with the fragment
        ext_smi = pd.concat(
            [esmi_base.iloc[:idx_base], esmi_frag,
             esmi_base.iloc[idx_base:]]).reset_index(drop=True)
        new_pd_row = {'esmi': '!', 'n_br': 0, 'n_ring': 0, 'substr': ['!']}
        ext_smi.append(new_pd_row, ignore_index=True)

        return ngram.esmi2smi(ext_smi)