def test_ngram_3(data): ngram = NGram(sample_order=5) ngram.fit(data['pg'][0][:20], train_order=5) def on_errors(self, error): if isinstance(error, MolConvertError): raise error else: return error.old_smi np.random.seed(123456) ngram.on_errors = types.MethodType(on_errors, ngram) with pytest.raises(MolConvertError): old_smis = ['CC(=S)C([*])(C)=CCC([*])'] ngram.proposal(old_smis) def on_errors(self, error): if isinstance(error, GetProbError): raise error else: return error.old_smi np.random.seed(654321) ngram.on_errors = types.MethodType(on_errors, ngram) with pytest.raises(GetProbError): old_smis = ['C([*])C([*])(C1=C(OCCC)C=CC(Br)C1)'] ngram.proposal(old_smis)
def learn_n_gram2(smiles): # Method 2: expand n-gram training set with randomly reordered SMILES # (we show one of the many possible ways of doing it) n_reorder = 10 # pick a fixed number of re-ordering # convert the SMILES to canonical SMILES in RDKit (not necessary in general) cans = [] for smi in smiles: # remove some molecules in the full SMILES list that may lead to error try: cans.append(Chem.MolToSmiles(Chem.MolFromSmiles(smi))) except: print(smi) pass mols = [Chem.MolFromSmiles(smi) for smi in cans] smi_reorder = [] for mol in mols: idx = list(range(mol.GetNumAtoms())) np.random.shuffle(idx) tmp = [Chem.MolToSmiles(mol, rootedAtAtom=x) for x in range(min(len(idx), n_reorder))] smi_reorder.append(list(set(tmp))) # flatten out the list and train the N-gram flat_list = [item for sublist in smi_reorder for item in sublist] n_gram_reorder = NGram(reorder_prob=0.5) n_gram_reorder.fit(flat_list) # save results # with open('ngram_reorder_full.obj', 'wb') as f: # pk.dump(n_gram_reorder, f) return n_gram_reorder
def learn_n_gram0(smiles): # initialize a new n-gram n_gram = NGram() # train the n-gram with SMILES of available molecules n_gram.fit(smiles, train_order=5) return n_gram
def _learn_n_gram2(smiles, reorder_prob=0.5, paraphrased_smiles_number=10): # Method 2: expand n-gram training set with randomly reordered SMILES # (we show one of the many possible ways of doing it) mols = _smiles_to_mol(smiles) generated_smiles = [] gen_smi_append = generated_smiles.append for mol in mols: number_of_atom = mol.GetNumAtoms() sample_number = min(number_of_atom, paraphrased_smiles_number) shuffled_index = np.random.permutation(number_of_atom) tmp = [ Chem.MolToSmiles(mol, rootedAtAtom=int(x)) for x in shuffled_index[:sample_number] ] gen_smi_append(list(set(tmp))) flat_list = [item for sublist in generated_smiles for item in sublist] n_gram = NGram(reorder_prob=reorder_prob) n_gram.fit(flat_list) return n_gram
def _learn_n_gram1(smiles): # Method 1: use canonical SMILES in RDKit with no reordering cans = _canonicalize_smiles(smiles) n_gram = NGram(reorder_prob=0) n_gram.fit(cans) return n_gram
def learn_n_gram1(smiles): # Method 1: use canonical SMILES in RDKit with no reordering cans = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles] n_gram_cans = NGram(reorder_prob=0) n_gram_cans.fit(cans) # save results # with open('ngram_cans.obj', 'wb') as f: # pk.dump(n_gram_cans, f) return n_gram_cans
def test_iqspr_1(data): np.random.seed(0) ecfp = ECFP(n_jobs=1, input_type='smiles') bre = BayesianRidgeEstimator(descriptor=ecfp) ngram = NGram() iqspr = IQSPR(estimator=bre, modifier=ngram) X, y = data['pg'] bre.fit(X, y) ngram.fit(data['pg'][0][0:20], train_order=10) beta = np.linspace(0.05, 1, 10) for s, ll, p, f in iqspr(data['pg'][0][:5], beta, yield_lpf=True, bandgap=(0.1, 0.2), density=(0.9, 1.2)): assert np.abs(np.sum(p) - 1.0) < 1e-5 assert np.sum(f) == 5, print(f)
def test_iqspr_1(data): np.random.seed(0) ecfp = data['ecfp'] bre = GaussianLogLikelihood(descriptor=ecfp) ngram = NGram() iqspr = IQSPR(estimator=bre, modifier=ngram) X, y = data['pg'] bre.fit(X, y) bre.update_targets(reset=True, bandgap=(0.1, 0.2), density=(0.9, 1.2)) ngram.fit(data['pg'][0][0:20], train_order=10) beta = np.linspace(0.05, 1, 10) for s, ll, p, f in iqspr(data['pg'][0][:5], beta, yield_lpf=True): assert np.abs(np.sum(p) - 1.0) < 1e-5 assert np.sum(f) == 5
def test_ngram_1(data): ngram = NGram() assert ngram.ngram_table is None assert ngram.max_len == 1000 assert ngram.del_range == (1, 10) assert ngram.reorder_prob == 0 assert ngram.sample_order == (1, 10) assert ngram._train_order is None ngram.set_params(max_len=500, reorder_prob=0.2) assert ngram.max_len == 500 assert ngram.del_range == (1, 10) assert ngram.reorder_prob == 0.2
def test_ngram_4(): smis1 = ['CCCc1ccccc1', 'CC(CCc1ccccc1)CC', 'Cc1ccccc1CC', 'C(CC(C))CC', 'CCCC'] n_gram1 = NGram() # base case n_gram1.fit(smis1, train_order=3) assert n_gram1._train_order == (1, 3) tmp_tab = n_gram1._table assert (len(tmp_tab),len(tmp_tab[0][0])) == (3,2) check_close = [[(4, 5), (2, 2)], [(6, 5), (3, 2)], [(8, 4), (4, 2)]] check_open = [[(5, 5), (2, 2)], [(6, 5), (3, 2)], [(6, 5), (4, 2)]] for ii, x in enumerate(tmp_tab): for i in range(len(x[0])): assert x[0][i].shape == check_close[ii][i] assert x[1][i].shape == check_open[ii][i]
def data(): # ignore numpy warning import warnings print('ignore NumPy RuntimeWarning\n') warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ndarray size changed") pwd = Path(__file__).parent pg_data = pd.read_csv(str(pwd / 'polymer_test_data.csv')) X = pg_data['smiles'] y = pg_data.drop(['smiles', 'Unnamed: 0'], axis=1) ecfp = ECFP(n_jobs=1, input_type='smiles', target_col=0) rdkitfp = RDKitFP(n_jobs=1, input_type='smiles', target_col=0) bre = GaussianLogLikelihood(descriptor=ecfp) bre2 = GaussianLogLikelihood(descriptor=rdkitfp) bre.fit(X, y[['bandgap', 'glass_transition_temperature']]) bre2.fit(X, y[['density', 'refractive_index']]) bre.update_targets(bandgap=(1, 2), glass_transition_temperature=(200, 300)) bre2.update_targets(refractive_index=(2, 3), density=(0.9, 1.2)) class MyLogLikelihood(BaseLogLikelihoodSet): def __init__(self): super().__init__() self.loglike = bre self.loglike = bre2 like_mdl = MyLogLikelihood() ngram = NGram() ngram.fit(X[0:20], train_order=5) iqspr = IQSPR(estimator=bre, modifier=ngram) # prepare test data yield dict(ecfp=ecfp, rdkitfp=rdkitfp, bre=bre, bre2=bre2, like_mdl=like_mdl, ngram=ngram, iqspr=iqspr, pg=(X, y)) print('test over')
def test_ngram_6(data): smis0 = ['CCCc1ccccc1', 'CC(CCc1ccccc1)CC', 'Cc1ccccc1CC', 'C(CC(C))CC', 'CCCC'] n_gram0 = NGram() # base case n_gram0.fit(smis0, train_order=5) n_gram1, n_gram2 = n_gram0.split_table(cut_order=2) assert n_gram1._train_order == (1, 2) assert n_gram1.min_len == 1 assert n_gram2._train_order == (3, 5) assert n_gram2.min_len == 3 n_gram3 = n_gram2.merge_table(n_gram1, weight=1, overwrite=False) assert n_gram3._train_order == (1, 5) assert n_gram3.min_len == 3 assert np.all(n_gram3._table[3][0][1] == n_gram0._table[3][0][1]) assert np.all(n_gram3._table[2][1][0] == n_gram0._table[2][1][0]) n_gram1.merge_table(n_gram2, weight=1) assert n_gram1._train_order == (1, 5) assert n_gram1.min_len == 1 assert np.all(n_gram1._table[3][0][1] == n_gram0._table[3][0][1]) assert np.all(n_gram1._table[2][1][0] == n_gram0._table[2][1][0])
def data(): # ignore numpy warning import warnings print('ignore NumPy RuntimeWarning\n') warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ndarray size changed") pwd = Path(__file__).parent pg_data = pd.read_csv(str(pwd / 'polymer_test_data.csv')) X = pg_data['smiles'] y = pg_data.drop(['smiles', 'Unnamed: 0'], axis=1) ecfp = ECFP(n_jobs=1, input_type='smiles') bre = BayesianRidgeEstimator(descriptor=ecfp) ngram = NGram() iqspr = IQSPR(estimator=bre, modifier=ngram) # prepare test data yield dict(ecfp=ecfp, bre=bre, ngram=ngram, iqspr=iqspr, pg=(X, y)) print('test over')
def test_ngram_2(data): ngram = NGram() with pytest.warns(RuntimeWarning, match='<sample_order>'): ngram.fit(data['pg'][0][:20], train_order=5) assert ngram._train_order == (1, 5) assert ngram.sample_order == (1, 5) assert ngram.ngram_table is not None np.random.seed(123456) with pytest.warns(RuntimeWarning, match='can not convert'): old_smis = ['CC(=S)C([*])(C)=CCC([*])'] tmp = ngram.proposal(old_smis) assert tmp == old_smis np.random.seed(654321) with pytest.warns(RuntimeWarning, match='get_prob: '): old_smis = ['C([*])C([*])(C1=C(OCCC)C=CC(Br)C1)'] tmp = ngram.proposal(old_smis) assert tmp == old_smis
def test_ngram_5(): smis1 = ['CCCc1ccccc1', 'CC(CCc1ccccc1)CC', 'Cc1ccccc1CC', 'C(CC(C))CC', 'CCCC'] smis2 = ['C(F)(F)C', 'CCCF', 'C(F)C=C'] smis3 = ['c123c(cc(ccc(N)ccc3)cccc2)cccc1', 'c1cncc1', 'CC(=O)CN'] n_gram1 = NGram() # base case n_gram1.fit(smis1, train_order=3) n_gram2 = NGram() # higher order but lower num of rings n_gram2.fit(smis2, train_order=4) n_gram3 = NGram() # lower order but higher num of rings n_gram3.fit(smis3, train_order=2) tmp_ngram = n_gram1.merge_table(n_gram2, weight=1, overwrite=False) assert tmp_ngram._train_order == (1, 4) tmp_tab = tmp_ngram._table assert (len(tmp_tab), len(tmp_tab[0][0])) == (4,2) tmp_ngram = n_gram1.merge_table(n_gram3, weight=1, overwrite=False) assert tmp_ngram._train_order == (1, 3) tmp_tab = tmp_ngram._table assert (len(tmp_tab), len(tmp_tab[0][0])) == (3, 4) n_gram1.merge_table(n_gram2, n_gram3, weight=[0.5, 1]) tmp_tab = n_gram1._table assert n_gram1._train_order == (1, 4) assert (len(tmp_tab), len(tmp_tab[0][0])) == (4, 4) assert tmp_tab[0][0][0].loc["['C']","C"] == 11.0 assert tmp_tab[0][1][0].loc["['(']", "C"] == 3.0
def combine_fragments(self, smis_base, smis_frag): """ combine two SMILES strings with '*' as connection points Parameters ---------- smis_base: str SMILES for combining. If no '*', assume connection point at the end. If more than one '*', the first will be picked if it's not the 1st character. smis_frag: str SMILES for combining. If no '*', assume connection point at the front. If more than one '*', the first will be picked. """ # prepare NGram object for use of ext. SMILES ngram = NGram() # check position of '*' mols_base = Chem.MolFromSmiles(smis_base) if mols_base is None: raise RuntimeError('Invalid base SMILES!') idx_base = [ i for i in range(mols_base.GetNumAtoms()) if mols_base.GetAtomWithIdx(i).GetSymbol() == '*' ] # rearrange base SMILES to avoid 1st char = '*' if len(idx_base) == 1 and idx_base[0] == 0: smis_base_head = Chem.MolToSmiles(mols_base, rootedAtAtom=1) elif len(idx_base) == 0: smis_base_head = smis_base + '*' else: smis_base_head = smis_base # converge base to ext. SMILES and pick insertion location esmi_base = ngram.smi2esmi(smis_base_head) esmi_base = esmi_base[:-1] idx_base = esmi_base.index[esmi_base['esmi'] == '*'].tolist() if idx_base[0] == 0: idx_base = idx_base[1] else: idx_base = idx_base[0] # rearrange fragment to have 1st char = '*' and convert to ext. SMILES mols_frag = Chem.MolFromSmiles(smis_frag) if mols_frag is None: raise RuntimeError('Invalid frag SMILES!') idx_frag = [ i for i in range(mols_frag.GetNumAtoms()) if mols_frag.GetAtomWithIdx(i).GetSymbol() == '*' ] if len(idx_frag) == 0: esmi_frag = ngram.smi2esmi(smis_frag) # remove last '!' esmi_frag = esmi_frag[:-1] else: esmi_frag = ngram.smi2esmi( Chem.MolToSmiles(mols_frag, rootedAtAtom=idx_frag[0])) # remove leading '*' and last '!' esmi_frag = esmi_frag[1:-1] # check open rings of base SMILES nRing_base = esmi_base['n_ring'].loc[idx_base] # re-number rings in fragment SMILES esmi_frag['n_ring'] = esmi_frag['n_ring'] + nRing_base # delete '*' at the insertion location esmi_base = esmi_base.drop(idx_base).reset_index(drop=True) # combine base with the fragment ext_smi = pd.concat( [esmi_base.iloc[:idx_base], esmi_frag, esmi_base.iloc[idx_base:]]).reset_index(drop=True) new_pd_row = {'esmi': '!', 'n_br': 0, 'n_ring': 0, 'substr': ['!']} ext_smi.append(new_pd_row, ignore_index=True) return ngram.esmi2smi(ext_smi)