Пример #1
0
 def testRingEncoding(self):
     ringconverter = ds.Converter(rings=True)
     allconverter = ds.Converter(rings=True, branches=True)
     data = [  # smi, DeepSmiles/RC, DeepSMILES/RC+PN (if different)
         ("C1CCCC1", "CCCCC5"),
         ("C%10CCC%10", "CCCC4"),
         ("C1CCCCCCCCC1", "CCCCCCCCCC%10"),
         ("C1CC(OC)CC1", "CCC(OC)CC5", "CCCOC))CC5"),
         (r"N1CC=C/1\Br", r"NCC=C/4\Br"),
         (r"N\1CC=C1\Br", r"NCC=C/4\Br"),
         (r"C1=C/CCCCCC/1", "C=C/CCCCCC/8"),
         (r"C\1=C/CCCCCC1", "C=C/CCCCCC/8"),
         (r"C\1=C/CCCCCC/1", "C=C/CCCCCC/8"),
         ("C1N[C@@]12CO2", "CN[C@@]3CO3"),
         ("[C@@]12(NC1)CO2", "[C@@](NC3)CO3", "[C@@]NC3))CO3"),
         ("CC1CCCO[C@]21CCCCO2", "CCCCCO[C@@]6CCCCO6"),
         ("CC1CCCO[C@@]12CCCCO2", "CCCCCO[C@@]6CCCCO6"),
         ("NC[C@]12CCCC1C3CC2CC3", "NC[C@]CCCC5CCC8CC5"),
         ("NC[C@]12CCCC2C3CC1CC3", "NC[C@@]CCCC5CCC8CC5"),
         ("C2C1=C/CCCCCC/12", "CC=C/CCCCCC/89"),
         ("C1C2=C/CCCCCC1/2", "CC=C/CCCCCC9/8"),
     ]
     for d in data:
         encoded = ringconverter.encode(d[0])
         self.assertEqual(d[1], encoded)
         encodedB = allconverter.encode(d[0])
         if len(d) == 2:
             self.assertEqual(d[1], encodedB)
         else:
             self.assertEqual(d[2], encodedB)
Пример #2
0
 def testRingDecoding(self):
     ringconverter = ds.Converter(rings=True)
     allconverter = ds.Converter(rings=True, branches=True)
     data = [  # smi, DeepSmiles/RC
         ("C1CCC1", "CCCC4"),
         ("C2CC1CCC1C2", "CCCCCC4C7"),
         ("c1c[nH]cc1", "cc[nH]cc5"),
         ("C1CCCCCCCCC1", "CCCCCCCCCC%10"),
         ("C1CCCCCCCCC1", "CCCCCCCCCC%(10)"),
         ("CCCCCCC1CCC1", "CCCCCCCCCC%(4)"),
         ("C1=C/CCCCCC/1", "C=C/CCCCCC/8"),
         ("C1C1C2C2C3C3C4C4C5C5C6C6C7C7C8C8C9C9C%10C%10",
          "CC2CC2CC2CC2CC2CC2CC2CC2CC2CC2"),
         ("C(CS)N", "C(CS)N", ""),
         ("C[C@@H]1CCCO[C@]12CCCCO2", "C[C@@H]CCCO[C@]6CCCCO6"),
         ("C2C1=C/CCCCCC/12", "CC=C/CCCCCC/89"),
         ("C1C2=C/CCCCCC1/2", "CC=C/CCCCCC9/8"),
     ]
     for d in data:
         decoded = ringconverter.decode(d[1])
         self.assertEqual(d[0], decoded)
         minput = d[2] if len(d) == 3 else d[1]
         if minput:
             decodedB = allconverter.decode(minput)
             self.assertEqual(d[0], decodedB)
Пример #3
0
 def testStringRep(self):
     self.assertEqual(str(ds.Converter()),
                      "Converter(rings=False, branches=False)")
     self.assertEqual(str(ds.Converter(rings=True)),
                      "Converter(rings=True, branches=False)")
     self.assertEqual(str(ds.Converter(branches=True)),
                      "Converter(rings=False, branches=True)")
     self.assertEqual(str(ds.Converter(rings=True, branches=True)),
                      "Converter(rings=True, branches=True)")
Пример #4
0
 def testDecodingExceptions(self):
     converter = ds.Converter(rings=True, branches=True)
     data = [
         "C8", "C))I", "%10C", "9C", "CCCCCC%(3CC", "C%(100)", "C[C@@CCl",
         "C%CC", "-5cc[nH]9"
     ]
     for dsmi in data:
         self.assertRaises(ds.DecodeError, converter.decode, dsmi)
     # Test just the ring decoder
     converter = ds.Converter(rings=True)
     data = ["C8", "%10C", "9C", "C%(100)", "C[C@@CCl", "C%CC", "-5cc[nH]9"]
     for dsmi in data:
         self.assertRaises(ds.DecodeError, converter.decode, dsmi)
Пример #5
0
def main():


    converter = deepsmiles.Converter(rings=True, branches=True)

    client = MongoClient("localhost:27018")
    db = client['COCONUT2020-07']

    collection = db.uniqueNaturalProduct.aggregate([{'$project': {'_id': 0, 'coconut_id': 1, 'unique_smiles': 1}}])

    allnp = pd.DataFrame(list(collection))

    for index, row in allnp.iterrows():
        coconut_id = row['coconut_id']
        smiles = row["unique_smiles"]

        deep_smiles = converter.encode(smiles)
        db.uniqueNaturalProduct.update_one({'coconut_id': coconut_id}, {"$set": {"deep_smiles": deep_smiles}})

        # for each row - get unique_smiles -> convert to deep smiles and save at deep_smiles

    # db.uniqueNaturalProduct.update_one({'coconut_id': data[0]}, {"$set": {"name": data[2]}})



    print("done")
Пример #6
0
 def testRoundTripRingClosures(self):
     smi = "C%(1)C%(2)C%(3)C%(4)C%(5)C%(6)C%(7)C%(8)C%(9)C%(10)C%(11)C%(12)C%(13)C%(14)C%(15)C%(16)C%(17)C%(18)C%(19)C%(20)C%(21)C%(22)C%(23)C%(24)C%(25)C%(26)C%(27)C%(28)C%(29)C%(30)C%(31)C%(32)C%(33)C%(34)C%(35)C%(36)C%(37)C%(38)C%(39)C%(40)C%(41)C%(42)C%(43)C%(44)C%(45)C%(46)C%(47)C%(48)C%(49)C%(50)C%(51)C%(52)C%(53)C%(54)C%(55)C%(56)C%(57)C%(58)C%(59)C%(60)C%(61)C%(62)C%(63)C%(64)C%(65)C%(66)C%(67)C%(68)C%(69)C%(70)C%(71)C%(72)C%(73)C%(74)C%(75)C%(76)C%(77)C%(78)C%(79)C%(80)C%(81)C%(82)C%(83)C%(84)C%(85)C%(86)C%(87)C%(88)C%(89)C%(90)C%(91)C%(92)C%(93)C%(94)C%(95)C%(96)C%(97)C%(98)C%(99)C%(100)C%(100)C%(99)C%(98)C%(97)C%(96)C%(95)C%(94)C%(93)C%(92)C%(91)C%(90)C%(89)C%(88)C%(87)C%(86)C%(85)C%(84)C%(83)C%(82)C%(81)C%(80)C%(79)C%(78)C%(77)C%(76)C%(75)C%(74)C%(73)C%(72)C%(71)C%(70)C%(69)C%(68)C%(67)C%(66)C%(65)C%(64)C%(63)C%(62)C%(61)C%(60)C%(59)C%(58)C%(57)C%(56)C%(55)C%(54)C%(53)C%(52)C%(51)C%(50)C%(49)C%(48)C%(47)C%(46)C%(45)C%(44)C%(43)C%(42)C%(41)C%(40)C%(39)C%(38)C%(37)C%(36)C%(35)C%(34)C%(33)C%(32)C%(31)C%(30)C%(29)C%(28)C%(27)C%(26)C%(25)C%(24)C%(23)C%(22)C%(21)C%(20)C%(19)C%(18)C%(17)C%(16)C%(15)C%(14)C%(13)C%(12)C%(11)C%(10)C%(9)C%(8)C%(7)C%(6)C%(5)C%(4)C%(3)C%(2)C%(1)"
     for branches in [True, False]:
         converter = ds.Converter(rings=True, branches=branches)
         encoded = converter.encode(smi)
         decoded = converter.decode(encoded)
         self.assertTrue("%(100)" in decoded)
Пример #7
0
 def testBranchEncoding(self):
     branchconverter = ds.Converter(branches=True)
     allconverter = ds.Converter(rings=True, branches=True)
     data = [  # smi, DeepSmiles/PN, DeepSMILES/RC+PN (if different)
         ("C(O)C", "CO)C"), ("C([O])C", "C[O])C"), ("C(OF)C", "COF))C"),
         ("C(F)(F)C", "CF)F)C"), ("C(Cl)(Cl)C", "CCl)Cl)C"),
         ("C(=O)Cl", "C=O)Cl"), ("C(OC(=O)Cl)I", "COC=O)Cl)))I"),
         ("[C@@H](Br)(Cl)I", "[C@@H]Br)Cl)I"),
         ("B(c1ccccc1)(O)O", "Bc1ccccc1))))))O)O", "Bcccccc6))))))O)O"),
         ("Cn1cccc-2nccc12", "Cn1cccc-2nccc12", "Cnccccnccc9-5")
     ]
     for d in data:
         encoded = branchconverter.encode(d[0])
         self.assertEqual(d[1], encoded)
         encodedB = allconverter.encode(d[0])
         if len(d) == 2:
             self.assertEqual(d[1], encodedB)
         else:
             self.assertEqual(d[2], encodedB)
Пример #8
0
def sm2ds(line):
    # Takes schwaller's preprocessed SMILES and turns them into deepSMILES
    converter = ds.Converter(rings=True, branches=True)
    line = line.replace(" ", "")
    molecules = line.split(".")
    new_line = []
    for molecule in molecules:
        new_molecule = converter.encode(molecule)
        new_line.append(new_molecule)
    new_line = ".".join(new_line)
    return new_line
Пример #9
0
def ds2sm(line):
    # Takes a deepSMILES line and turns it into SMILES
    converter = ds.Converter(rings=True, branches=True)
    line = line.replace(" ", "")
    molecules = line.split(".")
    new_line = []
    for molecule in molecules:
        new_molecule = converter.decode(molecule)
        new_line.append(new_molecule)
    new_line = ".".join(new_line)
    return new_line
Пример #10
0
 def testBranchDecoding(self):
     branchconverter = ds.Converter(branches=True)
     data = [  # smi, DeepSmiles/PN
         ("COC", "COC"),
         ("C(O)C", "CO)C"),
         ("C(=O)C", "C=O)C"),
         ("C[O]C", "C[O]C"),
         ("C(OC(=O)Cl)I", "COC=O)Cl)))I"),
         ("C(F)(F)C", "CF)F)C"),
         ("Cn1ccnc1", "Cn1ccnc1"),
         ("c1ccn(cc1)O", "c1ccncc1))O"),
         ("Cn1cccc-2nccc12", "Cn1cccc-2nccc12"),
     ]
     for d in data:
         decoded = branchconverter.decode(d[1])
         self.assertEqual(d[0], decoded)
Пример #11
0
def convert_smiles(smiles=False, deep=False):
    '''
    smiles and deep must be str format
    Converts from SMILES to DeepSMILES and vice versa.
    Which ever has a string provided, will convert to the other.
    If strings are proivded for both, then nothing happens
    '''
    converter = deepsmiles.Converter(rings=True, branches=True)
    if smiles and deep:
        print('Only provide a string for one of smiles or deep')
        return()
    if smiles: # Convert from SMILES to DeepSMILES
        deep_string = converter.encode(smiles)
        return deep_string
    if deep: # Convert from DeepSMILES to SMILES
        smiles_string = converter.decode(deep)
        return smiles_string
Пример #12
0
def main():
    parser = argparse.ArgumentParser(description="Predicting test images")
    # Input Arguments
    parser.add_argument('--input',
                        help='Enter the input filename',
                        required=True)

    args = parser.parse_args()

    tokenizer = pickle.load(open("tokenizer.pkl", "rb"))

    # Prediction model parameters
    embedding_dim = 600
    units = 1024
    vocab_size = len(tokenizer.word_index) + 1

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

    encoder = I2S_Model.CNN_Enc(embedding_dim)
    decoder = I2S_Model.RNN_Dec(embedding_dim, units, vocab_size)

    # Initialize DeepSMILES
    converter = deepsmiles.Converter(rings=True, branches=True)

    with open('Predictions.txt', 'w') as f:
        print(datetime.now().strftime('%Y/%m/%d %H:%M:%S'),
              "Predictions\n\n",
              flush=True,
              file=f)

        result = predictor(args.input,
                           encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer,
                           tokenizer=tokenizer)

        print(
            converter.decode(''.join(result).replace("<start>",
                                                     "").replace("<end>", "")),
            '\tPredictedSmiles',
            flush=True,
            file=f,
        )
Пример #13
0
def convert_to_deepsmile(smiles_list):
    """
    Convert smiles list to deepsmiles list. Make sure the smiles list is canonical.
    Args:
        smiles_list:

    Returns: deepsmiles list

    """
    converter = deepsmiles.Converter(rings=True, branches=True)
    deep_lst = [converter.encode(smi) for smi in smiles_list]
    decoded_lst = []
    final_deep_lst = []
    num_decode, num_recover = 0, 0
    for i in range(len(deep_lst)):
        try:
            decoded = converter.decode(deep_lst[i])

        except deepsmiles.DecodeError as e:
            decoded = None
            print("DecodeError! Error message was {}".format(e.message))
        decoded_lst.append(decoded)
        if decoded:
            num_decode += 1
    decoded_can_lst = []
    for item in decoded_lst:
        if Chem.MolFromSmiles(item) is not None:
            can_item = Chem.MolToSmiles(Chem.MolFromSmiles(item))
            decoded_can_lst.append(can_item)
        else:
            decoded_can_lst.append(None)

    for i in range(len(smiles_list)):
        if smiles_list[i] == decoded_can_lst[i]:
            final_deep_lst.append(deep_lst[i])
    return final_deep_lst
Пример #14
0
import os, sys
sys.path.append('VAE_dependencies')
sys.path.append('../../VAE_dependencies')
print(sys.argv)

import numpy as np
import yaml
import torch
from torch import nn
from random import shuffle
from data_loader import multiple_smile_to_hot, grammar_one_hot_to_smile
import pandas as pd

from GPlus2S import GrammarPlusToSMILES, IncludeRingsForSMILES
import deepsmiles
converter = deepsmiles.Converter(
    rings=True, branches=True)  # Coverter object, described by authors

from rdkit.Chem import MolFromSmiles
from rdkit import rdBase
rdBase.DisableLog('rdApp.error')

import time


def _make_dir(directory):
    os.makedirs(directory)


def save_models(encoder, decoder, epoch):
    out_dir = './saved_models/{}'.format(epoch)
    _make_dir(out_dir)
Пример #15
0
 def decode(generated, start='<M>', end='</M>'):
     generated = DeepSMILESLanguageModelUtils.extract(generated, start, end)
     converter = deepsmiles.Converter(rings=True, branches=True)
     return converter.decode(generated)
Пример #16
0
def deepsml(x):
    ldeep={}
    for i,m in (x.items()):
         converter = deepsmiles.Converter(rings=True, branches=True)
         ldeep[i]=converter.encode(m)
    return ldeep
Пример #17
0
 def testDecodingBasic(self):
     converter = ds.Converter()
     dsmi = converter.decode("C")
     self.assertEqual("C", dsmi)
Пример #18
0
from rdkit import Chem
from rdkit.Chem import AllChem
import re
import deepsmiles
f = open('allspec_90')

ccc = 0
for line in f:
    if re.search('pred', line):
        try:
            smi = deepsmiles.Converter(rings=True, branches=True).decode(
                line.split()[1].split('<')[0].strip())
        except:
            smi = None
        if smi:
            try:
                csmi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
            except:
                pass
            if csmi:
                l = []
                mm = Chem.MolFromSmiles(csmi)
                mm = AllChem.AddHs(mm)
                for at in mm.GetAtoms():
                    l.append(at.GetSymbol())
                    cs = l.count('C')
                    hs = l.count('H')
                    os = l.count('O')
                print('pred', csmi, cs, hs, os)
    if re.search('real', line):
        try:
tokenizer = pickle.load(open("tokenizer.pkl","rb"))


#Prediction model parameters
embedding_dim = 600
units = 1024
vocab_size = len(tokenizer.word_index) + 1

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

encoder = I2S_Model.CNN_Encoder(embedding_dim)
decoder = I2S_Model.RNN_Decoder(embedding_dim, units, vocab_size)

#Initialize DeepSMILES
converter = deepsmiles.Converter(rings=True, branches=True)


#Evaluator
def evaluate(image):
	maxlength = 74 #should be determined from running the training script

	hidden = decoder.reset_state(batch_size=1)

	temp_input = tf.expand_dims(I2S_evalData.load_image(image)[0], 0)
	img_tensor_val = I2S_evalData.image_features_extract_model(temp_input)
	img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

	features = encoder(img_tensor_val)

	dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
Пример #20
0
    """
    if not mol:
        return None

    if random_type == "unrestricted":
        return rkc.MolToSmiles(mol, canonical=False, doRandom=True, isomericSmiles=False)
    if random_type == "restricted":
        new_atom_order = list(range(mol.GetNumAtoms()))
        random.shuffle(new_atom_order)
        random_mol = rkc.RenumberAtoms(mol, newOrder=new_atom_order)
        return rkc.MolToSmiles(random_mol, canonical=False, isomericSmiles=False)
    raise ValueError("Type '{}' is not valid".format(random_type))


DEEPSMI_CONVERTERS = {
    "rings": ds.Converter(rings=True),
    "branches": ds.Converter(branches=True),
    "both": ds.Converter(rings=True, branches=True)
}


def to_deepsmiles(smi, converter="both"):
    """
    Converts a SMILES strings to the DeepSMILES alternative.
    :param smi: SMILES string.
    :return : A DeepSMILES string.
    """
    return DEEPSMI_CONVERTERS[converter].encode(smi)


def from_deepsmiles(deepsmi, converter="both"):
Пример #21
0
    output = open(args.output_fn, 'w')
    rings = args.rings
    branches = args.branches
    do_encode = args.do_encode
    do_decode = args.do_decode
    if do_decode:
        do_encode = False
    assert(not (do_encode and do_decode))
    if not (rings or branches):
        print("use at least --no-rings or --no-branches",
              file=sys.stderr)
        sys.exit(1)
    count = 0
    # work ----------------------------------------------
    smi_supplier = RobustSmilesSupplier(input_fn)
    converter = deepsmiles.Converter(rings, branches)
    if do_encode:

        for smi, name in smi_supplier:
            deep_smi = encode(converter, smi)
            print("%s\t%s" % (deep_smi, name), file=output)
            count += 1
    else: # decode
        for deep_smi, name in smi_supplier:
            smi = decode(converter, deep_smi)
            if smi != None:
                print("%s\t%s" % (smi, name), file=output)
            count += 1
    after = time.time()
    dt = after - before
    print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr)