예제 #1
0
def createSubSetZINC(n=5000):
    from read_dataset import read_zinc

    D = read_zinc()
    np.random.shuffle(D)
    subSet = D[:n]

    f = open("250k_rndm_zinc_drugs_clean_sub" + str(n) + ".smi", "w")
    for l in subSet:
        f.write(l)
        f.write("\n")
    f.close()
예제 #2
0
def main():
    decoded_file = GRAMMAR_WEIGHTS.split(".")[0] + "_decRes.txt"
    priors_file = GRAMMAR_WEIGHTS.split(".")[0] + "_priorsRes.txt"
    generation_file = GRAMMAR_WEIGHTS.split(".")[0] + "_generationRes.txt"
    grammar_model = molecule_vae.ZincGrammarModel(GRAMMAR_WEIGHTS)

    XTE = read_zinc()
    XTE = XTE[0:5000]
    # rember to comment and uncomment the line  in the #moelcule_vae file
    decoded_result = reconstruction(grammar_model, XTE)
    save_decoded_results(XTE, decoded_result, decoded_file)
    # decoded_priors = prior(grammar_model)
    # save_decoded_priors(decoded_priors, priors_file)
    decoded_generation = generation(grammar_model)
    save_decoded_priors(decoded_generation, generation_file)
예제 #3
0
def translationZINC(file):
    from read_dataset import read_zinc

    XTE = read_zinc()
    XTE = XTE[0:5000]

    f = open(file, "r")
    decoded_res = []
    for l in f:
        if l != "":
            lsani = l.strip()
            lsani = lsani.strip(";,;")
            smiles = lsani.split(";,;")
            decoded_res.append(smiles)

    save_decoded_results(XTE, decoded_res, file + "_new")
예제 #4
0
def main():
    from att_model_proxy import AttMolProxy as ProxyModel
    from att_model_proxy import cmd_args
    # takes the model and calculate the decode results
    model = ProxyModel()
    # update where to save
    decoded_file = cmd_args.save_dir + '/decoded_results.txt'

    # reading smiles test set
    if cmd_args.smiles_file == 'qm9':
        smiles_list = readStr_qm9()
    elif cmd_args.smiles_file == 'zinc':
        smiles_list = read_zinc()

    XTE = smiles_list[0:nb_smiles]

    decoded_result = reconstruct(model, XTE)
    decoded_result = np.array(decoded_result)
    save_decoded_results(XTE, decoded_result, decoded_file)
예제 #5
0
def zincProve():
    L = read_zinc()
    MAX = 120
    count = 0
    countDot = 0
    countAst = 0
    nMolMIn9 = 0
    nMaxAtoms = 0
    nMinAtom = 100

    nMol1 = 0

    for s in L:
        if len(s) > MAX:
            count = count + 1
        if "." in s:
            countDot = countDot + 1
        if "*" in s:
            countAst = countAst + 1
        m = Chem.MolFromSmiles(s)
        atom = m.GetNumAtoms()
        if atom <= 9:
            nMolMIn9 = nMolMIn9 + 1
        if atom > nMaxAtoms:
            nMaxAtoms = atom
        elif atom < nMinAtom:
            nMinAtom = atom

        if atom == 1:
            nMol1 = nMol1 + 1

    print("Numero molecole con num. atomi <= 9: {}".format(nMolMIn9))
    print("Numero massimo di molecole: {}".format(nMaxAtoms))
    print("Numero minimo di molecole: {}".format(nMinAtom))
    print("Numero molecole con caratteri superiori a 120: {}".format(count))
    print("Numero molecole con carattere '.' : {}".format(countDot))
    print("Numero molecole con carattere '*' : {}".format(countAst))
    print("Numero di molecole lette: {}".format(len(L)))
    print(
        "Numero di molecole lette formate da un solo atomo: {}".format(nMol1))
    print("-------------- FINE ---------------")
예제 #6
0
    all_onehot = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM),
                          dtype=np.byte)
    all_masks = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM),
                         dtype=np.byte)

    for start, b_pair in zip(range(0, len(L), chunk_size), list_binary):
        all_onehot[start:start + chunk_size, :, :] = b_pair[0]
        all_masks[start:start + chunk_size, :, :] = b_pair[1]

    #f_smiles = '.'.join(cmd_args.smiles_file.split('/')[-1].split('.')[0:-1])
    f_smiles = cmd_args.smiles_file
    out_file = '%s/%s-%d.h5' % (cmd_args.save_dir, f_smiles,
                                cmd_args.skip_deter)
    h5f = h5py.File(out_file, 'w')
    h5f.create_dataset('x', data=all_onehot)
    h5f.create_dataset('masks', data=all_masks)
    h5f.close()


if __name__ == '__main__':

    smiles_list = []

    if cmd_args.smiles_file == 'qm9':
        smiles_list = readStr_qm9()
    elif cmd_args.smiles_file == 'zinc':
        smiles_list = read_zinc()

    train_dataset = smiles_list[5000:]
    run_job(train_dataset)
예제 #7
0
sys.path.append('../utils')
sys.path.append('../../_utils')
from plot_utils import *
from read_dataset import readStr_qm9, read_zinc
from smile_metrics import MolecularMetrics as mm
from utils import save_scores_bias, load_decoded_results, calc_perc

folder = "bias/"

# take params
name = sys.argv[1]
file = sys.argv[2]
dataset = sys.argv[3]

if dataset == "zinc":
    trainingSet = read_zinc()
else:
    trainingSet = readStr_qm9()
trainingSet = trainingSet[5000:]

# make folder
try:
    os.makedirs(folder + name)
except OSError:
    print("Creation of the directory %s failed" % (folder + name))
else:
    print("Successfully created the directory %s " % (folder + name))

# READ SMILES
smi = dict()
smi['smiles'], smi['decoded'] = load_decoded_results(file)
예제 #8
0
    result_list = Parallel(n_jobs=-1)(delayed(parse_many)(chunk[i:i +
                                                                size], grammar)
                                      for i in range(0, len(chunk), size))
    return [_1 for _0 in result_list for _1 in _0]


import cPickle as cp

from tqdm import tqdm

if __name__ == '__main__':
    save_dir = cmd_args.save_dir
    fname = save_dir + (
        cmd_args.smiles_file.split('/')[-1]).split('.')[0] + '.cfg_dump'
    fout = open(fname, 'wb')
    grammar = parser.Grammar(cmd_args.grammar_file)

    smiles = []
    if cmd_args.smiles_file == 'qm9':
        smiles = readStr_qm9()
    elif cmd_args.smiles_file == 'zinc':
        smiles = read_zinc()

    for i in tqdm(range(len(smiles))):
        ts = parser.parse(smiles[i], grammar)
        assert isinstance(ts, list) and len(ts) == 1
        n = AnnotatedTree2MolTree(ts[0])
        cp.dump(n, fout, cp.HIGHEST_PROTOCOL)

    fout.close()
예제 #9
0
import numpy as np
from models.utils import many_one_hot
import h5py

import sys, os
sys.path.append('%s/../_utils' % os.path.dirname(os.path.realpath(__file__)))
from read_dataset import read_zinc

L = read_zinc()

chars = [
    'C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[', '@', 'H',
    ']', 'n', '-', '#', 'S', 'l', '+', 's', 'B', 'r', '/', '4', '\\', '5', '6',
    '7', 'I', 'P', '8', ' '
]
DIM = len(chars)

count = 0
MAX_LEN = 120
OH = np.zeros((249456, MAX_LEN, DIM))
for chem in L:
    indices = []
    for c in chem:
        indices.append(chars.index(c))
    if len(indices) < MAX_LEN:
        indices.extend((MAX_LEN - len(indices)) * [DIM - 1])
    OH[count, :, :] = many_one_hot(np.array(indices), DIM)
    count = count + 1
h5f = h5py.File('data/zinc_str_dataset.h5', 'w')
h5f.create_dataset('data', data=OH)
h5f.create_dataset('chr', data=chars)
예제 #10
0
def main():
    torch.manual_seed(0)
    lg = rdkit.RDLogger.logger()
    lg.setLevel(rdkit.RDLogger.CRITICAL)

    parser = OptionParser()
    parser.add_option("-t", "--test", dest="test_path")
    parser.add_option("-v", "--vocab", dest="vocab_path")
    parser.add_option("-m", "--model", dest="model_path")
    parser.add_option("-w", "--hidden", dest="hidden_size", default=200)
    parser.add_option("-l", "--latent", dest="latent_size", default=56)
    parser.add_option("-d", "--depth", dest="depth", default=3)
    opts, args = parser.parse_args()

    vocab = [x.strip("\r\n ") for x in open(opts.vocab_path)]
    vocab = Vocab(vocab)

    hidden_size = int(opts.hidden_size)
    latent_size = int(opts.latent_size)
    depth = int(opts.depth)

    model = JTNNVAE(vocab, hidden_size, latent_size, depth)
    model.load_state_dict(torch.load(opts.model_path))
    model = model.cuda()

    dataset_name = opts.test_path
    result_file = dataset_name + "_decoded_results.txt"
    priors_file = dataset_name + "_decoded_priors.txt"
    generation_fie = dataset_name + "_generation.txt"

    # read dataset
    if dataset_name == "zinc":
        XTE = read_zinc()
    else:
        D = readStr_qm9()
        # fix problem about molecule with '.' inside
        XTE = []
        for mol in D:
            if "." not in mol:
                XTE.append(mol)

    # reconstruction
    XTE = XTE[0:5000]
    XTE = filter(lambda x: len(x) > 1,
                 XTE)  #needed for removing smiles with only a char.
    decoded_result = reconstruction(model, XTE, 20, 1)
    save_decoded_results(XTE, decoded_result, result_file)

    # prior
    # decoded_priors_witherrors = model.sample_prior_eval(True, 1000, 10)
    # decoded_priors = []
    # for i in decoded_priors_witherrors:
    #     decoded_priors.append(sanitize(i))
    # save_decoded_priors(decoded_priors, priors_file)

    # generation
    generation_witherrors = model.sample_prior_eval(True, 20000, 1)
    generation = []
    for i in generation_witherrors:
        generation.append(sanitize(i))
    save_decoded_priors(generation, generation_fie)