def createSubSetZINC(n=5000): from read_dataset import read_zinc D = read_zinc() np.random.shuffle(D) subSet = D[:n] f = open("250k_rndm_zinc_drugs_clean_sub" + str(n) + ".smi", "w") for l in subSet: f.write(l) f.write("\n") f.close()
def main(): decoded_file = GRAMMAR_WEIGHTS.split(".")[0] + "_decRes.txt" priors_file = GRAMMAR_WEIGHTS.split(".")[0] + "_priorsRes.txt" generation_file = GRAMMAR_WEIGHTS.split(".")[0] + "_generationRes.txt" grammar_model = molecule_vae.ZincGrammarModel(GRAMMAR_WEIGHTS) XTE = read_zinc() XTE = XTE[0:5000] # rember to comment and uncomment the line in the #moelcule_vae file decoded_result = reconstruction(grammar_model, XTE) save_decoded_results(XTE, decoded_result, decoded_file) # decoded_priors = prior(grammar_model) # save_decoded_priors(decoded_priors, priors_file) decoded_generation = generation(grammar_model) save_decoded_priors(decoded_generation, generation_file)
def translationZINC(file): from read_dataset import read_zinc XTE = read_zinc() XTE = XTE[0:5000] f = open(file, "r") decoded_res = [] for l in f: if l != "": lsani = l.strip() lsani = lsani.strip(";,;") smiles = lsani.split(";,;") decoded_res.append(smiles) save_decoded_results(XTE, decoded_res, file + "_new")
def main(): from att_model_proxy import AttMolProxy as ProxyModel from att_model_proxy import cmd_args # takes the model and calculate the decode results model = ProxyModel() # update where to save decoded_file = cmd_args.save_dir + '/decoded_results.txt' # reading smiles test set if cmd_args.smiles_file == 'qm9': smiles_list = readStr_qm9() elif cmd_args.smiles_file == 'zinc': smiles_list = read_zinc() XTE = smiles_list[0:nb_smiles] decoded_result = reconstruct(model, XTE) decoded_result = np.array(decoded_result) save_decoded_results(XTE, decoded_result, decoded_file)
def zincProve(): L = read_zinc() MAX = 120 count = 0 countDot = 0 countAst = 0 nMolMIn9 = 0 nMaxAtoms = 0 nMinAtom = 100 nMol1 = 0 for s in L: if len(s) > MAX: count = count + 1 if "." in s: countDot = countDot + 1 if "*" in s: countAst = countAst + 1 m = Chem.MolFromSmiles(s) atom = m.GetNumAtoms() if atom <= 9: nMolMIn9 = nMolMIn9 + 1 if atom > nMaxAtoms: nMaxAtoms = atom elif atom < nMinAtom: nMinAtom = atom if atom == 1: nMol1 = nMol1 + 1 print("Numero molecole con num. atomi <= 9: {}".format(nMolMIn9)) print("Numero massimo di molecole: {}".format(nMaxAtoms)) print("Numero minimo di molecole: {}".format(nMinAtom)) print("Numero molecole con caratteri superiori a 120: {}".format(count)) print("Numero molecole con carattere '.' : {}".format(countDot)) print("Numero molecole con carattere '*' : {}".format(countAst)) print("Numero di molecole lette: {}".format(len(L))) print( "Numero di molecole lette formate da un solo atomo: {}".format(nMol1)) print("-------------- FINE ---------------")
all_onehot = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM), dtype=np.byte) all_masks = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM), dtype=np.byte) for start, b_pair in zip(range(0, len(L), chunk_size), list_binary): all_onehot[start:start + chunk_size, :, :] = b_pair[0] all_masks[start:start + chunk_size, :, :] = b_pair[1] #f_smiles = '.'.join(cmd_args.smiles_file.split('/')[-1].split('.')[0:-1]) f_smiles = cmd_args.smiles_file out_file = '%s/%s-%d.h5' % (cmd_args.save_dir, f_smiles, cmd_args.skip_deter) h5f = h5py.File(out_file, 'w') h5f.create_dataset('x', data=all_onehot) h5f.create_dataset('masks', data=all_masks) h5f.close() if __name__ == '__main__': smiles_list = [] if cmd_args.smiles_file == 'qm9': smiles_list = readStr_qm9() elif cmd_args.smiles_file == 'zinc': smiles_list = read_zinc() train_dataset = smiles_list[5000:] run_job(train_dataset)
sys.path.append('../utils') sys.path.append('../../_utils') from plot_utils import * from read_dataset import readStr_qm9, read_zinc from smile_metrics import MolecularMetrics as mm from utils import save_scores_bias, load_decoded_results, calc_perc folder = "bias/" # take params name = sys.argv[1] file = sys.argv[2] dataset = sys.argv[3] if dataset == "zinc": trainingSet = read_zinc() else: trainingSet = readStr_qm9() trainingSet = trainingSet[5000:] # make folder try: os.makedirs(folder + name) except OSError: print("Creation of the directory %s failed" % (folder + name)) else: print("Successfully created the directory %s " % (folder + name)) # READ SMILES smi = dict() smi['smiles'], smi['decoded'] = load_decoded_results(file)
result_list = Parallel(n_jobs=-1)(delayed(parse_many)(chunk[i:i + size], grammar) for i in range(0, len(chunk), size)) return [_1 for _0 in result_list for _1 in _0] import cPickle as cp from tqdm import tqdm if __name__ == '__main__': save_dir = cmd_args.save_dir fname = save_dir + ( cmd_args.smiles_file.split('/')[-1]).split('.')[0] + '.cfg_dump' fout = open(fname, 'wb') grammar = parser.Grammar(cmd_args.grammar_file) smiles = [] if cmd_args.smiles_file == 'qm9': smiles = readStr_qm9() elif cmd_args.smiles_file == 'zinc': smiles = read_zinc() for i in tqdm(range(len(smiles))): ts = parser.parse(smiles[i], grammar) assert isinstance(ts, list) and len(ts) == 1 n = AnnotatedTree2MolTree(ts[0]) cp.dump(n, fout, cp.HIGHEST_PROTOCOL) fout.close()
import numpy as np from models.utils import many_one_hot import h5py import sys, os sys.path.append('%s/../_utils' % os.path.dirname(os.path.realpath(__file__))) from read_dataset import read_zinc L = read_zinc() chars = [ 'C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[', '@', 'H', ']', 'n', '-', '#', 'S', 'l', '+', 's', 'B', 'r', '/', '4', '\\', '5', '6', '7', 'I', 'P', '8', ' ' ] DIM = len(chars) count = 0 MAX_LEN = 120 OH = np.zeros((249456, MAX_LEN, DIM)) for chem in L: indices = [] for c in chem: indices.append(chars.index(c)) if len(indices) < MAX_LEN: indices.extend((MAX_LEN - len(indices)) * [DIM - 1]) OH[count, :, :] = many_one_hot(np.array(indices), DIM) count = count + 1 h5f = h5py.File('data/zinc_str_dataset.h5', 'w') h5f.create_dataset('data', data=OH) h5f.create_dataset('chr', data=chars)
def main(): torch.manual_seed(0) lg = rdkit.RDLogger.logger() lg.setLevel(rdkit.RDLogger.CRITICAL) parser = OptionParser() parser.add_option("-t", "--test", dest="test_path") parser.add_option("-v", "--vocab", dest="vocab_path") parser.add_option("-m", "--model", dest="model_path") parser.add_option("-w", "--hidden", dest="hidden_size", default=200) parser.add_option("-l", "--latent", dest="latent_size", default=56) parser.add_option("-d", "--depth", dest="depth", default=3) opts, args = parser.parse_args() vocab = [x.strip("\r\n ") for x in open(opts.vocab_path)] vocab = Vocab(vocab) hidden_size = int(opts.hidden_size) latent_size = int(opts.latent_size) depth = int(opts.depth) model = JTNNVAE(vocab, hidden_size, latent_size, depth) model.load_state_dict(torch.load(opts.model_path)) model = model.cuda() dataset_name = opts.test_path result_file = dataset_name + "_decoded_results.txt" priors_file = dataset_name + "_decoded_priors.txt" generation_fie = dataset_name + "_generation.txt" # read dataset if dataset_name == "zinc": XTE = read_zinc() else: D = readStr_qm9() # fix problem about molecule with '.' inside XTE = [] for mol in D: if "." not in mol: XTE.append(mol) # reconstruction XTE = XTE[0:5000] XTE = filter(lambda x: len(x) > 1, XTE) #needed for removing smiles with only a char. decoded_result = reconstruction(model, XTE, 20, 1) save_decoded_results(XTE, decoded_result, result_file) # prior # decoded_priors_witherrors = model.sample_prior_eval(True, 1000, 10) # decoded_priors = [] # for i in decoded_priors_witherrors: # decoded_priors.append(sanitize(i)) # save_decoded_priors(decoded_priors, priors_file) # generation generation_witherrors = model.sample_prior_eval(True, 20000, 1) generation = [] for i in generation_witherrors: generation.append(sanitize(i)) save_decoded_priors(generation, generation_fie)