def check_drigoni(download_path): def conv(smile): return Chem.MolToSmiles(Chem.MolFromSmiles(smile)) print('reading data...') raw_data = train_valid_split(download_path) data = [conv(data_item['smiles']) for data_item in raw_data['valid']] data.extend([conv(data_item['smiles']) for data_item in raw_data['train']]) print('reading dataset') dataset = readStr_qm9() print('len dataset:' + str(len(dataset)) + " len data: " + str(len(data))) count = 0 saw = 0 for i, data_item in enumerate(data): if data_item not in dataset: count +=1 saw += 1 print('end: ' + str(count) + " saw = " + str(saw)) count = 0 saw = 0 for i, data_item in enumerate(dataset): if data_item not in data: count +=1 saw += 1 print('end: ' + str(count) + " saw = " + str(saw))
def createSubSetQM9(n=5000): from read_dataset import readStr_qm9 D = readStr_qm9() np.random.shuffle(D) subSet = D[:n] f = open("qm9_sub" + str(n) + ".smi", "w") for l in subSet: f.write(l) f.write("\n") f.close()
def main(): decoded_file = GRAMMAR_WEIGHTS.split(".")[0] + "_decRes.txt" priors_file = GRAMMAR_WEIGHTS.split(".")[0] + "_priorsRes.txt" generation_file = GRAMMAR_WEIGHTS.split(".")[0] + "_generationRes.txt" grammar_model = molecule_vae.Qm9GrammarModel(GRAMMAR_WEIGHTS) XTE = readStr_qm9() XTE = XTE[0:5000] # rember to comment and uncomment the line in the #moelcule_vae file decoded_result = reconstruction(grammar_model, XTE) save_decoded_results(XTE, decoded_result, decoded_file) # decoded_priors = prior(grammar_model) # save_decoded_priors(decoded_priors, priors_file) decoded_generation = generation(grammar_model) save_decoded_priors(decoded_generation, generation_file)
def main(): from att_model_proxy import AttMolProxy as ProxyModel from att_model_proxy import cmd_args # takes the model and calculate the decode results model = ProxyModel() # update where to save decoded_file = cmd_args.save_dir + '/decoded_results.txt' # reading smiles test set if cmd_args.smiles_file == 'qm9': smiles_list = readStr_qm9() elif cmd_args.smiles_file == 'zinc': smiles_list = read_zinc() XTE = smiles_list[0:nb_smiles] decoded_result = reconstruct(model, XTE) decoded_result = np.array(decoded_result) save_decoded_results(XTE, decoded_result, decoded_file)
def train_valid_split_drigoni(): print('reading dataset') dataset = readStr_qm9()[5000:] valid_idx = np.random.randint(0, high=len(dataset), size=round(len(dataset)*0.1)) raw_data = {'train': [], 'valid': []} # save the train, valid dataset. file_count=0 for i, smiles in enumerate(dataset): val = QED.qed(Chem.MolFromSmiles(smiles)) if i not in valid_idx: raw_data['train'].append({'smiles': smiles, 'QED': val}) else: raw_data['valid'].append({'smiles': smiles, 'QED': val}) file_count += 1 if file_count % 2000 == 0: print('finished reading: %d' % file_count, end='\r') return raw_data
def qm9StrProve(): L = readStr_qm9() MAX = 120 count = 0 countDot = 0 countAst = 0 nMolMIn9 = 0 nMaxAtoms = 0 nMinAtom = 100 nMol1 = 0 for s in L: if len(s) > MAX: count = count + 1 if "." in s: countDot = countDot + 1 if "*" in s: countAst = countAst + 1 m = Chem.MolFromSmiles(s) atom = m.GetNumAtoms() if atom <= 9: nMolMIn9 = nMolMIn9 + 1 if atom > nMaxAtoms: nMaxAtoms = atom elif atom < nMinAtom: nMinAtom = atom if atom == 1: nMol1 = nMol1 + 1 print("Numero molecole con num. atomi <= 9: {}".format(nMolMIn9)) print("Numero massimo di atomi: {}".format(nMaxAtoms)) print("Numero minimo di atomi: {}".format(nMinAtom)) print("Numero molecole con caratteri superiori a 120: {}".format(count)) print("Numero molecole con carattere '.' : {}".format(countDot)) print("Numero molecole con carattere '*' : {}".format(countAst)) print("Numero di molecole lette: {}".format(len(L))) print( "Numero di molecole lette formate da un solo atomo: {}".format(nMol1)) print("-------------- FINE ---------------")
all_onehot = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM), dtype=np.byte) all_masks = np.zeros((len(L), cmd_args.max_decode_steps, DECISION_DIM), dtype=np.byte) for start, b_pair in zip(range(0, len(L), chunk_size), list_binary): all_onehot[start:start + chunk_size, :, :] = b_pair[0] all_masks[start:start + chunk_size, :, :] = b_pair[1] #f_smiles = '.'.join(cmd_args.smiles_file.split('/')[-1].split('.')[0:-1]) f_smiles = cmd_args.smiles_file out_file = '%s/%s-%d.h5' % (cmd_args.save_dir, f_smiles, cmd_args.skip_deter) h5f = h5py.File(out_file, 'w') h5f.create_dataset('x', data=all_onehot) h5f.create_dataset('masks', data=all_masks) h5f.close() if __name__ == '__main__': smiles_list = [] if cmd_args.smiles_file == 'qm9': smiles_list = readStr_qm9() elif cmd_args.smiles_file == 'zinc': smiles_list = read_zinc() train_dataset = smiles_list[5000:] run_job(train_dataset)
from plot_utils import * from read_dataset import readStr_qm9, read_zinc from smile_metrics import MolecularMetrics as mm from utils import save_scores_bias, load_decoded_results, calc_perc folder = "bias/" # take params name = sys.argv[1] file = sys.argv[2] dataset = sys.argv[3] if dataset == "zinc": trainingSet = read_zinc() else: trainingSet = readStr_qm9() trainingSet = trainingSet[5000:] # make folder try: os.makedirs(folder + name) except OSError: print("Creation of the directory %s failed" % (folder + name)) else: print("Successfully created the directory %s " % (folder + name)) # READ SMILES smi = dict() smi['smiles'], smi['decoded'] = load_decoded_results(file) smi['valid'] = [] for line in smi['decoded']:
from __future__ import print_function import nltk import qm9_grammar import numpy as np import h5py import molecule_vae import sys, os sys.path.append('%s/../_utils' % os.path.dirname(os.path.realpath(__file__))) from read_dataset import readStr_qm9 MAX_LEN = 277 L = readStr_qm9() NCHARS = len(qm9_grammar.GCFG.productions()) def to_one_hot(smiles): """ Encode a list of smiles strings to one-hot vectors """ assert type(smiles) == list prod_map = {} for ix, prod in enumerate(qm9_grammar.GCFG.productions()): prod_map[prod] = ix tokenize = molecule_vae.get_zinc_tokenizer(qm9_grammar.GCFG) tokens = map(tokenize, smiles) parser = nltk.ChartParser(qm9_grammar.GCFG) parse_trees = [parser.parse(t).next() for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] indices = [ np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq
result_list = Parallel(n_jobs=-1)(delayed(parse_many)(chunk[i:i + size], grammar) for i in range(0, len(chunk), size)) return [_1 for _0 in result_list for _1 in _0] import cPickle as cp from tqdm import tqdm if __name__ == '__main__': save_dir = cmd_args.save_dir fname = save_dir + ( cmd_args.smiles_file.split('/')[-1]).split('.')[0] + '.cfg_dump' fout = open(fname, 'wb') grammar = parser.Grammar(cmd_args.grammar_file) smiles = [] if cmd_args.smiles_file == 'qm9': smiles = readStr_qm9() elif cmd_args.smiles_file == 'zinc': smiles = read_zinc() for i in tqdm(range(len(smiles))): ts = parser.parse(smiles[i], grammar) assert isinstance(ts, list) and len(ts) == 1 n = AnnotatedTree2MolTree(ts[0]) cp.dump(n, fout, cp.HIGHEST_PROTOCOL) fout.close()
def main(): torch.manual_seed(0) lg = rdkit.RDLogger.logger() lg.setLevel(rdkit.RDLogger.CRITICAL) parser = OptionParser() parser.add_option("-t", "--test", dest="test_path") parser.add_option("-v", "--vocab", dest="vocab_path") parser.add_option("-m", "--model", dest="model_path") parser.add_option("-w", "--hidden", dest="hidden_size", default=200) parser.add_option("-l", "--latent", dest="latent_size", default=56) parser.add_option("-d", "--depth", dest="depth", default=3) opts, args = parser.parse_args() vocab = [x.strip("\r\n ") for x in open(opts.vocab_path)] vocab = Vocab(vocab) hidden_size = int(opts.hidden_size) latent_size = int(opts.latent_size) depth = int(opts.depth) model = JTNNVAE(vocab, hidden_size, latent_size, depth) model.load_state_dict(torch.load(opts.model_path)) model = model.cuda() dataset_name = opts.test_path result_file = dataset_name + "_decoded_results.txt" priors_file = dataset_name + "_decoded_priors.txt" generation_fie = dataset_name + "_generation.txt" # read dataset if dataset_name == "zinc": XTE = read_zinc() else: D = readStr_qm9() # fix problem about molecule with '.' inside XTE = [] for mol in D: if "." not in mol: XTE.append(mol) # reconstruction XTE = XTE[0:5000] XTE = filter(lambda x: len(x) > 1, XTE) #needed for removing smiles with only a char. decoded_result = reconstruction(model, XTE, 20, 1) save_decoded_results(XTE, decoded_result, result_file) # prior # decoded_priors_witherrors = model.sample_prior_eval(True, 1000, 10) # decoded_priors = [] # for i in decoded_priors_witherrors: # decoded_priors.append(sanitize(i)) # save_decoded_priors(decoded_priors, priors_file) # generation generation_witherrors = model.sample_prior_eval(True, 20000, 1) generation = [] for i in generation_witherrors: generation.append(sanitize(i)) save_decoded_priors(generation, generation_fie)