def train_generator(self, moment_files): print('Training RNN') folder = 'vector_based/' if not os.path.exists(self.gen_data_dir + folder + 'vecs.csv'): print('Gen Vectors....') mols = pd.read_csv(self.gen_data_dir + 'mols.csv', header=0) mols = [x[0] for x in mols.values] get_latent_vecs(mols, self.gen_data_dir, folder + 'vecs.csv') mew, std = pretrain(self.gen_data_dir, 'Voc', folder + 'vecs.csv', 'input_mols_filtered.csv', folder + 'Prior.ckpt') ckpt_file = self.gen_data_dir + folder + 'Prior.ckpt' header = get_headings() np.savetxt(moment_files[0], np.array([mew]), header=','.join(header), delimiter=',', comments='', newline='\n') np.savetxt(moment_files[1], np.array([std]), header=','.join(header), delimiter=',', comments='', newline='\n') return ckpt_file
def __init__(self, voc, smi_file, vec_file): """ Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the images. transform (callable, optional): Optional transform to be applied on a sample. """ self.voc = voc self.smiles = pd.read_csv(smi_file, header=0, dtype=str).values self.smiles = [x[0] for x in self.smiles] # reading descriptors data = pd.read_csv(vec_file, header=0) #look for missing data entries if data.isnull().values.any(): raise ValueError( 'Found nan in data, possible data missing in generation vectors.' ) #correct heading order headings = get_headings() data = data.reindex(columns=headings) #look for missing columns if data.isnull().values.any(): raise ValueError( 'Found nan in data, possible columns missing in generation vectors.' ) #calculate mew and std for normilization of generation vectors self.vectors = data.values self.mew, self.std = get_moments(self.vectors) # catch any zeros which will give nan when normalizing self.std = np.array([x if x != 0 else 1.0 for x in self.std]) self.vectors = (self.vectors - self.mew) / self.std
def discrim_to_gen(self): float_bool = get_float_bool(self.discrim_data_dir, 'float_bool.csv') all_rounded = [] for i in range(50): rounded = [] for x, is_float in zip(self.x_solution, float_bool): if is_float == 1: rounded.append(x) else: rounded.append(prob_round(x)) if rounded not in all_rounded: all_rounded.append(rounded) all_rounded = np.array(all_rounded) header = get_headings() np.savetxt(self.discrim_data_dir + 'rounded.csv', all_rounded, header=','.join(header), delimiter=',', comments='', newline='\n')
def k_near_search(self, vector, lib_vec_file, num_neighbours=1): # vector is vector for compund we want to search for neigbours of # lib_vec_file is a libary of compunds in vector form which will be search for neighbours all_neigh_dist = [] all_neigh_index = [] all_neigh_vec = [] chunksize = 100000 print('Looking for nearest neighbour:') for i, chunk in enumerate( pd.read_csv(lib_vec_file, chunksize=chunksize, header=0)): print('Evaluating chunk {} of length {}'.format(i, len(chunk))) # correct heading order chunk = chunk.reindex(columns=get_headings()) traning_data = chunk.values tree = spatial.KDTree(traning_data) ans = tree.query(np.array(vector), k=num_neighbours) all_neigh_dist.append(ans[0]) all_neigh_index.append(ans[1] + (i * chunksize)) all_neigh_vec.append(traning_data[ans[1]]) result = [[x, y] for _, x, y in sorted(zip( all_neigh_dist, all_neigh_vec, all_neigh_index), key=lambda pair: pair[0])] return result[0]
def generate(self, data_dir, ckpt_file, mode='vectors', batch_size=1, samples=50, moments=None): modes = ['reinvent', 'vectors'] network_size = 398 if mode == 'reinvent': data = [np.zeroes(network_size)] elif mode == 'vectors': vec_file = self.gen_data_dir + 'vector_based/vecs.csv' if moments is None: #Calculate the mew and std used to normalize generation data data = pd.read_csv(vec_file, header=0) # correct heading order data = data.reindex(columns=get_headings()) data = data.values mew, std = get_moments(data) # catch any zeros which will give nan when normalizing std = np.array([x if x != 0 else 1.0 for x in std]) else: #read mew and std from file, this save some time and memory mew = pd.read_csv(moments[0], header=0) std = pd.read_csv(moments[1], header=0) mew = mew.reindex(columns=get_headings()).values std = std.reindex(columns=get_headings()).values vectors = pd.read_csv(self.discrim_data_dir + 'rounded.csv', header=0) vectors = vectors.reindex(columns=get_headings()).values vectors = (vectors - mew) / std #replace data with normalized vectors data = torch.FloatTensor(vectors) else: raise ValueError('Supported generation modes are {}'.format(modes)) voc = Vocabulary(init_from_file=self.gen_data_dir + 'Voc') Prior = RNN(voc, network_size) if torch.cuda.is_available(): Prior.rnn.load_state_dict(torch.load(ckpt_file)) else: Prior.rnn.load_state_dict( torch.load(ckpt_file, map_location=lambda storage, loc: storage)) all_smi = set() valid = 0 with open('./output_smi.csv', 'w') as file: for j, test_vec in enumerate(data): test_vec = test_vec.float() for i in range(samples): seqs, prior_likelihood, entropy = Prior.sample( batch_size, test_vec) smiles = seq_to_smiles(seqs, voc)[0] if Chem.MolFromSmiles(smiles): valid += 1 all_smi.add(smiles) file.write(smiles + str(',{}\n'.format(j))) all_smi = list(all_smi) print("\n{:>4.1f}% valid SMILES".format(100 * (valid / (samples * len(data))))) return all_smi
def __init__(self): #Define file locations self.discrim_data_dir = './discriminator/data/' self.gen_data_dir = './generator/data/' self.gen_ckpt_file = self.gen_data_dir + 'vector_based/Prior.ckpt' self.target = 6.1 self.mixing = 0.0 #Make a discriminative model and use finite differnces to solev this modle for a set of inputs predicted to give a set target self.y_property, self.x_solution = ODO.discrim( self, target_property=self.target, train=False) print('Target activity {}, optimized solution achieved activity {}'. format(self.target, self.y_property[0])) #Convert the solution to the decriminative modle to a form that can be fed into a genereative model ODO.discrim_to_gen(self) mew = self.gen_data_dir + 'mew.dat' std = self.gen_data_dir + 'std.dat' moments = [mew, std] #train a generative model if needed train_RNN = False if train_RNN: self.gen_ckpt_file = ODO.train_generator(self, moments) #Use a generative modle to produce smiles using output of discriminative modle as input print( 'Generating SMILES from proposed vectors using RNN weights at {}'. format(self.gen_ckpt_file)) if not os.path.exists(mew) and os.path.exists(std): print( 'mew and std used to normalize generation input not found at {}, {}' .format(mew, std)) moments = None generated_smis = ODO.generate(self, self.gen_data_dir, self.gen_ckpt_file, moments=moments) #convert smiles back into vectors to be tested by the discriminative model get_latent_vecs(generated_smis, self.discrim_data_dir, 'output_vecs.csv') try: generated_vecs = pd.read_csv(self.discrim_data_dir + 'output_vecs.csv', header=0, dtype=np.float64) generated_vecs = generated_vecs.reindex( columns=get_headings()).values except: raise ValueError('Try deleting {}'.format(self.discrim_data_dir + 'output_vecs.csv')) #test if generating smiles close to generation vectors #ODO.test_vector_msd(self, generated_vecs) predict = ODO.predict_with_discrim(self, generated_vecs) np.savetxt('./a_{}_m_{}.dat'.format(self.target, self.mixing), predict) print('Average activity {} and std {}'.format(np.average(predict), np.std(predict))) thresh_hold = 7.0 thresh = [True if x >= thresh_hold else False for x in predict] smi_above_thresh = [[x, y] for x, y, z in zip(generated_smis, predict, thresh) if z is True] print('Number of compunds created = {}'.format(len(thresh))) print('Precent of compounds with activity above {} = {}'.format( thresh_hold, 100 * (thresh.count(True) / len(thresh)))) for x in generated_smis: print(x)
def test_vector_msd(self, generated_vectors): vectors = pd.read_csv(self.discrim_data_dir + 'rounded.csv', header=0) vectors = vectors.reindex(columns=get_headings()).values vec = vectors[0] msd = [np.average((vec - x)**2) for x in generated_vectors] print(msd)