def __init__(self, experiment_name): # Read parameter used during training self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._file_name = self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._config['EVALUATION']['starting_token'] if os.path.isfile('../data/' + self._file_name + '.csv'): self._data = pd.read_csv('../data/' + self._file_name + '.csv', header=None).values[:, 0] elif os.path.isfile('../data/' + self._file_name + '.tar.xz'): # Skip first line since empty and last line since nan self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz', compression='xz', header=None).values[1:-1, 0] # Clean data from start, end and padding token for i, mol_dat in enumerate(self._data): self._data[i] = clean_molecule(mol_dat, self._model_type)
def sample(self, N=100, stor_dir='../evaluation', T=0.7, fold=[1], epoch=[9], valid=True, novel=True, unique=True, write_csv=True): '''Sample from a model where the number of novel valid unique molecules is fixed :param stor_dir: directory where the generated SMILES are saved :param N: number of samples :param T: Temperature :param fold: Folds to use for sampling :param epoch: Epochs to use for sampling :param valid: If True, only accept valid SMILES :param novel: If True, only accept novel SMILES :param unique: If True, only accept unique SMILES :param write_csv If True, the generated SMILES are written in stor_dir :return: res_molecules: list with all the generated SMILES ''' res_molecules = [] print('Sampling: started') for f in fold: for e in epoch: self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(f) + '_epochs_' + str(e)) new_molecules = [] while len(new_molecules) < N: new_mol = self._encoder.decode( self._model.sample(self._starting_token, T)) # Remove remains from generation new_mol = clean_molecule(new_mol[0], self._model_type) # If not valid, get new molecule if valid and not check_valid(new_mol): continue # If not unique, get new molecule if unique and (new_mol in new_molecules): continue # If not novel, get molecule if novel and (new_mol in self._data): continue # If all conditions checked, add new molecule new_molecules.append(new_mol) # Prepare name for file name = 'molecules_fold_' + str(f) + '_epochs_' + str( e) + '_T_' + str(T) + '_N_' + str(N) + '.csv' if unique: name = 'unique_' + name if valid: name = 'valid_' + name if novel: name = 'novel_' + name # Store final molecules if write_csv: if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules/'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules/') mol = np.array(new_molecules).reshape(-1) pd.DataFrame(mol).to_csv(stor_dir + '/' + self._experiment_name + '/molecules/' + name, header=None) res_molecules.append(new_molecules) print('Sampling: done') return res_molecules
def fine_tuning(self, stor_dir='../evaluation/', restart=False): '''Perform fine-tuning and store statistic, NOTE: Directory should be prepared with the correct name and model NOTE: Molecules are not generated or validation is not performed. To sample molecules sampler should be used' :param stor_dir: directory to store data :return: ''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE random if self._model_type == 'NADE' and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Build model self._model.build(stor_dir + '/' + self._experiment_name + '/' + self._start_model) # Store total Statistics tot_stat = [] # only single fold fold = 1 for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: # Read existing files tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully completed else continue with normal training if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i: # Load model self._model.build( stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) # Skip this epoch continue else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train(self._data.reshape(-1, self._molecular_size, self._encoding_size), label.reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save( stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i) ) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode(self._model.sample(self._starting_token, self._T)) new_molecules.append(clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str( i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None)
def cross_validation(self, stor_dir='../evaluation/', restart=False): '''Perform cross-validation and store data''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/validation'): os.makedirs(stor_dir + '/' + self._experiment_name + '/validation') self._kf = KFold(n_splits=self._n_folds, shuffle=True, random_state=2) # Count iterations fold = 0 # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE if (self._model_type == 'NADE') and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Split data into train and test data for train, test in self._kf.split(self._data): # Shuffle index within test and train set np.random.shuffle(train) np.random.shuffle(test) fold += 1 self._model.build() # Store total statistics tot_stat = [] # Store validation loss tot_loss = [] for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: tmp_val_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None).to_numpy() tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully complete[0]d else continue with normal training if check_model( self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_val_file.shape[ 0] > i and tmp_stat_file.shape[0] > i: # Load model self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) tot_loss.append(tmp_val_file[i, 1]) # Skip this epoch continue else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train( self._data[train].reshape(-1, self._molecular_size, self._encoding_size), label[train].reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Test model on validation set tot_loss.append( self._model.validate( self._data[test].reshape(-1, self._molecular_size, self._encoding_size), label[test].reshape(-1, self._molecular_size))) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode( self._model.sample(self._starting_token, self._T)) new_molecules.append( clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None) # Store validation data pd.DataFrame(np.array(tot_loss).reshape(-1, 1)).to_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None)
def eval_molecule(self, stor_dir='.'): '''Plot percentage of novel, valid and unique SMILES :return: ''' valid = np.zeros((self._n_folds, self._epochs)) unique = np.zeros((self._n_folds, self._epochs)) novel = np.zeros((self._n_folds, self._epochs)) for i in range(self._n_folds): for j in range(self._epochs): mol = pd.read_csv(stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(i + 1) + '_epochs_' + str(j) + '.csv', header=None).values[:, 1].astype(str) # Remove padding for k, m in enumerate(mol): mol[k] = clean_molecule(m, self._model_type) # Compute unique molecules unique[i, j] = len(set(mol)) / self._samples # Remove duplicates mol = np.array(list(set(mol))) # Check validity and remove non-valid molecules to_delete = [] for k, m in enumerate(mol): if not check_valid(m): to_delete.append(k) valid_mol = np.delete(mol, to_delete) valid[i, j] = len(valid_mol) / self._samples # Compute molecules unequal to training data if valid_mol.size != 0: new_m = self.check_with_training_data(list(valid_mol)) novel[i, j] = len(new_m) / self._samples # Get percentage unique *= 100 novel *= 100 valid *= 100 # Get mean values mean_unique = np.mean(unique, axis=0) mean_valid = np.mean(valid, axis=0) mean_novel = np.mean(novel, axis=0) # Get standard deviation std_unique = np.std(unique, axis=0) std_valid = np.std(valid, axis=0) std_novel = np.std(novel, axis=0) print(mean_unique) print(mean_valid) print(mean_novel) # PLot plt.figure(1) plt.errorbar(np.arange(1, self._epochs + 1), mean_unique, yerr=std_unique, capsize=3, label='unique') plt.errorbar(np.arange(1, self._epochs + 1), mean_valid, yerr=std_valid, capsize=3, label='valid & unique') plt.errorbar(np.arange(1, self._epochs + 1), mean_novel, yerr=std_novel, capsize=3, label='novel, valid & unique', linestyle=':') plt.yticks(np.arange(0, 110, step=10)) plt.legend() plt.ylim(0, 105) plt.title('SMILES T=' + str(self._T)) plt.ylabel('% SMILES') plt.xlabel('Epoch') plt.savefig(stor_dir + '/' + self._experiment_name + '/molecules/novel_valid_unique_molecules.png') # Store data data = np.vstack((mean_unique, std_unique, mean_valid, std_valid, mean_novel, std_novel)) pd.DataFrame(data).to_csv(self._experiment_name + '/molecules/' + self._experiment_name + '_data.csv') plt.show()