Пример #1
0
class FineTuner():

    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        # self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']])

        # Read starting model
        self._start_model = self._config['FINETUNING']['start_model']

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate,
                               self._hidden_units, self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)

    def fine_tuning(self, stor_dir='../evaluation/', restart=False):
        '''Perform fine-tuning and store statistic,
        NOTE: Directory should be prepared with the correct name and model
        NOTE: Molecules are not generated or validation is not performed. To sample molecules sampler should be used'
        :param stor_dir:    directory to store data
        :return:
        '''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE random
        if self._model_type == 'NADE' and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Build model
        self._model.build(stor_dir + '/' + self._experiment_name + '/' + self._start_model)

        # Store total Statistics
        tot_stat = []

        # only single fold
        fold = 1

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            if restart:
                # Read existing files
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules(
                        self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i:
                    # Load model
                    self._model.build(
                        stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist())

                    # Skip this epoch
                    continue

                else:
                    restart = False

            # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
            # to  (all_SMILES, molecular_size, encoding_size))
            statistic = self._model.train(self._data.reshape(-1, self._molecular_size, self._encoding_size),
                                          label.reshape(-1, self._molecular_size), epochs=1,
                                          batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(
                stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i) )

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(
                    i) + '.csv', header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)
class Sampler():
    def __init__(self, experiment_name):
        self._encoder = SMILESEncoder()

        # Read parameter used during training
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        # Read data
        if os.path.isfile('../data/' + self._file_name + '.csv'):
            self._data = pd.read_csv('../data/' + self._file_name + '.csv',
                                     header=None).values[:, 0]
        elif os.path.isfile('../data/' + self._file_name + '.tar.xz'):
            # Skip first line since empty and last line since nan
            self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz',
                                     compression='xz',
                                     header=None).values[1:-1, 0]

        # Clean data from start, end and padding token
        for i, mol_dat in enumerate(self._data):
            self._data[i] = clean_molecule(mol_dat, self._model_type)

    def sample(self,
               N=100,
               stor_dir='../evaluation',
               T=0.7,
               fold=[1],
               epoch=[9],
               valid=True,
               novel=True,
               unique=True,
               write_csv=True):
        '''Sample from a model where the number of novel valid unique molecules is fixed
        :param stor_dir:    directory where the generated SMILES are saved
        :param N:        number of samples
        :param T:        Temperature
        :param fold:     Folds to use for sampling
        :param epoch:    Epochs to use for sampling
        :param valid:    If True, only accept valid SMILES
        :param novel:    If True, only accept novel SMILES
        :param unique:   If True, only accept unique SMILES
        :param write_csv If True, the generated SMILES are written in stor_dir
        :return: res_molecules: list with all the generated SMILES
        '''

        res_molecules = []
        print('Sampling: started')
        for f in fold:
            for e in epoch:
                self._model.build(stor_dir + '/' + self._experiment_name +
                                  '/models/model_fold_' + str(f) + '_epochs_' +
                                  str(e))

                new_molecules = []
                while len(new_molecules) < N:
                    new_mol = self._encoder.decode(
                        self._model.sample(self._starting_token, T))

                    # Remove remains from generation
                    new_mol = clean_molecule(new_mol[0], self._model_type)

                    # If not valid, get new molecule
                    if valid and not check_valid(new_mol):
                        continue

                    # If not unique, get new molecule
                    if unique and (new_mol in new_molecules):
                        continue

                    # If not novel, get molecule
                    if novel and (new_mol in self._data):
                        continue

                    # If all conditions checked, add new molecule
                    new_molecules.append(new_mol)

                # Prepare name for file
                name = 'molecules_fold_' + str(f) + '_epochs_' + str(
                    e) + '_T_' + str(T) + '_N_' + str(N) + '.csv'
                if unique:
                    name = 'unique_' + name
                if valid:
                    name = 'valid_' + name
                if novel:
                    name = 'novel_' + name

                # Store final molecules
                if write_csv:
                    if not os.path.exists(stor_dir + '/' +
                                          self._experiment_name +
                                          '/molecules/'):
                        os.makedirs(stor_dir + '/' + self._experiment_name +
                                    '/molecules/')
                    mol = np.array(new_molecules).reshape(-1)
                    pd.DataFrame(mol).to_csv(stor_dir + '/' +
                                             self._experiment_name +
                                             '/molecules/' + name,
                                             header=None)

            res_molecules.append(new_molecules)

        print('Sampling: done')
        return res_molecules
class Trainer():
    def __init__(self, experiment_name='ForwardRNN'):

        self._encoder = SMILESEncoder()

        # Read all parameter from the .ini file
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name
        self._hidden_units = int(self._config['MODEL']['hidden_units'])

        self._file_name = '../data/' + self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._encoder.encode(
            [self._config['EVALUATION']['starting_token']])

        if self._model_type == 'FBRNN':
            self._model = FBRNN(self._molecular_size, self._encoding_size,
                                self._learning_rate, self._hidden_units)
        elif self._model_type == 'ForwardRNN':
            self._model = ForwardRNN(self._molecular_size, self._encoding_size,
                                     self._learning_rate, self._hidden_units)

        elif self._model_type == 'BIMODAL':
            self._model = BIMODAL(self._molecular_size, self._encoding_size,
                                  self._learning_rate, self._hidden_units)

        elif self._model_type == 'NADE':
            self._generation = self._config['MODEL']['generation']
            self._missing_token = self._encoder.encode(
                [self._config['TRAINING']['missing_token']])
            self._model = NADE(self._molecular_size, self._encoding_size,
                               self._learning_rate, self._hidden_units,
                               self._generation, self._missing_token)

        self._data = self._encoder.encode_from_file(self._file_name)

    def complete_run(self, stor_dir='../evaluation/', restart=False):
        '''Training without validation on complete data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if self._model_type == 'NADE' and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Build model
        self._model.build()

        # Store total Statistics
        tot_stat = []

        # only single fold
        fold = 1

        # Shuffle data before training (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
        # to  (all_SMILES, molecular_size, encoding_size))
        self._data, label = shuffle(
            self._data.reshape(-1, self._molecular_size, self._encoding_size),
            label.reshape(-1, self._molecular_size))

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            # With restart read existing files
            if restart:
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name,
                               stor_dir, fold, i) and check_molecules(
                                   self._experiment_name, stor_dir, fold,
                                   i) and tmp_stat_file.shape[0] > i:

                    # Load model
                    self._model.build(stor_dir + '/' + self._experiment_name +
                                      '/models/model_fold_' + str(fold) +
                                      '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1,
                                                                 -1).tolist())
                    continue

                # Continue with normal training
                else:
                    restart = False

            # Train model
            statistic = self._model.train(self._data,
                                          label,
                                          epochs=1,
                                          batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(stor_dir + '/' + self._experiment_name +
                             '/models/model_fold_' + str(fold) + '_epochs_' +
                             str(i))

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(
                    self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) +
                '.csv',
                header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)

    def single_run(self, stor_dir='../evaluation/', restart=False):
        '''Training with validation and store data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/validation'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/validation')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if (self._model_type == 'NADE' or self._model_type
                == 'NADE_v2') and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Split data into train and test data
        train_data, test_data, train_label, test_label = train_test_split(
            self._data, label, test_size=1. / 5, random_state=1, shuffle=True)
        # Build model
        self._model.build()

        # Store total Statistics
        tot_stat = []

        # Store validation loss
        tot_loss = []

        # only single fold
        fold = 1

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            if restart:
                # Read existing files
                tmp_val_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/validation/val_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name,
                               stor_dir, fold, i) and check_molecules(
                                   self._experiment_name, stor_dir, fold,
                                   i) and tmp_val_file.shape[
                                       0] > i and tmp_stat_file.shape[0] > i:

                    # Load model
                    self._model.build(stor_dir + '/' + self._experiment_name +
                                      '/models/model_fold_' + str(fold) +
                                      '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1,
                                                                 -1).tolist())
                    tot_loss.append(tmp_val_file[i, 1])

                    # Skip this epoch
                    continue

                # Continue with normal training
                else:
                    restart = False

            # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
            # to  (all_SMILES, molecular_size, encoding_size))
            statistic = self._model.train(
                train_data.reshape(-1, self._molecular_size,
                                   self._encoding_size),
                train_label.reshape(-1, self._molecular_size),
                epochs=1,
                batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(stor_dir + '/' + self._experiment_name +
                             '/models/model_fold_' + str(fold) + '_epochs_' +
                             str(i))

            # Test model on validation set
            tot_loss.append(
                self._model.validate(
                    test_data.reshape(-1, self._molecular_size,
                                      self._encoding_size),
                    test_label.reshape(-1, self._molecular_size)))

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(
                    self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) +
                '.csv',
                header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name +
                '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)

            # Store validation data
            pd.DataFrame(np.array(tot_loss).reshape(
                -1, 1)).to_csv(stor_dir + '/' + self._experiment_name +
                               '/validation/val_fold_' + str(fold) + '.csv',
                               header=None)

    def cross_validation(self, stor_dir='../evaluation/', restart=False):
        '''Perform cross-validation and store data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/validation'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/validation')

        self._kf = KFold(n_splits=self._n_folds, shuffle=True, random_state=2)

        # Count iterations
        fold = 0

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if (self._model_type == 'NADE') and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Split data into train and test data
        for train, test in self._kf.split(self._data):

            # Shuffle index within test and train set
            np.random.shuffle(train)
            np.random.shuffle(test)

            fold += 1

            self._model.build()

            # Store total statistics
            tot_stat = []

            # Store validation loss
            tot_loss = []

            for i in range(self._epochs):
                print('Fold:', fold)
                print('Epoch:', i)

                if restart:
                    tmp_val_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/validation/val_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    tmp_stat_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/statistic/stat_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    # Check if current epoch is successfully complete[0]d else continue with normal training
                    if check_model(
                            self._model_type, self._experiment_name, stor_dir,
                            fold, i) and check_molecules(
                                self._experiment_name, stor_dir, fold,
                                i) and tmp_val_file.shape[
                                    0] > i and tmp_stat_file.shape[0] > i:

                        # Load model
                        self._model.build(stor_dir + '/' +
                                          self._experiment_name +
                                          '/models/model_fold_' + str(fold) +
                                          '_epochs_' + str(i))

                        # Fill statistic and loss list
                        tot_stat.append(tmp_stat_file[i,
                                                      1:].reshape(1,
                                                                  -1).tolist())
                        tot_loss.append(tmp_val_file[i, 1])

                        # Skip this epoch
                        continue

                    else:
                        restart = False

                # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
                # to  (all_SMILES, molecular_size, encoding_size))
                statistic = self._model.train(
                    self._data[train].reshape(-1, self._molecular_size,
                                              self._encoding_size),
                    label[train].reshape(-1, self._molecular_size),
                    epochs=1,
                    batch_size=self._batch_size)

                tot_stat.append(statistic.tolist())

                # Store model
                self._model.save(stor_dir + '/' + self._experiment_name +
                                 '/models/model_fold_' + str(fold) +
                                 '_epochs_' + str(i))

                # Test model on validation set
                tot_loss.append(
                    self._model.validate(
                        self._data[test].reshape(-1, self._molecular_size,
                                                 self._encoding_size),
                        label[test].reshape(-1, self._molecular_size)))

                # Sample new molecules
                new_molecules = []
                for s in range(self._samples):
                    mol = self._encoder.decode(
                        self._model.sample(self._starting_token, self._T))
                    new_molecules.append(
                        clean_molecule(mol[0], self._model_type))

                # Store new molecules
                new_molecules = np.array(new_molecules)
                pd.DataFrame(new_molecules).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/molecules/molecule_fold_' + str(fold) + '_epochs_' +
                    str(i) + '.csv',
                    header=None)

                # Store statistic
                store_stat = np.array(tot_stat).reshape(i + 1, -1)
                pd.DataFrame(np.array(store_stat)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None)

                # Store validation data
                pd.DataFrame(np.array(tot_loss).reshape(-1, 1)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/validation/val_fold_' + str(fold) + '.csv',
                    header=None)