Пример #1
0
    def fit(self,
            X_train: ndarray,
            y_train: ndarray,
            X_test: ndarray,
            y_test: ndarray,
            epochs: int = 100,
            eval_every: int = 10,
            batch_size: int = 32,
            seed: int = 1,
            restart: bool = True) -> None:
        '''
        Fits the neural network on the training data for a certain number of epochs.
        Every 'eval_every' epochs, evaluates the network on testing data
        '''
        setattr(self.optim, 'max_epochs', epochs)
        self.optim._setup_decay()

        np.random.seed(seed)
        if restart:
            for layer in self.net.layers:
                layer.first = True
            self.best_loss = 1e9

        for epoch in range(epochs):

            if (epoch + 1) % eval_every == 0:
                # for early stopping
                last_model = deepcopy(self.net)

            X_train, y_train = permute_data(X_train, y_train)
            batch_generator = self.generate_batches(X_train, y_train,
                                                    batch_size)
            pbar = tqdm(enumerate(batch_generator),
                        total=len(list(batch_generator)))
            for i, (X_batch, y_batch) in pbar:
                pbar.set_postfix({"Epoch": epoch + 1, "Batch": i + 1})
                self.net.train_batch(X_batch, y_batch)
                self.optim.step()

            if (epoch + 1) % eval_every == 0:
                test_preds = self.net.forward(X_test, inference=True)
                loss = self.net.loss.forward(test_preds, y_test)

                if loss < self.best_loss:
                    print(
                        f'Validation loss after {epoch + 1} epochs is {loss:.3f}'
                    )
                    self.best_loss = loss
                else:
                    print(
                        f'Loss increased after {epoch + 1}, final loss was {self.best_loss:.3f}, using the model from epoch {epoch + 1 - eval_every}'
                    )
                    self.net = last_model
                    setattr(self.optim, 'net', self.net)
                    break
            if self.optim.final_lr:
                self.optim._decay_lr()
Пример #2
0
def split_huuskonsen():
    train_file_path = 'ugrnn/data/huuskonsen/train.smi'
    test1_file_path = 'ugrnn/data/huuskonsen/test1.smi'
    test2_file_path = 'ugrnn/data/huuskonsen/test2.smi'

    smile_col_name = "smiles"
    target_col_name = "solubility"
    logp_col_name = "logp"

    dtype = [(smile_col_name, 'S200'), (target_col_name, 'f8'),
             (logp_col_name, 'f8')]

    data = np.genfromtxt(train_file_path,
                         usecols=(6, 3, 5),
                         dtype=dtype,
                         comments=None)
    data_perm = permute_data(data)

    l = len(data)
    train_end = int(l * .9)

    train_data = data_perm[:train_end]
    val_data = data_perm[train_end:]

    test1_data = np.genfromtxt(test1_file_path, usecols=(6, 3, 5), dtype=dtype)
    test2_data = np.genfromtxt(test2_file_path, usecols=(6, 3, 5), dtype=dtype)
    test_data = np.concatenate((test1_data, test2_data))

    train_file_path = 'ugrnn/data/huuskonsen/train_huuskonsen.csv'
    validate_file_path = 'ugrnn/data/huuskonsen/validate_huuskonsen.csv'
    test_file_path = 'ugrnn/data/huuskonsen/test_huuskonsen.csv'

    header = "{:},{:},{:}".format(smile_col_name, target_col_name,
                                  logp_col_name)
    fmt = ('%s', '%4f', '%4f')
    np.savetxt(train_file_path,
               train_data,
               header=header,
               fmt=fmt,
               comments='',
               delimiter=',')
    np.savetxt(validate_file_path,
               val_data,
               header=header,
               fmt=fmt,
               comments='',
               delimiter=',')
    np.savetxt(test_file_path,
               test_data,
               header=header,
               fmt=fmt,
               comments='',
               delimiter=',')
def split_delaney():
    csv_file_path = 'ugrnn/data/DILI/DILI.csv'
    smile_col_name = "smiles"
    target_col_name = "solubility"
    logp_col_name = "logp"

    data = read_csv(csv_file_path, smile_col_name, target_col_name, logp_col_name)
    data_perm = permute_data(data)

    traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10)

    train_file_path = './data/DILI/train_DILI.csv'
    validate_file_path = './data/DILI/validate_DILI.csv'
    test_file_path = './data/DILI/test_DILI.csv'

    header = "{:},{:},{:}".format(smile_col_name, target_col_name, logp_col_name )
    fmt = ('%s', '%4f', '%4f')
    np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
def split_karthikeyan():
    csv_file_path = 'ugrnn/data/karthikeyan/melting_points.csv'
    smile_col_name = "SMILES"
    target_col_name = "MTP"

    data = read_csv(csv_file_path, smile_col_name, target_col_name)
    bool_arr = np.array([valid_smile(row[0]) for row in data])
    print(bool_arr)
    filter_data = data[bool_arr]
    data_perm = permute_data(filter_data)

    traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10)

    train_file_path = 'ugrnn/data/karthikeyan/train_karthikeyan.csv'
    validate_file_path = 'ugrnn/data/karthikeyan/validate_karthikeyan.csv'
    test_file_path = 'ugrnn/data/karthikeyan/test_karthikeyan.csv'

    header = "{:},{:}".format(smile_col_name, target_col_name)
    fmt = ('%s', '%4f')
    np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
Пример #5
0
def main(output_dir='output/',
         model_name='my_model',
         training_file='delaney_train.csv',
         validation_file='delaney_validate.csv',
         smile_col='smiles',
         target_col='solubility',
         crossval_total_num_splits=10,
         initial_crossvalidation_index=0,
         weight_decay_factor=0,
         *args,
         **kwargs):
    '''
    valid kwargs:

        experiment_name, regression,
        binary_classification, batch_size,
        clip_gradient, model_params,
        contract_rings, learning_rate,
        max_epochs, enable_plotting

    '''
    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_format)
    logger = logging.getLogger(__name__)
    print('output_dir', output_dir)
    output_dir = os.path.join(output_dir, model_name)

    #    if tf.gfile.Exists(output_dir):
    #        tf.gfile.DeleteRecursively(output_dir)

    tf.gfile.MakeDirs(output_dir)

    with tf.Graph().as_default():
        # Create a session for running Ops on the Graph.
        # select CPU (as it is faster than GPUs)
        config = tf.ConfigProto(device_count={'GPU': 0})
        session = tf.Session(config=config)

        logger.info('Loading data set from {:}'.format(training_file))
        csv_file_path = training_file
        smile_col_name = smile_col
        target_col_name = target_col
        data = utils.read_csv(csv_file_path, None, smile_col_name,
                              target_col_name)
        assert len(data[0]) > 0, 'no data loaded!'
        smiles, labels = utils.permute_data(data[0], data[1])

        if kwargs['regression']:
            # normalize regression targets to be in a reasonable value-range
            labels_mean = labels.mean()
            labels_range = np.max(labels) - np.min(labels)
            labels = (labels - labels_mean) / labels_range

            #this function will be applied to predictions of the model and to targets when computing metrics
            def Targets_UnNormalization_fn(targets):
                return targets * labels_range + labels_mean

            def Targets_Normalization_fn(targets):
                return (targets - labels_mean) / labels_range
        else:
            if labels.ndim == 1:
                labels = labels.reshape((len(labels), 1))
            Targets_UnNormalization_fn = lambda x: x
            Targets_Normalization_fn = lambda x: x

        if validation_file != '' and validation_file is not None:
            # train single model
            logger.info(
                'Loading validation dataset from {:}'.format(validation_file))
            valid_data = utils.read_csv(validation_file, None, smile_col_name,
                                        target_col_name)
            if kwargs['regression'] == 0 and labels.ndim == 1:
                labels = labels.reshape(
                    (len(labels), 1))  #binary classification
            train_data = (smiles, labels)
            valid_data = (valid_data[0],
                          Targets_Normalization_fn(valid_data[1]))

            training_scores_dict, validation_scores_dict = build_and_train(
                logger,
                session,
                output_dir,
                train_data,
                valid_data,
                model_name=model_name,
                Targets_UnNormalization_fn=Targets_UnNormalization_fn,
                weight_decay_factor=weight_decay_factor,
                **kwargs)

        else:
            # cross validation
            assert initial_crossvalidation_index < crossval_total_num_splits, 'INVALID VALUE GIVEN for initial_crossvalidation_index or crossval_total_num_splits!'
            training_scores_dict, validation_scores_dict = [], []
            for crossval_split_index in range(initial_crossvalidation_index,
                                              crossval_total_num_splits):
                print('crossval_split: {} of {}'.format(
                    crossval_split_index + 1, crossval_total_num_splits))

                assert len(smiles) == len(labels)
                train_data, valid_data, testdata = utils.cross_validation_split(
                    smiles,
                    labels,
                    crossval_split_index,
                    crossval_total_num_splits=crossval_total_num_splits,
                    validation_data_ratio=1. / crossval_total_num_splits)
                #merge "test" and train -- validation part used for testing
                train_data = (np.concatenate((train_data[0], testdata[0])),
                              np.concatenate((train_data[1], testdata[1])))
                print('CV: # train samples:', len(train_data[0]),
                      '# validation samples:', len(valid_data[0]))

                td, vd = build_and_train(
                    logger,
                    session,
                    output_dir + '_CV_{}'.format(crossval_split_index),
                    train_data,
                    valid_data,
                    model_name=model_name,
                    Targets_UnNormalization_fn=Targets_UnNormalization_fn,
                    weight_decay_factor=weight_decay_factor,
                    **kwargs)
                training_scores_dict.append(td)
                validation_scores_dict.append(vd)
        if isinstance(training_scores_dict,
                      list) and len(training_scores_dict) == 1 and len(
                          validation_scores_dict) == 1:
            return training_scores_dict[0], validation_scores_dict[0]
        return training_scores_dict, validation_scores_dict