Exemplo n.º 1
0
    def data_split(self, data):
        from skorch.dataset import Dataset

        X, y = data
        dataset_train = Dataset(X[:2], y[:2])
        dataset_valid = Dataset(X[2:], y[2:])
        return dataset_train, dataset_valid
Exemplo n.º 2
0
    def split_data(self, dataset=None):
        """
        Splits dataset into the training and validation set while also performing filtering, whitening
        :param dataset: if dataset is None, the self.datasets is split, else the given dataset is split into
        training and validation indices. In this case, only whitening is done, not filtering
        :return: returns the training and validation set
        """
        length = len(self.datasets.X)
        if self.num_of_folds == -1:
            index = int((length / 100) * 80)

            if (self.train_indices is None) and self.random_valid:
                valid_indices = random.sample([x for x in range(length)], length - index)
                self.valid_indices = valid_indices
                train_indices = [x for x in range(length) if x not in valid_indices]
                self.train_indices = train_indices

            elif not self.random_valid:
                self.train_indices = [x for x in range(0, index)]
                self.valid_indices = [x for x in range(index, length)]

            if dataset is None:
                train_set = MyDataset([self.datasets.X[i] for i in self.train_indices],
                                      [self.datasets.y[i] for i in self.train_indices])
                test_set = MyDataset([self.datasets.X[i] for i in self.valid_indices],
                                     [self.datasets.y[i] for i in self.valid_indices])
                if self.random_valid:
                    print('Random sets')
            else:
                train_set = MyDataset([dataset.X[i] for i in self.train_indices],
                                      [dataset.y[i] for i in self.train_indices])
                test_set = MyDataset([dataset.X[i] for i in self.valid_indices],
                                     [dataset.y[i] for i in self.valid_indices])
            print('validation_indices:', self.valid_indices)
            print('train_indices:', self.train_indices)

            if self.pre_whiten and (dataset is None):
                train_set.X, channel_norms, iqr, median = whiten_data(train_set)
                test_set.X, _, _, _ = whiten_data(test_set, True, channel_normalizations=channel_norms, iqrs=iqr)
            if dataset is None:
                train_Xs, train_ys = train_set.X, train_set.y
                test_Xs, test_ys = test_set.X, test_set.y
                if self.low_pass or self.low_pass_training:
                    X, y = band_pass_data(train_Xs, train_ys, 15, 40, 'low')
                    self.low_pass_train = Dataset(X, y)
                    X, y = band_pass_data(test_Xs, test_ys, 15, 40, 'low')
                    self.low_pass_test = Dataset(X, y)

                elif self.high_pass or self.valid_high_pass:
                    X, y = band_pass_data(train_Xs, train_ys, 15, 60, 'hp')
                    self.high_pass_train = Dataset(X, y)
                    X, y = band_pass_data(test_Xs, test_ys, 15, 60, 'hp')
                    self.high_pass_test = Dataset(X, y)

            return train_set, None, test_set

        else:
            train_set = MyDataset(self.datasets.X[:], self.datasets.y[:])
            return train_set, None, None
    def cv_split(self, X, y):
        length = len(X.X)
        index = int((length / 100) * 20)
        # print(X.X.shape)
        # print(y.shape)
        self.train_set = Dataset(X.X[:-index], y[:-index])
        self.valid_set = Dataset(X.X[-index:], y[-index:])

        return self.train_set, self.valid_set
    def on_epoch_end(self, net, **kwargs):
        # writer = net.callbacks[2][1].writer
        train_X = kwargs['dataset_train'].X
        train_y = kwargs['dataset_train'].y
        valid_X = kwargs['dataset_valid'].X
        valid_y = kwargs['dataset_valid'].y

        train_preds = net.predict(train_X)
        valid_preds = net.predict(valid_X)

        train_corr = self.calculate_correlation(train_preds, train_y, train_X)
        valid_corr = self.calculate_correlation(valid_preds, valid_y, valid_X)
        names = ['train_correlation', 'validation_correlation']
        if 'test' in kwargs.keys():
        #     writer.add_scalar('test_correlation', train_corr, 0)
        #     writer.flush()
            print(f'test_correlation: {train_corr}')
        else:
            for name, value in zip(names, [train_corr, valid_corr]):
                # writer.add_scalar(name, value, self.step_number)
                # writer.flush()
                net.history.record(name, value)
            if net.max_correlation < valid_corr:
                net.max_correlation = valid_corr
                net.history.record('validation_correlation_best', True)
                # if self.output_dir is not None:
                    # torch.save(net.module,
                    #            home + f'/models/saved_models/{self.output_dir}/best_model_split_{self.split}')
                self.validation_set = Dataset(valid_X, valid_y)

            else:
                net.history.record('validation_correlation_best', False)

        self.step_number += 1
Exemplo n.º 5
0
    def test_fit_with_dataset_and_y_none(self, net_cls, module_cls, data):
        from skorch.dataset import Dataset

        # deactivate train split since it requires y
        net = net_cls(module_cls, train_split=False, max_epochs=1)
        X, y = data
        dataset = Dataset(X, y)
        assert net.fit(dataset, y=None)
Exemplo n.º 6
0
    def test_pickle(self, predefined_split, data):
        from skorch.dataset import Dataset

        valid_dataset = Dataset(*data)
        train_split = predefined_split(valid_dataset)

        # does not raise
        pickle.dumps(train_split)
Exemplo n.º 7
0
    def sk_dataset(self, context_dictionary):
        skdset = {}
        skdset["context"] = torch.Tensor(context_dictionary["context"])
        skdset["host"] = preprocessing.LabelEncoder().fit_transform(
            context_dictionary["host"])
        an_perc = context_dictionary["anomaly_perc"]
        Y = np.where(an_perc == 0, NORMAL_TRAFFIC, an_perc)
        Y = np.where(Y > 0, ATTACK_TRAFFIC, Y)
        Y = torch.Tensor(Y)

        return self.Dataset2GPU(Dataset(skdset, Y))
def main():
    sampling_rate = 360

    wavelet = "mexh"  # mexh, morl, gaus8, gaus4
    scales = pywt.central_frequency(wavelet) * sampling_rate / np.arange(
        1, 101, 1)

    (x1_train, x2_train, y_train,
     groups_train), (x1_test, x2_test, y_test,
                     groups_test) = load_data(wavelet=wavelet,
                                              scales=scales,
                                              sampling_rate=sampling_rate)
    print("Data loaded successfully!")

    log_dir = "./logs/{}".format(wavelet)
    shutil.rmtree(log_dir, ignore_errors=True)

    callbacks = [
        Initializer("[conv|fc]*.weight", fn=torch.nn.init.kaiming_normal_),
        Initializer("[conv|fc]*.bias",
                    fn=partial(torch.nn.init.constant_, val=0.0)),
        LRScheduler(policy=StepLR, step_size=5, gamma=0.1),
        EpochScoring(scoring=make_scorer(f1_score, average="macro"),
                     lower_is_better=False,
                     name="valid_f1"),
        TensorBoard(SummaryWriter(log_dir))
    ]
    net = NeuralNetClassifier(  # skorch is extensive package of pytorch for compatible with scikit-learn
        MyModule,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.Adam,
        lr=0.001,
        max_epochs=30,
        batch_size=1024,
        train_split=predefined_split(
            Dataset({
                "x1": x1_test,
                "x2": x2_test
            }, y_test)),
        verbose=1,
        device="cuda",
        callbacks=callbacks,
        iterator_train__shuffle=True,
        optimizer__weight_decay=0,
    )
    net.fit({"x1": x1_train, "x2": x2_train}, y_train)
    y_true, y_pred = y_test, net.predict({"x1": x1_test, "x2": x2_test})

    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, digits=4))

    net.save_params(f_params="./models/model_{}.pkl".format(wavelet))
Exemplo n.º 9
0
def concatenate_batches(set, iterator, shuffle):
    complete_input = []
    complete_targets = []
    for batch in iterator.get_batches(set, shuffle=shuffle):
        for entry in batch[0]:
            complete_input.append(entry)
        for entry in batch[1]:
            complete_targets.append(entry)
    complete_input = np.array(complete_input)
    complete_targets = np.array(complete_targets)
    print(complete_input.shape)
    print(complete_targets.shape)
    return Dataset(complete_input, complete_targets)
Exemplo n.º 10
0
    def get_certain_channels(self, dataset, motor=True):
        """
        Returns the indices of either motor or non-motor channels
        :param dataset: one patient data for which we want the channels
        :param motor: if motor True, motor channel indices are returned, else
        non-motor channel indices are returned
        :return: Dataset object of only data belonging to either motor or non-motor channels
        """
        if motor:
            self.motor_channels = self.motor_channels.astype(int)
            new_set = np.copy(dataset.X)
            mask = np.ones(self.in_channels, np.bool)
            mask[self.motor_channels] = 0
            new_set[:, mask, :, :] = 0
            return Dataset(new_set, dataset.y)

        else:
            self.non_motor_channels = self.non_motor_channels.astype(int)
            new_set = np.copy(dataset.X)
            mask = np.ones(self.in_channels, np.bool)
            mask[self.non_motor_channels] = 0
            new_set[:, mask, :, :] = 0
            return Dataset(new_set, dataset.y)
Exemplo n.º 11
0
    def create_datasets(self, trajectory_index=0):
        """
        Creates an Dataset object from data which has been read from the Matlab file
        Also saves information about which electrodes belong to motor channels and which
        to non-motor channels
        :param trajectory_index: Selects velocity with 0 and absolute velocity with 1
        :return:
        """
        if not self.dummy_dataset:
            sessions = self.data.D
            if self.shift_data:
                Xs = [session[0].ieeg[self.shift_by:] for session in sessions]
                ys = [session[0].traj[:-self.shift_by, trajectory_index] for session in sessions]
            else:
                Xs = [session[0].ieeg[:] for session in sessions]
                ys = [session[0].traj[:, trajectory_index] for session in sessions]
            # if self.absVel_from_vel:
            #     ys = np.abs(ys)
            print(len(Xs), len(ys))

            self.motor_channels = self.data.H.selCh_D_MTR - 1
            self.non_motor_channels = self.data.H.selCh_D_CTR - 1

            if self.num_of_folds == 0:
                self.num_of_folds = len(Xs)
        else:
            print('creating dummy dataset')
            Xs = []
            ys = []
            for i in range(32):
                ieeg = np.zeros([7500, 85])
                for row in range(85):
                    series = [gauss(0.0, 1.0) for i in range(7500)]
                    ieeg[:, row] = series
                traj = np.zeros([7500])
                series = [gauss(0.0, 1.0) for i in range(7500)]
                traj[:] = series
                Xs.append(ieeg)
                ys.append(traj)
            self.motor_channels = None
            self.non_motor_channels = None

        dataset = Dataset(Xs, ys)

        return dataset
Exemplo n.º 12
0
    def type_truth_table():
        from skorch.dataset import Dataset
        from skorch.helper import Subset

        numpy_data = np.array([1, 2, 3])
        tensor_data = torch.from_numpy(numpy_data)
        torch_dataset = torch.utils.data.TensorDataset(
            tensor_data, tensor_data)
        torch_subset = Subset(torch_dataset, [1, 2])
        skorch_dataset = Dataset(numpy_data)
        skorch_subset = Subset(skorch_dataset, [1, 2])

        return [
            (numpy_data, False),
            (torch_dataset, False),
            (torch_subset, False),
            (skorch_dataset, True),
            (skorch_subset, True),
        ]
Exemplo n.º 13
0
def main():
    run = wandb.init()
    cfg = wandb.config
    filepath = './data/' + cfg.dataset

    device = get_device(cfg)
    feature_list = get_feature_list()

    X_train, X_valid, X_test, y_train, y_valid, y_test = load_data(
        filepath, cfg, feature_list)
    valid_ds = Dataset(X_valid, y_valid)

    model = mlp.MLPModule(input_units=cfg.n_features,
                          hidden_units=cfg.hidden_units,
                          num_hidden=cfg.layers,
                          dropout=cfg.dropout).to(device)

    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)
    net = NeuralNetClassifier(
        model,
        max_epochs=cfg.epochs,
        batch_size=cfg.batch_size,
        criterion=nn.CrossEntropyLoss,
        criterion__weight=torch.FloatTensor(class_weights).to(device),
        optimizer=torch.optim.SGD,
        optimizer__lr=cfg.learning_rate,
        optimizer__weight_decay=cfg.weight_decay,
        device=device,
        train_split=predefined_split(valid_ds),
        callbacks=[],
        iterator_train__shuffle=True if cfg.shuffle else False,
        warm_start=False)

    net.initialize()
    net.fit(X_train, y_train)

    y_pred = net.predict(X_test)
Exemplo n.º 14
0
    def type_truth_table():
        """Return a table of (type, bool) tuples that describe what
        is_skorch_dataset should return when called with that type.
        """
        from skorch.dataset import Dataset
        from torch.utils.data.dataset import Subset

        numpy_data = np.array([1, 2, 3])
        tensor_data = torch.from_numpy(numpy_data)
        torch_dataset = torch.utils.data.TensorDataset(
            tensor_data, tensor_data)
        torch_subset = Subset(torch_dataset, [1, 2])
        skorch_dataset = Dataset(numpy_data)
        skorch_subset = Subset(skorch_dataset, [1, 2])

        return [
            (numpy_data, False),
            (torch_dataset, False),
            (torch_subset, False),
            (skorch_dataset, True),
            (skorch_subset, True),
        ]
Exemplo n.º 15
0
def main():
    """
    Run an active learning experiment.

    Sample command:
    ```
    python training/run_modAL_experiment.py --al_epochs_init=10 --al_epochs_incr=5 --al_n_iter=10 --al_samples_per_iter=100 --data_class=DroughtWatch --model_class=ResnetClassifier --batch_size=64 --n_train_images=1000 --n_validation_images=1000 --pretrained=True --wandb
    ```
    """

    # generic setup steps from run_experiment
    # ---------------------------------------

    parser = _setup_parser()
    args = parser.parse_args()
    data_class = _import_class(f"active_learning.data.{args.data_class}")
    model_class = _import_class(f"active_learning.models.{args.model_class}")
    data = data_class(args)
    model = model_class(data_config=data.config(), args=args)

    if args.loss not in ("ctc", "transformer"):
        lit_model_class = lit_models.BaseLitModel

    if args.loss == "ctc":
        lit_model_class = lit_models.CTCLitModel

    if args.loss == "transformer":
        lit_model_class = lit_models.TransformerLitModel

    if args.load_checkpoint is not None:
        lit_model = lit_model_class.load_from_checkpoint(args.load_checkpoint, args=args, model=model)
    else:
        lit_model = lit_model_class(args=args, model=model)

    # modAL specific experiment setup
    # -------------------------------

    # initialize wandb with pytorch model
    if args.wandb:
        wandb.init(config=args)
        wandb.watch(model, log_freq=100)

    # evaluate query strategy from args parameter
    if args.al_query_strategy in ["uncertainty_sampling", "margin_sampling", "entropy_sampling"]:
        query_strategy = _import_class(f"modAL.uncertainty.{args.al_query_strategy}")
    else:
        query_strategy = _import_class(f"active_learning.sampling.{args.al_query_strategy}")

    # cpu vs. gpu: ignore --gpu args param, instead just set gpu based on availability
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # initialize train, validation and pool datasets
    data.setup()

    X_initial = np.moveaxis(
        data.data_train.data, 3, 1
    )  # shape change: (i, channels, h, w) instead of (i, h, w, channels)
    y_initial = data.data_train.targets
    if args.reduced_develop_train_size:
        print("NOTE: Reduced initial train set size for development activated")
        X_initial = X_initial[:100, :, :, :]
        y_initial = y_initial[:100]

    X_val = np.moveaxis(data.data_val.data, 3, 1)  # shape change
    y_val = data.data_val.targets
    X_pool = np.moveaxis(data.data_unlabelled.data, 3, 1)  # shape change
    y_pool = data.data_unlabelled.targets

    # initialize skorch classifier
    classifier = NeuralNetClassifier(
        model,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.Adam,
        train_split=predefined_split(Dataset(X_val, y_val)),
        verbose=1,
        device=device,
    )

    lit_model.summarize(mode="full")

    # initialize modal active learner
    print("Initializing model with base training set")
    learner = ActiveLearner(
        estimator=classifier,
        X_training=X_initial,
        y_training=y_initial,
        epochs=args.al_epochs_init,
        query_strategy=query_strategy,
    )

    _log_skorch_history(
        history=learner.estimator.history,
        al_iter=0,
        epoch_start=0,
        train_acc=learner.score(learner.X_training, learner.y_training),
        train_size=len(learner.y_training),
        wandb_logging=args.wandb,
    )

    # active learning loop
    for idx in range(args.al_n_iter):

        print("Active learning query no. %d" % (idx + 1))
        query_idx, _ = learner.query(X_pool, n_instances=args.al_samples_per_iter)
        learner.teach(
            X=X_pool[query_idx], y=y_pool[query_idx], only_new=args.al_incr_onlynew, epochs=args.al_epochs_incr
        )

        _log_skorch_history(
            history=learner.estimator.history,
            al_iter=idx + 1,
            epoch_start=args.al_epochs_init + idx * args.al_epochs_incr,
            train_acc=learner.score(learner.X_training, learner.y_training),
            train_size=len(learner.y_training),
            wandb_logging=args.wandb,
        )

        # remove queried instances from pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx, axis=0)
    print("-- Training: Simple POS Probe --")
    train_data_X, train_data_y, train_vocab = create_data(
        os.path.join('data', 'sample' if use_sample else '',
                     'en_ewt-ud-train.conllu'), model, w2i)

    valid_data_X, valid_data_y, _ = create_data(os.path.join(
        'data', 'sample' if use_sample else '', 'en_ewt-ud-dev.conllu'),
                                                model,
                                                w2i,
                                                pos_vocab=train_vocab)

    train_X, train_y = transform_XY_to_concat_tensors(train_data_X,
                                                      train_data_y)
    valid_X, valid_y = transform_XY_to_concat_tensors(valid_data_X,
                                                      valid_data_y)
    valid_ds = Dataset(valid_X, valid_y)

    print('Training POS probe')
    train_acc = EpochScoring(scoring='accuracy',
                             on_train=True,
                             name='train_acc',
                             lower_is_better=False)
    early_stopping = EarlyStopping(monitor='valid_acc',
                                   patience=config.pos_probe_train_patience,
                                   lower_is_better=False)
    callbacks = [train_acc, early_stopping]

    embedding_size = train_data_X[0].shape[
        1]  # size of the word embedding (either 650 or 768)
    vocab_size = len(train_vocab)  #17
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--valid_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    valid_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_valid_csv_filename,
        y_csv_path=y_valid_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_tstep')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    #     from IPython import embed; embed()
    #     X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train))

    valid_ds = Dataset(X_valid, y_valid)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float()

    print('Number of training sequences : %s' % N)
    print('Number of test sequences : %s' % X_test.shape[0])
    print('Ratio positive in train : %.2f' %
          ((y_train == 1).sum() / len(y_train)))
    print('Ratio positive in test : %.2f' %
          ((y_test == 1).sum() / len(y_test)))

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)

    loss_early_stopping_cp = EarlyStopping(monitor='valid_loss',
                                           patience=15,
                                           threshold=0.002,
                                           threshold_mode='rel',
                                           lower_is_better=True)

    rnn = RNNPerTStepBinaryClassifier(
        max_epochs=250,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=True,
                         name='auprc_train'),
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=False,
                         name='auprc_valid'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=True,
                         name='auroc_train'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=False,
                         name='auroc_valid'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'),
            #                   EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel',
            #                                                  lower_is_better=False),
            #               LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10),
            #                   compute_grad_norm,
            #               GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2),
            loss_early_stopping_cp,
            Checkpoint(monitor='auprc_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        #               criterion=torch.nn.CrossEntropyLoss,
        #               criterion__weight=class_weights,
        train_split=predefined_split(valid_ds),
        module__rnn_type='GRU',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    #     N=len(X_train)
    #     X_train = X_train[:N]
    #     y_train = y_train[:N]

    clf = rnn.fit(X_train, y_train)

    # get threshold with max recall at fixed precision
    fixed_precision = 0.1

    # get predict probas for y=1 on validation set
    keep_inds_va = torch.logical_not(
        torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1))
    y_va_pred_proba = clf.predict_proba(
        X_valid)[keep_inds_va][:, 1].detach().numpy()

    unique_probas = np.unique(y_va_pred_proba)
    thr_grid_G = np.linspace(np.percentile(unique_probas, 1),
                             max(unique_probas), 100)

    precision_scores_G, recall_scores_G = [
        np.zeros(thr_grid_G.size),
        np.zeros(thr_grid_G.size)
    ]
    for gg, thr in enumerate(thr_grid_G):
        #             logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double()
        curr_thr_y_preds = clf.predict_proba(
            torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg]
        precision_scores_G[gg] = precision_score(y_valid[keep_inds_va],
                                                 curr_thr_y_preds)
        recall_scores_G[gg] = recall_score(y_valid[keep_inds_va],
                                           curr_thr_y_preds)

    keep_inds = precision_scores_G >= fixed_precision

    if keep_inds.sum() > 0:
        print('Choosing threshold with precision >= %.3f' % fixed_precision)
    else:
        fixed_precision_old = fixed_precision
        fixed_precision = np.percentile(precision_scores_G, 99)
        keep_inds = precision_scores_G >= fixed_precision
        print(
            'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f'
            % (fixed_precision_old, fixed_precision))

    thr_grid_G = thr_grid_G[keep_inds]
    precision_scores_G = precision_scores_G[keep_inds]
    recall_scores_G = recall_scores_G[keep_inds]
    thr_perf_df = pd.DataFrame(
        np.vstack([
            thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :],
            recall_scores_G[np.newaxis, :]
        ]).T,
        columns=['thr', 'precision_score', 'recall_score'])

    print(thr_perf_df)
    best_ind = np.argmax(recall_scores_G)
    best_thr = thr_grid_G[best_ind]
    print('chosen threshold : %.3f' % best_thr)

    splits = ['train', 'valid', 'test']
    #     data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test))
    auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits))
    ]

    for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid),
                                 (X_test, y_test)]):
        keep_inds = torch.logical_not(
            torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1))
        y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:,
                                                           1].detach().numpy()
        #         y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
        auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos)
        #         y_pred_proba_pos = np.asarray(y_pred_proba_pos)
        auprc_per_split[ii] = average_precision_score(y[keep_inds],
                                                      y_pred_proba_pos)
        y_pred = y_pred_proba_pos >= best_thr
        precisions_per_split[ii] = precision_score(y[keep_inds], y_pred)
        recalls_per_split[ii] = recall_score(y[keep_inds], y_pred)

    auroc_train, auroc_valid, auroc_test = auroc_per_split
    auprc_train, auprc_valid, auprc_test = auprc_per_split
    precision_train, precision_valid, precision_test = precisions_per_split
    recall_train, recall_valid, recall_test = recalls_per_split

    # save performance
    perf_dict = {
        'auroc_train': auroc_train,
        'auroc_valid': auroc_valid,
        'auroc_test': auroc_test,
        'auprc_train': auprc_train,
        'auprc_valid': auprc_valid,
        'auprc_test': auprc_test,
        'precision_train': precision_train,
        'precision_valid': precision_valid,
        'precision_test': precision_test,
        'recall_train': recall_train,
        'recall_valid': recall_valid,
        'recall_test': recall_test,
        'threshold': best_thr
    }

    perf_df = pd.DataFrame([perf_dict])
    perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv')
    print('Final performance on train, valid and test :\n')
    print(perf_df)

    print('Final performance saved to %s' % perf_csv)
    perf_df.to_csv(perf_csv, index=False)
Exemplo n.º 18
0
# CREATE CLF
split = skorch.dataset.CVSplit(cv=5, stratified=True)
checkpoint = skorch.callbacks.Checkpoint(
    monitor='valid_loss_best',
    f_params='params.pt',
    dirname='{c}/checkpoint/rnn/'.format(c=CHALLENGE)
)
early_stopping = skorch.callbacks.EarlyStopping(
    monitor='valid_loss',
    patience=1,
    threshold=0,
    threshold_mode='rel',
    lower_is_better=True,
)
val_dataset = Dataset(X=xs['val'], y=ys['val'])
clf = skorch.NeuralNetClassifier(
    module=RNN,
    device=device,
    callbacks=[('early_stopping', early_stopping), ('checkpoint', checkpoint)],
    criterion=nn.CrossEntropyLoss,
    optimizer=torch.optim.Adam,
    train_split=predefined_split(val_dataset),
    iterator_train__shuffle=True,
    iterator_train__drop_last=True,
    iterator_valid__drop_last=False,
    iterator_train__batch_size=128,
    iterator_valid__batch_size=-1,  # use all examples
    verbose=1
)
Exemplo n.º 19
0
    def cv_split(self, X, y):
        """
        In case of only one fold validation, splits the data into the training and validation set.
        In case of n_fold cross-validation also splits the data into the training and validation set.
        However, in the case of n-fold cross-validation because we are combining filtered and non-filtered
        datasets as well as changing the training set when pre-whitening, the filtering and whitening has
        to be performed with each fold which is implemented in this method
        :param X: input signals
        :param y: gold labels
        :return: returns the training and validation set for this cross-validation fold
        """
        assert self.n_preds_per_input is not None
        "Needs to run cut_input first to assign n_preds per input and input_time_length"
        if isinstance(X, np.ndarray):
            length = len(X)
        else:
            X = X.X
            length = len(X)
        if self.num_of_folds == -1:
            # index = int((length/100)*10)
            index = -1
            if index > -1:
                train_set = Dataset(X[:-index], y[:-index])
                valid_set = Dataset(X[-index:], y[-index:])
            else:
                train_set = Dataset(X[:], y[:])
                valid_set = self.test_set
            if self.double_training:
                second_X, _ = band_pass_data(valid_set.X, valid_set.y, order=3, cut_off_frequency=60, btype='hp')
                second_test_set = np.stack(second_X)
                # second_test_set = np.zeros(second_test_set.shape)
                i = 0
                while i < valid_set.X.shape[0]:
                    if i == 0:
                        full_train_set = np.stack(
                            [valid_set.X[i:i + 32], second_test_set[i:i + 32]])
                        full_train_set = np.moveaxis(full_train_set, 0, 3)
                        full_train_set = full_train_set.reshape(
                            [full_train_set.shape[0], full_train_set.shape[1], full_train_set.shape[2], 2])
                    else:
                        new_stack = np.stack(
                            [valid_set.X[i:i + 32], second_test_set[i:i + 32]])
                        new_stack = np.moveaxis(new_stack, 0, 3)
                        new_stack = new_stack.reshape([new_stack.shape[0], new_stack.shape[1], new_stack.shape[2], 2])
                        full_train_set = np.concatenate([full_train_set, new_stack])
                    i += 32
                valid_set = Dataset(full_train_set, self.test_set.y)
            return train_set, valid_set

        # this code executes self.num_of_folds is larger than -1
        self.valid_indices = self.indices[self.fold_number]
        self.train_indices = []
        for i in range(self.num_of_folds):
            if i != self.fold_number:
                self.train_indices += list(self.indices[i])
        print('train indices:', self.train_indices)
        print('valid indices:', self.valid_indices)

        train_set = MyDataset([self.train_set.X[i] for i in self.train_indices],
                              [self.train_set.y[i] for i in self.train_indices])

        validation_set = MyDataset([self.train_set.X[i] for i in self.valid_indices],
                                   [self.train_set.y[i] for i in self.valid_indices])
        # print('Attention! shuffled set')
        # random_indices = get_random_permutation_with_no_fixed_point(self.valid_indices)
        # validation_set = MyDataset([self.train_set.X[i] for i in self.valid_indices],
        #                            [self.train_set.y[self.valid_indices[index]] for index in random_indices])

        if self.pre_whiten:
            train_set.X, channel_norms, iqr, median = whiten_data(train_set)
            validation_set.X, _, _, _ = whiten_data(validation_set, True, channel_normalizations=channel_norms,
                                                    iqrs=iqr)

        train_Xs, train_ys = train_set.X, train_set.y
        validation_Xs, validation_ys = validation_set.X, validation_set.y

        # filters the training set and validation set for the current cv fold
        if self.low_pass or self.low_pass_training:
            X, y = band_pass_data(train_Xs, train_ys, 15, 40, 'low')
            self.low_pass_train = Dataset(X, y)
            X, y = band_pass_data(validation_Xs, validation_ys, 15, 40, 'low')
            self.low_pass_test = Dataset(X, y)
        if self.low_pass:
            validation_set = self.low_pass_test
        if self.low_pass_training:
            train_set = self.low_pass_train

        if self.high_pass or self.valid_high_pass:
            X, y = band_pass_data(train_Xs, train_ys, 15, 60, 'hp')
            self.high_pass_train = Dataset(X, y)
            X, y = band_pass_data(validation_Xs, validation_ys, 15, 60, 'hp')
            self.high_pass_test = Dataset(X, y)
        if self.high_pass:
            train_set = self.high_pass_train
            validation_set = self.high_pass_test
        if self.valid_high_pass:
            validation_set = self.high_pass_test

        # cuts the input similarly to Data.cut_input()
        iterator = CropsFromTrialsIterator(batch_size=32,
                                           input_time_length=self.input_time_length,
                                           n_preds_per_input=self.n_preds_per_input)

        validation_set = concatenate_batches(validation_set, iterator, False)
        train_set = concatenate_batches(train_set, iterator, False)

        self.fold_number += 1
        return train_set, validation_set
Exemplo n.º 20
0
    test = test.reshape((-1, 32, 32, 3)).transpose(
        (0, 3, 1, 2)).astype(np.float32)
    # train = train.reshape((-1, 3, 48, 48)).astype(np.float32)
    # test = test.reshape((-1, 3, 48, 48)).astype(np.float32)
    # train = train.reshape((-1, 3, 96, 96)).astype(np.float32)
    # test = test.reshape((-1, 3, 96, 96)).astype(np.float32)
    # train = train.reshape((-1, 3, 224, 224)).astype(np.float32)
    # test = test.reshape((-1, 3, 224, 224)).astype(np.float32)
    train_label = train_label.astype(np.int64)
    test_label = test_label.astype(np.int64)
    print('training data size: ')
    print(train.shape)
    print('testing data size: ')
    print(test.shape)

    valid_ds = Dataset(test, test_label)

    # scd = CNN(args.round, SimpleNet_gtsrb)
    # scd = CNN(args.round, LeNet_gtsrb)
    # scd = CNN(args.round, SimpleNet_celeba)
    # scd = CNN(args.round, LeNet_celeba)
    # scd = CNN(args.round, SimpleNet_cifar)
    scd = CNN(args.round, LeNet_cifar)
    # scd = CNN(args.round, ResNet18)
    # models = []
    # for i in range(args.round):
    #     model = resnet50()
    #     model._modules['conv1'] = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    #     model._modules['fc'] = nn.Linear(2048, args.n_classes, bias=True)
    #
    #     models.append(model)
Exemplo n.º 21
0
        specificity = np.zeros((folds, reps))
        auc_scores = np.zeros((folds, reps))
        train_time = np.zeros((folds, reps))
        best_acc = 0
        for k in range(folds):
            progressBar(k + 1, reps)
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.3,
                                                                random_state=k)
            fprs, tprs = [], []

            for r in range(reps):

                # Load Training Dataset
                trainset = Dataset(X_train, y_train)
                trainloader = torch.utils.data.DataLoader(trainset,
                                                          batch_size=100,
                                                          shuffle=True)

                #Load testing Dataset
                testset = Dataset(X_train, y_train)
                testloader = torch.utils.data.DataLoader(testset,
                                                         batch_size=100,
                                                         shuffle=True)

                trainLoss, validLoss, trainAcc, validAcc, trainTime = train(
                    trainloader, testloader, net, device, r, opt, model)

                trainloss.append(trainLoss)
                valloss.append(validLoss)
    transformer.model.idfs,
)
print(f"Computing representations took: {time.time() - start_time}s.")

print("Started computing representations for validation data")
start_time = time.time()
val_X = get_tfidf_repr(val_organelle + val_bacteria + val_archea + val_eukarya,
                       k, transformer.model.idfs)
print(f"Computing representations took: {time.time() - start_time}s.")

train_y = np.array([0] * len(train_organelle) + [1] * len(train_bacteria) +
                   [3] * len(train_archea) + [4] * len(train_eukarya))
val_y = np.array([0] * len(val_organelle) + [1] * len(val_bacteria) +
                 [3] * len(val_archea) + [4] * len(val_eukarya))

valid_dataset = Dataset(val_X, val_y)

train_X = train_X / np.linalg.norm(train_X, axis=1).reshape((-1, 1))
val_X = val_X / np.linalg.norm(val_X, axis=1).reshape((-1, 1))

dim_in = 4**k
dim_out = 5

with open(sys.argv[1], "a") as handle:
    handle.write(
        "organelle: 0, bacteria:1, unknown: 2, archea: 3, eukarya: 4\n")

for i, architecture in enumerate(architectures):
    hid1 = architecture["hid1"]
    try:
        hid2 = architecture["hid2"]
Exemplo n.º 23
0
 def skorch_ds(self, data):
     from skorch.dataset import Dataset
     return Dataset(*data)
Exemplo n.º 24
0
            module__hidden_dim=opt['hidden_layer_dim'],
            optimizer__weight_decay=opt['l2_weight'],
            module__dropout=opt['dropout'],
            device='cuda',
            # Training
            max_epochs=opt['max_epochs'],
            batch_size=opt['batch_size'],
            callbacks=[
                Checkpoint(dirname=save_dir,
                           f_params='params.pt',
                           f_optimizer=None,
                           f_history=None,
                           monitor='valid_loss_best')
            ],
            # train_split is validation data
            train_split=predefined_split(Dataset(X_val, y_val)),
            # Optimizer
            optimizer=optim.Adam,
            lr=opt['learning_rate'],
            # Data
            iterator_train__shuffle=True,
            verbose=(runs == 1))

        net.fit(X_train, y_train)

        # Reload best valid loss checkpoint
        net.load_params(save_dir.joinpath('params.pt'))

        # Evaluate
        preds = net.predict(X_train)
        train_acc = accuracy_score(y_train, preds)
Exemplo n.º 25
0
    print(split_dict)

    # get the feature columns
    class_weights_ = torch.tensor(
        np.asarray([1 / ((y_train == 0).sum()), 1 / ((y_train == 1).sum())]))

    # normalize data
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train_transformed = scaler.transform(x_train)
    x_valid_transformed = scaler.transform(x_valid)
    x_test_transformed = scaler.transform(x_test)

    # store the fixed validation set as a skorch dataset
    valid_ds = Dataset(x_valid_transformed, y_valid)

    # set random seed
    torch.manual_seed(args.seed)

    # set max_epochs
    max_epochs = 100

    # define callbacks
    epoch_scoring_precision_train = EpochScoring('precision',
                                                 lower_is_better=False,
                                                 on_train=True,
                                                 name='precision_train')

    epoch_scoring_precision_valid = EpochScoring('precision',
                                                 lower_is_better=False,
Exemplo n.º 26
0
        callbacks=[
            cp,
            # ('lr_scheduler', schedule),  # Use with SGD optimizer
        ],
        lr=setting['initial_lr'],
        module__input_size=x_train.shape[1],
        module__hidden_layer_sizes=hidden_layer_sizes,
        module__dropout=setting['dropout'],
        module__output_size=output_size,
        criterion=criterion,
        optimizer=torch.optim.Adam,  # torch.optim.SGD,
        batch_size=128,
        warm_start=False,
        verbose=2,
        device='cuda',
        train_split=predefined_split(Dataset(x_val, y_val)),  # holdout val set
        optimizer__weight_decay=setting['wd'],
        # optimizer__momentum=setting['momentum'],  # Use with SGD optimizer
        optimizer__amsgrad=setting['amsgrad'],
    )

    if use_crossval:
        # This script is no longer intended to use cross-validation. Please use
        # the provided val and test sets in EgoCom.
        pass
    else:
        model.fit(x_train, y_train, epochs=epochs)
    print(" * Test Acc (last epoch): {:.6f}".format(model.score(
        x_test, y_test)))
    model.load_params(checkpoint=cp)
    print(" ** Test Acc (best val): {:.6f}".format(model.score(x_test,