예제 #1
0
def main():
    [[X_train, y_train], [X_valid, y_valid],
     [X_test, y_test]] = util.load_data(DATA_PATH)

    X = X_train.append(X_valid).append(X_test)
    y = y_train.append(y_valid).append(y_test)
    del X_train
    del X_valid
    del X_test
    del y_train
    del y_valid
    del y_test
    X = make_squared(X, sq_fields)
    X = make_interactions(X, interactions)
    full = X.copy()
    X = X.drop([
        'studyArea', 'x', 'y', 'elev_srtm30', 'year',
        'varPrecip_growingSeason', 'precip_OctSep:varPrecip_growingSeason'
    ],
               axis=1)
    predictors = list(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = y['beetle'].values.reshape(-1)
    logistic_clf = LogisticRegression(C=0.001, penalty='l2')
    logistic_clf.fit(X, y)
    probs = logistic_clf.predict_proba(X)
    probs = [p[1] for p in probs]

    X_df = pd.DataFrame(data=X, index=full.index, columns=predictors)
    X_df['year'] = full['year']
    X_df['x'] = full['x']
    X_df['y'] = full['y']
    X_df['probs'] = probs
    X_df['preds'] = X_df['probs'].apply(lambda x: 1
                                        if x >= OPTIMAL_THRESHOLD else 0)
    out_data = X_df.loc[X_df.year == 2000, ['x', 'y', 'probs', 'preds']]
    out_data = out_data.rename(columns={
        'probs': 'probs_2000',
        'preds': 'preds_2000'
    })
    out_data.index = out_data.apply(lambda row: str(row['x']) + str(row['y']),
                                    axis=1)

    for year in range(2001, 2015):
        year_data = X_df.loc[X_df.year == year, ['x', 'y', 'probs', 'preds']]
        year_data.index = year_data.apply(
            lambda row: str(row['x']) + str(row['y']), axis=1)
        out_data['probs_%s' % year] = year_data['probs']
        out_data['preds_%s' % year] = year_data['preds']
    out_data.index = range(out_data.shape[0])
    print(out_data.head())
    out_data.to_csv(HISTORIC_DATA_PATH + 'recent_data_fitted_no_beetle.csv')
예제 #2
0
def main():
    ignore = [
        'year', 'studyArea', 'elev_srtm30', 'x', 'y', 'varPrecip_growingSeason'
    ]
    if HISTORIC:
        X_test = pd.read_csv(DATA_DIR + 'X_test.csv')
        x_min = X_test.x.min()
        y_min = X_test.y.min()
        x_max = X_test.x.max()
        y_max = X_test.y.max()
        print('x range: %d - %d' % (x_min, x_max))
        print('y range: %d - %d' % (y_min, y_max))
        t0 = time.time()
        for i in HISTORIC_YEARS:
            t_iter = time.time()
            path = '%sclean_%d.csv' % (HISTORIC_DATA_DIR, i)
            print('Reading data from %s...' % path)
            X = pd.read_csv(path)
            print('Filtering x, y ranges...')
            X = X.loc[((X.x >= x_min)
                       & (X.x <= x_max)
                       & (X.y >= y_min)
                       & (X.y <= y_max)), :]
            fields = [col for col in list(X) if col not in ignore]
            make_and_save_tensor(X, fields, i)
            iter_time = (time.time() - t_iter) / 60
            elapsed = (time.time() - t0) / 60
            print(
                '  Iteration time: %.2fminutes\n  Elapsed time: %.2fminutes' %
                (iter_time, elapsed))
    else:
        [[X_train, y_train], [X_valid, y_valid],
         [X_test, y_test]] = util.load_data(DATA_DIR)
        X_train, y_train = util.drop_nans(X_train, y_train,
                                          'varPrecip_growingSeason')
        X_valid, y_valid = util.drop_nans(X_valid, y_valid,
                                          'varPrecip_growingSeason')
        X_test, y_test = util.drop_nans(X_test, y_test,
                                        'varPrecip_growingSeason')
        fields = [col for col in list(X_test) if col not in ignore]
        for i in range(2006, 2015):
            make_and_save_tensor(X_train, fields, i)
            make_and_save_y_matrix(y_train, X_train, i)
        for i in range(2003, 2006):
            make_and_save_tensor(X_valid, fields, i)
            make_and_save_y_matrix(y_valid, X_valid, i)
        for i in range(2000, 2003):
            make_and_save_tensor(X_test, fields, i)
            make_and_save_y_matrix(y_test, X_test, i)
예제 #3
0
def main():
    print('Loading data...')
    [[X_train, y_train], [X_valid, y_valid],
     [X_test, y_test]] = util.load_data(DATA_DIR)
    print('Merging data....')
    data = X_train.append(X_valid).append(X_test)
    y = y_train.append(y_valid).append(y_test)
    data['beetle'] = y
    print('Adding the following year beetle data...')
    [[X_train, y_train], [X_valid, y_valid],
     [X_test, y_test]] = make_new_data_sets(data)
    print('X_train: %s\ty_train: %s\n'
          'X_valid: %s\ty_valid: %s\n'
          'X_test:  %s\ty_test:  %s' %
          (X_train.shape, y_train.shape, X_valid.shape, y_valid.shape,
           X_test.shape, y_test.shape))
    print('Saving files as "X_train_full.csv" etc....')
    save_xy([X_train, y_train], DATA_DIR, 'train_full')
    save_xy([X_valid, y_valid], DATA_DIR, 'valid_full')
    save_xy([X_test, y_test], DATA_DIR, 'test_full')
예제 #4
0
def predict_on_image(model, image_path, intensity_correction=0.0):
    """
    Use the model to give a prediction on a image.
    :param model: A keras model.
    :param image_path: The path to a image in geotiff format. Image should be 512x512 and in black and white.
    :return: The prediction image as a TrainingImage object.
    """

    # Load image
    training_image = model_utils.load_data(image_path, image_path)
    data_X = model_utils.convert_training_images_to_numpy_arrays([training_image])[0]
    data_X += intensity_correction / (2**8 - 1)  # Adjust for differing light levels in training and this dataset
    data_X = model_utils.fake_colors(data_X)

    prediction = model.predict(data_X)
    prediction = np.argmax(prediction, axis=-1)
    prediction = np.squeeze(prediction)

    training_image.labels = prediction

    return training_image
def run_nn_models(sp,
                  n_folds,
                  combined_sbjs,
                  lp,
                  roi_proj_loadpath,
                  pats_ids_in=[
                      'EC01', 'EC02', 'EC03', 'EC04', 'EC05', 'EC06', 'EC07',
                      'EC08', 'EC09', 'EC10', 'EC11', 'EC12'
                  ],
                  n_evs_per_sbj=500,
                  test_day=None,
                  tlim=[-1, 1],
                  n_chans_all=140,
                  dipole_dens_thresh=0.2,
                  rem_bad_chans=True,
                  models=['eegnet_hilb', 'eegnet', 'rf'],
                  save_suffix='',
                  n_estimators=150,
                  max_depth=8,
                  overwrite=True,
                  dropoutRate=0.25,
                  kernLength=32,
                  F1=8,
                  D=2,
                  F2=16,
                  dropoutType='Dropout',
                  kernLength_sep=16,
                  rand_seed=1337,
                  loss='categorical_crossentropy',
                  optimizer='adam',
                  patience=5,
                  early_stop_monitor='val_loss',
                  do_log=False,
                  n_test=1,
                  n_val=4,
                  custom_rois=True,
                  n_train=7,
                  epochs=20,
                  compute_val='power',
                  ecog_srate=500,
                  half_n_evs_test='nopad',
                  trim_n_chans=True):
    '''
    Main function that prepares data and aggregates accuracy values from model fitting.
    Note that overwrite variable no longer does anything.
    Also note that ecog_srate is only needed for frequency sliding computation in neural net (if compute_val=='freqslide')
    '''
    # Ensure pats_ids_in and models variables are lists
    if not isinstance(pats_ids_in, list):
        pats_ids_in = [pats_ids_in]
    if not isinstance(models, list):
        models = [models]

    # Save pickle file with dictionary of input parameters (useful for reproducible dataset splits and model fitting)
    params_dict = {
        'sp': sp,
        'n_folds': n_folds,
        'combined_sbjs': combined_sbjs,
        'lp': lp,
        'pats_ids_in': pats_ids_in,
        'n_evs_per_sbj': n_evs_per_sbj,
        'test_day': test_day,
        'tlim': tlim,
        'n_chans_all': n_chans_all,
        'dipole_dens_thresh': dipole_dens_thresh,
        'rem_bad_chans': rem_bad_chans,
        'models': models,
        'save_suffix': save_suffix,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'overwrite': overwrite,
        'dropoutRate': dropoutRate,
        'kernLength': kernLength,
        'F1': F1,
        'D': D,
        'F2': F2,
        'dropoutType': dropoutType,
        'kernLength_sep': kernLength_sep,
        'rand_seed': rand_seed,
        'loss': loss,
        'optimizer': optimizer,
        'patience': patience,
        'early_stop_monitor': early_stop_monitor,
        'do_log': do_log,
        'n_test': n_test,
        'n_val': n_val,
        'n_train': n_train,
        'epochs': epochs,
        'compute_val': compute_val,
        'ecog_srate': ecog_srate,
        'trim_n_chans': trim_n_chans
    }
    f = open(sp + 'param_file.pkl', 'wb')
    pickle.dump(params_dict, f)
    f.close()

    # Set random seed
    np.random.seed(rand_seed)

    # Perform different procedures depending on whether or not multiple subjects are being fit together
    if combined_sbjs:
        # For multi-subject fits, obtain projection matrix and good regions of interest for each subject
        if custom_rois:
            custom_roi_inds = get_custom_motor_rois(
            )  # load custom roi's from precentral, postcentral, and inf parietal (AAL2)
        else:
            custom_roi_inds = None
        print("Determining ROIs")
        proj_mat_out, good_ROIs, chan_ind_vals_all = proj_mats_good_rois(
            pats_ids_in,
            n_chans_all=n_chans_all,
            rem_bad_chans=rem_bad_chans,
            dipole_dens_thresh=dipole_dens_thresh,
            custom_roi_inds=custom_roi_inds,
            chan_cut_thres=n_chans_all,
            roi_proj_loadpath=roi_proj_loadpath)
        nROIs = len(good_ROIs)
        print("ROIs found")

        # Retain only the electrodes with nonzero data (initially padded because number of electrodes varies across subjects)
        # proj_mat_out : (len(pats_ids_in) x len(good_ROIs) x n_chans_all)
        if trim_n_chans:
            n_chans_all = len(
                np.nonzero(
                    proj_mat_out.reshape(
                        -1, proj_mat_out.shape[-1]).mean(axis=0))[0])
            proj_mat_out = proj_mat_out[..., :n_chans_all]
        np.save(sp + "proj_mat_out", proj_mat_out)

        # Load ECoG data (if test_day is None, then X_test_orig, y_test_orig, and sbj_order_test_load will be empty)
        X, y, X_test_orig, y_test_orig, sbj_order, sbj_order_test_load = load_data(
            pats_ids_in,
            lp,
            n_chans_all=n_chans_all,
            test_day=test_day,
            tlim=tlim)
        X[np.isnan(X)] = 0  # set all NaN's to 0
        # Identify the number of unique labels (or classes) present
        nb_classes = len(np.unique(y))

        # Choose which subjects for training/validation/testing for every fold (splits are based on random seed)
        sbj_inds_all_train, sbj_inds_all_val, sbj_inds_all_test = folds_choose_subjects(
            n_folds, pats_ids_in, n_test=n_test, n_val=n_val, n_train=n_train)

        # Iterate across all model types specified
        labels_unique = np.unique(y)
        if isinstance(n_evs_per_sbj, str):
            half_n_evs = n_evs_per_sbj
        else:
            half_n_evs = n_evs_per_sbj // len(labels_unique)


#         half_n_evs_test = 'nopad' #avoid duplicating events for test set (okay for train/val sets where it is more important to balance trials across subjects)
        train_inds_folds, val_inds_folds, test_inds_folds = [], [], []
        for k, modeltype in enumerate(models):
            accs = np.zeros([n_folds, 3])  # accuracy table for all NN models
            last_epochs = np.zeros([n_folds, 2])

            # For the number of folds, pick the events to use
            for i in tqdm(range(n_folds)):
                test_sbj = sbj_inds_all_test[i]
                val_sbj = sbj_inds_all_val[i]
                train_sbj = sbj_inds_all_train[i]

                # Only need to determine train/val/test inds for first modeltype used
                if k == 0:
                    # Find train/val/test indices (test inds differ depending on if test_day is specified or not)
                    # Note that subject_data_inds will balance number of trials across classes
                    train_inds, val_inds, test_inds = [], [], []
                    if test_day is None:
                        test_inds = subject_data_inds(np.full(1, test_sbj),
                                                      sbj_order, labels_unique,
                                                      i, 'test_inds',
                                                      half_n_evs_test, y, sp,
                                                      n_folds, test_inds,
                                                      overwrite)
                    else:
                        test_inds = subject_data_inds(
                            np.full(1, test_sbj), sbj_order_test_load,
                            labels_unique, i, 'test_inds', half_n_evs_test,
                            y_test_orig, sp, n_folds, test_inds, overwrite)
                    val_inds = subject_data_inds(val_sbj, sbj_order,
                                                 labels_unique, i, 'val_inds',
                                                 half_n_evs, y, sp, n_folds,
                                                 val_inds, overwrite)
                    train_inds = subject_data_inds(train_sbj, sbj_order,
                                                   labels_unique, i,
                                                   'train_inds', half_n_evs, y,
                                                   sp, n_folds, train_inds,
                                                   overwrite)
                    train_inds_folds.append(train_inds)
                    val_inds_folds.append(val_inds)
                    test_inds_folds.append(test_inds)
                else:
                    train_inds = train_inds_folds[i]
                    val_inds = val_inds_folds[i]
                    test_inds = test_inds_folds[i]

                # Now that we have the train/val/test event indices, generate the data for the models
                X_train = X[train_inds, ...]
                Y_train = y[train_inds]
                sbj_order_train = sbj_order[train_inds]
                X_validate = X[val_inds, ...]
                Y_validate = y[val_inds]
                sbj_order_validate = sbj_order[val_inds]
                if test_day is None:
                    X_test = X[test_inds, ...]
                    Y_test = y[test_inds]
                    sbj_order_test = sbj_order[test_inds]
                else:
                    X_test = X_test_orig[test_inds, ...]
                    Y_test = y_test_orig[test_inds]
                    sbj_order_test = sbj_order_test_load[test_inds]

                if modeltype == 'rf':
                    # For random forest, project data from electrodes to ROIs in advance
                    X_train_proj = roi_proj_rf(X_train, sbj_order_train, nROIs,
                                               proj_mat_out)
                    X_validate_proj = roi_proj_rf(X_validate,
                                                  sbj_order_validate, nROIs,
                                                  proj_mat_out)
                    X_test_proj = roi_proj_rf(X_test, sbj_order_test, nROIs,
                                              proj_mat_out)

                    # Create Random Forest classifier model
                    model = RandomForestClassifier(n_estimators=n_estimators,
                                                   max_depth=max_depth,
                                                   class_weight="balanced",
                                                   random_state=rand_seed,
                                                   n_jobs=1,
                                                   oob_score=True)

                    # Fit model and store train/val/test accuracies
                    t_fit_start = time.time()
                    clf = model.fit(X_train_proj, Y_train.ravel())
                    last_epochs[i, 1] = time.time() - t_fit_start
                    accs[i, 0] = accuracy_score(Y_train.ravel(),
                                                clf.predict(X_train_proj))
                    accs[i, 1] = accuracy_score(Y_validate.ravel(),
                                                clf.predict(X_validate_proj))
                    accs[i, 2] = accuracy_score(Y_test.ravel(),
                                                clf.predict(X_test_proj))
                    del X_train_proj, X_validate_proj, X_test_proj

                    # Save model
                    chckpt_path = sp + modeltype + '_fold' + str(
                        i) + save_suffix + '.sav'
                    pickle.dump(clf, open(chckpt_path, 'wb'))
                elif modeltype == 'riemann':
                    # Project data from electrodes to ROIs in advance
                    X_train_proj = roi_proj_rf(X_train, sbj_order_train, nROIs,
                                               proj_mat_out)
                    X_validate_proj = roi_proj_rf(X_validate,
                                                  sbj_order_validate, nROIs,
                                                  proj_mat_out)
                    X_test_proj = roi_proj_rf(X_test, sbj_order_test, nROIs,
                                              proj_mat_out)

                    # Reshape into 3 dimensions
                    X_train_proj2 = X_train_proj.reshape(
                        (X_train.shape[0], -1, X_train.shape[-1]))
                    X_validate_proj2 = X_validate_proj.reshape(
                        (X_validate.shape[0], -1, X_validate.shape[-1]))
                    X_test_proj2 = X_test_proj.reshape(
                        (X_test.shape[0], -1, X_test.shape[-1]))

                    # Find any events where std is 0
                    train_inds_bad = np.nonzero(
                        X_train_proj2.std(axis=-1).max(axis=-1) == 0)[0]
                    val_inds_bad = np.nonzero(
                        X_validate_proj2.std(axis=-1).max(axis=-1) == 0)[0]
                    test_inds_bad = np.nonzero(
                        X_test_proj2.std(axis=-1).max(axis=-1) == 0)[0]
                    if not not train_inds_bad.tolist():
                        first_good_ind = np.setdiff1d(
                            np.arange(X_train_proj2.shape[0]),
                            train_inds_bad)[0]
                        X_train_proj2[train_inds_bad,
                                      ...] = X_train_proj2[(train_inds_bad *
                                                            0) +
                                                           first_good_ind, ...]
                    if not not val_inds_bad.tolist():
                        first_good_ind = np.setdiff1d(
                            np.arange(X_validate_proj2.shape[0]),
                            val_inds_bad)[0]
                        X_validate_proj2[val_inds_bad, ...] = X_validate_proj2[
                            (val_inds_bad * 0) + first_good_ind, ...]
                    if not not test_inds_bad.tolist():
                        first_good_ind = np.setdiff1d(
                            np.arange(X_test_proj2.shape[0]), test_inds_bad)[0]
                        X_test_proj2[test_inds_bad,
                                     ...] = X_test_proj2[(test_inds_bad * 0) +
                                                         first_good_ind, ...]

                    # Estimate covariances matrices
                    cov_data_train = pyriemann.estimation.Covariances(
                        'lwf').fit_transform(X_train_proj2)
                    cov_data_val = pyriemann.estimation.Covariances(
                        'lwf').fit_transform(X_validate_proj2)
                    cov_data_test = pyriemann.estimation.Covariances(
                        'lwf').fit_transform(X_test_proj2)

                    # Create MDM model
                    mdm = pyriemann.classification.MDM()

                    # Fit model and store train/val/test accuracies
                    t_fit_start = time.time()
                    clf = mdm.fit(cov_data_train, Y_train.ravel())
                    last_epochs[i, 1] = time.time() - t_fit_start
                    accs[i, 0] = accuracy_score(Y_train.ravel(),
                                                clf.predict(cov_data_train))
                    accs[i, 1] = accuracy_score(Y_validate.ravel(),
                                                clf.predict(cov_data_val))
                    accs[i, 2] = accuracy_score(Y_test.ravel(),
                                                clf.predict(cov_data_test))
                    del X_train_proj, X_validate_proj, X_test_proj

                    # Save model
                    chckpt_path = sp + modeltype + '_fold' + str(
                        i) + save_suffix + '.sav'
                    pickle.dump(clf, open(chckpt_path, 'wb'))
                else:
                    # Reformat data size for NN fitting
                    Y_train = np_utils.to_categorical(Y_train - 1)
                    X_train = np.expand_dims(X_train, 1)
                    Y_validate = np_utils.to_categorical(Y_validate - 1)
                    X_validate = np.expand_dims(X_validate, 1)
                    Y_test = np_utils.to_categorical(Y_test - 1)
                    X_test = np.expand_dims(X_test, 1)
                    proj_mat_out2 = np.expand_dims(proj_mat_out, 1)

                    # Fit NN model using Keras
                    chckpt_path = sp + 'checkpoint_gen_' + modeltype + '_fold' + str(
                        i) + save_suffix + '.h5'
                    accs_lst, last_epoch_tmp = cnn_model(
                        X_train,
                        Y_train,
                        X_validate,
                        Y_validate,
                        X_test,
                        Y_test,
                        chckpt_path,
                        modeltype,
                        proj_mat_out2,
                        sbj_order_train,
                        sbj_order_validate,
                        sbj_order_test,
                        nROIs=nROIs,
                        nb_classes=nb_classes,
                        dropoutRate=dropoutRate,
                        kernLength=kernLength,
                        F1=F1,
                        D=D,
                        F2=F2,
                        dropoutType=dropoutType,
                        kernLength_sep=kernLength_sep,
                        loss=loss,
                        optimizer=optimizer,
                        patience=patience,
                        early_stop_monitor=early_stop_monitor,
                        do_log=do_log,
                        epochs=epochs,
                        compute_val=compute_val,
                        ecog_srate=ecog_srate)

                    # Store train/val/test accuracies, and last epoch
                    for ss in range(3):
                        accs[i, ss] = accs_lst[ss]

                    last_epochs[i, :] = last_epoch_tmp

            # Save accuracies for all folds for one type of model
            np.save(
                sp + 'acc_gen_' + modeltype + '_' + str(n_folds) +
                save_suffix + '.npy', accs)
            np.save(
                sp + 'last_training_epoch_gen_tf' + modeltype + '_' +
                str(n_folds) + save_suffix + '.npy', last_epochs)

        # Returns average validation accuracy for hyperparameter tuning (will be for last model_type only)
        return accs[:, 1].mean()
    else:
        # Single subject model fitting
        for pat_id_curr in pats_ids_in:
            # Load ECoG data
            X, y, X_test, y_test, sbj_order, sbj_order_test = load_data(
                pat_id_curr,
                lp,
                n_chans_all=n_chans_all,
                test_day=test_day,
                tlim=tlim)
            X[np.isnan(X)] = 0  # set all NaN's to 0
            # Identify the number of unique labels (or classes) present
            nb_classes = len(np.unique(y))

            # Randomize event order (random seed facilitates consistency)
            order_inds = np.arange(len(y))
            np.random.shuffle(order_inds)
            X = X[order_inds, ...]
            y = y[order_inds]
            order_inds_test = np.arange(len(y_test))
            np.random.shuffle(order_inds_test)
            X_test = X_test[order_inds_test, ...]
            y_test = y_test[order_inds_test]

            # Iterate across all model types specified
            for modeltype in models:
                # Reformat data based on model
                if modeltype == 'rf':
                    y2 = y.copy()
                    y_test2 = y_test.copy()
                    X2 = X.copy()
                    X_test2 = X_test.copy()
                elif modeltype == 'riemann':
                    y2 = y.copy()
                    y_test2 = y_test.copy()
                    X2 = X.copy()
                    X_test2 = X_test.copy()
                else:
                    y2 = np_utils.to_categorical(y - 1)
                    y_test2 = np_utils.to_categorical(y_test - 1)
                    X2 = np.expand_dims(X, 1)
                    X_test2 = np.expand_dims(X_test, 1)

                # Create splits for train/val and fit model
                split_len = X2.shape[0] // n_folds
                accs = np.zeros([n_folds, 3])
                last_epochs = np.zeros([n_folds, 2])
                for frodo in range(n_folds):
                    val_inds = np.arange(0, split_len) + (frodo * split_len)
                    train_inds = np.setdiff1d(
                        np.arange(X2.shape[0]),
                        val_inds)  #take all events not in val set

                    # Split data and labels into train/val sets
                    X_train = X2[train_inds, ...]
                    Y_train = y2[train_inds]
                    X_validate = X2[val_inds, ...]
                    Y_validate = y2[val_inds]

                    if modeltype == 'rf':
                        # For random forest, combine electrodes and time dimensions
                        X_train_rf = X_train.reshape(X_train.shape[0], -1)
                        X_validate_rf = X_validate.reshape(
                            X_validate.shape[0], -1)
                        X_test2_rf = X_test2.reshape(X_test2.shape[0], -1)

                        # Create random forest model
                        model = RandomForestClassifier(
                            n_estimators=n_estimators,
                            max_depth=max_depth,
                            class_weight="balanced",
                            random_state=rand_seed,
                            n_jobs=1,
                            oob_score=True)

                        # Fit model and store accuracies
                        t_fit_start = time.time()
                        clf = model.fit(X_train_rf, Y_train.ravel())
                        last_epochs[frodo, 1] = time.time() - t_fit_start
                        accs[frodo,
                             0] = accuracy_score(Y_train.ravel(),
                                                 clf.predict(X_train_rf))
                        accs[frodo,
                             1] = accuracy_score(Y_validate.ravel(),
                                                 clf.predict(X_validate_rf))
                        accs[frodo,
                             2] = accuracy_score(y_test2.ravel(),
                                                 clf.predict(X_test2_rf))

                        # Save model
                        chckpt_path = sp+modeltype+'_'+pat_id_curr+'_testday_'+\
                                      str(test_day)+'_fold'+str(frodo)+save_suffix+'.sav'
                        pickle.dump(clf, open(chckpt_path, 'wb'))
                    elif modeltype == 'riemann':
                        # Find any events where std is 0
                        train_inds_bad = np.nonzero(
                            X_train.std(axis=-1).max(axis=-1) == 0)[0]
                        val_inds_bad = np.nonzero(
                            X_validate.std(axis=-1).max(axis=-1) == 0)[0]
                        test_inds_bad = np.nonzero(
                            X_test2.std(axis=-1).max(axis=-1) == 0)[0]
                        if not not train_inds_bad.tolist():
                            first_good_ind = np.setdiff1d(
                                np.arange(X_train.shape[0]), train_inds_bad)[0]
                            X_train[train_inds_bad,
                                    ...] = X_train[(train_inds_bad * 0) +
                                                   first_good_ind, ...]
                        if not not val_inds_bad.tolist():
                            first_good_ind = np.setdiff1d(
                                np.arange(X_validate.shape[0]),
                                val_inds_bad)[0]
                            X_validate[val_inds_bad,
                                       ...] = X_validate[(val_inds_bad * 0) +
                                                         first_good_ind, ...]
                        if not not test_inds_bad.tolist():
                            first_good_ind = np.setdiff1d(
                                np.arange(X_test2.shape[0]), test_inds_bad)[0]
                            X_test2[test_inds_bad,
                                    ...] = X_test2[(test_inds_bad * 0) +
                                                   first_good_ind, ...]

                        # Estimate covariances matrices
                        cov_data_train = pyriemann.estimation.Covariances(
                            'lwf').fit_transform(X_train)
                        cov_data_val = pyriemann.estimation.Covariances(
                            'lwf').fit_transform(X_validate)
                        cov_data_test = pyriemann.estimation.Covariances(
                            'lwf').fit_transform(X_test2)

                        # Create MDM model
                        mdm = pyriemann.classification.MDM()

                        # Fit model and store train/val/test accuracies
                        t_fit_start = time.time()
                        clf = mdm.fit(cov_data_train, Y_train.ravel())
                        last_epochs[frodo, 1] = time.time() - t_fit_start
                        accs[frodo,
                             0] = accuracy_score(Y_train.ravel(),
                                                 clf.predict(cov_data_train))
                        accs[frodo,
                             1] = accuracy_score(Y_validate.ravel(),
                                                 clf.predict(cov_data_val))
                        accs[frodo,
                             2] = accuracy_score(y_test2.ravel(),
                                                 clf.predict(cov_data_test))

                        # Save model
                        chckpt_path = sp+modeltype+'_'+pat_id_curr+'_testday_'+\
                                      str(test_day)+'_fold'+str(frodo)+save_suffix+'.sav'
                        pickle.dump(clf, open(chckpt_path, 'wb'))
                    else:
                        # Fit NN model and store accuracies
                        chckpt_path = sp+'checkpoint_'+modeltype+'_'+pat_id_curr+'_testday_'+\
                                      str(test_day)+'_fold'+str(frodo)+save_suffix+'.h5'
                        accs_lst, last_epoch_tmp = cnn_model(
                            X_train,
                            Y_train,
                            X_validate,
                            Y_validate,
                            X_test2,
                            y_test2,
                            chckpt_path,
                            modeltype,
                            nb_classes=nb_classes,
                            dropoutRate=dropoutRate,
                            kernLength=kernLength,
                            F1=F1,
                            D=D,
                            F2=F2,
                            dropoutType=dropoutType,
                            kernLength_sep=kernLength_sep,
                            loss=loss,
                            optimizer=optimizer,
                            patience=patience,
                            early_stop_monitor=early_stop_monitor,
                            do_log=do_log,
                            epochs=epochs,
                            compute_val=compute_val,
                            ecog_srate=ecog_srate)

                        for ss in range(3):
                            accs[frodo, ss] = accs_lst[ss]

                        last_epochs[frodo, :] = last_epoch_tmp

                # Save accuracies (train/val/test)
                np.save(
                    sp + 'acc_' + modeltype + '_' + pat_id_curr + '_testday_' +
                    str(test_day) + save_suffix + '.npy', accs)
                np.save(
                    sp + 'last_training_epoch_gen_tf' + modeltype + '_' +
                    pat_id_curr + '_testday_' + str(test_day) + save_suffix +
                    '.npy', last_epochs)

        # Return validation accuracy for hyperparameter tuning (assumes only 1 model and 1 subject)
        return accs[:, 1].mean()
예제 #6
0
    # very bad
    'lstm': lstm,
    # very bad
    'gru': gru,
    # very good
    'bidirectional_lstm': bidirectional_lstm,
    'bidirectional_gru': bidirectional_gru
}


def create_model():
    return models_available[MODEL_NAME]()


print('loading the data')
data_train = model_utils.load_data()

texts = []
labels = []

intents = model_utils.load_labels()
intents_lookup = model_utils.get_intents_lookup(intents)

inputs = np.zeros((len(data_train), MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
for idx, (text, intent) in enumerate(data_train):
    encoded = model_utils.encode_sentence(text)
    # copy the values, equivalent of padding
    inputs[idx, :encoded.shape[0], :encoded.
           shape[1]] = encoded[:MAX_SEQUENCE_LENGTH, :]
    # append the id of the intent
    labels.append(intents_lookup[intent])
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    add_test_args(parser)
    add_common_args(parser)
    args = parser.parse_args()

    x_train, y_train_biden, y_train_trump, mask_train, x_dev, y_dev_biden, y_dev_trump, mask_dev, container = model_utils.load_data(
        args.dataset_dir, dev_frac=args.dev_frac, max_entries=args.dataset_cap)

    model = model_utils.load_model(
        models.get_model(args.model)(), args.load_path)
    model.eval()

    # Change this line to hear other kinds of samples.
    dataset = x_train

    for i in range(dataset.shape[0]):
        print(f"Playing Combined {i}...")
        container.data = dataset[i, 0].numpy()
        play(container.invert())

        y_b, y_t = model(dataset[i:i + 1].abs())
        container.data = (torch.clamp(y_b / dataset[i:i + 1].abs(), 0, 1) *
                          dataset[i:i + 1]).detach().numpy()[0, 0]
        print(f"Playing model output {i}...")
        play(container.invert())
예제 #8
0
                        type=str,
                        help="Model Architecture")
    parser.add_argument("--learning_rate",
                        default=0.005,
                        type=float,
                        help="Learning Rate")
    parser.add_argument("--hidden_units",
                        default=512,
                        type=int,
                        help="Hidden units")
    parser.add_argument("--epochs", default=10, type=int, help="Epochs")
    parser.add_argument("--gpu",
                        action='store_true',
                        default=False,
                        help="GPU")

    args = parser.parse_args()

    print('---------Parameters----------')
    print('gpu              = {!r}'.format(args.gpu))
    print('epoch(s)         = {!r}'.format(args.epochs))
    print('arch             = {!r}'.format(args.arch))
    print('learning_rate    = {!r}'.format(args.learning_rate))
    print('hidden_units     = {!r}'.format(args.hidden_units))
    print('-----------------------------')

    train_loader, valid_loader, _, class_to_idx = load_data(args.data_dir)
    best_model = training(args.arch, args.hidden_units, args.learning_rate,
                          args.epochs, args.save_dir, train_loader,
                          valid_loader, class_to_idx)
예제 #9
0
def main():
    [[X_train, y_train], [X_valid, y_valid],
     [X_test, y_test]] = util.load_data(DATA_PATH)
    print('Merging data...')
    X = X_train.append(X_valid).append(X_test)
    y = y_train.append(y_valid).append(y_test)
    del X_train
    del X_valid
    del X_test
    del y_train
    del y_valid
    del y_test
    X = make_squared(X, SQ_FIELDS)
    X = make_interactions(X, INTERACTIONS)
    full = X.copy()
    full['beetle'] = y['beetle']
    X = X.drop([
        'studyArea', 'x', 'y', 'elev_srtm30', 'year',
        'varPrecip_growingSeason', 'precip_OctSep:varPrecip_growingSeason'
    ],
               axis=1)
    predictors = list(X)

    print('Fitting model to full data set...')
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = y['beetle'].values.reshape(-1)
    logistic_clf = LogisticRegression(C=0.001, penalty='l2')
    logistic_clf.fit(X, y)

    coefs = pd.DataFrame(
        [[pred, coef]
         for pred, coef in zip(predictors, logistic_clf.coef_[0])],
        columns=['predictor', 'coef'])
    coefs['abs'] = np.abs(coefs.coef)
    coefs = coefs.sort_values('abs', ascending=False)
    coefs = coefs.drop(['abs'], axis=1)
    print('Model Coefficients:\n', coefs)

    x_range, y_range = get_ranges(full, verbose=True)
    historic_years = range(1903, 2000)
    year = 1999
    next_year_data = full.loc[full.year == (year + 1), :]

    while year >= historic_years[0]:
        hist_data = pd.read_csv(HISTORIC_DATA_PATH + 'clean_%d.csv' % year)
        hist_data = mask_data(hist_data, x_range, y_range, verbose=False)
        hist_data = make_squared(hist_data, SQ_FIELDS)
        hist_data = make_interactions(hist_data, INTERACTIONS)

        print('\nBeginning predictions for', year)
        xy = next_year_data.apply(lambda row: str(row['x']) + str(row['y']),
                                  axis=1)
        print('  Reducing %d data to study area...' % year)
        extras = find_extra_rows(hist_data, xy)
        hist_data = hist_data.drop(extras, axis=0)
        hist_data = hist_data.rename(
            columns={'precipPreious_OctSep': 'precipPrevious_OctSep'})
        if year == historic_years[-1]:
            hist_merge = hist_data[['x', 'y']]
        print('  Ascertaining rows are aligned...')
        assert list(hist_data.x) == list(next_year_data.x)
        assert list(hist_data.y) == list(next_year_data.y)

        hist_data.index = next_year_data.index
        hist_merge.index = hist_data.index
        hist_essentials = pd.DataFrame(hist_data[predictors[0]])
        print('  Keeping essentials...')
        for p in predictors[1:]:
            hist_essentials[p] = hist_data[p]

        hist_essentials = scaler.fit_transform(hist_essentials)
        print('  Predicting...')
        probs = logistic_clf.predict_proba(hist_essentials)
        probs = np.array([prob[1] for prob in probs])
        hist_merge.loc[:, 'probs_%d' % year] = probs
        hist_merge.loc[:, 'preds_%d' % year] = list(
            map(lambda x: 1 if x >= OPTIMAL_THRESHOLD else 0, probs))
        print('  Saving data so far....')
        hist_merge.to_csv(HISTORIC_DATA_PATH + 'predictions_no_beetle.csv')

        year -= 1
        next_year_data = hist_data
예제 #10
0
파일: run.py 프로젝트: aghie/hpac
    if args.model == models.LSTM_NAME:
        config.set("LSTM", "timesteps", args.timesteps)
    if args.model == models.CNN_NAME:
        config.set("CNN", "timesteps", args.timesteps)
    if args.model == models.LG_NAME:
        config.set("MLR", "timesteps", args.timesteps)
    if args.model == models.MLP_NAME:
        config.set("MLP", "timesteps", args.timesteps)

    config.set(args.model, "external_embeddings", embeddings)

    train_conf = dict(config.items("Settings"))
    print 'Loading training data...',
    words, labels = model_utils.load_data(args.training,
                                          path_spells,
                                          train=True)
    print "[OK]"

    path_weights = args.model_weights
    for n in range(1, args.S + 1):

        args.model_weights = path_weights.replace(".hdf5",
                                                  "_" + str(n) + ".hdf5")

        #Instantiating the model
        print('Initializing model'), args.model
        if args.model == models.LG_NAME:
            m = models.LogisticRegressionHP(conf=dict(config.items('MLR')),
                                            forms=words,
                                            labels=labels,
예제 #11
0
parser.add_argument("--learning_rate",
                    type=float,
                    default=0.001,
                    help="set learning rate")
# hidden_units is irrelevant
parser.add_argument("--hidden_units",
                    type=int,
                    default=1024,
                    help="set hidden units")
parser.add_argument("--epochs", type=int, default=1, help="set epochs")
parser.add_argument("--gpu",
                    action="store_const",
                    const="cuda",
                    default="cpu",
                    help="use gpu")
parser.add_argument("--save_dir", help="save model")

args = parser.parse_args()

cat_to_name = read_json(args.category_names)

trainloader, testloader, validloader, train_data = load_data(args.data_dir)

model, history = make_NN(n_hidden=[args.hidden_units], n_epoch=args.epochs, labelsdict=cat_to_name, lr=args.learning_rate, device=args.gpu, \
                model_name=args.arch, trainloader=trainloader, validloader=validloader, train_data=train_data)

with open('loss_history.pickle', 'wb') as f:
    pickle.dump(history, f)

if args.save_dir:
    save_whole_model(model, args.save_dir)
예제 #12
0

"""
Augment and save the dataset. Augments are rotation and flipping. Fake colors are added too. Saved as .tif files. 
"""


if __name__ == '__main__':

    source_path = sys.argv[1]
    dest_path = sys.argv[2]

    # Load dataset
    image_paths = glob.glob(os.path.join(source_path, "*.tif"))
    for image_path in image_paths:
        image = model_utils.load_data(image_path, image_path.replace("images", "labels"))
        # Do preprocessing and image augmentation
        train_x, train_y = model_utils.convert_training_images_to_numpy_arrays([image], normalize=False)
        train_y = model_utils.replace_class(train_y, class_id=5)
        train_x = model_utils.fake_colors(train_x)
        train_x = model_utils.image_augmentation(train_x)
        train_y = model_utils.image_augmentation(train_y)

        # Save the images
        for i in range(train_x.shape[0]):
            augmented_image_x = train_x[i, :, :, :]
            augmented_image_y = train_y[i, :, :, :]
            augmented_image = data_processing.TrainingImage(augmented_image_x, augmented_image_y,
                                                            geo_transform=image.geo_transform,
                                                            projection=image.projection)
            data_output_path = os.path.join(dest_path, "images",
예제 #13
0
def main():
    parser = argparse.ArgumentParser()
    add_train_args(parser)
    add_common_args(parser)
    args = parser.parse_args()
    add_experiment(args)
    device = model_utils.get_device()

    # Load dataset from disk
    x_train, y_train_biden, y_train_trump, mask_train, x_dev, y_dev_biden, y_dev_trump, mask_dev, container = model_utils.load_data(
        args.dataset_dir, dev_frac=args.dev_frac, max_entries=args.dataset_cap)
    train_dl = data.DataLoader(
        data.TensorDataset(x_train, y_train_biden, y_train_trump, mask_train),
        batch_size=args.train_batch_size,
        shuffle=True,
    )
    dev_dl = data.DataLoader(
        data.TensorDataset(x_dev, y_dev_biden, y_dev_trump, mask_dev),
        batch_size=args.val_batch_size,
        shuffle=False,
    )

    # Initialize a model
    model = models.get_model(args.model)()

    # load from checkpoint if path specified
    if args.load_path is not None:
        model = model_utils.load_model(model, args.load_path)

    # Move model to GPU if necessary
    model.to(device)

    # Initialize optimizer
    optimizer = optim.Adam(
        model.parameters(),
        lr=args.learning_rate,
        weight_decay=args.weight_decay,
    )

    # Scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=30,
        verbose=True,
    )

    os.makedirs(f'{args.save_path}/{args.experiment}')
    print(f'Created new experiment: {args.experiment}')
    save_arguments(args, f'{args.save_path}/{args.experiment}/args.txt')

    # Train!
    trained_model = train_model(
        train_dl,
        dev_dl,
        model,
        optimizer,
        scheduler,
        args,
    )

    # Save trained model
    filename = f'{args.save_path}/{args.experiment}/{model.__class__.__name__}_trained.checkpoint'
    model_utils.save_model(trained_model, filename)
예제 #14
0
train_path = '/nethome/zyu336/dl_catdog/data/train/'

img_shape = (3, 224, 224)
batch_size = 16
n_epochs = 20
sample_interval = 400

cuda = torch.cuda.is_available()
latent_dim = 256
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
dataloader, _, n_classes, _, _, _, _ = load_data('breeds_cat', train_transform,
                                                 train_transform, batch_size)

n_classes = len(n_classes)


class Flatten(nn.Module):
    def forward(self, x):
        N, C, H, W = x.size()
        return x.view(N, -1)


class Unflatten(nn.Module):
    def __init__(self, N=-1, C=3, H=224, W=224):
        super(Unflatten, self).__init__()
        self.N = N
        self.C = C
def transfer_learn_nn(lp, sp, model_type = 'eegnet_hilb', layers_to_finetune = None,
                      n_train_trials = 50, per_train_trials = 0.6, n_val_trials = 50, per_val_trials = 0.3,
                      n_test_trials = 50, use_per_vals = False,loss='categorical_crossentropy', optimizer='adam',
                      patience=5,early_stop_monitor='val_loss',norm_rate=0.25,use_prev_opt_early_params=True,
                      single_sub=False, compute_val='power', ecog_srate=500, epochs = 20,
                      data_lp=None, pats_ids_in=None, test_day=None, n_train_sbj=None, n_folds=None,
                      proj_mat_lp=None):
    '''
    Main script for performing transfer learning across folds. Matches code from run_nn_models.py.
    
    If doing test_day = 'last', only need to specify train and val trials/percent because test set is known.
    '''
    # Ensure layers_to_finetune is a list
    if (layers_to_finetune is not None) and (not isinstance(layers_to_finetune, list)):
        layers_to_finetune = [layers_to_finetune]
    
    # Create suffix for saving files (so can save results from different train/val sizes to same folder)
    if use_per_vals:
        suffix_trials = '_ptra'+str(int(per_train_trials*100))+'_pval'+str(int(per_val_trials*100))
    else:
        suffix_trials = '_ntra'+str(n_train_trials)+'_nval'+str(n_val_trials)+'_ntes'+str(n_test_trials)
    
    # Load param file from pre-trained model
    file_pkl = open(lp+'param_file.pkl', 'rb')
    params_dict = pickle.load(file_pkl)
    file_pkl.close()
    
    # Extract appropriate parameters from param file
    tlim = params_dict['tlim']
    if test_day==None:
        test_day = params_dict['test_day']
    if pats_ids_in==None:
        pats_ids_in = params_dict['pats_ids_in']
    rand_seed = params_dict['rand_seed']
    n_test_sbj = params_dict['n_test']
    n_val_sbj = params_dict['n_val']
    if n_folds==None:
        n_folds = params_dict['n_folds']
    save_suffix = params_dict['save_suffix']
    do_log = params_dict['do_log']
    if data_lp==None:
        data_lp = params_dict['lp']
    if n_train_sbj==None:
        if 'n_train' in list(params_dict.keys()):
            n_train_sbj = params_dict['n_train']
        else:
            n_train_sbj = 7
    
    if 'epochs' in list(params_dict.keys()):
        epochs = params_dict['epochs']
        compute_val = params_dict['compute_val']
        ecog_srate = params_dict['ecog_srate']
    if use_prev_opt_early_params:
        # Use model fitting parameters from pre-trained model
        loss = params_dict['loss']
        optimizer = params_dict['optimizer']
        patience = params_dict['patience']
        early_stop_monitor = params_dict['early_stop_monitor']
    
    # Load in hyperparameters
    dropoutRate = params_dict['dropoutRate']
    kernLength = params_dict['kernLength']
    F1 = params_dict['F1']
    D = params_dict['D']
    F2 = params_dict['F2']
    dropoutType = params_dict['dropoutType']
    kernLength_sep = params_dict['kernLength_sep']
    
    # Find pathnames of models from all folds
    model_fnames = natsort.natsorted(glob.glob(lp + 'checkpoint_gen_'+model_type+'_fold*.h5'))
    
    # Set random seed
    np.random.seed(rand_seed)
    
    # Load projection matrix (electrodes to ROI's) from pre-trained model
    if proj_mat_lp==None:
        proj_mat_out = np.load(lp+'proj_mat_out.npy')
        n_chans_all = len(np.nonzero(proj_mat_out.reshape(-1,proj_mat_out.shape[-1]).mean(axis=0))[0])
    else:
        proj_mat_out = np.load(proj_mat_lp+'proj_mat_out.npy')
        n_chans_all = params_dict['n_chans_all']
        if proj_mat_out.shape[-1]>n_chans_all:
            proj_mat_out = proj_mat_out[...,:n_chans_all]
        elif proj_mat_out.shape[-1]<n_chans_all:
            proj_mat_out_tmp = proj_mat_out.copy()
            proj_sh = [val for val in proj_mat_out_tmp.shape]
            proj_sh[-1] = n_chans_all
            proj_mat_out = np.zeros(proj_sh)
            proj_mat_out[...,:proj_mat_out_tmp.shape[-1]] = proj_mat_out_tmp

    # Load ECoG data for all subjects
    if test_day == 'last':
        # If test day is 'last', load in last day's data for all subjects
        X_all,y_all,X_test_last,y_test_last,sbj_order_all,sbj_order_test_last = load_data(pats_ids_in, data_lp,
                                                                                          n_chans_all=n_chans_all,
                                                                                          test_day=test_day, tlim=tlim)
    else:
        X_all,y_all,_,_,sbj_order_all,_ = load_data(pats_ids_in, data_lp,
                                                    n_chans_all=n_chans_all,
                                                    test_day=None, tlim=tlim)
    
    # Identify the number of unique labels (or classes) present
    nb_classes = len(np.unique(y_all))

    # Choose subjects for training/validation/testing for every fold (random seed keeps this consistent to pre-trained data)
    sbj_inds_all_train, sbj_inds_all_val, sbj_inds_all_test = folds_choose_subjects(n_folds, pats_ids_in,
                                                                                    n_test=n_test_sbj, n_val=n_val_sbj,
                                                                                    n_train=n_train_sbj)
    
    print("Subject indices are: ", sbj_inds_all_test, len(sbj_inds_all_test))
    # Determine train/val/test inds for every fold
    labels_unique = np.unique(y_all)
    nb_classes = len(labels_unique)
    half_n_evs_test = 'nopad' #avoids duplicating events (will take all available events)

    acc_pretrain = np.zeros([n_folds,3])
    acc_trained = acc_pretrain.copy()
    acc_single_sub = acc_pretrain.copy()
    acc_single_sub_0 = acc_single_sub.copy()
    last_epochs_TL = np.zeros([n_folds,2])
    last_epochs_SS = np.zeros([n_folds,2])
    for i in range(n_folds):
        # Determine subjects in train/val/test sets for current fold
        test_sbj = sbj_inds_all_test[i]
        val_sbj = sbj_inds_all_val[i]
        train_sbj = sbj_inds_all_train[i]

        # First, find indices for all events associated with test subject
        other_inds = subject_data_inds(np.full(1, test_sbj), sbj_order_all, labels_unique, i, 
                                       'test_inds', half_n_evs_test, y_all, sp, n_folds, [])
        trainval_inds = np.asarray(list(set(other_inds)))
        
        if test_day == 'last':
            # Find all events for last day for test subject
            test_inds = subject_data_inds(np.full(1, test_sbj), sbj_order_test_last, labels_unique, i, 
                                          'test_inds', half_n_evs_test, y_test_last, sp, n_folds, [])
            
            # Determine number of train and val events (trials) to obtain
            if use_per_vals:
                n_train = int(len(trainval_inds) * per_train_trials)
                n_val = int(len(trainval_inds) * per_val_trials)
            else:
                n_train = int(n_train_trials)
                n_val = int(n_val_trials)
            
            # Find train event indices
            train_inds_tmp = subject_data_inds(np.full(1, test_sbj), sbj_order_all[trainval_inds], labels_unique, i, 
                                               'train_inds', n_train//nb_classes, y_all[trainval_inds], sp, n_folds, []) 
            #I think above is supposed to be 'train_inds'
            train_inds = trainval_inds[train_inds_tmp] #convert back to original inds
            
            # Remove train events and choose val inds from remaining events
            # Note: if n_train is larger than available events for training data, finding validation events
            # will throw an error because there are no remaining events to choose from
            remain_inds = np.asarray(list(set(trainval_inds) - set(train_inds))) # remove train inds
            if len(remain_inds) == 0:
                sys.exit("Error: No data to pick from for validation set!")
            val_inds_tmp = subject_data_inds(np.full(1, test_sbj), sbj_order_all[remain_inds], labels_unique, i, 
                                             'val_inds', n_val//nb_classes, y_all[remain_inds], sp, n_folds, [])
            val_inds = remain_inds[val_inds_tmp] # convert back to original inds
            
        else:
            # If test_day is not last, then determine number of train, val, and test events (trials) to obtain
            if use_per_vals:
                n_train = int(len(other_inds) * per_train_trials)
                n_val = int(len(other_inds) * per_val_trials)
                n_test = int(len(other_inds) * (1-per_train_trials-per_val_trials))
            else:
                n_train = int(n_train_trials)
                n_val = int(n_val_trials)
                n_test = int(n_test_trials)
            
            # Find train event indices
            train_inds_tmp = subject_data_inds(np.full(1, test_sbj), sbj_order_all[trainval_inds], labels_unique, i, 
                                               'train_inds', n_train//nb_classes, y_all[trainval_inds], sp, n_folds, [])
            train_inds = trainval_inds[train_inds_tmp] # convert back to original inds
            
            # Remove train events and choose val inds from remaining events
            # Note: if n_train is larger than available events for training data, finding validation events
            # will throw an error because there are no remaining events to choose from
            valtest_inds = np.asarray(list(set(other_inds) - set(train_inds))) #remove train inds
            if len(valtest_inds) == 0:
                sys.exit("Error: No data to pick from for validation and test sets!")
            val_inds_tmp = subject_data_inds(np.full(1, test_sbj), sbj_order_all[valtest_inds], labels_unique, i, 
                                             'val_inds', n_val//nb_classes, y_all[valtest_inds], sp, n_folds, [])
            val_inds = valtest_inds[val_inds_tmp] # convert back to original inds
            
            # Remove val events and choose test inds from remaining events
            # Note: if n_train+n_val is larger than available events for training data, finding test events
            # will throw an error because there are no remaining events to choose from
            remain_inds = np.asarray(list(set(valtest_inds) - set(val_inds))) # remove train inds
            if len(remain_inds) == 0:
                sys.exit("Error: No data to pick from for test set!")
            test_inds_tmp = subject_data_inds(np.full(1, test_sbj), sbj_order_all[remain_inds], labels_unique, i, 
                                             'test_inds', n_test//nb_classes, y_all[remain_inds], sp, n_folds, [])
            test_inds = remain_inds[test_inds_tmp] # convert back to original inds

        # Generate train/val/test data based on event indices for each fold
        X_train = X_all[train_inds,...]
        Y_train = y_all[train_inds]
        sbj_order_train = sbj_order_all[train_inds] # important for projection matrix input
        X_validate = X_all[val_inds,...]
        Y_validate = y_all[val_inds]
        sbj_order_validate = sbj_order_all[val_inds] # important for projection matrix input
        if test_day is None:
            X_test = X_all[test_inds,...]
            Y_test = y_all[test_inds]
            sbj_order_test = sbj_order_all[test_inds] # important for projection matrix input
        else:
            # If test_day is last, use loaded data from last days only
            X_test = X_test_last[test_inds,...]
            Y_test = y_test_last[test_inds]
            sbj_order_test = sbj_order_test_last[test_inds] # important for projection matrix input

        # Reformat data size for NN
        Y_train = np_utils.to_categorical(Y_train-1)
        X_train = np.expand_dims(X_train,1)
        Y_validate = np_utils.to_categorical(Y_validate-1)
        X_validate = np.expand_dims(X_validate,1)
        Y_test = np_utils.to_categorical(Y_test-1)
        X_test = np.expand_dims(X_test,1)
        proj_mat_out2 = np.expand_dims(proj_mat_out,1)
        
        # Run transfer learning
        str_len = len('checkpoint_gen_')
        curr_mod_fname = model_fnames[i].split('/')[-1][:-3]
        chckpt_path = sp+'checkpoint_gen_tf_'+curr_mod_fname[str_len:]+suffix_trials+'.h5'
        acc_pretrain_tmp, acc_trained_tmp, last_epoch_tmp = run_transfer_learning(model_fnames[i], sbj_order_train,
                                                                  X_train, Y_train, sbj_order_validate, 
                                                                  X_validate, Y_validate, sbj_order_test,
                                                                  X_test, Y_test,proj_mat_out2, chckpt_path,
                                                                  layers_to_finetune = layers_to_finetune,
                                                                  norm_rate = norm_rate, loss=loss,
                                                                  optimizer=optimizer, patience = patience,
                                                                  early_stop_monitor = early_stop_monitor,
                                                                  do_log=do_log, nb_classes = nb_classes,
                                                                  epochs = epochs)
        
        # Here need to run the single subject on the same amount of training and val data
        if single_sub:
            test_sbj_name = pats_ids_in[int(test_sbj)]
            chckpt_path = sp+'checkpoint_gen_tf_single_sub_'+test_sbj_name+suffix_trials+'.h5'
            acc_single_sub_tmp, last_epoch_single_tmp, acc_single_sub_tmp_0 = run_single_sub_percent_compare(sbj_order_train, X_train, Y_train, 
                                          sbj_order_validate, X_validate, Y_validate, sbj_order_test, 
                                          X_test, Y_test, chckpt_path, norm_rate = norm_rate,
                                          loss=loss, optimizer=optimizer, patience = patience, 
                                          early_stop_monitor = early_stop_monitor, do_log=do_log, nb_classes = nb_classes,
                                          compute_val=compute_val, ecog_srate=ecog_srate, epochs = epochs,
                                          dropoutRate = dropoutRate, kernLength = kernLength, F1 = F1, D = D, F2 = F2, 
                                          dropoutType = dropoutType,kernLength_sep = kernLength_sep)
            acc_single_sub[i,:] = acc_single_sub_tmp
            last_epochs_SS[i,:] = last_epoch_single_tmp
            acc_single_sub_0[i,:] = acc_single_sub_tmp_0
        
        # Save train/val/test accuracies for every fold
        acc_pretrain[i,:] = acc_pretrain_tmp
        acc_trained[i,:] = acc_trained_tmp    
        last_epochs_TL[i,:] = last_epoch_tmp
        
        
    # Save accuracies across all folds (adds suffix for number/percentage of trials)
    np.save(sp+'acc_gen_tf_pretrain_'+model_type+'_'+str(n_folds)+save_suffix+suffix_trials+'.npy',acc_pretrain)
    np.save(sp+'acc_gen_tf_trained_'+model_type+'_'+str(n_folds)+save_suffix+suffix_trials+'.npy',acc_trained)
    np.save(sp+'last_training_epoch_gen_tf'+model_type+'_'+str(n_folds)+save_suffix+suffix_trials+'.npy', last_epochs_TL)
    if single_sub:
        np.save(sp+'acc_gen_tf_singlesub_'+model_type+'_'+str(n_folds)+save_suffix+suffix_trials+'.npy',acc_single_sub)
        np.save(sp+'acc_gen_tf_singlesub0_'+model_type+'_'+str(n_folds)+save_suffix+suffix_trials+'.npy',acc_single_sub_0)
        np.save(sp+'last_training_epoch_gen_tf_singlesub_'+model_type+'_'+str(n_folds)
                +save_suffix+suffix_trials+'.npy', last_epochs_SS)
예제 #16
0
def transfer_learn_nn_eeg(lp, sp, eeg_data_lp,
                          model_type = 'eegnet_hilb', layers_to_finetune = None,
                          n_train_trials = 50, per_train_trials = 0.6, n_val_trials = 50, per_val_trials = 0.3,
                          n_test_trials = 50, use_per_vals = False,loss='categorical_crossentropy', optimizer='adam',
                          patience=5,early_stop_monitor='val_loss',norm_rate=0.25,use_prev_opt_early_params=True,
                          single_sub=False, compute_val='power', ecog_srate=500, epochs = 20):
    '''
    Main script for performing transfer learning across folds. Matches code from run_nn_models.py.
    
    If doing test_day = 'last', only need to specify train and val trials/percent because test set is known.
    '''
    # Parameters for projection matrix
    custom_rois = True
    n_chans_eeg = 61
    n_chans_ecog = 126 # number of channels in ecog data (expected by model)
    per_test_trials = 0.2 # percentage of EEG data to use for test set
    
    # Ensure layers_to_finetune is a list
    if (layers_to_finetune is not None) and (not isinstance(layers_to_finetune, list)):
        layers_to_finetune = [layers_to_finetune]
    
    # Create suffix for saving files (so can save results from different train/val sizes to same folder)
    if use_per_vals:
        suffix_trials = '_ptra'+str(int(per_train_trials*100))+'_pval'+str(int(per_val_trials*100))
    else:
        suffix_trials = '_ntra'+str(n_train_trials)+'_nval'+str(n_val_trials)+'_ntes'+str(n_test_trials)
    
    # Load param file from pre-trained model
    file_pkl = open(lp+'param_file.pkl', 'rb')
    params_dict = pickle.load(file_pkl)
    file_pkl.close()
    
    # Extract appropriate parameters from param file
    tlim = params_dict['tlim']
    test_day = params_dict['test_day']
#     pats_ids_in = params_dict['pats_ids_in']
    pats_ids_in = ['EE'+str(val).zfill(2) for val in np.arange(1,16).tolist()]
    rand_seed = params_dict['rand_seed']
    n_test_sbj = params_dict['n_test']
    n_val_sbj = params_dict['n_val']
    n_folds = params_dict['n_folds']
    save_suffix = params_dict['save_suffix']
    do_log = params_dict['do_log']
#     data_lp = params_dict['lp']
    if 'n_train' in list(params_dict.keys()):
        n_train_sbj = params_dict['n_train']
    else:
        n_train_sbj = 7
    
    if 'epochs' in list(params_dict.keys()):
        epochs = params_dict['epochs']
        compute_val = params_dict['compute_val']
        ecog_srate = params_dict['ecog_srate']
    if use_prev_opt_early_params:
        # Use model fitting parameters from pre-trained model
        loss = params_dict['loss']
        optimizer = params_dict['optimizer']
        patience = params_dict['patience']
        early_stop_monitor = params_dict['early_stop_monitor']
    
    # Load in hyperparameters
    dropoutRate = params_dict['dropoutRate']
    kernLength = params_dict['kernLength']
    F1 = params_dict['F1']
    D = params_dict['D']
    F2 = params_dict['F2']
    dropoutType = params_dict['dropoutType']
    kernLength_sep = params_dict['kernLength_sep']
    
    # Find pathnames of models from all folds
    model_fnames = natsort.natsorted(glob.glob(lp + 'checkpoint_gen_'+model_type+'_fold*.h5'))
    
    # Set random seed
    np.random.seed(rand_seed)
    
    # Load projection matrix (electrodes to ROI's) for EEG data
    if custom_rois:
        custom_roi_inds = get_custom_motor_rois() # load custom roi's from precentral, postcentral, and inf parietal (AAL2)
    else:
        custom_roi_inds = None
    print("Determining ROIs")
    proj_mat_out,good_ROIs,chan_ind_vals_all = proj_mats_good_rois(['EE01_bH'],
                                                                   n_chans_all = n_chans_eeg,
                                                                   rem_bad_chans=False,
                                                                   dipole_dens_thresh=None,
                                                                   custom_roi_inds=custom_roi_inds,
                                                                   chan_cut_thres=n_chans_eeg,
                                                                   roi_proj_loadpath= eeg_data_lp+'proj_mat/')
    nROIs = len(good_ROIs)
    print("ROIs found")
    n_chans_all = n_chans_eeg

    # Load EEG data for each subject and fit model
    X_all,y_all,_,_,sbj_order_all,_ = load_data(pats_ids_in, eeg_data_lp, test_day=None, tlim=tlim, n_chans_all=n_chans_eeg)
    X_all[np.isnan(X_all)] = 0 # set all NaN's to 0
    
    for pat_ind,curr_pat in enumerate(pats_ids_in):
        # Identify the number of unique labels (or classes) present
        nb_classes = len(np.unique(y_all))

        # Determine train/val/test inds for every fold
        labels_unique = np.unique(y_all)
        nb_classes = len(labels_unique)
        half_n_evs_test = 'nopad' #avoids duplicating events (will take all available events)

        acc_pretrain = np.zeros([n_folds,3])
        acc_trained = acc_pretrain.copy()
        acc_single_sub = acc_pretrain.copy()
        acc_single_sub_0 = acc_single_sub.copy()
        last_epochs_TL = np.zeros([n_folds,2])
        last_epochs_SS = np.zeros([n_folds,2])
        for i in range(n_folds):
            # First, find indices for all events associated with test subject
            other_inds = subject_data_inds(np.full(1, pat_ind), sbj_order_all, labels_unique, i, 
                                           'test_inds', half_n_evs_test, y_all, sp, n_folds, [])
            trainval_inds = np.asarray(list(set(other_inds)))

            # If test_day is not last, then determine number of train, val, and test events (trials) to obtain
            if use_per_vals:
                n_train = int(len(other_inds) * per_train_trials*(1-per_test_trials))
                n_val = int(len(other_inds) * per_val_trials*(1-per_test_trials))
                n_test = int(len(other_inds) * per_test_trials) #(1-per_train_trials-per_val_trials))
            else:
                n_train = int(n_train_trials)
                n_val = int(n_val_trials)
                n_test = int(n_test_trials)

            # Find train event indices
            test_inds_tmp = subject_data_inds(np.full(1, pat_ind), sbj_order_all[trainval_inds], labels_unique, i, 
                                               'test_inds', n_test//nb_classes, y_all[trainval_inds], sp, n_folds, [])
            
            test_inds = trainval_inds[test_inds_tmp] # convert back to original inds

            # Remove train events and choose val inds from remaining events
            # Note: if n_train is larger than available events for training data, finding validation events
            # will throw an error because there are no remaining events to choose from
            valtest_inds = np.asarray(list(set(other_inds) - set(test_inds))) #remove train inds
            if len(valtest_inds) == 0:
                sys.exit("Error: No data to pick from for validation and test sets!")
            val_inds_tmp = subject_data_inds(np.full(1, pat_ind), sbj_order_all[valtest_inds], labels_unique, i, 
                                             'val_inds', n_val//nb_classes, y_all[valtest_inds], sp, n_folds, [])
            val_inds = valtest_inds[val_inds_tmp] # convert back to original inds

            # Remove val events and choose test inds from remaining events
            # Note: if n_train+n_val is larger than available events for training data, finding test events
            # will throw an error because there are no remaining events to choose from
            remain_inds = np.asarray(list(set(valtest_inds) - set(val_inds))) # remove train inds
            if len(remain_inds) == 0:
                sys.exit("Error: No data to pick from for test set!")
            train_inds_tmp = subject_data_inds(np.full(1, pat_ind), sbj_order_all[remain_inds], labels_unique, i, 
                                             'train_inds', n_train//nb_classes, y_all[remain_inds], sp, n_folds, [])
            train_inds = remain_inds[train_inds_tmp] # convert back to original inds

            # Append train/val/test event indices for each fold
    #         train_inds_folds.append(train_inds)
    #         val_inds_folds.append(val_inds)
    #         test_inds_folds.append(test_inds)

            # Reformat data size for NN
            Y_all = np_utils.to_categorical(y_all-1)
            X_all_tmp = np.expand_dims(X_all,1)
            proj_mat_out2 = np.tile(proj_mat_out,[X_all_tmp.shape[0],1,1])
            proj_mat_out2 = np.expand_dims(proj_mat_out2,1)

            # Pad channel dimension to match ECoG data
            X_all_sh = list(X_all_tmp.shape)
            X_all_sh[2] = n_chans_ecog
            X_all_resh = np.zeros(X_all_sh)
            X_all_resh[...,:n_chans_eeg,:] = X_all_tmp
            proj_mat_out3 = np.zeros(list(proj_mat_out2.shape[:-1])+[n_chans_ecog])
            proj_mat_out3[...,:n_chans_eeg] = proj_mat_out2

            # Generate train/val/test data based on event indices for each fold
            X_train = X_all_resh[train_inds,...]
            Y_train = Y_all[train_inds,...]
            sbj_order_train = np.zeros(len(train_inds)).astype('int') # important for projection matrix input
            X_validate = X_all_resh[val_inds,...]
            Y_validate = Y_all[val_inds,...]
            sbj_order_validate = np.zeros(len(val_inds)).astype('int')  # important for projection matrix input
            
            X_test = X_all_resh[test_inds,...]
            Y_test = Y_all[test_inds,...]
            sbj_order_test = np.zeros(len(test_inds)).astype('int')  # important for projection matrix input
            # Note that sbj_order doesn't matter here because all EEG subjects have same electrode locations
            
            # Run transfer learning
            str_len = len('checkpoint_gen_')
            curr_mod_fname = model_fnames[i].split('/')[-1][:-3]
            chckpt_path = sp+'checkpoint_gen_tf_'+curr_pat+'_'+curr_mod_fname[str_len:]+suffix_trials+'.h5'
            acc_pretrain_tmp, acc_trained_tmp, last_epoch_tmp = run_transfer_learning(model_fnames[i], sbj_order_train,
                                                                      X_train, Y_train, sbj_order_validate, 
                                                                      X_validate, Y_validate, sbj_order_test,
                                                                      X_test, Y_test,proj_mat_out3, chckpt_path,
                                                                      layers_to_finetune = layers_to_finetune,
                                                                      norm_rate = norm_rate, loss=loss,
                                                                      optimizer=optimizer, patience = patience,
                                                                      early_stop_monitor = early_stop_monitor,
                                                                      do_log=do_log, nb_classes = nb_classes,
                                                                      epochs = epochs)

            # Here need to run the single subject on the same amount of training and val data
            if single_sub:
                chckpt_path = sp+'checkpoint_gen_tf_single_sub_'+curr_pat+'_'+suffix_trials+'.h5'
                acc_single_sub_tmp, last_epoch_single_tmp, acc_single_sub_tmp_0 = run_single_sub_percent_compare(sbj_order_train, X_train, Y_train, 
                                              sbj_order_validate, X_validate, Y_validate, sbj_order_test, 
                                              X_test, Y_test, chckpt_path, norm_rate = norm_rate,
                                              loss=loss, optimizer=optimizer, patience = patience, 
                                              early_stop_monitor = early_stop_monitor, do_log=do_log, nb_classes = nb_classes,
                                              compute_val=compute_val, ecog_srate=ecog_srate, epochs = epochs,
                                              dropoutRate = dropoutRate, kernLength = kernLength, F1 = F1, D = D, F2 = F2, 
                                          dropoutType = dropoutType,kernLength_sep = kernLength_sep)
                acc_single_sub[i,:] = acc_single_sub_tmp
                last_epochs_SS[i,:] = last_epoch_single_tmp
                acc_single_sub_0[i,:] = acc_single_sub_tmp_0

            # Save train/val/test accuracies for every fold
            acc_pretrain[i,:] = acc_pretrain_tmp
            acc_trained[i,:] = acc_trained_tmp    
            last_epochs_TL[i,:] = last_epoch_tmp
        
        
        # Save accuracies across all folds (adds suffix for number/percentage of trials)
        np.save(sp+'acc_gen_tf_pretrain_'+curr_pat+'_'+model_type+'_'+str(n_folds)+save_suffix+\
                suffix_trials+'.npy',acc_pretrain)
        np.save(sp+'acc_gen_tf_trained_'+curr_pat+'_'+model_type+'_'+str(n_folds)+\
                save_suffix+suffix_trials+'.npy',acc_trained)
        np.save(sp+'last_training_epoch_gen_tf'+curr_pat+'_'+model_type+'_'+str(n_folds)+\
                save_suffix+suffix_trials+'.npy', last_epochs_TL)
        if single_sub:
            np.save(sp+'acc_gen_tf_singlesub_'+curr_pat+'_'+model_type+'_'+str(n_folds)+\
                    save_suffix+suffix_trials+'.npy',acc_single_sub)
            np.save(sp+'acc_gen_tf_singlesub0_'+curr_pat+'_'+model_type+'_'+str(n_folds)+\
                    save_suffix+'.npy',acc_single_sub_0)
            np.save(sp+'last_training_epoch_gen_tf_singlesub_'+curr_pat+'_'+model_type+'_'+str(n_folds)
                    +save_suffix+suffix_trials+'.npy', last_epochs_SS)