示例#1
0
def committe(solver, solver_name, intervals, reps):
    np.random.seed()
    X, y = util.basic_data()
    polls = util.add_noise(y)
    curr_labels = np.random.choice(range(len(X)), size=4, replace=False)
    X_train = X[curr_labels]
    square_errors = np.zeros([2, len(intervals)])
    for i in range(len(intervals)):
        print("interval: ", intervals[i])
        for j in range(reps):
            while len(curr_labels) <= intervals[i]:
                next_points = next_countys(solver, curr_labels, X, polls)
                curr_labels = np.append(curr_labels, next_points)
            curr_labels = curr_labels[:intervals[i]]
            preds = solver(X, X[curr_labels], polls[curr_labels])
            square_errors[:, i] += util.square_error(y, preds)
        square_errors[:, i] /= reps
    square_errors = np.vstack(
        (square_errors.mean(axis=0), util.performance(solver, intervals,
                                                      reps).mean(axis=0)))
    util.plot("committe",
              intervals / len(X),
              square_errors,
              legend=[solver_name, "random"],
              x_label="% counties",
              y_label="MSE",
              title="Committe")
示例#2
0
文件: roar.py 项目: jjbrophy47/trex
def experiment(args, logger, out_dir):
    """
    Main method that removes training instances ordered by
    different methods and measure their impact on a random
    set of test instances.
    """

    # start timer
    begin = time.time()

    # create random number generator
    rng = np.random.default_rng(args.rs)

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # use a fraction of the train data
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train_samples = int(X_train.shape[0] * args.train_frac)
        train_indices = rng.choice(X_train.shape[0], size=n_train_samples, replace=False)
        X_train, y_train = X_train[train_indices], y_train[train_indices]

    # select a (stratified) subset of test instances uniformly at random
    _, X_test_sub, _, y_test_sub = train_test_split(X_test, y_test,
                                                    test_size=args.n_test,
                                                    random_state=args.rs,
                                                    stratify=y_test)

    # display dataset statistics
    logger.info('\nno. train instances: {:,}'.format(X_train.shape[0]))
    logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0]))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))
    logger.info('pos. label % (test): {:.1f}%\n'.format(np.sum(y_test) / y_test.shape[0] * 100))

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')

    # select a subset of test instances, half from the neg. class and half from the pos. class
    model_pred_test = model.predict(X_test)
    neg_test_indices = np.where(model_pred_test == 0)[0]
    pos_test_indices = np.where(model_pred_test == 1)[0]
    neg_test_indices = rng.choice(neg_test_indices, size=int(args.n_test / 2), replace=False)
    pos_test_indices = rng.choice(pos_test_indices, size=int(args.n_test / 2), replace=False)
    test_indices = np.concatenate([neg_test_indices, pos_test_indices])
    X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

    util.performance(model, X_test_sub, y_test_sub, logger=logger, name='Test')

    # compute how many samples to remove before a checkpoint
    if args.train_frac_to_remove >= 1.0:
        n_checkpoint = int(args.train_frac_to_remove)

    elif args.train_frac_to_remove > 0:
        n_checkpoint = int(args.train_frac_to_remove * X_train.shape[0] / args.n_checkpoints)

    else:
        raise ValueError('invalid train_frac_to_remove: {}'.format(args.train_frac_to_remove))

    # sort train instances, then remove, retrain, and re-evaluate
    train_indices = sort_train_instances(args, model, X_train, y_train, X_test_sub, y_test_sub, rng, logger=logger)
    result = measure_performance(train_indices, n_checkpoint, args.n_checkpoints,
                                 clf, X_train, y_train, X_test_sub, y_test_sub, logger=logger)

    # save results
    result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    result['total_time'] = time.time() - begin
    np.save(os.path.join(out_dir, 'results.npy'), result)
    plt.savefig(os.path.join(out_dir, 'special_ckpt.pdf'), bbox_inches='tight')

    # display results
    logger.info('\nResults:\n{}'.format(result))
    logger.info('\nsaving results to {}...'.format(os.path.join(out_dir, 'results.npy')))
示例#3
0
def experiment(args, logger, out_dir):
    """
    Main method that removes training instances ordered by
    different methods and measure their impact on a random
    set of test instances.
    """

    # start timer
    begin = time.time()

    # create random number generator
    rng = np.random.default_rng(args.rs)

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')
    util.performance(model, X_test, y_test, logger=logger, name='Test')

    # select a subset of test instances uniformly at random
    if args.start_pred == -1:

        # select an instance at random
        if args.n_test == 1:
            test_indices = rng.choice(X_test.shape[0],
                                      size=args.n_test,
                                      replace=False)
            X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

        # use the entire test set
        elif args.n_test <= 0:
            X_test_sub, y_test_sub = X_test, y_test

        # use a stratified sample of the test set
        else:
            n_test = 1.0 if args.n_test == -1 else args.n_test
            _, X_test_sub, _, y_test_sub = train_test_split(
                X_test,
                y_test,
                test_size=n_test,
                random_state=args.rs,
                stratify=y_test)

    # select a subset of test instances of the desired predicted label uniformly at random
    elif args.start_pred in [0, 1]:

        # use the entire test set
        if args.n_test <= 0:
            X_test_sub, y_test_sub = X_test, y_test

        # select a specified no. instances from the specified class
        else:
            model_pred = model.predict(X_test)
            label_indices = np.where(model_pred == args.start_pred)[0]
            test_indices = rng.choice(label_indices,
                                      size=args.n_test,
                                      replace=False)
            X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

    else:
        raise ValueError('unknown start_pred: {}'.format(args.start_pred))

    # display dataset statistics
    logger.info('\nno. train instances: {:,}'.format(X_train.shape[0]))
    logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0]))
    logger.info('no. features: {:,}\n'.format(X_train.shape[1]))

    # sort train instances, then remove, retrain, and re-evaluate
    result = measure_performance(args,
                                 clf,
                                 X_train,
                                 y_train,
                                 X_test_sub,
                                 y_test_sub,
                                 rng,
                                 out_dir,
                                 logger=logger)

    # save results
    result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    result['total_time'] = time.time() - begin
    np.save(os.path.join(out_dir, 'results.npy'), result)

    # display results
    logger.info('\nResults:\n{}'.format(result))
    logger.info('\nsaving results to {}...'.format(
        os.path.join(out_dir, 'results.npy')))
示例#4
0
def experiment(args, logger, out_dir, seed):
    """
    Main method comparing performance of tree ensembles and svm models.
    """

    # start experiment timer
    begin = time.time()

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    logger.info('no. train: {:,}'.format(X_train.shape[0]))
    logger.info('no. test: {:,}'.format(X_test.shape[0]))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # tune on a fraction of the training data
    if not args.no_tune:

        if args.tune_frac < 1.0:
            sss = StratifiedShuffleSplit(n_splits=1,
                                         test_size=2,
                                         train_size=args.tune_frac,
                                         random_state=args.rs)
            tune_indices, _ = list(sss.split(X_train, y_train))[0]
            X_train_sub, y_train_sub = X_train[tune_indices], y_train[
                tune_indices]
            logger.info('tune instances: {:,}'.format(X_train_sub.shape[0]))

        else:
            X_train_sub, y_train_sub = X_train, y_train
    else:
        X_train_sub, y_train_sub = X_train, y_train

    # get model
    model, param_grid = get_model(args, cat_indices=cat_indices)
    logger.info('\nmodel: {}, param_grid: {}'.format(args.model, param_grid))

    # tune the model
    start = time.time()
    if not args.no_tune:
        skf = StratifiedKFold(n_splits=args.cv,
                              shuffle=True,
                              random_state=args.rs)
        gs = GridSearchCV(model,
                          param_grid,
                          scoring=args.scoring,
                          cv=skf,
                          verbose=args.verbose)
        gs = gs.fit(X_train_sub, y_train_sub)

        cols = ['mean_fit_time', 'mean_test_score', 'rank_test_score']
        cols += ['param_{}'.format(param) for param in param_grid.keys()]

        df = pd.DataFrame(gs.cv_results_)
        logger.info('gridsearch results:')
        logger.info(df[cols].sort_values('rank_test_score'))

        model = clone(gs.best_estimator_)
        logger.info('best params: {}'.format(gs.best_params_))

    tune_time = time.time() - start
    logger.info('tune time: {:.3f}s'.format(tune_time))

    # train model
    start = time.time()
    model = model.fit(X_train, y_train)
    train_time = time.time() - start
    logger.info('train time: {:.3f}s'.format(train_time))

    # evaluate
    auc, acc, ap, ll = util.performance(model,
                                        X_test,
                                        y_test,
                                        logger,
                                        name=args.model)

    # save results
    result = {}
    result['model'] = args.model
    result['auc'] = auc
    result['acc'] = acc
    result['ap'] = ap
    result['ll'] = ll
    result['tune_time'] = tune_time
    result['train_time'] = train_time
    result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    result['tune_frac'] = args.tune_frac
    if args.model in ['cb', 'rf']:
        result['n_estimators'] = gs.best_params_['n_estimators']
        result['max_depth'] = gs.best_params_['max_depth']
    np.save(os.path.join(out_dir, 'results.npy'), result)

    # Macs show this in bytes, unix machines show this in KB
    logger.info('max_rss: {:,}'.format(result['max_rss']))
    logger.info('total time: {:.3f}s'.format(time.time() - begin))
    logger.info('saving results to {}...'.format(
        os.path.join(out_dir, 'results.npy')))
示例#5
0
from lda import lda

if __name__ == '__main__':
    filepath = os.path.join('Yale_Face_Database', 'Training')
    H, W = 231, 195
    X, y = imread(filepath, H, W)

    eigenvalues_pca, eigenvectors_pca, X_mean = pca(X, num_dim=31)
    X_pca = eigenvectors_pca.T @ (X - X_mean)
    eigenvalues_lda, eigenvectors_lda = lda(X_pca, y)

    # Transform matrix
    U = eigenvectors_pca @ eigenvectors_lda
    print('U shape: {}'.format(U.shape))

    # show top 25 eigenface
    show_eigenface(U, 25, H, W)

    # reduce dim (projection)
    Z = U.T @ X

    # recover
    X_recover = U @ Z + X_mean
    show_reconstruction(X, X_recover, 10, H, W)

    # accuracy
    filepath = os.path.join('Yale_Face_Database', 'Testing')
    X_test, y_test = imread(filepath, H, W)
    acc = performance(X_test, y_test, Z, y, U, X_mean, 5)
    print('acc: {:.2f}%'.format(acc * 100))
示例#6
0
def experiment(args, logger, out_dir):

    logger.info('\nDATA')
    start = time.time()

    in_dir = os.path.join(args.data_dir, args.dataset,
                          'fold_{}'.format(args.fold))

    # read in feature data
    logger.info('reading in data...')
    X_train = load_npz(
        os.path.join(in_dir,
                     '{}_train.npz'.format(args.feature_type))).tocsr()
    X_val = load_npz(
        os.path.join(in_dir, '{}_val.npz'.format(args.feature_type))).tocsr()
    X_test = load_npz(
        os.path.join(in_dir, '{}_test.npz'.format(args.feature_type))).tocsr()

    # read in label data
    train_df = pd.read_csv(os.path.join(in_dir, 'y_train.csv'))
    val_df = pd.read_csv(os.path.join(in_dir, 'y_val.csv'))
    test_df = pd.read_csv(os.path.join(in_dir, 'y_test.csv'))

    # filter out transductive test indices
    if args.test_type == 'inductive':
        indices = np.load(os.path.join(in_dir, 'inductive_indices.npz'))

        val_df = val_df[val_df['com_id'].isin(indices['val'])]
        test_df = test_df[test_df['com_id'].isin(indices['test'])]

        X_val = X_val[val_df.index]
        X_test = X_test[test_df.index]

    # extract label data
    y_train = train_df['label'].to_numpy()
    y_val = val_df['label'].to_numpy()
    y_test = test_df['label'].to_numpy()

    # extract identifier data
    target_ids_train = train_df['com_id'].to_numpy()
    target_ids_val = val_df['com_id'].to_numpy()
    target_ids_test = test_df['com_id'].to_numpy()

    logger.info('\ntrain instances: X: {}, y: {}'.format(
        X_train.shape, y_train.shape))
    logger.info('val   instances: X: {}, y: {}'.format(X_val.shape,
                                                       y_val.shape))
    logger.info('test  instances: X: {}, y: {}'.format(X_test.shape,
                                                       y_test.shape))
    logger.info('total time: {:.3f}s'.format(time.time() - start))

    # train
    logger.info('\nTRAIN')
    start = time.time()

    # setup models
    model = _get_model(args, data_dir=in_dir, logger=logger)

    if args.eggs:
        model = model.fit(X_train, y_train, target_ids_train, X_val, y_val,
                          target_ids_val)
    else:
        model = model.fit(X_train, y_train)
    logger.info('total time: {:.3f}s'.format(time.time() - start))

    # predict
    logger.info('\nPREDICT')
    start = time.time()

    if args.eggs:
        proba = model.predict_proba(X_test, target_ids_test)[:, 1]
    else:
        proba = model.predict_proba(X_test)[:, 1]

    auc, ap = util.performance(y_test, proba, logger=logger, name='model')
    logger.info('total time: {:.3f}s'.format(time.time() - start))

    # save results
    result = {'auc': auc, 'ap': ap}
    result['target_id'] = target_ids_test
    result['label'] = y_test
    result['yhat'] = proba
    np.save(os.path.join(out_dir, 'result.npy'), result)
示例#7
0
def experiment(args, logger, out_dir):
    """
    Main method that trains a tree ensemble, then compares the
    runtime of different methods to explain a single test instance.
    """

    # start timer
    begin = time.time()

    # create random number generator
    rng = np.random.default_rng(args.rs)

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    logger.info('\nno. train instances: {:,}'.format(X_train.shape[0]))
    logger.info('no. test instances: {:,}'.format(X_test.shape[0]))
    logger.info('no. features: {:,}\n'.format(X_train.shape[1]))

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')
    util.performance(model, X_test, y_test, logger=logger, name='Test')

    # randomly pick a test instances to explain
    test_ndx = rng.choice(y_test.shape[0], size=1, replace=False)

    # TREX
    if 'klr' in args.method or 'svm' in args.method:
        result = trex_method(args,
                             model,
                             test_ndx,
                             X_train,
                             y_train,
                             X_test,
                             logger=logger)

    # Leaf Influence
    elif 'leaf_influence' in args.method and args.model == 'cb':
        result = leaf_influence_method(args,
                                       model,
                                       test_ndx,
                                       X_train,
                                       y_train,
                                       X_test,
                                       y_test,
                                       logger=logger)

    # MAPLE
    elif args.method == 'maple':
        result = maple_method(args,
                              model,
                              test_ndx,
                              X_train,
                              y_train,
                              X_test,
                              logger=logger)

    # TEKNN
    elif 'knn' in args.method:
        result = teknn_method(args,
                              model,
                              test_ndx,
                              X_train,
                              y_train,
                              X_test,
                              logger=logger)

    else:
        raise ValueError('method {} unknown!'.format(args.method))

    # save results
    result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    result['total_time'] = time.time() - begin
    np.save(os.path.join(out_dir, 'results.npy'), result)

    # display results
    logger.info('\nResults:\n{}'.format(result))
    logger.info('\nsaving results to {}...'.format(
        os.path.join(out_dir, 'results.npy')))
示例#8
0
def experiment(args, logger, out_dir):
    """
    Cleaning Experiment:
      1) Train a tree ensemble.
      2) Flip a percentage of train labels.
      3) Prioritize train instances to be checked using various methods.
      4) Check and correct any flipped train labels.
      5) Compute how effective each method is at cleaning the data.
    """

    # start timer
    begin = time.time()

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # use a subset of the training data
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train = int(X_train.shape[0] * args.train_frac)
        X_train, y_train = X_train[:n_train], y_train[:n_train]

    logger.info('\nno. train instances: {:,}'.format(len(X_train)))
    logger.info('no. test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # add noise
    y_train_noisy, noisy_indices = flip_labels(y_train, k=args.flip_frac, seed=args.rs, logger=logger)
    noisy_indices = np.array(sorted(noisy_indices))
    logger.info('no. noisy labels: {:,}'.format(noisy_indices.shape[0]))

    # train a tree ensemble on the clean and noisy labels
    model = clone(clf).fit(X_train, y_train)
    model_noisy = clone(clf).fit(X_train, y_train_noisy)

    # show model performance before and after noise
    logger.info('\nBefore noise:')
    util.performance(model, X_train, y_train, logger=logger, name='Before, Train')
    util.performance(model, X_test, y_test, logger=logger, name='Before, Test')

    logger.info('\nAfter noise:')
    util.performance(model_noisy, X_train, y_train_noisy, logger=logger, name='After, Noisy Train')
    util.performance(model_noisy, X_train, y_train, logger=logger, name='After, Clean Train')
    util.performance(model_noisy, X_test, y_test, logger=logger, name='After, Test')

    # check predictive performance before and after noise
    acc_clean, auc_clean = score(model, X_test, y_test)
    acc_noisy, auc_noisy = score(model_noisy, X_test, y_test)

    # find how many corrupted / non-corrupted labels were incorrectly predicted
    predicted_labels = model_noisy.predict(X_train).flatten()
    incorrect_indices = np.where(y_train_noisy != predicted_labels)[0]
    incorrect_noisy_indices = np.intersect1d(noisy_indices, incorrect_indices)
    logger.info('\nno. incorrectly predicted noisy train instances: {:,}'.format(incorrect_noisy_indices.shape[0]))
    logger.info('no. incorrectly predicted train instances: {:,}'.format(incorrect_indices.shape[0]))

    # total no. instances to check and no. instances to check between checkpoints
    n_check = int(y_train.shape[0] * args.check_pct)
    n_checkpoint = int(n_check / args.n_checkpoints)

    # random
    if args.method == 'random':
        result = random_method(args, noisy_indices, n_check, n_checkpoint,
                               clf, X_train, y_train, X_test, y_test,
                               acc_noisy, auc_noisy, logger=logger)

    # TREX
    elif 'klr' in args.method or 'svm' in args.method:
        result = trex_method(args, model_noisy, y_train_noisy,
                             noisy_indices, n_check, n_checkpoint,
                             clf, X_train, y_train, X_test, y_test,
                             acc_noisy, auc_noisy, logger=logger)

    # tree-esemble loss
    elif args.method == 'tree_loss':
        result = tree_loss_method(args, model_noisy, y_train_noisy,
                                  noisy_indices, n_check, n_checkpoint,
                                  clf, X_train, y_train, X_test, y_test,
                                  acc_noisy, auc_noisy, logger=logger)

    # Leaf Influence
    elif 'leaf_influence' in args.method and args.model == 'cb':
        result = leaf_influence_method(args, model_noisy, y_train_noisy,
                                       noisy_indices, n_check, n_checkpoint,
                                       clf, X_train, y_train, X_test, y_test,
                                       acc_noisy, auc_noisy, logger=logger)

    # MAPLE
    elif args.method == 'maple':
        result = maple_method(args, model_noisy,
                              noisy_indices, n_check, n_checkpoint,
                              clf, X_train, y_train, X_test, y_test,
                              acc_noisy, auc_noisy, logger=logger)

    # TEKNN
    elif 'knn' in args.method:
        result = teknn_method(args, model_noisy, y_train_noisy,
                              noisy_indices, n_check, n_checkpoint,
                              clf, X_train, y_train, X_test, y_test,
                              acc_noisy, auc_noisy, logger=logger)

    # Tree Prototype
    elif args.method == 'tree_prototype':
        result = tree_prototype_method(args, model_noisy, y_train_noisy,
                                       noisy_indices, n_check, n_checkpoint,
                                       clf, X_train, y_train, X_test, y_test,
                                       acc_noisy, auc_noisy, logger=logger)

    # MMD Prototype
    elif args.method == 'mmd_prototype':
        result = mmd_prototype_method(args, model_noisy, y_train_noisy,
                                      noisy_indices, n_check, n_checkpoint,
                                      clf, X_train, y_train, X_test, y_test,
                                      acc_noisy, auc_noisy, logger=logger)

    else:
        raise ValueError('unknown method {}'.format(args.method))

    # save results
    result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    result['total_time'] = time.time() - begin
    result['acc_clean'] = acc_clean
    result['auc_clean'] = auc_clean
    np.save(os.path.join(out_dir, 'results.npy'), result)

    # display results
    logger.info('\nResults:\n{}'.format(result))
示例#9
0
def experiment(args, logger, out_dir):
    """
    Main method that removes training instances ordered by
    different methods and measure their impact on a random
    set of test instances.
    """

    # start timer
    begin = time.time()

    # create random number generator
    rng = np.random.default_rng(args.rs)

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # use a fraction of the train data
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train_samples = int(X_train.shape[0] * args.train_frac)
        train_indices = rng.choice(X_train.shape[0],
                                   size=n_train_samples,
                                   replace=False)
        X_train, y_train = X_train[train_indices], y_train[train_indices]

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')

    # select an ambiguously predicted test instance
    proba = model.predict_proba(X_test)[:, 1]
    sorted_indices = np.argsort(np.abs(proba - 0.5))
    test_indices = sorted_indices[:1]  # shape=(1,)
    X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

    # display dataset statistics
    logger.info('\nno. train instances: {:,}'.format(X_train.shape[0]))
    logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0]))
    logger.info('no. features: {:,}\n'.format(X_train.shape[1]))
    logger.info('pos. label % (test): {:.1f}%\n'.format(
        np.sum(y_test) / y_test.shape[0] * 100))

    # sort train instances
    exc_indices, inh_indices = trex_method(args,
                                           model,
                                           X_train,
                                           y_train,
                                           X_test_sub,
                                           logger=logger)
    ran_indices = rng.choice(np.arange(X_train.shape[0]),
                             size=X_train.shape[0],
                             replace=False)
    ran_pos_indices = np.where(y_train == 1)[0]
    ran_neg_indices = np.where(y_train == 0)[0]
    rng.shuffle(ran_pos_indices)
    rng.shuffle(ran_neg_indices)

    # remove, retrain, and re-evaluate
    logger.info('\nremoving most excitatory train instances...')
    exc_result = measure_performance(args,
                                     exc_indices,
                                     clf,
                                     X_train,
                                     y_train,
                                     X_test_sub,
                                     y_test_sub,
                                     logger=logger)

    logger.info('\nremoving most inhibitory train instances...')
    inh_result = measure_performance(args,
                                     inh_indices,
                                     clf,
                                     X_train,
                                     y_train,
                                     X_test_sub,
                                     y_test_sub,
                                     logger=logger)

    logger.info('\nremoving train instances uniformly at random...')
    ran_result = measure_performance(args,
                                     ran_indices,
                                     clf,
                                     X_train,
                                     y_train,
                                     X_test_sub,
                                     y_test_sub,
                                     logger=logger)

    if args.extra_methods:
        logger.info('\nremoving positive train instances at random...')
        ran_pos_result = measure_performance(args,
                                             ran_pos_indices,
                                             clf,
                                             X_train,
                                             y_train,
                                             X_test_sub,
                                             y_test_sub,
                                             logger=logger)

        logger.info('\nremoving negative train instances at random...')
        ran_neg_result = measure_performance(args,
                                             ran_neg_indices,
                                             clf,
                                             X_train,
                                             y_train,
                                             X_test_sub,
                                             y_test_sub,
                                             logger=logger)

    # matplotlib settings
    util.plot_settings(fontsize=13)

    # inches
    width = 4.8  # Machine Learning journal
    height = get_height(width=width, subplots=(1, 1))
    fig, ax = plt.subplots(figsize=(width * 1.65, height * 1.0))

    # plot results
    l1 = ax.errorbar(exc_result['remove_pct'],
                     exc_result['proba'],
                     color='blue',
                     linestyle='--',
                     marker='.',
                     label='Most excitatory')
    l2 = ax.errorbar(inh_result['remove_pct'],
                     inh_result['proba'],
                     color='green',
                     linestyle='--',
                     marker='+',
                     label='Most inhibitory')
    l3 = ax.errorbar(ran_result['remove_pct'],
                     ran_result['proba'],
                     color='red',
                     linestyle='-',
                     marker='*',
                     label='Random')
    lines = [l1, l2, l3]
    labels = ['Most excitatory', 'Most inhibitory', 'Random']

    if args.extra_methods:
        l4 = ax.errorbar(ran_pos_result['remove_pct'],
                         ran_pos_result['proba'],
                         color='cyan',
                         linestyle=':',
                         marker='1',
                         label='Pos. random')
        l5 = ax.errorbar(ran_neg_result['remove_pct'],
                         ran_neg_result['proba'],
                         color='orange',
                         linestyle=':',
                         marker='2',
                         label='Neg. random')
        lines += [l4, l5]
        labels += ['Random (pos. only)', 'Random (neg. only)']

    ax.set_xlabel('Train data removed (%)')
    ax.set_ylabel('Predicted probability')
    ax.set_ylim(0, 1)

    # adjust legend
    fig.legend(tuple(lines),
               tuple(labels),
               loc='left',
               ncol=1,
               bbox_to_anchor=(1.0, 0.85),
               title='Removal Ordering')
    plt.tight_layout()
    fig.subplots_adjust(right=0.65)

    # save plot
    plt.savefig(os.path.join(out_dir, 'probas.pdf'), bbox_inches='tight')

    # display results
    logger.info('\nsaving results to {}/...'.format(os.path.join(out_dir)))
    logger.info('total time: {:.3f}s'.format(time.time() - begin))
示例#10
0
def experiment(args, logger, out_dir):

    # start timer
    begin = time.time()

    # create random number generator
    rng = np.random.default_rng(args.rs)

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing,
                         mismatch=True)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # display dataset statistics
    logger.info('\nno. train instances: {:,}'.format(X_train.shape[0]))
    logger.info('no. test instances: {:,}'.format(X_test.shape[0]))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))
    logger.info('\npos. label % (train): {:.1f}%'.format(
        np.sum(y_train) / y_train.shape[0] * 100))
    logger.info('pos. label % (test): {:.1f}%\n'.format(
        np.sum(y_test) / y_test.shape[0] * 100))

    # train tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')
    util.performance(model, X_test, y_test, logger=logger, name='Test')

    # train surrogate model
    params = {
        'C': args.C,
        'n_neighbors': args.n_neighbors,
        'tree_kernel': args.tree_kernel
    }
    surrogate = trex.train_surrogate(model=model,
                                     surrogate='klr',
                                     X_train=X_train,
                                     y_train=y_train,
                                     val_frac=args.tune_frac,
                                     metric=args.metric,
                                     seed=args.rs,
                                     params=params,
                                     logger=logger)

    # extract predictions
    start = time.time()
    model_pred = model.predict(X_test)
    model_proba = model.predict_proba(X_test)[:, 1]
    logger.info('predicting...{:.3f}s'.format(time.time() - start))

    # pick a test instance in which the person is <= 17 from the Adult dataset
    indices = np.where(X_test[:, args.age_ndx] <= 17)[0]
    test_ndx = rng.choice(indices)
    age_test_val = X_test[test_ndx][args.age_ndx]
    x_test = X_test[[test_ndx]]

    # show prediction for this test instance
    s = '\ntest: {}, actual: {}, proba.: {:.3f}, age: {:.0f}'
    logger.info(
        s.format(test_ndx, y_test[test_ndx], model_proba[test_ndx],
                 age_test_val))

    # sort based on similarity-influence
    if 'sim' in args.surrogate:

        # compute influence based on predicted labels
        attributions = surrogate.similarity(x_test)
        pred_label = model.predict(x_test)

        # put positive weight if similar instances have the same label as the predicted test label
        for i in range(x_test.shape[0]):
            attributions[i] = np.where(y_train == pred_label[i],
                                       attributions[i], attributions[i] * -1)
        attributions = attributions.sum(axis=0)

        attribution_indices = np.argsort(attributions)[::-1]

    # sort training instances by most influential to the predicted label
    else:
        attributions = surrogate.pred_influence(x_test,
                                                model_pred[[test_ndx]])[0]
        attribution_indices = np.argsort(attributions)[::-1]

    # sort training instances by most similar to the test instance
    sim = surrogate.similarity(x_test)[0]
    sim_indices = np.argsort(sim)[::-1]

    # get instance weights
    alpha = surrogate.get_alpha()

    # 1. show most influential training instances
    logger.info(
        '\nTop {:,} most influential samples to the predicted label...'.format(
            args.topk_inf))
    show_instances(args, X_train, alpha, sim, attributions, y_train,
                   attribution_indices, args.topk_inf, logger)

    # 2a. compute aggregate surrogate contribution of most influential train instances
    attr_all = np.sum(np.abs(attributions))
    attr_pos = np.sum(np.where(attributions > 0, attributions,
                               0))  # sum of pos. attributions only
    attr_topk_inf = np.sum(
        np.abs(attributions[attribution_indices][:args.topk_inf]))
    attr_topk_inf_pct = attr_topk_inf / attr_all * 100
    attr_topk_inf_pos_pct = attr_topk_inf / attr_pos * 100

    # 2b. display aggregate attributions
    s1 = '\nattribution % of top {:,} infuential instances: {:.2f}%'
    s2 = 'attribution % of top {:,} infuential instances for pos. attributions: {:.2f}%'
    logger.info(s1.format(args.topk_inf, attr_topk_inf_pct))
    logger.info(s2.format(args.topk_inf, attr_topk_inf_pos_pct))

    # 3. compute change in predicted probability after REMOVING the most influential instances
    s = 'test: {}, actual: {}, proba.: {:.3f}, age: {:.0f}'
    logger.info('\nRemoving top {:,} influential instances..'.format(
        args.topk_inf))
    new_X_train = np.delete(X_train,
                            attribution_indices[:args.topk_inf],
                            axis=0)
    new_y_train = np.delete(y_train, attribution_indices[:args.topk_inf])
    new_model = clone(clf).fit(new_X_train, new_y_train)
    util.performance(model,
                     new_X_train,
                     new_y_train,
                     logger=logger,
                     name='Train')
    util.performance(model, X_test, y_test, logger=logger, name='Test')
    logger.info(
        s.format(test_ndx, y_test[test_ndx],
                 new_model.predict_proba(X_test)[:, 1][test_ndx],
                 age_test_val))

    # 4a. compute change in predicted probability after FLIPPING the labels of the most influential instances
    s1 = 'test: {}, actual: {}, proba.: {:.3f}, age: {:.0f}'
    s2 = '\n{:,} out of the top {:,} most influential instances have age <= 17'
    logger.info(
        '\nFixing ONLY corrupted labels of the top {:,} influential instances..'
        .format(args.topk_inf))
    new_X_train = X_train.copy()
    new_y_train = y_train.copy()

    # 4b. fixing a portion of the corrupted training instances
    temp_indices = np.where(
        X_train[attribution_indices][:args.topk_inf][:, args.age_ndx] <= 17)[0]
    age17_topk_inf_indices = attribution_indices[temp_indices]
    new_y_train[age17_topk_inf_indices] = 0

    # 4c. fit new model and re-evaluate
    new_model = clone(clf).fit(new_X_train, new_y_train)
    util.performance(new_model,
                     new_X_train,
                     new_y_train,
                     logger=logger,
                     name='Train')
    util.performance(new_model, X_test, y_test, logger=logger, name='Test')
    logger.info(
        s1.format(test_ndx, y_test[test_ndx],
                  new_model.predict_proba(X_test)[:, 1][test_ndx],
                  age_test_val))
    logger.info(s2.format(age17_topk_inf_indices.shape[0], args.topk_inf))

    # 5. show most similar training instances
    logger.info(
        '\nTop {:,} most similar samples to the predicted label...'.format(
            args.topk_sim))
    show_instances(args, X_train, alpha, sim, attributions, y_train,
                   sim_indices, args.topk_sim, logger)

    # 6. of the most similar train instances, compute how many have age <= 17
    num_age17_topk = np.where(
        X_train[sim_indices][:args.topk_sim][:,
                                             args.age_ndx] <= 17)[0].shape[0]
    logger.info(
        '\nTop {:,} most similar train instances with age <= 17: {:,}'.format(
            args.topk_sim, num_age17_topk))

    # 7. plot similarity of train instances against their instance weights
    logger.info('\nplotting similarity vs. weights...')
    plot_similarity(args, alpha, sim, out_dir, logger)

    # 8. no. train instances with age <= 17 and an alpha coefficient < 0
    neg_alpha_indices = np.where(attributions < 0)[0]
    num_age17_neg_alpha = np.where(
        X_train[neg_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0]
    logger.info('\nno. instances with age <= 17 and alpha < 0: {:,}'.format(
        num_age17_neg_alpha))

    # 9. no. train instances with age <= 17 and an alpha coefficient >= 0
    pos_alpha_indices = np.where(attributions >= 0)[0]
    num_age17_pos_alpha = np.where(
        X_train[pos_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0]
    logger.info('no. instances with age <= 17 and alpha >= 0: {:,}'.format(
        num_age17_pos_alpha))

    # 10. no. train instances with age <= 17, similarity > thershold and an alpha coefficient < 0
    s = 'no. instances with age <= 17, sim > {:.2f} and alpha >= 0: {:,}'
    neg_alpha_indices = np.where((attributions < 0)
                                 & (sim > args.sim_thresh))[0]
    num_age17_sim_neg_alpha = np.where(
        X_train[neg_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0]
    logger.info(s.format(args.sim_thresh, num_age17_sim_neg_alpha))

    # 11. no. train instances with age <= 17, similarity > thershold and an alpha coefficient >= 0
    s = 'no. instances with age <= 17, sim > {:.2f} and alpha >= 0: {:,}'
    pos_alpha_indices = np.where((attributions >= 0)
                                 & (sim > args.sim_thresh))[0]
    num_age17_sim_pos_alpha = np.where(
        X_train[pos_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0]
    logger.info(s.format(args.sim_thresh, num_age17_sim_pos_alpha))

    # display total time
    logger.info('\ntotal time: {:.3f}s'.format(time.time() - begin))
示例#11
0
def experiment(args, logger, out_dir):
    """
    Main method that removes training instances ordered by
    different methods and measure their impact on a random
    set of test instances.
    """

    # start timer
    begin = time.time()

    # create random number generator
    rng = np.random.default_rng(args.rs)

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')
    util.performance(model, X_test, y_test, logger=logger, name='Test')

    # compute loss of each test instance
    proba = model.predict_proba(X_test)
    losses = np.abs([proba[i][1] - y_test[i] for i in range(proba.shape[0])])

    # select instances with an L1 loss >= 0.9
    test_indices = np.where(losses >= 0.9)[0]
    n_pos = y_test[test_indices].sum()

    logger.info(
        '\nNo. test instances w/ L1 loss >= 0.9: {:,}, no. pos.: {:,}'.format(
            len(test_indices), n_pos))

    for i, ndx in enumerate(test_indices[:args.n_test]):
        logger.info('\n\n[#{:,}] Test {}, loss: {:.3f}'.format(
            i + 1, ndx, losses[ndx]))

        X_test_sub = X_test[[ndx]]
        y_test_sub = y_test[[ndx]]
        instance_dir = os.path.join(out_dir, 'test_{}'.format(i))

        os.makedirs(instance_dir, exist_ok=True)

        # display dataset statistics
        logger.info('\nno. train instances: {:,}'.format(X_train.shape[0]))
        logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0]))
        logger.info('no. features: {:,}'.format(X_train.shape[1]))

        # sort train instances, then remove, retrain, and re-evaluate
        result = measure_performance(args,
                                     clf,
                                     X_train,
                                     y_train,
                                     X_test_sub,
                                     y_test_sub,
                                     rng,
                                     instance_dir,
                                     logger=logger)

        # save results
        result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        result['total_time'] = time.time() - begin
        np.save(os.path.join(instance_dir, 'results.npy'), result)

        # display results
        logger.info('\nResults:\n{}'.format(result))
        logger.info('\nsaving results to {}...'.format(
            os.path.join(instance_dir, 'results.npy')))
示例#12
0
def experiment(args, logger, out_dir):

    # start timer
    begin = time.time()

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    logger.info('\ntrain instances: {:,}'.format(X_train.shape[0]))
    logger.info('test instances: {:,}'.format(X_test.shape[0]))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')
    util.performance(model, X_test, y_test, logger=logger, name='Test')

    # store indexes of different subgroups
    train_neg = np.where(y_train == 0)[0]
    train_pos = np.where(y_train == 1)[0]
    # test_neg = np.where(y_test == 0)[0]
    # test_pos = np.where(y_test == 1)[0]

    # transform features to tree kernel space
    logger.info('\ntransforming features into tree kernel space...')
    extractor = trex.TreeExtractor(model, tree_kernel=args.tree_kernel)

    start = time.time()
    X_train_alt = extractor.transform(X_train)
    logger.info('train transform time: {:.3f}s'.format(time.time() - start))

    start = time.time()
    X_test_alt = extractor.transform(X_test)
    logger.info('test transform time: {:.3f}s'.format(time.time() - start))

    # reduce dimensionality on original and tree feature spaces
    logger.info('\nembed original features into a lower dimensional space')
    X_train, X_test = reduce_and_embed(args, X_train, X_test, logger)

    logger.info('\nembed tree kernel features into a lower dimensional space')
    X_train_alt, X_test_alt = reduce_and_embed(args, X_train_alt, X_test_alt,
                                               logger)

    # separating embedded points into train and test
    # n_train = len(y_train)
    # train_neg_embed = X_embed[:n_train][train_neg]
    # train_pos_embed = X_embed[:n_train][train_pos]
    # test_neg_embed = X_embed[n_train:][test_neg]
    # test_pos_embed = X_embed[n_train:][test_pos]

    # save original feature space results
    np.save(os.path.join(out_dir, 'train_negative'), X_train[train_neg])
    np.save(os.path.join(out_dir, 'train_positive'), X_train[train_pos])

    # save tree kenel space results
    np.save(os.path.join(out_dir, 'train_tree_negative'),
            X_train_alt[train_neg])
    np.save(os.path.join(out_dir, 'train_tree_positive'),
            X_train_alt[train_pos])