Exemplo n.º 1
0
def load_data(dataset, data_dir):

    if dataset == 'iris':
        data = load_iris()
        X = data['data']
        y = data['target']

        # make into binary classification dataset
        indices = np.where(y != 2)[0]
        X = X[indices]
        y = y[indices]

        X_train, X_test, y_train, y_test = X, X, y, y

    elif dataset == 'boston':
        data = load_boston()
        X = data['data']
        y = data['target']

        # make into binary classification dataset
        y = np.where(y < np.mean(y), 0, 1)

        X_train, X_test, y_train, y_test = X, X, y, y

    else:
        X_train, X_test, y_train, y_test = data_util.get_data(
            dataset, data_dir)

        X_train = X_train[:, :50]
        X_test = X_test[:, :50]

    return X_train, X_test, y_train, y_test
Exemplo n.º 2
0
def main(args):

    # create output directory
    out_dir = os.path.join(args.out_dir, args.dataset)
    os.makedirs(out_dir, exist_ok=True)

    # create logger
    logger_fp = os.path.join(out_dir, 'log.txt')
    logger = print_util.get_logger(logger_fp)
    logger.info('{}'.format(args))
    logger.info('\ntimestamp: {}'.format(datetime.now()))

    # get dataset
    X_train, X_test, y_train, y_test = data_util.get_data(
        args.dataset, args.data_dir)
    logger.info('X_train.shape: {}'.format(X_train.shape))

    # collect top threshold scores
    top_scores = []

    # get best threshold(s) for each feature
    for i in range(X_train.shape[1]):
        vals = np.unique(X_train[:, i])
        C = get_thresholds(X_train[:, i], y_train)
        S = compute_scores(C)
        logger.info(
            '\n[FEATURE {}] no. unique: {:,}, no. valid thresholds: {:,}'.
            format(i, len(vals), len(C)))

        # sort thresholds based on score
        S = sorted(S, key=lambda x: x[1])

        # display split score for each threshold
        for T, s in S[:args.k]:
            logger.info('  threshold value: {:.5f}, score: {:.5f}'.format(
                T.v, s))
            top_scores.append(s)

    # plot distribution of top threshold scores
    ax = sns.distplot(top_scores, rug=True, hist=False)
    ax.set_title('{}: Scores for Top {} Threshold(s) / Feature'.format(
        args.dataset.title(), args.k))
    ax.set_xlabel('Gini index')
    ax.set_ylabel('Density')
    plt.savefig(os.path.join(out_dir, 'k_{}.pdf'.format(args.k)),
                bbox_inches='tight')
Exemplo n.º 3
0
def experiment(args, logger, out_dir, seed):
    """
    Main method comparing performance of tree ensembles and svm models.
    """

    # get model and data
    clf, params = _get_classifier(args)
    data = data_util.get_data(args.dataset,
                              random_state=seed,
                              data_dir=args.data_dir)
    X_train, X_test, y_train, y_test, label = data

    logger.info('train instances: {:,}'.format(len(X_train)))
    logger.info('test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # train model
    logger.info('\nmodel: {}, params: {}'.format(args.model, params))

    if not args.no_tune:
        gs = GridSearchCV(clf, params, cv=args.cv, verbose=args.verbose).fit(X_train, y_train)

        cols = ['mean_fit_time', 'mean_test_score', 'rank_test_score']
        cols += ['param_{}'.format(param) for param in params.keys()]

        df = pd.DataFrame(gs.cv_results_)
        logger.info('gridsearch results:')
        logger.info(df[cols].sort_values('rank_test_score'))

        model = gs.best_estimator_
        logger.info('best params: {}'.format(gs.best_params_))

    else:
        model = clf.fit(X_train, y_train)

    model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger)
Exemplo n.º 4
0
def experiment(args, logger, out_dir):
    """
    Obtains data, trains model, and generates instance-attribution explanations.
    """

    # get data
    X_train, X_test, y_train, y_test = data_util.get_data(
        args.dataset, data_dir=args.data_dir)

    logger.info('\nno. train instances: {:,}'.format(len(X_train)))
    logger.info('no. test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # add noise
    y_train_noisy, noisy_indices = flip_labels(y_train,
                                               seed=args.rs,
                                               k=args.flip_frac)
    noisy_indices = np.array(sorted(noisy_indices))
    logger.info('no. noisy labels: {:,}'.format(len(noisy_indices)))

    # number of checkpoints to record
    n_check = int(len(y_train) * args.check_frac)
    snapshot_interval = n_check / args.n_snapshots
    logger.info('no. check: {:,}'.format(n_check))
    logger.info('no. snapshots: {:,}'.format(args.n_snapshots))

    # experiment settings
    logger.info('\nrandom state: {}'.format(args.rs))
    logger.info('criterion: {}'.format(args.criterion))
    logger.info('n_estimators: {}'.format(args.n_estimators))
    logger.info('max_depth: {}'.format(args.max_depth))
    logger.info('max_features: {}\n'.format(args.max_features))

    # clean model
    model = _get_model(args).fit(X_train, y_train)
    acc_clean, auc_clean, ap_clean = exp_util.performance(model,
                                                          X_test,
                                                          y_test,
                                                          logger=logger,
                                                          name='clean')

    # noisy model
    model = _get_model(args).fit(X_train, y_train_noisy)
    exp_util.performance(model, X_test, y_test, logger=logger, name='noisy')

    start = time.time()

    # random method
    if args.method == 'random':
        logger.info('\nOrdering by random...')

        # +1 to avoid choosing the same indices as the noisy labels
        np.random.seed(args.rs + 1)
        train_order = np.random.choice(len(y_train),
                                       size=n_check,
                                       replace=False)

    # D-DART: ordered from biggest change in prediction for each training sample on itself
    elif args.method == 'dart':
        logger.info('\nOrdering by D-DART...')
        start = time.time()

        initial_proba = model.predict_proba(X_train)[:, 1]
        explanation = np.zeros(shape=(X_train.shape[0], ))

        for i in range(X_train.shape[0]):
            model.delete(i)
            proba = model.predict_proba(X_train[[i]])[:, 1][0]
            explanation[i] = np.abs(proba - initial_proba[i])

            if i % PRINT_COUNTER == 0:
                elapsed = time.time() - start
                logger.info(
                    '[Influence on sample {}] cum time: {:.3f}s'.format(
                        i, elapsed))

            model.add(X_train[[i]], y_train_noisy[[i]])

        train_order = np.argsort(explanation)[::-1]

    # D-DART loss: ordered by largest loss on training samples
    elif args.method == 'dart_loss':
        logger.info('\nOrdering by D-DART loss...')
        proba = model.predict_proba(X_train)[:, 1]
        loss = np.abs(proba - y_train_noisy)
        train_order = np.argsort(loss)[::-1]

    # save results
    checkpoints, fixed_indices = record_fixes(train_order[:n_check],
                                              noisy_indices, snapshot_interval)
    results = measure_performance(args,
                                  checkpoints,
                                  fixed_indices,
                                  noisy_indices,
                                  model,
                                  X_train,
                                  y_train,
                                  X_test,
                                  y_test,
                                  logger=logger)
    results['acc_clean'] = acc_clean
    results['auc_clean'] = auc_clean
    results['ap_clean'] = ap_clean
    np.save(os.path.join(out_dir, 'results.npy'), results)

    logger.info('time: {:3f}s'.format(time.time() - start))
Exemplo n.º 5
0
def experiment(args, logger, out_dir, seed):

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=args.rs)

    X_train, X_test, y_train, y_test, label = data_util.get_data(
        args.dataset, random_state=args.rs, data_dir=args.data_dir)

    # reduce train size
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train = int(X_train.shape[0] * args.train_frac)
        X_train, y_train = X_train[:n_train], y_train[:n_train]
    data = X_train, y_train, X_test, y_test

    logger.info('train instances: {}'.format(len(X_train)))
    logger.info('test instances: {}'.format(len(X_test)))
    logger.info('no. features: {}'.format(X_train.shape[1]))

    logger.info('no. trees: {:,}'.format(args.n_estimators))
    logger.info('max depth: {}'.format(args.max_depth))

    # train a tree ensemble
    logger.info('fitting tree ensemble...')
    tree = clf.fit(X_train, y_train)

    if args.teknn:

        # transform data
        extractor = trex.TreeExtractor(tree, tree_kernel=args.tree_kernel)

        logger.info('transforming training data...')
        X_train_alt = extractor.fit_transform(X_train)

        logger.info('transforming test data...')
        X_test_alt = extractor.transform(X_test)

        train_label = y_train if args.true_label else tree.predict(X_train)

        # tune and train teknn
        start = time.time()
        logger.info('TE-KNN...')
        if args.k:
            knn_clf = KNeighborsClassifier(n_neighbors=args.k,
                                           weights='uniform')
            knn_clf = knn_clf.fit(X_train_alt, y_train)
        else:
            knn_clf = exp_util.tune_knn(tree,
                                        X_train,
                                        X_train_alt,
                                        train_label,
                                        args.val_frac,
                                        seed=seed,
                                        logger=logger)

        start = time.time()
        logger.info('generating predictions...')
        results = _get_knn_predictions(tree,
                                       knn_clf,
                                       X_test,
                                       X_test_alt,
                                       y_train,
                                       pred_size=args.pred_size,
                                       out_dir=out_dir,
                                       logger=logger)
        logger.info('time: {:.3f}s'.format(time.time() - start))

        # save results
        if results:
            results['n_neighbors'] = knn_clf.get_params()['n_neighbors']
            np.save(os.path.join(out_dir, 'tree.npy'), results['tree'])
            np.save(os.path.join(out_dir, 'surrogate.npy'), results['teknn'])

    if args.trex:

        start = time.time()
        explainer = trex.TreeExplainer(tree,
                                       X_train,
                                       y_train,
                                       tree_kernel=args.tree_kernel,
                                       kernel_model=args.kernel_model,
                                       random_state=args.rs,
                                       logger=logger,
                                       true_label=not args.true_label,
                                       val_frac=args.val_frac)

        start = time.time()
        logger.info('generating predictions...')
        results = _get_trex_predictions(tree, explainer, data)
        logger.info('time: {:.3f}s'.format(time.time() - start))

        results['C'] = explainer.C

        # save data
        np.save(os.path.join(out_dir, 'tree.npy'), results['tree'])
        np.save(
            os.path.join(out_dir, 'surrogate.npy'.format(args.kernel_model)),
            results['trex'])
Exemplo n.º 6
0
def experiment(args, logger, out_dir, seed):
    """
    Delete as many samples in the time it takes the naive
    approach to delete one sample.
    """

    # random number generator
    rng = np.random.default_rng(args.rs)

    # get data
    X_train, X_test, y_train, y_test = data_util.get_data(args.dataset, data_dir=args.data_dir)

    # dataset statistics
    logger.info('\ntrain instances: {:,}'.format(X_train.shape[0]))
    logger.info('test instances: {:,}'.format(X_test.shape[0]))
    logger.info('features: {:,}'.format(X_train.shape[1]))

    # experiment settings
    logger.info('\nrandom state: {}'.format(seed))
    logger.info('criterion: {}'.format(args.criterion))
    logger.info('n_estimators: {}'.format(args.n_estimators))
    logger.info('max_depth: {}'.format(args.max_depth))
    logger.info('topd: {}'.format(args.topd))
    logger.info('k: {}'.format(args.k))
    logger.info('subsample_size: {}'.format(args.subsample_size))
    logger.info('n_delete: {}'.format(args.n_delete))

    # train a naive model, before and after deleting 1 sample
    naive_avg_delete_time, naive_utility = train_naive(args, X_train, y_train, X_test, y_test, rng, logger=logger)

    # begin experiment
    begin = time.time()

    # amount of time given to delete as many samples as possible
    allotted_time = naive_avg_delete_time

    # result containers
    total_delete_time = 0
    delete_types_list = []
    delete_depths_list = []
    delete_costs_list = []

    # train target model
    model = get_model(args)

    start = time.time()
    model = model.fit(X_train, y_train)
    train_time = time.time() - start

    logger.info('[{}] train time: {:.3f}s'.format('model', train_time))

    # evaluate predictive performance between naive and the model
    naive_auc, naive_acc, naive_ap = naive_utility
    model_auc, model_acc, model_ap = exp_util.performance(model, X_test, y_test, logger=logger, name='model')

    # available indices
    indices = np.arange(len(X_train))

    # find the most damaging samples heuristically
    progress_str = '[{}] sample {}, sample_cost: {:,}, search time: {:3f}s, allotted: {:.3f}s, cum time: {:.3f}s'
    logger.info('\nDelete samples:')

    n_deleted = 0
    while allotted_time > 0 and time.time() - begin <= args.time_limit:

        # adversarially select a sample out of a subset of candidate samples
        delete_ndx, search_time = get_delete_index(model, X_train, y_train, indices, rng)

        # delete the adversarially selected sample
        start = time.time()
        model.delete(delete_ndx)
        delete_time = time.time() - start

        # get deletion statistics
        delete_types, delete_depths, delete_costs = model.get_delete_metrics()
        delete_types_list.append(delete_types)
        delete_depths_list.append(delete_depths)
        delete_costs_list.append(delete_costs)
        sample_cost = np.sum(delete_costs)  # sum over all trees
        model.clear_delete_metrics()

        # update counters
        allotted_time -= delete_time  # available time
        total_delete_time += delete_time  # total deletion time
        cum_time = time.time() - begin  # total time
        n_deleted += 1

        # progress update
        logger.info(progress_str.format(n_deleted, delete_ndx, sample_cost, search_time, allotted_time, cum_time))

        # remove the chosen ndx from the list of available indices
        indices = np.setdiff1d(indices, [delete_ndx])

    # estimate how many additional updates would finish in the remaining time
    if allotted_time > 0:
        average_delete_time = total_delete_time / n_deleted
        n_deleted += int(allotted_time) / average_delete_time

    # get model statistics
    n_nodes_avg, n_random_nodes_avg, n_greedy_nodes_avg = model.get_node_statistics()
    delete_types = np.concatenate(delete_types_list)
    delete_depths = np.concatenate(delete_depths_list)
    delete_costs = np.concatenate(delete_costs_list)

    # save model results
    result = model.get_params()
    result['naive_auc'] = naive_auc
    result['naive_acc'] = naive_acc
    result['naive_ap'] = naive_ap
    result['naive_avg_delete_time'] = naive_avg_delete_time
    result['naive_n_deleted'] = args.n_delete
    result['model_n_deleted'] = n_deleted
    result['model_train_%_deleted'] = n_deleted / len(X_train)
    result['model_delete_depths'] = count_depths(delete_types, delete_depths)
    result['model_delete_costs'] = count_costs(delete_types, delete_depths, delete_costs)
    result['model_auc'] = model_auc
    result['model_acc'] = model_acc
    result['model_ap'] = model_ap
    result['model_n_nodes_avg'] = n_nodes_avg
    result['model_n_random_nodes_avg'] = n_random_nodes_avg
    result['model_n_greedy_nodes_avg'] = n_greedy_nodes_avg
    result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

    logger.info('\nResults:\n{}'.format(result))
    np.save(os.path.join(out_dir, 'results.npy'), result)

    return result
Exemplo n.º 7
0
def performance(args, logger):

    begin = time.time()

    # obtain data
    X_train, X_test, y_train, y_test = data_util.get_data(
        args.dataset, data_dir=args.data_dir)

    # dataset statistics
    logger.info('train instances: {:,}'.format(X_train.shape[0]))
    logger.info('test instances: {:,}'.format(X_test.shape[0]))
    logger.info('attributes: {:,}'.format(X_train.shape[1]))

    # tune on a fraction of the training data
    if not args.no_tune:

        if args.tune_frac < 1.0:
            sss = StratifiedShuffleSplit(n_splits=1,
                                         test_size=2,
                                         train_size=args.tune_frac,
                                         random_state=args.rs)
            tune_indices, _ = list(sss.split(X_train, y_train))[0]
            X_train_sub, y_train_sub = X_train[tune_indices], y_train[
                tune_indices]
            logger.info('tune instances: {:,}'.format(X_train_sub.shape[0]))

        else:
            X_train_sub, y_train_sub = X_train, y_train

    # hyperparameter values
    n_estimators = [10, 50, 100, 250]
    max_depth = [1, 3, 5, 10, 20]

    param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators}

    # test model
    logger.info('\n{}'.format(args.model_type.capitalize()))
    start = time.time()

    # get model
    model = lgb.LGBMClassifier(num_leaves=2**10)

    # tune model
    if args.no_tune:
        model = model.fit(X_train, y_train)

    else:
        logger.info('param_grid: {}'.format(param_grid))
        skf = StratifiedKFold(n_splits=args.cv,
                              shuffle=True,
                              random_state=args.rs)
        gs = GridSearchCV(model,
                          param_grid,
                          scoring=args.scoring,
                          cv=skf,
                          verbose=args.verbose,
                          refit=True)
        gs = gs.fit(X_train_sub, y_train_sub)
        model = gs.best_estimator_
        logger.info('best params: {}'.format(gs.best_params_))

    # test model
    start = time.time()
    model = model.fit(X_train, y_train)
    exp_util.performance(model,
                         X_test,
                         y_test,
                         name=args.model_type,
                         logger=logger)
    logger.info('train time: {:.3f}s'.format(time.time() - start))
    logger.info('total time: {:.3f}s'.format(time.time() - begin))
Exemplo n.º 8
0
def experiment(args, logger, out_dir, seed):
    """
    Main method that trains a tree ensemble, then compares the
    runtime of different methods to explain a single test instance.
    """

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=seed)

    data = data_util.get_data(args.dataset,
                              random_state=seed,
                              data_dir=args.data_dir)
    X_train, X_test, y_train, y_test, label = data

    logger.info('train instances: {:,}'.format(len(X_train)))
    logger.info('test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    model_util.performance(model, X_train, y_train,
                           X_test=X_test, y_test=y_test,
                           logger=logger)

    # randomly pick test instances to explain
    np.random.seed(seed)
    test_ndx = np.random.choice(len(y_test), size=1, replace=False)

    # train on predicted labels
    train_label = y_train if args.true_label else model.predict(X_train)

    # TREX
    if args.trex:
        logger.info('\nTREX...')
        fine_tune, test_time = _trex_method(args, model, test_ndx, X_test, X_train, y_train,
                                            seed=seed, logger=logger)

        logger.info('fine tune: {:.3f}s'.format(fine_tune))
        logger.info('computation time: {:.3f}s'.format(test_time))
        r = {'fine_tune': fine_tune, 'test_time': test_time}
        np.save(os.path.join(out_dir, 'method.npy'), r)

    # Leaf Influence
    if args.tree_type == 'cb' and args.inf_k is not None:
        logger.info('\nleafinfluence...')
        fine_tune, test_time = _influence_method(model, test_ndx, X_train,
                                                 y_train, X_test, y_test, args.inf_k)

        if test_time is not None:
            logger.info('fine tune: {:.3f}s'.format(fine_tune))
            logger.info('computation time: {:.3f}s'.format(test_time))
            r = {'fine_tune': fine_tune, 'test_time': test_time}
            np.save(os.path.join(out_dir, 'method.npy'), r)
        else:
            logger.info('time limit reached!')

    if args.maple:
        logger.info('\nMAPLE...')
        fine_tune, test_time = _maple_method(model, test_ndx, X_train, train_label, X_test, y_test,
                                             dstump=args.dstump, logger=logger)

        if fine_tune is not None and test_time is not None:
            logger.info('fine tune: {:.3f}s'.format(fine_tune))
            logger.info('computation time: {:.3f}s'.format(test_time))
            r = {'fine_tune': fine_tune, 'test_time': test_time}
            np.save(os.path.join(out_dir, 'method.npy'), r)
        else:
            logger.info('time limit reached!')

    if args.teknn:
        logger.info('\nTEKNN...')
        fine_tune, test_time = _teknn_method(args, model, test_ndx, X_train, train_label,
                                             X_test, seed, logger=logger)
        if fine_tune is not None and test_time is not None:
            logger.info('fine tune: {:.3f}s'.format(fine_tune))
            logger.info('computation time: {:.3f}s'.format(test_time))
            r = {'fine_tune': fine_tune, 'test_time': test_time}
            np.save(os.path.join(out_dir, 'method.npy'), r)
        else:
            logger.info('time limit reached!')
Exemplo n.º 9
0
def experiment(args, logger, out_dir, seed):
    """
    Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train
    instances using various methods, and computes how effective each method is at cleaning the data.
    """

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=1)

    data = data_util.get_data(args.dataset,
                              random_state=1,
                              data_dir=args.data_dir)

    X_train, X_test, y_train, y_test, label = data

    # use part of the train data
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train_samples = int(X_train.shape[0] * args.train_frac)
        train_indices = np.random.choice(X_train.shape[0], size=n_train_samples, replace=False)
        X_train, y_train = X_train[train_indices], y_train[train_indices]

    # use part of the test data for evaluation
    n_test_samples = args.n_test if args.n_test is not None else int(X_test.shape[0] * args.test_frac)
    np.random.seed(seed)
    test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False)
    X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

    # choose new subset if test subset all contain the same label
    new_seed = seed
    while y_test_sub.sum() == len(y_test_sub) or y_test_sub.sum() == 0:
        np.random.seed(new_seed)
        new_seed += np.random.randint(MAX_SEED_INCREASE)
        np.random.seed(new_seed)
        test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False)
        X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

    X_test = X_test_sub
    y_test = y_test_sub

    logger.info('no. train instances: {:,}'.format(len(X_train)))
    logger.info('no. test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger)

    pcts = list(range(0, 100, 10))
    np.save(os.path.join(out_dir, 'percentages.npy'), pcts)

    # random method
    logger.info('\nordering by random...')
    start = time.time()
    np.random.seed(seed)
    train_order = np.random.choice(np.arange(X_train.shape[0]), size=X_train.shape[0], replace=False)
    random_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf)
    logger.info('time: {:3f}s'.format(time.time() - start))
    np.save(os.path.join(out_dir, 'random.npy'), random_res)

    # TREX method
    if args.trex:
        logger.info('\nordering by our method...')
        start = time.time()
        train_order = _trex_method(args, model, X_test, X_train, y_train, seed, logger)
        trex_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf)
        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), trex_res)

    # MAPLE method
    if args.maple:
        logger.info('\nordering by MAPLE...')
        start = time.time()
        train_order = _maple_method(X_test, args, model, X_train, y_train, logger)
        maple_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf)
        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), maple_res)

    # influence method
    if args.tree_type == 'cb' and args.inf_k is not None:
        logger.info('\nordering by LeafInfluence...')
        start = time.time()
        train_order = _influence_method(X_test, args, model, X_train, y_train, y_test, logger)
        leafinfluence_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf)
        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res)

    # TEKNN method
    if args.teknn:
        logger.info('\nordering by teknn...')
        start = time.time()
        train_order = _teknn_method(args, model, X_test, X_train, y_train, y_test, seed, logger)
        knn_res = _measure_performance(train_order, pcts, X_test, y_test, X_train, y_train, clf)
        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), knn_res)
Exemplo n.º 10
0
def performance(args, out_dir, logger):

    begin = time.time()

    # obtain data
    X_train, X_test, y_train, y_test = data_util.get_data(
        args.dataset, data_dir=args.data_dir)

    # dataset statistics
    logger.info('train instances: {:,}'.format(X_train.shape[0]))
    logger.info('test instances: {:,}'.format(X_test.shape[0]))
    logger.info('attributes: {:,}'.format(X_train.shape[1]))
    logger.info('split criterion: {}'.format(args.criterion))

    # tune on a fraction of the training data
    if not args.no_tune:

        if args.tune_frac < 1.0:
            sss = StratifiedShuffleSplit(n_splits=1,
                                         test_size=2,
                                         train_size=args.tune_frac,
                                         random_state=args.rs)
            tune_indices, _ = list(sss.split(X_train, y_train))[0]
            X_train_sub, y_train_sub = X_train[tune_indices], y_train[
                tune_indices]
            logger.info('tune instances: {:,}'.format(X_train_sub.shape[0]))

        else:
            X_train_sub, y_train_sub = X_train, y_train
    else:
        X_train_sub, y_train_sub = X_train, y_train

    # hyperparameter values
    n_estimators = [10, 50, 100, 250]
    max_depth = [1, 3, 5, 10, 20]

    # set hyperparameter grid
    param_grid = {'max_depth': max_depth, 'n_estimators': n_estimators}

    # add additional parameter for DaRE
    if args.model == 'dare':
        param_grid['k'] = [5, 10, 25, 50]

    # get hyperparameter names
    keys = list(param_grid.keys())

    # test model
    logger.info('\n{}'.format(args.model.capitalize()))
    start = time.time()
    model = _get_model(args)

    # tune hyperparameters
    if not args.no_tune:
        logger.info('param_grid: {}'.format(param_grid))

        # cross-validation
        skf = StratifiedKFold(n_splits=args.cv,
                              shuffle=True,
                              random_state=args.rs)
        gs = GridSearchCV(model,
                          param_grid,
                          scoring=args.scoring,
                          cv=skf,
                          verbose=args.verbose,
                          refit=False)
        gs = gs.fit(X_train_sub, y_train_sub)

        best_params = _get_best_params(gs, param_grid, keys, logger, args.tol)
        model = _get_model_dict(args, best_params)

    # record time it takes to tune the model
    tune_time = time.time() - start

    # train best model
    start = time.time()
    model = model.fit(X_train, y_train)
    train_time = time.time() - start
    logger.info('train time: {:.3f}s'.format(train_time))

    n_nodes, n_random, n_greedy = model.trees_[0].get_node_statistics()
    print(
        '[Tree 0] no. nodes: {:,}, no. random: {:,}, no. greedy: {:,}'.format(
            n_nodes, n_random, n_greedy))
    print('[Tree 0] memory usage: {:,} bytes'.format(
        model.trees_[0].get_memory_usage()))
    print('[Forest] memory usage: {:,} bytes'.format(model.get_memory_usage()))
    print('max_rss: {:,}'.format(
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
    exit(0)

    # evaluate
    auc, acc, ap = exp_util.performance(model,
                                        X_test,
                                        y_test,
                                        name=args.model,
                                        logger=logger)

    # save results
    result = model.get_params()
    result['model'] = args.model
    result['bootstrap'] = args.bootstrap
    result['auc'] = auc
    result['acc'] = acc
    result['ap'] = ap
    result['train_time'] = train_time
    result['tune_train_time'] = tune_time + train_time
    result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    np.save(os.path.join(out_dir, 'results.npy'), result)

    logger.info('total time: {:.3f}s'.format(time.time() - begin))
    logger.info('max_rss: {:,}'.format(result['max_rss']))
Exemplo n.º 11
0
def experiment(args, logger, out_dir, seed):
    """
    Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train
    instances using various methods, and computes how effective each method is at cleaning the data.
    """

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=seed)

    X_train, X_test, y_train, y_test, label = data_util.get_data(args.dataset,
                                                                 random_state=seed,
                                                                 data_dir=args.data_dir)

    # reduce train size
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train = int(X_train.shape[0] * args.train_frac)
        X_train, y_train = X_train[:n_train], y_train[:n_train]
    data = X_train, y_train, X_test, y_test

    logger.info('no. train instances: {:,}'.format(len(X_train)))
    logger.info('no. test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # add noise
    y_train_noisy, noisy_ndx = data_util.flip_labels(y_train, k=args.flip_frac, random_state=seed)
    noisy_ndx = np.array(sorted(noisy_ndx))
    logger.info('no. noisy labels: {:,}'.format(len(noisy_ndx)))

    # train a tree ensemble on the clean and noisy labels
    model = clone(clf).fit(X_train, y_train)
    model_noisy = clone(clf).fit(X_train, y_train_noisy)

    # show model performance before and after noise
    logger.info('\nBefore noise:')
    model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger)
    logger.info('\nAfter noise:')
    model_util.performance(model_noisy, X_train, y_train_noisy, X_test=X_test, y_test=y_test, logger=logger)

    # check accuracy before and after noise
    acc_test_clean = accuracy_score(y_test, model.predict(X_test))
    acc_test_noisy = accuracy_score(y_test, model_noisy.predict(X_test))

    # find how many corrupted/non-corrupted labels were incorrectly predicted
    if not args.true_label:
        logger.info('\nUsing predicted labels:')
        predicted_labels = model_noisy.predict(X_train).flatten()
        incorrect_ndx = np.where(y_train_noisy != predicted_labels)[0]
        incorrect_corrupted_ndx = np.intersect1d(noisy_ndx, incorrect_ndx)
        logger.info('incorrectly predicted corrupted labels: {:,}'.format(incorrect_corrupted_ndx.shape[0]))
        logger.info('total number of incorrectly predicted labels: {:,}'.format(incorrect_ndx.shape[0]))

    # number of checkpoints to record
    n_check = int(len(y_train) * args.check_pct)
    interval = (n_check / len(y_train)) / args.n_plot_points

    # random method
    logger.info('\nordering by random...')
    start = time.time()
    ckpt_ndx, fix_ndx = _random_method(noisy_ndx, y_train, interval,
                                       to_check=n_check,
                                       random_state=seed)
    check_pct, random_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)
    logger.info('time: {:3f}s'.format(time.time() - start))
    np.save(os.path.join(out_dir, 'random.npy'), random_res)

    # save global lines
    np.save(os.path.join(out_dir, 'test_clean.npy'), acc_test_clean)
    np.save(os.path.join(out_dir, 'check_pct.npy'), check_pct)

    # tree loss method
    logger.info('\nordering by tree loss...')
    start = time.time()

    y_train_proba = model_noisy.predict_proba(X_train)
    ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
    _, tree_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

    logger.info('time: {:3f}s'.format(time.time() - start))
    np.save(os.path.join(out_dir, 'tree.npy'), tree_loss_res)

    # trex method
    if args.trex:
        logger.info('\nordering by TREX...')
        start = time.time()
        explainer = trex.TreeExplainer(model_noisy, X_train, y_train_noisy,
                                       tree_kernel=args.tree_kernel,
                                       random_state=seed,
                                       true_label=args.true_label,
                                       kernel_model=args.kernel_model,
                                       verbose=args.verbose,
                                       val_frac=args.val_frac,
                                       logger=logger)

        ckpt_ndx, fix_ndx, _ = _our_method(explainer, noisy_ndx, y_train, n_check, interval)
        check_pct, trex_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), trex_res)

        # trex loss method
        logger.info('\nordering by TREX loss...')
        start = time.time()

        y_train_proba = explainer.predict_proba(X_train)
        ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
        _, trex_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method_loss.npy'), trex_loss_res)

    # influence method
    if args.tree_type == 'cb' and args.inf_k is not None:
        logger.info('\nordering by leafinfluence...')
        start = time.time()

        model_path = '.model.json'
        model_noisy.save_model(model_path, format='json')

        if args.inf_k == -1:
            update_set = 'AllPoints'
        elif args.inf_k == 0:
            update_set = 'SinglePoint'
        else:
            update_set = 'TopKLeaves'

        leaf_influence = CBLeafInfluenceEnsemble(model_path, X_train, y_train_noisy, k=args.inf_k,
                                                 learning_rate=model.learning_rate_, update_set=update_set)
        ckpt_ndx, fix_ndx, _, _ = _influence_method(leaf_influence, noisy_ndx, X_train, y_train, y_train_noisy,
                                                    interval, to_check=n_check)
        _, leafinfluence_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res)

    # MAPLE method
    if args.maple:
        logger.info('\nordering by MAPLE...')
        start = time.time()

        train_label = y_train_noisy if args.true_label else model_noisy.predict(X_train)
        maple_exp = MAPLE(X_train, train_label, X_train, train_label, verbose=args.verbose, dstump=False)
        ckpt_ndx, fix_ndx, map_scores, map_order = _maple_method(maple_exp, X_train, noisy_ndx, interval,
                                                                 to_check=n_check)
        _, maple_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), maple_res)

    # TEKNN method
    if args.teknn:
        logger.info('\nordering by teknn...')
        start = time.time()

        # transform the data
        extractor = trex.TreeExtractor(model_noisy, tree_kernel=args.tree_kernel)
        X_train_alt = extractor.fit_transform(X_train)
        train_label = y_train if args.true_label else model_noisy.predict(X_train)

        # tune and train teknn
        knn_clf = exp_util.tune_knn(model_noisy, X_train, X_train_alt, train_label, args.val_frac,
                                    seed=seed, logger=logger)

        ckpt_ndx, fix_ndx, _ = _knn_method(knn_clf, X_train_alt, noisy_ndx, interval, to_check=n_check)
        _, teknn_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), teknn_res)

        # TEKNN loss method
        logger.info('\nordering by teknn loss...')
        start = time.time()
        y_train_proba = knn_clf.predict_proba(X_train_alt)

        ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
        _, teknn_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method_loss.npy'), teknn_loss_res)

    # MMD-Critic method
    if args.mmd:
        logger.info('\nordering by mmd-critic...')
        start = time.time()
        ckpt_ndx, fix_ndx = _mmd_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check)
        _, mmd_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), mmd_res)

    # Prototype method
    if args.proto:
        logger.info('\nordering by proto...')
        start = time.time()
        ckpt_ndx, fix_ndx = _proto_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check)
        _, proto_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), proto_res)
Exemplo n.º 12
0
def experiment(args, logger, out_dir):
    """
    Obtains data, trains model, and generates instance-attribution explanations.
    """

    # get data
    X_train, X_test, y_train, y_test = data_util.get_data(args.dataset, data_dir=args.data_dir)

    # select a subset of the test data for evaluation
    n_test_samples = args.n_test if args.n_test is not None else int(X_test.shape[0] * args.test_frac)
    np.random.seed(args.rs)
    test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False)
    X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

    # choose new subset if test subset all contain the same label
    new_seed = args.rs
    while y_test_sub.sum() == len(y_test_sub) or y_test_sub.sum() == 0:
        np.random.seed(new_seed)
        new_seed += np.random.randint(MAX_SEED_INCREASE)
        np.random.seed(new_seed)
        test_indices = np.random.choice(X_test.shape[0], size=n_test_samples, replace=False)
        X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices]

    X_test = X_test_sub
    y_test = y_test_sub

    # dataset statistics
    logger.info('\ntrain instances: {:,}'.format(X_train.shape[0]))
    logger.info('test instances: {:,}'.format(X_test.shape[0]))
    logger.info('features: {:,}'.format(X_train.shape[1]))

    # experiment settings
    logger.info('\nrandom state: {}'.format(args.rs))
    logger.info('criterion: {}'.format(args.criterion))
    logger.info('n_estimators: {}'.format(args.n_estimators))
    logger.info('max_depth: {}'.format(args.max_depth))
    logger.info('k: {}'.format(args.k))
    logger.info('max_features: {}'.format(args.max_features))
    logger.info('n_test: {}\n'.format(args.n_test))

    # train target model
    model = _get_model(args)
    name = 'G-DaRE'

    start = time.time()
    model = model.fit(X_train, y_train)
    train_time = time.time() - start

    logger.info('[{}] train time: {:.3f}s'.format(name, train_time))
    exp_util.performance(model, X_test, y_test, logger=logger, name=name)

    percentages = list(range(0, 100, 1))
    start = time.time()

    # random method
    if args.method == 'random':
        logger.info('\nordering by random...')
        np.random.seed(args.rs)
        train_order = np.random.choice(np.arange(X_train.shape[0]), size=X_train.shape[0], replace=False)
        results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger)

    # G-DaRE 1: ordered from biggest sum increase in positive label confidence to least
    elif args.method == 'dare1':
        logger.info('\nordering by G-DaRE...')
        explanation = exp_util.explain_lite(model, X_train, y_train, X_test)
        train_order = np.argsort(explanation)[::-1]
        results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger)

    # G-DaRE 2: ordered by most positively influential to least positively influential
    elif args.method == 'dare2':
        logger.info('\nordering by G-DaRE 2...')
        explanation = exp_util.explain_lite(model, X_train, y_train, X_test, y_test=y_test)
        train_order = np.argsort(explanation)[::-1]
        results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger)

    # G-DaRE 3: ordered by biggest sum of absolute change in predictions
    elif args.method == 'dart3':
        logger.info('\nordering by G-DaRE 3...')
        explanation = exp_util.explain_lite(model, X_train, y_train, X_test, use_abs=True)
        train_order = np.argsort(explanation)[::-1]
        results = measure_performance(train_order, percentages, X_test, y_test, X_train, y_train, logger)

    logger.info('time: {:3f}s'.format(time.time() - start))

    results['percentage'] = percentages
    np.save(os.path.join(out_dir, 'results.npy'), results)
Exemplo n.º 13
0
def experiment(args, logger, out_dir, seed):

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=seed)

    data = data_util.get_data(args.dataset,
                              random_state=seed,
                              data_dir=args.data_dir,
                              return_feature=True)
    X_train, X_test, y_train, y_test, label, feature = data

    logger.info('train instances: {:,}'.format(len(X_train)))
    logger.info('test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # train a tree ensemble and explainer
    tree = clone(clf).fit(X_train, y_train)
    model_util.performance(tree,
                           X_train,
                           y_train,
                           X_test,
                           y_test,
                           logger=logger)

    original_auc = roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1])
    original_acc = accuracy_score(y_test, tree.predict(X_test))

    # train TREX
    explainer = trex.TreeExplainer(
        tree,
        X_train,
        y_train,
        tree_kernel=args.tree_kernel,
        random_state=seed,
        kernel_model=args.kernel_model,
        kernel_model_kernel=args.kernel_model_kernel,
        true_label=args.true_label)

    # get missed test instances
    missed_indices = np.where(tree.predict(X_test) != y_test)[0]

    np.random.seed(seed)
    explain_indices = np.random.choice(
        missed_indices,
        replace=False,
        size=int(len(missed_indices) * args.sample_frac))

    logger.info('no. incorrect instances: {:,}'.format(len(missed_indices)))
    logger.info('no. explain instances: {:,}'.format(len(explain_indices)))

    # compute total impact of train instances on test instances
    contributions = explainer.explain(X_test[explain_indices],
                                      y=y_test[explain_indices])
    impact_sum = np.sum(contributions, axis=0)

    # get train instances that impact the predictions
    neg_contributors = np.where(impact_sum < 0)[0]
    neg_impact = impact_sum[neg_contributors]
    neg_contributors = neg_contributors[np.argsort(neg_impact)]

    # remove offending train instances in segments and measure performance
    aucs = []
    accs = []
    n_removed = []
    for i in tqdm.tqdm(range(args.n_iterations + 1)):

        # remove these instances from the train data
        delete_ndx = neg_contributors[:args.n_remove * i]
        new_X_train = np.delete(X_train, delete_ndx, axis=0)
        new_y_train = np.delete(y_train, delete_ndx)

        tree = clone(clf).fit(new_X_train, new_y_train)

        aucs.append(roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1]))
        accs.append(accuracy_score(y_test, tree.predict(X_test)))

        n_removed.append(args.n_remove * i)

    # save results
    result = tree.get_params()
    result['original_auc'] = original_auc
    result['original_acc'] = original_acc
    result['auc'] = aucs
    result['acc'] = accs
    result['n_remove'] = n_removed
    np.save(os.path.join(out_dir, 'results.npy'), result)
Exemplo n.º 14
0
def performance(args, out_dir, logger):

    begin = time.time()

    # obtain data
    X_train, X_test, y_train, y_test = data_util.get_data(
        args.dataset, data_dir=args.data_dir)

    # dataset statistics
    logger.info('\nno. train instances: {:,}'.format(X_train.shape[0]))
    logger.info('no. test instances: {:,}'.format(X_test.shape[0]))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))
    logger.info('split criterion: {}'.format(args.criterion))
    logger.info('scoring: {}'.format(args.scoring))

    # tune on a fraction of the training data
    if args.tune_frac < 1.0:
        sss = StratifiedShuffleSplit(n_splits=1,
                                     test_size=2,
                                     train_size=args.tune_frac,
                                     random_state=args.rs)
        tune_indices, _ = list(sss.split(X_train, y_train))[0]
        X_train_sub, y_train_sub = X_train[tune_indices], y_train[tune_indices]
        logger.info('tune instances: {:,}'.format(X_train_sub.shape[0]))

    else:
        X_train_sub, y_train_sub = X_train, y_train

    skf = StratifiedKFold(n_splits=args.cv, shuffle=True, random_state=args.rs)

    # train exact model
    start = time.time()
    model = _get_model(args, topd=0)
    exact_score = cross_val_score(model,
                                  X_train_sub,
                                  y_train_sub,
                                  scoring=args.scoring,
                                  cv=skf).mean()
    logger.info('\n[topd=0] CV score: {:.5f}, time: {:.3f}s'.format(
        exact_score,
        time.time() - start))

    # train topd=0 model
    s = '[topd={}] CV score: {:.5f}, CV diff: {:.5f}, time: {:.3f}s'
    scores = {}
    best_scores = {tol: 0 for tol in args.tol}

    for topd in range(1, args.max_depth + 1):
        start = time.time()

        # obtain score for this topd
        model = _get_model(args, topd=topd)
        score = cross_val_score(model,
                                X_train_sub,
                                y_train_sub,
                                scoring=args.scoring,
                                cv=skf).mean()
        score_diff = exact_score - score
        scores[topd] = score
        end = time.time() - start

        logger.info(s.format(topd, score, score_diff, end))

        # update best score for each tolerance
        for tol in args.tol:
            if best_scores[tol] == topd - 1 and score_diff <= tol:
                best_scores[tol] = topd

        total_time = time.time() - begin

    logger.info('{}, total time: {:.3f}s'.format(best_scores, total_time))
    logger.info('max_rss: {:,}'.format(
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
    np.save(os.path.join(out_dir, 'results.npy'), best_scores)
Exemplo n.º 15
0
def experiment(args, logger, out_dir, seed):

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=seed)

    data = data_util.get_data(args.dataset,
                              random_state=seed,
                              data_dir=args.data_dir,
                              return_image_id=True,
                              test_size=args.test_size)
    X_train, X_test, y_train, y_test, label = data

    logger.info('train instances: {}'.format(len(X_train)))
    logger.info('test instances: {}'.format(len(X_test)))
    logger.info('labels: {}'.format(label))

    if args.pca_components is not None:
        logger.info('{} to {} using PCA...'.format(X_train.shape[1],
                                                   args.pca_components))
        pca = PCA(args.pca_components, random_state=args.rs).fit(X_train)
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)

    # fit a tree ensemble and an explainer for that tree ensemble
    logger.info('fitting {}...'.format(args.tree_type))
    tree = clone(clf).fit(X_train_pca, y_train)

    # show GBDT performance
    model_util.performance(tree,
                           X_train_pca,
                           y_train,
                           X_test_pca,
                           y_test,
                           logger=logger)

    logger.info('fitting TREX...')
    explainer = trex.TreeExplainer(tree,
                                   X_train_pca,
                                   y_train,
                                   tree_kernel=args.tree_kernel,
                                   random_state=seed,
                                   kernel_model=args.kernel_model,
                                   val_frac=args.val_frac,
                                   verbose=args.verbose,
                                   true_label=args.true_label,
                                   cv=2,
                                   logger=logger)

    # pick a random test instance to explain
    if args.random_test:
        np.random.seed(seed)
        test_ndx = np.random.choice(y_test)

    # pick a random mispredicted test instance to explain
    else:
        # y_test_label = explainer.le_.transform(y_test)
        # test_dist = exp_util.instance_loss(tree.predict_proba(X_test_pca), y_test_label)
        test_dist = exp_util.instance_loss(tree.predict_proba(X_test_pca),
                                           y_test)
        test_dist_ndx = np.argsort(test_dist)[::-1]
        np.random.seed(seed)
        test_ndx = np.random.choice(test_dist_ndx[:50])

    x_test = X_test_pca[test_ndx].reshape(1, -1)
    test_pred = tree.predict(x_test)[0]
    test_actual = y_test[test_ndx]

    # compute the impact of each training instance
    impact = explainer.explain(x_test)[0]
    alpha = explainer.get_weight()[0]
    sim = explainer.similarity(x_test)[0]

    # sort the training instances by impact in descending order
    sort_ndx = np.argsort(impact)[::-1]

    # matplotlib settings
    plt.rc('font', family='serif')
    plt.rc('xtick', labelsize=17)
    plt.rc('ytick', labelsize=17)
    plt.rc('axes', labelsize=22)
    plt.rc('axes', titlesize=22)
    plt.rc('legend', fontsize=18)
    plt.rc('legend', title_fontsize=11)
    plt.rc('lines', linewidth=1)
    plt.rc('lines', markersize=6)

    # matplotlib settings
    plt.rc('font', family='serif')
    plt.rc('xtick', labelsize=13)
    plt.rc('ytick', labelsize=13)
    plt.rc('axes', labelsize=13)
    plt.rc('axes', titlesize=13)
    plt.rc('legend', fontsize=11)
    plt.rc('legend', title_fontsize=11)
    plt.rc('lines', linewidth=1)
    plt.rc('lines', markersize=6)

    # inches
    width = 5.5  # Neurips 2020
    width, height = set_size(width=width * 3, fraction=1, subplots=(1, 3))
    fig, axs = plt.subplots(2,
                            1 + args.topk_train * 2,
                            figsize=(width, height))

    print(axs.shape)

    # plot the test image
    identifier = 'test_id{}'.format(test_ndx)
    _display_image(args,
                   X_test[test_ndx],
                   identifier=identifier,
                   predicted=test_pred,
                   actual=test_actual,
                   ax=axs[0][0])
    plt.setp(axs[0][0].spines.values(), color='blue')

    topk_train = args.topk_train if args.show_negatives else args.topk_train * 2

    # show positive train images
    for i, train_ndx in enumerate(sort_ndx[:topk_train]):
        i += 1
        identifier = 'train_id{}'.format(train_ndx)
        train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0]
        similarity = sim[train_ndx] if args.show_similarity else None
        weight = alpha[train_ndx] if args.show_weight else None
        plt.setp(axs[0][i].spines.values(), color='green')
        _display_image(args,
                       X_train[train_ndx],
                       ax=axs[0][i],
                       identifier=identifier,
                       predicted=train_pred,
                       actual=y_train[train_ndx],
                       similarity=similarity,
                       weight=weight)

    # show negative train images
    if args.show_negatives:
        for i, train_ndx in enumerate(sort_ndx[::-1][:topk_train]):
            i += 1 + args.topk_train
            identifier = 'train_id{}'.format(train_ndx)
            train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0]
            similarity = sim[train_ndx] if args.show_similarity else None
            weight = alpha[train_ndx] if args.show_weight else None
            plt.setp(axs[0][i].spines.values(), color='red')
            _display_image(args,
                           X_train[train_ndx],
                           ax=axs[0][i],
                           identifier=identifier,
                           predicted=train_pred,
                           actual=y_train[train_ndx],
                           similarity=similarity,
                           weight=weight)

    plt.savefig(os.path.join(out_dir, 'plot.pdf'),
                format='pdf',
                bbox_inches='tight')
    plt.show()

    # show highest weighted and lowest weighted samples for each class
    alpha_indices = np.argsort(alpha)

    print(alpha_indices)

    # plot highest negative weighted samples
    for i, train_ndx in enumerate(alpha_indices[:topk_train]):
        i += 1
        identifier = 'train_id{}'.format(train_ndx)
        train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0]
        similarity = sim[train_ndx] if args.show_similarity else None
        weight = alpha[train_ndx] if args.show_weight else None
        plt.setp(axs[1][i].spines.values(), color='red')
        _display_image(args,
                       X_train[train_ndx],
                       ax=axs[1][i],
                       identifier=identifier,
                       predicted=train_pred,
                       actual=y_train[train_ndx],
                       similarity=similarity,
                       weight=weight)

    # plot highest positive weighted samples
    for i, train_ndx in enumerate(alpha_indices[::-1][:topk_train]):
        i += 1 + args.topk_train
        identifier = 'train_id{}'.format(train_ndx)
        train_pred = tree.predict(X_train_pca[train_ndx].reshape(1, -1))[0]
        similarity = sim[train_ndx] if args.show_similarity else None
        weight = alpha[train_ndx] if args.show_weight else None
        plt.setp(axs[1][i].spines.values(), color='green')
        _display_image(args,
                       X_train[train_ndx],
                       ax=axs[1][i],
                       identifier=identifier,
                       predicted=train_pred,
                       actual=y_train[train_ndx],
                       similarity=similarity,
                       weight=weight)
Exemplo n.º 16
0
def experiment(args, logger, out_dir, seed):

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=seed)

    # get original feature space
    data = data_util.get_data(args.dataset,
                              random_state=seed,
                              data_dir=args.data_dir,
                              return_feature=True)
    X_train, X_test, y_train, y_test, label, feature = data

    logger.info('\ntrain instances: {}'.format(len(X_train)))
    logger.info('test instances: {}'.format(len(X_test)))
    logger.info('no. features: {}'.format(X_train.shape[1]))

    # filter the features to be the same as MFC18
    mapping = {'NC17_EvalPart1': 'nc17_mfc18',
               'MFC18_EvalPart1': 'mfc18_mfc19',
               'MFC19_EvalPart1': 'mfc19_mfc20'}

    if args.dataset in mapping:
        reduced_feature = data_util.get_data(mapping[args.dataset],
                                             random_state=seed,
                                             data_dir=args.data_dir,
                                             return_feature=True)[-1]

        keep_ndx = align_feature(feature, reduced_feature)
        feature = feature[keep_ndx]
        X_train = X_train[:, keep_ndx]
        X_test = X_test[:, keep_ndx]

    # train a tree ensemble and explainer
    tree = clone(clf).fit(X_train, y_train)
    model_util.performance(tree, X_train, y_train, X_test, y_test)

    # store indexes of different subgroups
    train_neg = np.where(y_train == 0)[0]
    train_pos = np.where(y_train == 1)[0]
    # test_neg = np.where(y_test == 0)[0]
    # test_pos = np.where(y_test == 1)[0]

    # transform features to tree kernel space
    logger.info('\ntransforming features into tree kernel space')
    start = time.time()
    extractor = TreeExtractor(tree, tree_kernel=args.tree_kernel)

    X_train_tree = extractor.fit_transform(X_train)
    logger.info('  train transform time: {:.3f}s'.format(time.time() - start))

    X_test_tree = extractor.transform(X_test)
    logger.info('  test transform time: {:.3f}s'.format(time.time() - start))

    # reduce dimensionality on original and tree feature spaces
    logger.info('\nembed original features into a lower dimensional space')
    X_train, X_test = reduce_and_embed(args, X_train, X_test, logger, init='random')

    logger.info('\nembed tree kernel features into a lower dimensional space')
    X_train_tree, X_test_tree = reduce_and_embed(args, X_train_tree, X_test_tree, logger, init='pca')

    # separating embedded points into train and test
    # n_train = len(y_train)
    # train_neg_embed = X_embed[:n_train][train_neg]
    # train_pos_embed = X_embed[:n_train][train_pos]
    # test_neg_embed = X_embed[n_train:][test_neg]
    # test_pos_embed = X_embed[n_train:][test_pos]

    # save original feature space results
    np.save(os.path.join(out_dir, 'train_negative'), X_train[train_neg])
    np.save(os.path.join(out_dir, 'train_positive'), X_train[train_pos])

    # save tree kenel space results
    np.save(os.path.join(out_dir, 'train_tree_negative'), X_train_tree[train_neg])
    np.save(os.path.join(out_dir, 'train_tree_positive'), X_train_tree[train_pos])