def committe(solver, solver_name, intervals, reps): np.random.seed() X, y = util.basic_data() polls = util.add_noise(y) curr_labels = np.random.choice(range(len(X)), size=4, replace=False) X_train = X[curr_labels] square_errors = np.zeros([2, len(intervals)]) for i in range(len(intervals)): print("interval: ", intervals[i]) for j in range(reps): while len(curr_labels) <= intervals[i]: next_points = next_countys(solver, curr_labels, X, polls) curr_labels = np.append(curr_labels, next_points) curr_labels = curr_labels[:intervals[i]] preds = solver(X, X[curr_labels], polls[curr_labels]) square_errors[:, i] += util.square_error(y, preds) square_errors[:, i] /= reps square_errors = np.vstack( (square_errors.mean(axis=0), util.performance(solver, intervals, reps).mean(axis=0))) util.plot("committe", intervals / len(X), square_errors, legend=[solver_name, "random"], x_label="% counties", y_label="MSE", title="Committe")
def experiment(args, logger, out_dir): """ Main method that removes training instances ordered by different methods and measure their impact on a random set of test instances. """ # start timer begin = time.time() # create random number generator rng = np.random.default_rng(args.rs) # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # use a fraction of the train data if args.train_frac < 1.0 and args.train_frac > 0.0: n_train_samples = int(X_train.shape[0] * args.train_frac) train_indices = rng.choice(X_train.shape[0], size=n_train_samples, replace=False) X_train, y_train = X_train[train_indices], y_train[train_indices] # select a (stratified) subset of test instances uniformly at random _, X_test_sub, _, y_test_sub = train_test_split(X_test, y_test, test_size=args.n_test, random_state=args.rs, stratify=y_test) # display dataset statistics logger.info('\nno. train instances: {:,}'.format(X_train.shape[0])) logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0])) logger.info('no. features: {:,}'.format(X_train.shape[1])) logger.info('pos. label % (test): {:.1f}%\n'.format(np.sum(y_test) / y_test.shape[0] * 100)) # train a tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') # select a subset of test instances, half from the neg. class and half from the pos. class model_pred_test = model.predict(X_test) neg_test_indices = np.where(model_pred_test == 0)[0] pos_test_indices = np.where(model_pred_test == 1)[0] neg_test_indices = rng.choice(neg_test_indices, size=int(args.n_test / 2), replace=False) pos_test_indices = rng.choice(pos_test_indices, size=int(args.n_test / 2), replace=False) test_indices = np.concatenate([neg_test_indices, pos_test_indices]) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] util.performance(model, X_test_sub, y_test_sub, logger=logger, name='Test') # compute how many samples to remove before a checkpoint if args.train_frac_to_remove >= 1.0: n_checkpoint = int(args.train_frac_to_remove) elif args.train_frac_to_remove > 0: n_checkpoint = int(args.train_frac_to_remove * X_train.shape[0] / args.n_checkpoints) else: raise ValueError('invalid train_frac_to_remove: {}'.format(args.train_frac_to_remove)) # sort train instances, then remove, retrain, and re-evaluate train_indices = sort_train_instances(args, model, X_train, y_train, X_test_sub, y_test_sub, rng, logger=logger) result = measure_performance(train_indices, n_checkpoint, args.n_checkpoints, clf, X_train, y_train, X_test_sub, y_test_sub, logger=logger) # save results result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss result['total_time'] = time.time() - begin np.save(os.path.join(out_dir, 'results.npy'), result) plt.savefig(os.path.join(out_dir, 'special_ckpt.pdf'), bbox_inches='tight') # display results logger.info('\nResults:\n{}'.format(result)) logger.info('\nsaving results to {}...'.format(os.path.join(out_dir, 'results.npy')))
def experiment(args, logger, out_dir): """ Main method that removes training instances ordered by different methods and measure their impact on a random set of test instances. """ # start timer begin = time.time() # create random number generator rng = np.random.default_rng(args.rs) # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # train a tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') util.performance(model, X_test, y_test, logger=logger, name='Test') # select a subset of test instances uniformly at random if args.start_pred == -1: # select an instance at random if args.n_test == 1: test_indices = rng.choice(X_test.shape[0], size=args.n_test, replace=False) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] # use the entire test set elif args.n_test <= 0: X_test_sub, y_test_sub = X_test, y_test # use a stratified sample of the test set else: n_test = 1.0 if args.n_test == -1 else args.n_test _, X_test_sub, _, y_test_sub = train_test_split( X_test, y_test, test_size=n_test, random_state=args.rs, stratify=y_test) # select a subset of test instances of the desired predicted label uniformly at random elif args.start_pred in [0, 1]: # use the entire test set if args.n_test <= 0: X_test_sub, y_test_sub = X_test, y_test # select a specified no. instances from the specified class else: model_pred = model.predict(X_test) label_indices = np.where(model_pred == args.start_pred)[0] test_indices = rng.choice(label_indices, size=args.n_test, replace=False) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] else: raise ValueError('unknown start_pred: {}'.format(args.start_pred)) # display dataset statistics logger.info('\nno. train instances: {:,}'.format(X_train.shape[0])) logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0])) logger.info('no. features: {:,}\n'.format(X_train.shape[1])) # sort train instances, then remove, retrain, and re-evaluate result = measure_performance(args, clf, X_train, y_train, X_test_sub, y_test_sub, rng, out_dir, logger=logger) # save results result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss result['total_time'] = time.time() - begin np.save(os.path.join(out_dir, 'results.npy'), result) # display results logger.info('\nResults:\n{}'.format(result)) logger.info('\nsaving results to {}...'.format( os.path.join(out_dir, 'results.npy')))
def experiment(args, logger, out_dir, seed): """ Main method comparing performance of tree ensembles and svm models. """ # start experiment timer begin = time.time() # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data logger.info('no. train: {:,}'.format(X_train.shape[0])) logger.info('no. test: {:,}'.format(X_test.shape[0])) logger.info('no. features: {:,}'.format(X_train.shape[1])) # tune on a fraction of the training data if not args.no_tune: if args.tune_frac < 1.0: sss = StratifiedShuffleSplit(n_splits=1, test_size=2, train_size=args.tune_frac, random_state=args.rs) tune_indices, _ = list(sss.split(X_train, y_train))[0] X_train_sub, y_train_sub = X_train[tune_indices], y_train[ tune_indices] logger.info('tune instances: {:,}'.format(X_train_sub.shape[0])) else: X_train_sub, y_train_sub = X_train, y_train else: X_train_sub, y_train_sub = X_train, y_train # get model model, param_grid = get_model(args, cat_indices=cat_indices) logger.info('\nmodel: {}, param_grid: {}'.format(args.model, param_grid)) # tune the model start = time.time() if not args.no_tune: skf = StratifiedKFold(n_splits=args.cv, shuffle=True, random_state=args.rs) gs = GridSearchCV(model, param_grid, scoring=args.scoring, cv=skf, verbose=args.verbose) gs = gs.fit(X_train_sub, y_train_sub) cols = ['mean_fit_time', 'mean_test_score', 'rank_test_score'] cols += ['param_{}'.format(param) for param in param_grid.keys()] df = pd.DataFrame(gs.cv_results_) logger.info('gridsearch results:') logger.info(df[cols].sort_values('rank_test_score')) model = clone(gs.best_estimator_) logger.info('best params: {}'.format(gs.best_params_)) tune_time = time.time() - start logger.info('tune time: {:.3f}s'.format(tune_time)) # train model start = time.time() model = model.fit(X_train, y_train) train_time = time.time() - start logger.info('train time: {:.3f}s'.format(train_time)) # evaluate auc, acc, ap, ll = util.performance(model, X_test, y_test, logger, name=args.model) # save results result = {} result['model'] = args.model result['auc'] = auc result['acc'] = acc result['ap'] = ap result['ll'] = ll result['tune_time'] = tune_time result['train_time'] = train_time result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss result['tune_frac'] = args.tune_frac if args.model in ['cb', 'rf']: result['n_estimators'] = gs.best_params_['n_estimators'] result['max_depth'] = gs.best_params_['max_depth'] np.save(os.path.join(out_dir, 'results.npy'), result) # Macs show this in bytes, unix machines show this in KB logger.info('max_rss: {:,}'.format(result['max_rss'])) logger.info('total time: {:.3f}s'.format(time.time() - begin)) logger.info('saving results to {}...'.format( os.path.join(out_dir, 'results.npy')))
from lda import lda if __name__ == '__main__': filepath = os.path.join('Yale_Face_Database', 'Training') H, W = 231, 195 X, y = imread(filepath, H, W) eigenvalues_pca, eigenvectors_pca, X_mean = pca(X, num_dim=31) X_pca = eigenvectors_pca.T @ (X - X_mean) eigenvalues_lda, eigenvectors_lda = lda(X_pca, y) # Transform matrix U = eigenvectors_pca @ eigenvectors_lda print('U shape: {}'.format(U.shape)) # show top 25 eigenface show_eigenface(U, 25, H, W) # reduce dim (projection) Z = U.T @ X # recover X_recover = U @ Z + X_mean show_reconstruction(X, X_recover, 10, H, W) # accuracy filepath = os.path.join('Yale_Face_Database', 'Testing') X_test, y_test = imread(filepath, H, W) acc = performance(X_test, y_test, Z, y, U, X_mean, 5) print('acc: {:.2f}%'.format(acc * 100))
def experiment(args, logger, out_dir): logger.info('\nDATA') start = time.time() in_dir = os.path.join(args.data_dir, args.dataset, 'fold_{}'.format(args.fold)) # read in feature data logger.info('reading in data...') X_train = load_npz( os.path.join(in_dir, '{}_train.npz'.format(args.feature_type))).tocsr() X_val = load_npz( os.path.join(in_dir, '{}_val.npz'.format(args.feature_type))).tocsr() X_test = load_npz( os.path.join(in_dir, '{}_test.npz'.format(args.feature_type))).tocsr() # read in label data train_df = pd.read_csv(os.path.join(in_dir, 'y_train.csv')) val_df = pd.read_csv(os.path.join(in_dir, 'y_val.csv')) test_df = pd.read_csv(os.path.join(in_dir, 'y_test.csv')) # filter out transductive test indices if args.test_type == 'inductive': indices = np.load(os.path.join(in_dir, 'inductive_indices.npz')) val_df = val_df[val_df['com_id'].isin(indices['val'])] test_df = test_df[test_df['com_id'].isin(indices['test'])] X_val = X_val[val_df.index] X_test = X_test[test_df.index] # extract label data y_train = train_df['label'].to_numpy() y_val = val_df['label'].to_numpy() y_test = test_df['label'].to_numpy() # extract identifier data target_ids_train = train_df['com_id'].to_numpy() target_ids_val = val_df['com_id'].to_numpy() target_ids_test = test_df['com_id'].to_numpy() logger.info('\ntrain instances: X: {}, y: {}'.format( X_train.shape, y_train.shape)) logger.info('val instances: X: {}, y: {}'.format(X_val.shape, y_val.shape)) logger.info('test instances: X: {}, y: {}'.format(X_test.shape, y_test.shape)) logger.info('total time: {:.3f}s'.format(time.time() - start)) # train logger.info('\nTRAIN') start = time.time() # setup models model = _get_model(args, data_dir=in_dir, logger=logger) if args.eggs: model = model.fit(X_train, y_train, target_ids_train, X_val, y_val, target_ids_val) else: model = model.fit(X_train, y_train) logger.info('total time: {:.3f}s'.format(time.time() - start)) # predict logger.info('\nPREDICT') start = time.time() if args.eggs: proba = model.predict_proba(X_test, target_ids_test)[:, 1] else: proba = model.predict_proba(X_test)[:, 1] auc, ap = util.performance(y_test, proba, logger=logger, name='model') logger.info('total time: {:.3f}s'.format(time.time() - start)) # save results result = {'auc': auc, 'ap': ap} result['target_id'] = target_ids_test result['label'] = y_test result['yhat'] = proba np.save(os.path.join(out_dir, 'result.npy'), result)
def experiment(args, logger, out_dir): """ Main method that trains a tree ensemble, then compares the runtime of different methods to explain a single test instance. """ # start timer begin = time.time() # create random number generator rng = np.random.default_rng(args.rs) # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) logger.info('\nno. train instances: {:,}'.format(X_train.shape[0])) logger.info('no. test instances: {:,}'.format(X_test.shape[0])) logger.info('no. features: {:,}\n'.format(X_train.shape[1])) # train a tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') util.performance(model, X_test, y_test, logger=logger, name='Test') # randomly pick a test instances to explain test_ndx = rng.choice(y_test.shape[0], size=1, replace=False) # TREX if 'klr' in args.method or 'svm' in args.method: result = trex_method(args, model, test_ndx, X_train, y_train, X_test, logger=logger) # Leaf Influence elif 'leaf_influence' in args.method and args.model == 'cb': result = leaf_influence_method(args, model, test_ndx, X_train, y_train, X_test, y_test, logger=logger) # MAPLE elif args.method == 'maple': result = maple_method(args, model, test_ndx, X_train, y_train, X_test, logger=logger) # TEKNN elif 'knn' in args.method: result = teknn_method(args, model, test_ndx, X_train, y_train, X_test, logger=logger) else: raise ValueError('method {} unknown!'.format(args.method)) # save results result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss result['total_time'] = time.time() - begin np.save(os.path.join(out_dir, 'results.npy'), result) # display results logger.info('\nResults:\n{}'.format(result)) logger.info('\nsaving results to {}...'.format( os.path.join(out_dir, 'results.npy')))
def experiment(args, logger, out_dir): """ Cleaning Experiment: 1) Train a tree ensemble. 2) Flip a percentage of train labels. 3) Prioritize train instances to be checked using various methods. 4) Check and correct any flipped train labels. 5) Compute how effective each method is at cleaning the data. """ # start timer begin = time.time() # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # use a subset of the training data if args.train_frac < 1.0 and args.train_frac > 0.0: n_train = int(X_train.shape[0] * args.train_frac) X_train, y_train = X_train[:n_train], y_train[:n_train] logger.info('\nno. train instances: {:,}'.format(len(X_train))) logger.info('no. test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # add noise y_train_noisy, noisy_indices = flip_labels(y_train, k=args.flip_frac, seed=args.rs, logger=logger) noisy_indices = np.array(sorted(noisy_indices)) logger.info('no. noisy labels: {:,}'.format(noisy_indices.shape[0])) # train a tree ensemble on the clean and noisy labels model = clone(clf).fit(X_train, y_train) model_noisy = clone(clf).fit(X_train, y_train_noisy) # show model performance before and after noise logger.info('\nBefore noise:') util.performance(model, X_train, y_train, logger=logger, name='Before, Train') util.performance(model, X_test, y_test, logger=logger, name='Before, Test') logger.info('\nAfter noise:') util.performance(model_noisy, X_train, y_train_noisy, logger=logger, name='After, Noisy Train') util.performance(model_noisy, X_train, y_train, logger=logger, name='After, Clean Train') util.performance(model_noisy, X_test, y_test, logger=logger, name='After, Test') # check predictive performance before and after noise acc_clean, auc_clean = score(model, X_test, y_test) acc_noisy, auc_noisy = score(model_noisy, X_test, y_test) # find how many corrupted / non-corrupted labels were incorrectly predicted predicted_labels = model_noisy.predict(X_train).flatten() incorrect_indices = np.where(y_train_noisy != predicted_labels)[0] incorrect_noisy_indices = np.intersect1d(noisy_indices, incorrect_indices) logger.info('\nno. incorrectly predicted noisy train instances: {:,}'.format(incorrect_noisy_indices.shape[0])) logger.info('no. incorrectly predicted train instances: {:,}'.format(incorrect_indices.shape[0])) # total no. instances to check and no. instances to check between checkpoints n_check = int(y_train.shape[0] * args.check_pct) n_checkpoint = int(n_check / args.n_checkpoints) # random if args.method == 'random': result = random_method(args, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # TREX elif 'klr' in args.method or 'svm' in args.method: result = trex_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # tree-esemble loss elif args.method == 'tree_loss': result = tree_loss_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # Leaf Influence elif 'leaf_influence' in args.method and args.model == 'cb': result = leaf_influence_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # MAPLE elif args.method == 'maple': result = maple_method(args, model_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # TEKNN elif 'knn' in args.method: result = teknn_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # Tree Prototype elif args.method == 'tree_prototype': result = tree_prototype_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) # MMD Prototype elif args.method == 'mmd_prototype': result = mmd_prototype_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) else: raise ValueError('unknown method {}'.format(args.method)) # save results result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss result['total_time'] = time.time() - begin result['acc_clean'] = acc_clean result['auc_clean'] = auc_clean np.save(os.path.join(out_dir, 'results.npy'), result) # display results logger.info('\nResults:\n{}'.format(result))
def experiment(args, logger, out_dir): """ Main method that removes training instances ordered by different methods and measure their impact on a random set of test instances. """ # start timer begin = time.time() # create random number generator rng = np.random.default_rng(args.rs) # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # use a fraction of the train data if args.train_frac < 1.0 and args.train_frac > 0.0: n_train_samples = int(X_train.shape[0] * args.train_frac) train_indices = rng.choice(X_train.shape[0], size=n_train_samples, replace=False) X_train, y_train = X_train[train_indices], y_train[train_indices] # train a tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') # select an ambiguously predicted test instance proba = model.predict_proba(X_test)[:, 1] sorted_indices = np.argsort(np.abs(proba - 0.5)) test_indices = sorted_indices[:1] # shape=(1,) X_test_sub, y_test_sub = X_test[test_indices], y_test[test_indices] # display dataset statistics logger.info('\nno. train instances: {:,}'.format(X_train.shape[0])) logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0])) logger.info('no. features: {:,}\n'.format(X_train.shape[1])) logger.info('pos. label % (test): {:.1f}%\n'.format( np.sum(y_test) / y_test.shape[0] * 100)) # sort train instances exc_indices, inh_indices = trex_method(args, model, X_train, y_train, X_test_sub, logger=logger) ran_indices = rng.choice(np.arange(X_train.shape[0]), size=X_train.shape[0], replace=False) ran_pos_indices = np.where(y_train == 1)[0] ran_neg_indices = np.where(y_train == 0)[0] rng.shuffle(ran_pos_indices) rng.shuffle(ran_neg_indices) # remove, retrain, and re-evaluate logger.info('\nremoving most excitatory train instances...') exc_result = measure_performance(args, exc_indices, clf, X_train, y_train, X_test_sub, y_test_sub, logger=logger) logger.info('\nremoving most inhibitory train instances...') inh_result = measure_performance(args, inh_indices, clf, X_train, y_train, X_test_sub, y_test_sub, logger=logger) logger.info('\nremoving train instances uniformly at random...') ran_result = measure_performance(args, ran_indices, clf, X_train, y_train, X_test_sub, y_test_sub, logger=logger) if args.extra_methods: logger.info('\nremoving positive train instances at random...') ran_pos_result = measure_performance(args, ran_pos_indices, clf, X_train, y_train, X_test_sub, y_test_sub, logger=logger) logger.info('\nremoving negative train instances at random...') ran_neg_result = measure_performance(args, ran_neg_indices, clf, X_train, y_train, X_test_sub, y_test_sub, logger=logger) # matplotlib settings util.plot_settings(fontsize=13) # inches width = 4.8 # Machine Learning journal height = get_height(width=width, subplots=(1, 1)) fig, ax = plt.subplots(figsize=(width * 1.65, height * 1.0)) # plot results l1 = ax.errorbar(exc_result['remove_pct'], exc_result['proba'], color='blue', linestyle='--', marker='.', label='Most excitatory') l2 = ax.errorbar(inh_result['remove_pct'], inh_result['proba'], color='green', linestyle='--', marker='+', label='Most inhibitory') l3 = ax.errorbar(ran_result['remove_pct'], ran_result['proba'], color='red', linestyle='-', marker='*', label='Random') lines = [l1, l2, l3] labels = ['Most excitatory', 'Most inhibitory', 'Random'] if args.extra_methods: l4 = ax.errorbar(ran_pos_result['remove_pct'], ran_pos_result['proba'], color='cyan', linestyle=':', marker='1', label='Pos. random') l5 = ax.errorbar(ran_neg_result['remove_pct'], ran_neg_result['proba'], color='orange', linestyle=':', marker='2', label='Neg. random') lines += [l4, l5] labels += ['Random (pos. only)', 'Random (neg. only)'] ax.set_xlabel('Train data removed (%)') ax.set_ylabel('Predicted probability') ax.set_ylim(0, 1) # adjust legend fig.legend(tuple(lines), tuple(labels), loc='left', ncol=1, bbox_to_anchor=(1.0, 0.85), title='Removal Ordering') plt.tight_layout() fig.subplots_adjust(right=0.65) # save plot plt.savefig(os.path.join(out_dir, 'probas.pdf'), bbox_inches='tight') # display results logger.info('\nsaving results to {}/...'.format(os.path.join(out_dir))) logger.info('total time: {:.3f}s'.format(time.time() - begin))
def experiment(args, logger, out_dir): # start timer begin = time.time() # create random number generator rng = np.random.default_rng(args.rs) # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing, mismatch=True) X_train, X_test, y_train, y_test, feature, cat_indices = data # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # display dataset statistics logger.info('\nno. train instances: {:,}'.format(X_train.shape[0])) logger.info('no. test instances: {:,}'.format(X_test.shape[0])) logger.info('no. features: {:,}'.format(X_train.shape[1])) logger.info('\npos. label % (train): {:.1f}%'.format( np.sum(y_train) / y_train.shape[0] * 100)) logger.info('pos. label % (test): {:.1f}%\n'.format( np.sum(y_test) / y_test.shape[0] * 100)) # train tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') util.performance(model, X_test, y_test, logger=logger, name='Test') # train surrogate model params = { 'C': args.C, 'n_neighbors': args.n_neighbors, 'tree_kernel': args.tree_kernel } surrogate = trex.train_surrogate(model=model, surrogate='klr', X_train=X_train, y_train=y_train, val_frac=args.tune_frac, metric=args.metric, seed=args.rs, params=params, logger=logger) # extract predictions start = time.time() model_pred = model.predict(X_test) model_proba = model.predict_proba(X_test)[:, 1] logger.info('predicting...{:.3f}s'.format(time.time() - start)) # pick a test instance in which the person is <= 17 from the Adult dataset indices = np.where(X_test[:, args.age_ndx] <= 17)[0] test_ndx = rng.choice(indices) age_test_val = X_test[test_ndx][args.age_ndx] x_test = X_test[[test_ndx]] # show prediction for this test instance s = '\ntest: {}, actual: {}, proba.: {:.3f}, age: {:.0f}' logger.info( s.format(test_ndx, y_test[test_ndx], model_proba[test_ndx], age_test_val)) # sort based on similarity-influence if 'sim' in args.surrogate: # compute influence based on predicted labels attributions = surrogate.similarity(x_test) pred_label = model.predict(x_test) # put positive weight if similar instances have the same label as the predicted test label for i in range(x_test.shape[0]): attributions[i] = np.where(y_train == pred_label[i], attributions[i], attributions[i] * -1) attributions = attributions.sum(axis=0) attribution_indices = np.argsort(attributions)[::-1] # sort training instances by most influential to the predicted label else: attributions = surrogate.pred_influence(x_test, model_pred[[test_ndx]])[0] attribution_indices = np.argsort(attributions)[::-1] # sort training instances by most similar to the test instance sim = surrogate.similarity(x_test)[0] sim_indices = np.argsort(sim)[::-1] # get instance weights alpha = surrogate.get_alpha() # 1. show most influential training instances logger.info( '\nTop {:,} most influential samples to the predicted label...'.format( args.topk_inf)) show_instances(args, X_train, alpha, sim, attributions, y_train, attribution_indices, args.topk_inf, logger) # 2a. compute aggregate surrogate contribution of most influential train instances attr_all = np.sum(np.abs(attributions)) attr_pos = np.sum(np.where(attributions > 0, attributions, 0)) # sum of pos. attributions only attr_topk_inf = np.sum( np.abs(attributions[attribution_indices][:args.topk_inf])) attr_topk_inf_pct = attr_topk_inf / attr_all * 100 attr_topk_inf_pos_pct = attr_topk_inf / attr_pos * 100 # 2b. display aggregate attributions s1 = '\nattribution % of top {:,} infuential instances: {:.2f}%' s2 = 'attribution % of top {:,} infuential instances for pos. attributions: {:.2f}%' logger.info(s1.format(args.topk_inf, attr_topk_inf_pct)) logger.info(s2.format(args.topk_inf, attr_topk_inf_pos_pct)) # 3. compute change in predicted probability after REMOVING the most influential instances s = 'test: {}, actual: {}, proba.: {:.3f}, age: {:.0f}' logger.info('\nRemoving top {:,} influential instances..'.format( args.topk_inf)) new_X_train = np.delete(X_train, attribution_indices[:args.topk_inf], axis=0) new_y_train = np.delete(y_train, attribution_indices[:args.topk_inf]) new_model = clone(clf).fit(new_X_train, new_y_train) util.performance(model, new_X_train, new_y_train, logger=logger, name='Train') util.performance(model, X_test, y_test, logger=logger, name='Test') logger.info( s.format(test_ndx, y_test[test_ndx], new_model.predict_proba(X_test)[:, 1][test_ndx], age_test_val)) # 4a. compute change in predicted probability after FLIPPING the labels of the most influential instances s1 = 'test: {}, actual: {}, proba.: {:.3f}, age: {:.0f}' s2 = '\n{:,} out of the top {:,} most influential instances have age <= 17' logger.info( '\nFixing ONLY corrupted labels of the top {:,} influential instances..' .format(args.topk_inf)) new_X_train = X_train.copy() new_y_train = y_train.copy() # 4b. fixing a portion of the corrupted training instances temp_indices = np.where( X_train[attribution_indices][:args.topk_inf][:, args.age_ndx] <= 17)[0] age17_topk_inf_indices = attribution_indices[temp_indices] new_y_train[age17_topk_inf_indices] = 0 # 4c. fit new model and re-evaluate new_model = clone(clf).fit(new_X_train, new_y_train) util.performance(new_model, new_X_train, new_y_train, logger=logger, name='Train') util.performance(new_model, X_test, y_test, logger=logger, name='Test') logger.info( s1.format(test_ndx, y_test[test_ndx], new_model.predict_proba(X_test)[:, 1][test_ndx], age_test_val)) logger.info(s2.format(age17_topk_inf_indices.shape[0], args.topk_inf)) # 5. show most similar training instances logger.info( '\nTop {:,} most similar samples to the predicted label...'.format( args.topk_sim)) show_instances(args, X_train, alpha, sim, attributions, y_train, sim_indices, args.topk_sim, logger) # 6. of the most similar train instances, compute how many have age <= 17 num_age17_topk = np.where( X_train[sim_indices][:args.topk_sim][:, args.age_ndx] <= 17)[0].shape[0] logger.info( '\nTop {:,} most similar train instances with age <= 17: {:,}'.format( args.topk_sim, num_age17_topk)) # 7. plot similarity of train instances against their instance weights logger.info('\nplotting similarity vs. weights...') plot_similarity(args, alpha, sim, out_dir, logger) # 8. no. train instances with age <= 17 and an alpha coefficient < 0 neg_alpha_indices = np.where(attributions < 0)[0] num_age17_neg_alpha = np.where( X_train[neg_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0] logger.info('\nno. instances with age <= 17 and alpha < 0: {:,}'.format( num_age17_neg_alpha)) # 9. no. train instances with age <= 17 and an alpha coefficient >= 0 pos_alpha_indices = np.where(attributions >= 0)[0] num_age17_pos_alpha = np.where( X_train[pos_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0] logger.info('no. instances with age <= 17 and alpha >= 0: {:,}'.format( num_age17_pos_alpha)) # 10. no. train instances with age <= 17, similarity > thershold and an alpha coefficient < 0 s = 'no. instances with age <= 17, sim > {:.2f} and alpha >= 0: {:,}' neg_alpha_indices = np.where((attributions < 0) & (sim > args.sim_thresh))[0] num_age17_sim_neg_alpha = np.where( X_train[neg_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0] logger.info(s.format(args.sim_thresh, num_age17_sim_neg_alpha)) # 11. no. train instances with age <= 17, similarity > thershold and an alpha coefficient >= 0 s = 'no. instances with age <= 17, sim > {:.2f} and alpha >= 0: {:,}' pos_alpha_indices = np.where((attributions >= 0) & (sim > args.sim_thresh))[0] num_age17_sim_pos_alpha = np.where( X_train[pos_alpha_indices][:, args.age_ndx] <= 17)[0].shape[0] logger.info(s.format(args.sim_thresh, num_age17_sim_pos_alpha)) # display total time logger.info('\ntotal time: {:.3f}s'.format(time.time() - begin))
def experiment(args, logger, out_dir): """ Main method that removes training instances ordered by different methods and measure their impact on a random set of test instances. """ # start timer begin = time.time() # create random number generator rng = np.random.default_rng(args.rs) # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # train a tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') util.performance(model, X_test, y_test, logger=logger, name='Test') # compute loss of each test instance proba = model.predict_proba(X_test) losses = np.abs([proba[i][1] - y_test[i] for i in range(proba.shape[0])]) # select instances with an L1 loss >= 0.9 test_indices = np.where(losses >= 0.9)[0] n_pos = y_test[test_indices].sum() logger.info( '\nNo. test instances w/ L1 loss >= 0.9: {:,}, no. pos.: {:,}'.format( len(test_indices), n_pos)) for i, ndx in enumerate(test_indices[:args.n_test]): logger.info('\n\n[#{:,}] Test {}, loss: {:.3f}'.format( i + 1, ndx, losses[ndx])) X_test_sub = X_test[[ndx]] y_test_sub = y_test[[ndx]] instance_dir = os.path.join(out_dir, 'test_{}'.format(i)) os.makedirs(instance_dir, exist_ok=True) # display dataset statistics logger.info('\nno. train instances: {:,}'.format(X_train.shape[0])) logger.info('no. test instances: {:,}'.format(X_test_sub.shape[0])) logger.info('no. features: {:,}'.format(X_train.shape[1])) # sort train instances, then remove, retrain, and re-evaluate result = measure_performance(args, clf, X_train, y_train, X_test_sub, y_test_sub, rng, instance_dir, logger=logger) # save results result['max_rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss result['total_time'] = time.time() - begin np.save(os.path.join(instance_dir, 'results.npy'), result) # display results logger.info('\nResults:\n{}'.format(result)) logger.info('\nsaving results to {}...'.format( os.path.join(instance_dir, 'results.npy')))
def experiment(args, logger, out_dir): # start timer begin = time.time() # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data logger.info('\ntrain instances: {:,}'.format(X_train.shape[0])) logger.info('test instances: {:,}'.format(X_test.shape[0])) logger.info('no. features: {:,}'.format(X_train.shape[1])) # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # train a tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') util.performance(model, X_test, y_test, logger=logger, name='Test') # store indexes of different subgroups train_neg = np.where(y_train == 0)[0] train_pos = np.where(y_train == 1)[0] # test_neg = np.where(y_test == 0)[0] # test_pos = np.where(y_test == 1)[0] # transform features to tree kernel space logger.info('\ntransforming features into tree kernel space...') extractor = trex.TreeExtractor(model, tree_kernel=args.tree_kernel) start = time.time() X_train_alt = extractor.transform(X_train) logger.info('train transform time: {:.3f}s'.format(time.time() - start)) start = time.time() X_test_alt = extractor.transform(X_test) logger.info('test transform time: {:.3f}s'.format(time.time() - start)) # reduce dimensionality on original and tree feature spaces logger.info('\nembed original features into a lower dimensional space') X_train, X_test = reduce_and_embed(args, X_train, X_test, logger) logger.info('\nembed tree kernel features into a lower dimensional space') X_train_alt, X_test_alt = reduce_and_embed(args, X_train_alt, X_test_alt, logger) # separating embedded points into train and test # n_train = len(y_train) # train_neg_embed = X_embed[:n_train][train_neg] # train_pos_embed = X_embed[:n_train][train_pos] # test_neg_embed = X_embed[n_train:][test_neg] # test_pos_embed = X_embed[n_train:][test_pos] # save original feature space results np.save(os.path.join(out_dir, 'train_negative'), X_train[train_neg]) np.save(os.path.join(out_dir, 'train_positive'), X_train[train_pos]) # save tree kenel space results np.save(os.path.join(out_dir, 'train_tree_negative'), X_train_alt[train_neg]) np.save(os.path.join(out_dir, 'train_tree_positive'), X_train_alt[train_pos])