def _teknn_method(args, tree, test_ndx, X_train, train_label, X_test, seed, logger=None): """ TEKNN fine tuning and computation. """ with timeout(seconds=MAX_TIME): try: start = time.time() extractor = trex.TreeExtractor(tree, tree_kernel=args.tree_kernel) X_train_alt = extractor.fit_transform(X_train) # tune and train teknn knn_clf = exp_util.tune_knn(tree, X_train, X_train_alt, train_label, args.val_frac, seed=seed, logger=logger) fine_tune = time.time() - start except: if logger: logger.info('TEKNN fine-tuning exceeded!') return None, None start = time.time() x_test_alt = extractor.transform(X_test[test_ndx]) distances, neighbor_ids = knn_clf.kneighbors(x_test_alt) test_time = time.time() - start return fine_tune, test_time
def _teknn_method(args, model, X_test, X_train, y_train, y_test, seed, logger): global teknn_explainer global teknn_extractor if teknn_explainer is None: # transform the data teknn_extractor = trex.TreeExtractor(model, tree_kernel=args.tree_kernel) X_train_alt = teknn_extractor.fit_transform(X_train) train_label = y_train if args.true_label else model.predict(X_train) # tune and train teknn teknn_explainer = exp_util.tune_knn(model, X_train, X_train_alt, train_label, args.val_frac, seed=1, logger=logger) # results container contributions_sum = np.zeros(X_train.shape[0]) # compute the contribution of all training samples on each test instance for i in tqdm.tqdm(range(X_test.shape[0])): x_test_alt = teknn_extractor.transform(X_test[[i]]) pred_label = int(teknn_explainer.predict(x_test_alt)[0]) distances, neighbor_ids = teknn_explainer.kneighbors(x_test_alt) for neighbor_id in neighbor_ids[0]: contribution = 1 if y_train[neighbor_id] == pred_label else -1 contributions_sum[neighbor_id] += contribution train_order = np.argsort(contributions_sum)[::-1] return train_order
def _proto_method(model, X_train, y_train, noisy_ndx, interval, n_check): """ Orders instances by using the GBT distance similarity formula in https://arxiv.org/pdf/1611.07115.pdf, then ranks training samples based on the proportion of labels from the k = 10, nearest neighbors. """ extractor = trex.TreeExtractor(model, tree_kernel='leaf_path') X_train_alt = extractor.fit_transform(X_train) # obtain weight of each tree: note, this code is specific to CatBoost temp_fp = '.{}_cb.json'.format(str(uuid.uuid4())) model.save_model(temp_fp, format='json') cb_dump = json.load(open(temp_fp, 'r')) # obtain weight of each tree: learning_rate^2 * var(predictions) tree_weights = [] for tree in cb_dump['oblivious_trees']: predictions = [] for val, weight in zip(tree['leaf_values'], tree['leaf_weights']): predictions += [val] * weight tree_weights.append(np.var(predictions) * (model.learning_rate_ ** 2)) # weight leaf path feature representation by the tree weights for i in range(X_train_alt.shape[0]): weight_cnt = 0 for j in range(X_train_alt.shape[1]): if X_train_alt[i][j] == 1: X_train_alt[i][j] *= tree_weights[weight_cnt] weight_cnt += 1 assert weight_cnt == len(tree_weights) # build a KNN using this proximity measure using k = 10 knn = KNeighborsClassifier(n_neighbors=TREE_PROTO_K) knn = knn.fit(X_train_alt, y_train) # compute proportion of neighbors that share the same label train_impact = np.zeros(X_train_alt.shape[0]) for i in tqdm.tqdm(range(X_train_alt.shape[0])): _, neighbor_ids = knn.kneighbors([X_train_alt[i]]) train_impact[i] = len(np.where(y_train[i] == y_train[neighbor_ids[0]])[0]) / len(neighbor_ids[0]) # rank training instances by low label agreement with its neighbors train_order = np.argsort(train_impact)[:n_check] ckpt_ndx, fix_ndx = _record_fixes(train_order, noisy_ndx, len(y_train), interval) os.system('rm {}'.format(temp_fp)) return ckpt_ndx, fix_ndx
def experiment(args, logger, out_dir, seed): # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs) X_train, X_test, y_train, y_test, label = data_util.get_data( args.dataset, random_state=args.rs, data_dir=args.data_dir) # reduce train size if args.train_frac < 1.0 and args.train_frac > 0.0: n_train = int(X_train.shape[0] * args.train_frac) X_train, y_train = X_train[:n_train], y_train[:n_train] data = X_train, y_train, X_test, y_test logger.info('train instances: {}'.format(len(X_train))) logger.info('test instances: {}'.format(len(X_test))) logger.info('no. features: {}'.format(X_train.shape[1])) logger.info('no. trees: {:,}'.format(args.n_estimators)) logger.info('max depth: {}'.format(args.max_depth)) # train a tree ensemble logger.info('fitting tree ensemble...') tree = clf.fit(X_train, y_train) if args.teknn: # transform data extractor = trex.TreeExtractor(tree, tree_kernel=args.tree_kernel) logger.info('transforming training data...') X_train_alt = extractor.fit_transform(X_train) logger.info('transforming test data...') X_test_alt = extractor.transform(X_test) train_label = y_train if args.true_label else tree.predict(X_train) # tune and train teknn start = time.time() logger.info('TE-KNN...') if args.k: knn_clf = KNeighborsClassifier(n_neighbors=args.k, weights='uniform') knn_clf = knn_clf.fit(X_train_alt, y_train) else: knn_clf = exp_util.tune_knn(tree, X_train, X_train_alt, train_label, args.val_frac, seed=seed, logger=logger) start = time.time() logger.info('generating predictions...') results = _get_knn_predictions(tree, knn_clf, X_test, X_test_alt, y_train, pred_size=args.pred_size, out_dir=out_dir, logger=logger) logger.info('time: {:.3f}s'.format(time.time() - start)) # save results if results: results['n_neighbors'] = knn_clf.get_params()['n_neighbors'] np.save(os.path.join(out_dir, 'tree.npy'), results['tree']) np.save(os.path.join(out_dir, 'surrogate.npy'), results['teknn']) if args.trex: start = time.time() explainer = trex.TreeExplainer(tree, X_train, y_train, tree_kernel=args.tree_kernel, kernel_model=args.kernel_model, random_state=args.rs, logger=logger, true_label=not args.true_label, val_frac=args.val_frac) start = time.time() logger.info('generating predictions...') results = _get_trex_predictions(tree, explainer, data) logger.info('time: {:.3f}s'.format(time.time() - start)) results['C'] = explainer.C # save data np.save(os.path.join(out_dir, 'tree.npy'), results['tree']) np.save( os.path.join(out_dir, 'surrogate.npy'.format(args.kernel_model)), results['trex'])
def tree_prototype_method(args, model_noisy, y_train_noisy, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=None, k=10, frac_progress_update=0.1): """ Orders instances by using the GBT distance similarity formula. It then ranks training samples based on the proportion of labels from the k = 10 nearest neighbors. Reference: https://arxiv.org/pdf/1611.07115.pdf. """ # get feature extractor extractor = trex.TreeExtractor(model_noisy, tree_kernel='leaf_path') X_train_alt = extractor.transform(X_train) # obtain weight of each tree: note, this code is specific to CatBoost if 'CatBoostClassifier' in str(model_noisy): temp_dir = os.path.join('.catboost_info', 'leaf_influence_{}'.format(str(uuid.uuid4()))) temp_fp = os.path.join(temp_dir, 'cb.json') os.makedirs(temp_dir, exist_ok=True) model_noisy.save_model(temp_fp, format='json') cb_dump = json.load(open(temp_fp, 'r')) # obtain weight of each tree: learning_rate^2 * var(predictions) tree_weights = [] for tree in cb_dump['oblivious_trees']: predictions = [] for val, weight in zip(tree['leaf_values'], tree['leaf_weights']): predictions += [val] * weight tree_weights.append(np.var(predictions) * (model_noisy.learning_rate_ ** 2)) # weight leaf path feature representation by the tree weights for i in range(X_train_alt.shape[0]): weight_cnt = 0 for j in range(X_train_alt.shape[1]): if X_train_alt[i][j] == 1: X_train_alt[i][j] *= tree_weights[weight_cnt] weight_cnt += 1 assert weight_cnt == len(tree_weights) # clean up shutil.rmtree(temp_dir) # build a KNN using this proximity measure using k knn = KNeighborsClassifier(n_neighbors=k) knn = knn.fit(X_train_alt, y_train_noisy) # display progress if logger: logger.info('\ncomputing similarity density...') # compute proportion of neighbors that share the same label start = time.time() train_weight = np.zeros(X_train_alt.shape[0]) for i in range(X_train_alt.shape[0]): _, neighbor_ids = knn.kneighbors([X_train_alt[i]]) train_weight[i] = len(np.where(y_train_noisy[i] == y_train_noisy[neighbor_ids[0]])[0]) / len(neighbor_ids[0]) # display progress if logger and i % int(X_train.shape[0] * frac_progress_update) == 0: elapsed = time.time() - start logger.info('finished {:.1f}% train instances...{:.3f}s'.format((i / X_train.shape[0]) * 100, elapsed)) # rank training instances by low label agreement with its neighbors train_indices = np.argsort(train_weight) result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint, clf, X_train, y_train, X_test, y_test, acc_noisy, auc_noisy, logger=logger) return result
def experiment(args, logger, out_dir, seed): """ Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train instances using various methods, and computes how effective each method is at cleaning the data. """ # get model and data clf = model_util.get_classifier(args.tree_type, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=seed) X_train, X_test, y_train, y_test, label = data_util.get_data(args.dataset, random_state=seed, data_dir=args.data_dir) # reduce train size if args.train_frac < 1.0 and args.train_frac > 0.0: n_train = int(X_train.shape[0] * args.train_frac) X_train, y_train = X_train[:n_train], y_train[:n_train] data = X_train, y_train, X_test, y_test logger.info('no. train instances: {:,}'.format(len(X_train))) logger.info('no. test instances: {:,}'.format(len(X_test))) logger.info('no. features: {:,}'.format(X_train.shape[1])) # add noise y_train_noisy, noisy_ndx = data_util.flip_labels(y_train, k=args.flip_frac, random_state=seed) noisy_ndx = np.array(sorted(noisy_ndx)) logger.info('no. noisy labels: {:,}'.format(len(noisy_ndx))) # train a tree ensemble on the clean and noisy labels model = clone(clf).fit(X_train, y_train) model_noisy = clone(clf).fit(X_train, y_train_noisy) # show model performance before and after noise logger.info('\nBefore noise:') model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger) logger.info('\nAfter noise:') model_util.performance(model_noisy, X_train, y_train_noisy, X_test=X_test, y_test=y_test, logger=logger) # check accuracy before and after noise acc_test_clean = accuracy_score(y_test, model.predict(X_test)) acc_test_noisy = accuracy_score(y_test, model_noisy.predict(X_test)) # find how many corrupted/non-corrupted labels were incorrectly predicted if not args.true_label: logger.info('\nUsing predicted labels:') predicted_labels = model_noisy.predict(X_train).flatten() incorrect_ndx = np.where(y_train_noisy != predicted_labels)[0] incorrect_corrupted_ndx = np.intersect1d(noisy_ndx, incorrect_ndx) logger.info('incorrectly predicted corrupted labels: {:,}'.format(incorrect_corrupted_ndx.shape[0])) logger.info('total number of incorrectly predicted labels: {:,}'.format(incorrect_ndx.shape[0])) # number of checkpoints to record n_check = int(len(y_train) * args.check_pct) interval = (n_check / len(y_train)) / args.n_plot_points # random method logger.info('\nordering by random...') start = time.time() ckpt_ndx, fix_ndx = _random_method(noisy_ndx, y_train, interval, to_check=n_check, random_state=seed) check_pct, random_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'random.npy'), random_res) # save global lines np.save(os.path.join(out_dir, 'test_clean.npy'), acc_test_clean) np.save(os.path.join(out_dir, 'check_pct.npy'), check_pct) # tree loss method logger.info('\nordering by tree loss...') start = time.time() y_train_proba = model_noisy.predict_proba(X_train) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, tree_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'tree.npy'), tree_loss_res) # trex method if args.trex: logger.info('\nordering by TREX...') start = time.time() explainer = trex.TreeExplainer(model_noisy, X_train, y_train_noisy, tree_kernel=args.tree_kernel, random_state=seed, true_label=args.true_label, kernel_model=args.kernel_model, verbose=args.verbose, val_frac=args.val_frac, logger=logger) ckpt_ndx, fix_ndx, _ = _our_method(explainer, noisy_ndx, y_train, n_check, interval) check_pct, trex_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), trex_res) # trex loss method logger.info('\nordering by TREX loss...') start = time.time() y_train_proba = explainer.predict_proba(X_train) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, trex_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method_loss.npy'), trex_loss_res) # influence method if args.tree_type == 'cb' and args.inf_k is not None: logger.info('\nordering by leafinfluence...') start = time.time() model_path = '.model.json' model_noisy.save_model(model_path, format='json') if args.inf_k == -1: update_set = 'AllPoints' elif args.inf_k == 0: update_set = 'SinglePoint' else: update_set = 'TopKLeaves' leaf_influence = CBLeafInfluenceEnsemble(model_path, X_train, y_train_noisy, k=args.inf_k, learning_rate=model.learning_rate_, update_set=update_set) ckpt_ndx, fix_ndx, _, _ = _influence_method(leaf_influence, noisy_ndx, X_train, y_train, y_train_noisy, interval, to_check=n_check) _, leafinfluence_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res) # MAPLE method if args.maple: logger.info('\nordering by MAPLE...') start = time.time() train_label = y_train_noisy if args.true_label else model_noisy.predict(X_train) maple_exp = MAPLE(X_train, train_label, X_train, train_label, verbose=args.verbose, dstump=False) ckpt_ndx, fix_ndx, map_scores, map_order = _maple_method(maple_exp, X_train, noisy_ndx, interval, to_check=n_check) _, maple_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), maple_res) # TEKNN method if args.teknn: logger.info('\nordering by teknn...') start = time.time() # transform the data extractor = trex.TreeExtractor(model_noisy, tree_kernel=args.tree_kernel) X_train_alt = extractor.fit_transform(X_train) train_label = y_train if args.true_label else model_noisy.predict(X_train) # tune and train teknn knn_clf = exp_util.tune_knn(model_noisy, X_train, X_train_alt, train_label, args.val_frac, seed=seed, logger=logger) ckpt_ndx, fix_ndx, _ = _knn_method(knn_clf, X_train_alt, noisy_ndx, interval, to_check=n_check) _, teknn_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), teknn_res) # TEKNN loss method logger.info('\nordering by teknn loss...') start = time.time() y_train_proba = knn_clf.predict_proba(X_train_alt) ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check) _, teknn_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method_loss.npy'), teknn_loss_res) # MMD-Critic method if args.mmd: logger.info('\nordering by mmd-critic...') start = time.time() ckpt_ndx, fix_ndx = _mmd_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check) _, mmd_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), mmd_res) # Prototype method if args.proto: logger.info('\nordering by proto...') start = time.time() ckpt_ndx, fix_ndx = _proto_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check) _, proto_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy) logger.info('time: {:3f}s'.format(time.time() - start)) np.save(os.path.join(out_dir, 'method.npy'), proto_res)
def experiment(args, logger, out_dir): # start timer begin = time.time() # get data data = util.get_data(args.dataset, data_dir=args.data_dir, preprocessing=args.preprocessing) X_train, X_test, y_train, y_test, feature, cat_indices = data logger.info('\ntrain instances: {:,}'.format(X_train.shape[0])) logger.info('test instances: {:,}'.format(X_test.shape[0])) logger.info('no. features: {:,}'.format(X_train.shape[1])) # get tree-ensemble clf = util.get_model(args.model, n_estimators=args.n_estimators, max_depth=args.max_depth, random_state=args.rs, cat_indices=cat_indices) # train a tree ensemble model = clone(clf).fit(X_train, y_train) util.performance(model, X_train, y_train, logger=logger, name='Train') util.performance(model, X_test, y_test, logger=logger, name='Test') # store indexes of different subgroups train_neg = np.where(y_train == 0)[0] train_pos = np.where(y_train == 1)[0] # test_neg = np.where(y_test == 0)[0] # test_pos = np.where(y_test == 1)[0] # transform features to tree kernel space logger.info('\ntransforming features into tree kernel space...') extractor = trex.TreeExtractor(model, tree_kernel=args.tree_kernel) start = time.time() X_train_alt = extractor.transform(X_train) logger.info('train transform time: {:.3f}s'.format(time.time() - start)) start = time.time() X_test_alt = extractor.transform(X_test) logger.info('test transform time: {:.3f}s'.format(time.time() - start)) # reduce dimensionality on original and tree feature spaces logger.info('\nembed original features into a lower dimensional space') X_train, X_test = reduce_and_embed(args, X_train, X_test, logger) logger.info('\nembed tree kernel features into a lower dimensional space') X_train_alt, X_test_alt = reduce_and_embed(args, X_train_alt, X_test_alt, logger) # separating embedded points into train and test # n_train = len(y_train) # train_neg_embed = X_embed[:n_train][train_neg] # train_pos_embed = X_embed[:n_train][train_pos] # test_neg_embed = X_embed[n_train:][test_neg] # test_pos_embed = X_embed[n_train:][test_pos] # save original feature space results np.save(os.path.join(out_dir, 'train_negative'), X_train[train_neg]) np.save(os.path.join(out_dir, 'train_positive'), X_train[train_pos]) # save tree kenel space results np.save(os.path.join(out_dir, 'train_tree_negative'), X_train_alt[train_neg]) np.save(os.path.join(out_dir, 'train_tree_positive'), X_train_alt[train_pos])