Пример #1
0
def _teknn_method(args, tree, test_ndx, X_train, train_label, X_test, seed, logger=None):
    """
    TEKNN fine tuning and computation.
    """

    with timeout(seconds=MAX_TIME):
        try:
            start = time.time()
            extractor = trex.TreeExtractor(tree, tree_kernel=args.tree_kernel)
            X_train_alt = extractor.fit_transform(X_train)

            # tune and train teknn
            knn_clf = exp_util.tune_knn(tree, X_train, X_train_alt, train_label,
                                        args.val_frac, seed=seed, logger=logger)
            fine_tune = time.time() - start

        except:
            if logger:
                logger.info('TEKNN fine-tuning exceeded!')

            return None, None

    start = time.time()
    x_test_alt = extractor.transform(X_test[test_ndx])
    distances, neighbor_ids = knn_clf.kneighbors(x_test_alt)
    test_time = time.time() - start

    return fine_tune, test_time
Пример #2
0
def _teknn_method(args, model, X_test, X_train, y_train, y_test, seed, logger):

    global teknn_explainer
    global teknn_extractor

    if teknn_explainer is None:

        # transform the data
        teknn_extractor = trex.TreeExtractor(model, tree_kernel=args.tree_kernel)
        X_train_alt = teknn_extractor.fit_transform(X_train)
        train_label = y_train if args.true_label else model.predict(X_train)

        # tune and train teknn
        teknn_explainer = exp_util.tune_knn(model, X_train, X_train_alt, train_label, args.val_frac,
                                            seed=1, logger=logger)

    # results container
    contributions_sum = np.zeros(X_train.shape[0])

    # compute the contribution of all training samples on each test instance
    for i in tqdm.tqdm(range(X_test.shape[0])):
        x_test_alt = teknn_extractor.transform(X_test[[i]])
        pred_label = int(teknn_explainer.predict(x_test_alt)[0])
        distances, neighbor_ids = teknn_explainer.kneighbors(x_test_alt)

        for neighbor_id in neighbor_ids[0]:
            contribution = 1 if y_train[neighbor_id] == pred_label else -1
            contributions_sum[neighbor_id] += contribution

    train_order = np.argsort(contributions_sum)[::-1]
    return train_order
Пример #3
0
def _proto_method(model, X_train, y_train, noisy_ndx, interval, n_check):
    """
    Orders instances by using the GBT distance similarity formula in
    https://arxiv.org/pdf/1611.07115.pdf, then ranks training samples
    based on the proportion of labels from the k = 10, nearest neighbors.
    """
    extractor = trex.TreeExtractor(model, tree_kernel='leaf_path')
    X_train_alt = extractor.fit_transform(X_train)

    # obtain weight of each tree: note, this code is specific to CatBoost
    temp_fp = '.{}_cb.json'.format(str(uuid.uuid4()))
    model.save_model(temp_fp, format='json')
    cb_dump = json.load(open(temp_fp, 'r'))

    # obtain weight of each tree: learning_rate^2 * var(predictions)
    tree_weights = []
    for tree in cb_dump['oblivious_trees']:
        predictions = []

        for val, weight in zip(tree['leaf_values'], tree['leaf_weights']):
            predictions += [val] * weight

        tree_weights.append(np.var(predictions) * (model.learning_rate_ ** 2))

    # weight leaf path feature representation by the tree weights
    for i in range(X_train_alt.shape[0]):

        weight_cnt = 0
        for j in range(X_train_alt.shape[1]):

            if X_train_alt[i][j] == 1:
                X_train_alt[i][j] *= tree_weights[weight_cnt]
                weight_cnt += 1

        assert weight_cnt == len(tree_weights)

    # build a KNN using this proximity measure using k = 10
    knn = KNeighborsClassifier(n_neighbors=TREE_PROTO_K)
    knn = knn.fit(X_train_alt, y_train)

    # compute proportion of neighbors that share the same label
    train_impact = np.zeros(X_train_alt.shape[0])
    for i in tqdm.tqdm(range(X_train_alt.shape[0])):
        _, neighbor_ids = knn.kneighbors([X_train_alt[i]])
        train_impact[i] = len(np.where(y_train[i] == y_train[neighbor_ids[0]])[0]) / len(neighbor_ids[0])

    # rank training instances by low label agreement with its neighbors
    train_order = np.argsort(train_impact)[:n_check]
    ckpt_ndx, fix_ndx = _record_fixes(train_order, noisy_ndx, len(y_train), interval)

    os.system('rm {}'.format(temp_fp))

    return ckpt_ndx, fix_ndx
Пример #4
0
def experiment(args, logger, out_dir, seed):

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=args.rs)

    X_train, X_test, y_train, y_test, label = data_util.get_data(
        args.dataset, random_state=args.rs, data_dir=args.data_dir)

    # reduce train size
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train = int(X_train.shape[0] * args.train_frac)
        X_train, y_train = X_train[:n_train], y_train[:n_train]
    data = X_train, y_train, X_test, y_test

    logger.info('train instances: {}'.format(len(X_train)))
    logger.info('test instances: {}'.format(len(X_test)))
    logger.info('no. features: {}'.format(X_train.shape[1]))

    logger.info('no. trees: {:,}'.format(args.n_estimators))
    logger.info('max depth: {}'.format(args.max_depth))

    # train a tree ensemble
    logger.info('fitting tree ensemble...')
    tree = clf.fit(X_train, y_train)

    if args.teknn:

        # transform data
        extractor = trex.TreeExtractor(tree, tree_kernel=args.tree_kernel)

        logger.info('transforming training data...')
        X_train_alt = extractor.fit_transform(X_train)

        logger.info('transforming test data...')
        X_test_alt = extractor.transform(X_test)

        train_label = y_train if args.true_label else tree.predict(X_train)

        # tune and train teknn
        start = time.time()
        logger.info('TE-KNN...')
        if args.k:
            knn_clf = KNeighborsClassifier(n_neighbors=args.k,
                                           weights='uniform')
            knn_clf = knn_clf.fit(X_train_alt, y_train)
        else:
            knn_clf = exp_util.tune_knn(tree,
                                        X_train,
                                        X_train_alt,
                                        train_label,
                                        args.val_frac,
                                        seed=seed,
                                        logger=logger)

        start = time.time()
        logger.info('generating predictions...')
        results = _get_knn_predictions(tree,
                                       knn_clf,
                                       X_test,
                                       X_test_alt,
                                       y_train,
                                       pred_size=args.pred_size,
                                       out_dir=out_dir,
                                       logger=logger)
        logger.info('time: {:.3f}s'.format(time.time() - start))

        # save results
        if results:
            results['n_neighbors'] = knn_clf.get_params()['n_neighbors']
            np.save(os.path.join(out_dir, 'tree.npy'), results['tree'])
            np.save(os.path.join(out_dir, 'surrogate.npy'), results['teknn'])

    if args.trex:

        start = time.time()
        explainer = trex.TreeExplainer(tree,
                                       X_train,
                                       y_train,
                                       tree_kernel=args.tree_kernel,
                                       kernel_model=args.kernel_model,
                                       random_state=args.rs,
                                       logger=logger,
                                       true_label=not args.true_label,
                                       val_frac=args.val_frac)

        start = time.time()
        logger.info('generating predictions...')
        results = _get_trex_predictions(tree, explainer, data)
        logger.info('time: {:.3f}s'.format(time.time() - start))

        results['C'] = explainer.C

        # save data
        np.save(os.path.join(out_dir, 'tree.npy'), results['tree'])
        np.save(
            os.path.join(out_dir, 'surrogate.npy'.format(args.kernel_model)),
            results['trex'])
Пример #5
0
def tree_prototype_method(args, model_noisy, y_train_noisy,
                          noisy_indices, n_check, n_checkpoint,
                          clf, X_train, y_train, X_test, y_test,
                          acc_noisy, auc_noisy, logger=None,
                          k=10, frac_progress_update=0.1):
    """
    Orders instances by using the GBT distance similarity formula.
    It then ranks training samples based on the proportion of
    labels from the k = 10 nearest neighbors.

    Reference:
    https://arxiv.org/pdf/1611.07115.pdf.
    """

    # get feature extractor
    extractor = trex.TreeExtractor(model_noisy, tree_kernel='leaf_path')
    X_train_alt = extractor.transform(X_train)

    # obtain weight of each tree: note, this code is specific to CatBoost
    if 'CatBoostClassifier' in str(model_noisy):
        temp_dir = os.path.join('.catboost_info', 'leaf_influence_{}'.format(str(uuid.uuid4())))
        temp_fp = os.path.join(temp_dir, 'cb.json')
        os.makedirs(temp_dir, exist_ok=True)
        model_noisy.save_model(temp_fp, format='json')
        cb_dump = json.load(open(temp_fp, 'r'))

        # obtain weight of each tree: learning_rate^2 * var(predictions)
        tree_weights = []
        for tree in cb_dump['oblivious_trees']:
            predictions = []

            for val, weight in zip(tree['leaf_values'], tree['leaf_weights']):
                predictions += [val] * weight

            tree_weights.append(np.var(predictions) * (model_noisy.learning_rate_ ** 2))

        # weight leaf path feature representation by the tree weights
        for i in range(X_train_alt.shape[0]):

            weight_cnt = 0
            for j in range(X_train_alt.shape[1]):

                if X_train_alt[i][j] == 1:
                    X_train_alt[i][j] *= tree_weights[weight_cnt]
                    weight_cnt += 1

            assert weight_cnt == len(tree_weights)

        # clean up
        shutil.rmtree(temp_dir)

    # build a KNN using this proximity measure using k
    knn = KNeighborsClassifier(n_neighbors=k)
    knn = knn.fit(X_train_alt, y_train_noisy)

    # display progress
    if logger:
        logger.info('\ncomputing similarity density...')

    # compute proportion of neighbors that share the same label
    start = time.time()
    train_weight = np.zeros(X_train_alt.shape[0])
    for i in range(X_train_alt.shape[0]):
        _, neighbor_ids = knn.kneighbors([X_train_alt[i]])
        train_weight[i] = len(np.where(y_train_noisy[i] == y_train_noisy[neighbor_ids[0]])[0]) / len(neighbor_ids[0])

        # display progress
        if logger and i % int(X_train.shape[0] * frac_progress_update) == 0:
            elapsed = time.time() - start
            logger.info('finished {:.1f}% train instances...{:.3f}s'.format((i / X_train.shape[0]) * 100, elapsed))

    # rank training instances by low label agreement with its neighbors
    train_indices = np.argsort(train_weight)
    result = fix_noisy_instances(train_indices, noisy_indices, n_check, n_checkpoint,
                                 clf, X_train, y_train, X_test, y_test,
                                 acc_noisy, auc_noisy, logger=logger)

    return result
Пример #6
0
def experiment(args, logger, out_dir, seed):
    """
    Main method that trains a tree ensemble, flips a percentage of train labels, prioritizes train
    instances using various methods, and computes how effective each method is at cleaning the data.
    """

    # get model and data
    clf = model_util.get_classifier(args.tree_type,
                                    n_estimators=args.n_estimators,
                                    max_depth=args.max_depth,
                                    random_state=seed)

    X_train, X_test, y_train, y_test, label = data_util.get_data(args.dataset,
                                                                 random_state=seed,
                                                                 data_dir=args.data_dir)

    # reduce train size
    if args.train_frac < 1.0 and args.train_frac > 0.0:
        n_train = int(X_train.shape[0] * args.train_frac)
        X_train, y_train = X_train[:n_train], y_train[:n_train]
    data = X_train, y_train, X_test, y_test

    logger.info('no. train instances: {:,}'.format(len(X_train)))
    logger.info('no. test instances: {:,}'.format(len(X_test)))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # add noise
    y_train_noisy, noisy_ndx = data_util.flip_labels(y_train, k=args.flip_frac, random_state=seed)
    noisy_ndx = np.array(sorted(noisy_ndx))
    logger.info('no. noisy labels: {:,}'.format(len(noisy_ndx)))

    # train a tree ensemble on the clean and noisy labels
    model = clone(clf).fit(X_train, y_train)
    model_noisy = clone(clf).fit(X_train, y_train_noisy)

    # show model performance before and after noise
    logger.info('\nBefore noise:')
    model_util.performance(model, X_train, y_train, X_test=X_test, y_test=y_test, logger=logger)
    logger.info('\nAfter noise:')
    model_util.performance(model_noisy, X_train, y_train_noisy, X_test=X_test, y_test=y_test, logger=logger)

    # check accuracy before and after noise
    acc_test_clean = accuracy_score(y_test, model.predict(X_test))
    acc_test_noisy = accuracy_score(y_test, model_noisy.predict(X_test))

    # find how many corrupted/non-corrupted labels were incorrectly predicted
    if not args.true_label:
        logger.info('\nUsing predicted labels:')
        predicted_labels = model_noisy.predict(X_train).flatten()
        incorrect_ndx = np.where(y_train_noisy != predicted_labels)[0]
        incorrect_corrupted_ndx = np.intersect1d(noisy_ndx, incorrect_ndx)
        logger.info('incorrectly predicted corrupted labels: {:,}'.format(incorrect_corrupted_ndx.shape[0]))
        logger.info('total number of incorrectly predicted labels: {:,}'.format(incorrect_ndx.shape[0]))

    # number of checkpoints to record
    n_check = int(len(y_train) * args.check_pct)
    interval = (n_check / len(y_train)) / args.n_plot_points

    # random method
    logger.info('\nordering by random...')
    start = time.time()
    ckpt_ndx, fix_ndx = _random_method(noisy_ndx, y_train, interval,
                                       to_check=n_check,
                                       random_state=seed)
    check_pct, random_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)
    logger.info('time: {:3f}s'.format(time.time() - start))
    np.save(os.path.join(out_dir, 'random.npy'), random_res)

    # save global lines
    np.save(os.path.join(out_dir, 'test_clean.npy'), acc_test_clean)
    np.save(os.path.join(out_dir, 'check_pct.npy'), check_pct)

    # tree loss method
    logger.info('\nordering by tree loss...')
    start = time.time()

    y_train_proba = model_noisy.predict_proba(X_train)
    ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
    _, tree_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

    logger.info('time: {:3f}s'.format(time.time() - start))
    np.save(os.path.join(out_dir, 'tree.npy'), tree_loss_res)

    # trex method
    if args.trex:
        logger.info('\nordering by TREX...')
        start = time.time()
        explainer = trex.TreeExplainer(model_noisy, X_train, y_train_noisy,
                                       tree_kernel=args.tree_kernel,
                                       random_state=seed,
                                       true_label=args.true_label,
                                       kernel_model=args.kernel_model,
                                       verbose=args.verbose,
                                       val_frac=args.val_frac,
                                       logger=logger)

        ckpt_ndx, fix_ndx, _ = _our_method(explainer, noisy_ndx, y_train, n_check, interval)
        check_pct, trex_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), trex_res)

        # trex loss method
        logger.info('\nordering by TREX loss...')
        start = time.time()

        y_train_proba = explainer.predict_proba(X_train)
        ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
        _, trex_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method_loss.npy'), trex_loss_res)

    # influence method
    if args.tree_type == 'cb' and args.inf_k is not None:
        logger.info('\nordering by leafinfluence...')
        start = time.time()

        model_path = '.model.json'
        model_noisy.save_model(model_path, format='json')

        if args.inf_k == -1:
            update_set = 'AllPoints'
        elif args.inf_k == 0:
            update_set = 'SinglePoint'
        else:
            update_set = 'TopKLeaves'

        leaf_influence = CBLeafInfluenceEnsemble(model_path, X_train, y_train_noisy, k=args.inf_k,
                                                 learning_rate=model.learning_rate_, update_set=update_set)
        ckpt_ndx, fix_ndx, _, _ = _influence_method(leaf_influence, noisy_ndx, X_train, y_train, y_train_noisy,
                                                    interval, to_check=n_check)
        _, leafinfluence_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), leafinfluence_res)

    # MAPLE method
    if args.maple:
        logger.info('\nordering by MAPLE...')
        start = time.time()

        train_label = y_train_noisy if args.true_label else model_noisy.predict(X_train)
        maple_exp = MAPLE(X_train, train_label, X_train, train_label, verbose=args.verbose, dstump=False)
        ckpt_ndx, fix_ndx, map_scores, map_order = _maple_method(maple_exp, X_train, noisy_ndx, interval,
                                                                 to_check=n_check)
        _, maple_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), maple_res)

    # TEKNN method
    if args.teknn:
        logger.info('\nordering by teknn...')
        start = time.time()

        # transform the data
        extractor = trex.TreeExtractor(model_noisy, tree_kernel=args.tree_kernel)
        X_train_alt = extractor.fit_transform(X_train)
        train_label = y_train if args.true_label else model_noisy.predict(X_train)

        # tune and train teknn
        knn_clf = exp_util.tune_knn(model_noisy, X_train, X_train_alt, train_label, args.val_frac,
                                    seed=seed, logger=logger)

        ckpt_ndx, fix_ndx, _ = _knn_method(knn_clf, X_train_alt, noisy_ndx, interval, to_check=n_check)
        _, teknn_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), teknn_res)

        # TEKNN loss method
        logger.info('\nordering by teknn loss...')
        start = time.time()
        y_train_proba = knn_clf.predict_proba(X_train_alt)

        ckpt_ndx, fix_ndx, _, _ = _loss_method(noisy_ndx, y_train_proba, y_train_noisy, interval, to_check=n_check)
        _, teknn_loss_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method_loss.npy'), teknn_loss_res)

    # MMD-Critic method
    if args.mmd:
        logger.info('\nordering by mmd-critic...')
        start = time.time()
        ckpt_ndx, fix_ndx = _mmd_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check)
        _, mmd_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), mmd_res)

    # Prototype method
    if args.proto:
        logger.info('\nordering by proto...')
        start = time.time()
        ckpt_ndx, fix_ndx = _proto_method(model_noisy, X_train, y_train_noisy, noisy_ndx, interval, n_check)
        _, proto_res = _interval_performance(ckpt_ndx, fix_ndx, noisy_ndx, clf, data, acc_test_noisy)

        logger.info('time: {:3f}s'.format(time.time() - start))
        np.save(os.path.join(out_dir, 'method.npy'), proto_res)
Пример #7
0
def experiment(args, logger, out_dir):

    # start timer
    begin = time.time()

    # get data
    data = util.get_data(args.dataset,
                         data_dir=args.data_dir,
                         preprocessing=args.preprocessing)
    X_train, X_test, y_train, y_test, feature, cat_indices = data

    logger.info('\ntrain instances: {:,}'.format(X_train.shape[0]))
    logger.info('test instances: {:,}'.format(X_test.shape[0]))
    logger.info('no. features: {:,}'.format(X_train.shape[1]))

    # get tree-ensemble
    clf = util.get_model(args.model,
                         n_estimators=args.n_estimators,
                         max_depth=args.max_depth,
                         random_state=args.rs,
                         cat_indices=cat_indices)

    # train a tree ensemble
    model = clone(clf).fit(X_train, y_train)
    util.performance(model, X_train, y_train, logger=logger, name='Train')
    util.performance(model, X_test, y_test, logger=logger, name='Test')

    # store indexes of different subgroups
    train_neg = np.where(y_train == 0)[0]
    train_pos = np.where(y_train == 1)[0]
    # test_neg = np.where(y_test == 0)[0]
    # test_pos = np.where(y_test == 1)[0]

    # transform features to tree kernel space
    logger.info('\ntransforming features into tree kernel space...')
    extractor = trex.TreeExtractor(model, tree_kernel=args.tree_kernel)

    start = time.time()
    X_train_alt = extractor.transform(X_train)
    logger.info('train transform time: {:.3f}s'.format(time.time() - start))

    start = time.time()
    X_test_alt = extractor.transform(X_test)
    logger.info('test transform time: {:.3f}s'.format(time.time() - start))

    # reduce dimensionality on original and tree feature spaces
    logger.info('\nembed original features into a lower dimensional space')
    X_train, X_test = reduce_and_embed(args, X_train, X_test, logger)

    logger.info('\nembed tree kernel features into a lower dimensional space')
    X_train_alt, X_test_alt = reduce_and_embed(args, X_train_alt, X_test_alt,
                                               logger)

    # separating embedded points into train and test
    # n_train = len(y_train)
    # train_neg_embed = X_embed[:n_train][train_neg]
    # train_pos_embed = X_embed[:n_train][train_pos]
    # test_neg_embed = X_embed[n_train:][test_neg]
    # test_pos_embed = X_embed[n_train:][test_pos]

    # save original feature space results
    np.save(os.path.join(out_dir, 'train_negative'), X_train[train_neg])
    np.save(os.path.join(out_dir, 'train_positive'), X_train[train_pos])

    # save tree kenel space results
    np.save(os.path.join(out_dir, 'train_tree_negative'),
            X_train_alt[train_neg])
    np.save(os.path.join(out_dir, 'train_tree_positive'),
            X_train_alt[train_pos])