Exemplo n.º 1
0
def get_naive(args):
    """
    Return naive model.
    """
    model = dare.Forest(max_depth=args.max_depth,
                        criterion=args.criterion,
                        topd=0,
                        k=args.k,
                        n_estimators=args.n_estimators,
                        max_features=args.max_features,
                        verbose=args.verbose,
                        random_state=args.rs)
    return model
Exemplo n.º 2
0
def _get_model(args):
    """
    Return model.
    """
    model = dare.Forest(criterion=args.criterion,
                        topd=0,
                        k=args.k,
                        n_estimators=args.n_estimators,
                        max_features=args.max_features,
                        max_depth=args.max_depth,
                        random_state=args.rs)

    return model
Exemplo n.º 3
0
def _get_model(args, topd=0):
    """
    Return model with the specified `topd`.
    """

    model = dare.Forest(max_depth=args.max_depth,
                        criterion=args.criterion,
                        topd=topd,
                        k=args.k,
                        n_estimators=args.n_estimators,
                        max_features=args.max_features,
                        verbose=args.verbose,
                        random_state=args.rs)

    return model
Exemplo n.º 4
0
def _get_model_dict(args, params):
    """
    Return the appropriate model.
    """

    if args.model == 'dare':
        model = dare.Forest(criterion=args.criterion,
                            max_depth=params['max_depth'],
                            n_estimators=params['n_estimators'],
                            max_features=args.max_features,
                            topd=args.topd,
                            k=params['k'],
                            verbose=args.verbose,
                            random_state=args.rs)

    elif args.model == 'extra_trees':
        model = ExtraTreesClassifier(n_estimators=params['n_estimators'],
                                     max_depth=params['max_depth'],
                                     max_features=args.max_features,
                                     criterion=args.criterion,
                                     random_state=args.rs)

    elif args.model == 'extra_trees_k1':
        model = ExtraTreesClassifier(n_estimators=params['n_estimators'],
                                     max_depth=params['max_depth'],
                                     max_features=1,
                                     criterion=args.criterion,
                                     random_state=args.rs)

    elif args.model == 'sklearn':
        model = RandomForestClassifier(n_estimators=params['n_estimators'],
                                       max_depth=params['max_depth'],
                                       max_features=args.max_features,
                                       criterion=args.criterion,
                                       random_state=args.rs,
                                       bootstrap=args.bootstrap)
    else:
        raise ValueError('model {} unknown!'.format(args.model))

    return model
Exemplo n.º 5
0
def main(args):

    # get data
    X_train, X_test, y_train, y_test = load_data(args.dataset, args.data_dir)

    # train
    topd = 0
    k = 100
    n_estimators = 100
    max_depth = 20
    seed = 1
    n_delete = 100

    if args.model == 'dare':
        model = dare.Forest(topd=topd,
                            k=k,
                            n_estimators=n_estimators,
                            max_depth=max_depth,
                            random_state=seed)

    elif args.model == 'sklearn':
        model = RandomForestClassifier(n_estimators=n_estimators,
                                       max_depth=max_depth,
                                       random_state=seed)

    start = time.time()
    model = dare.Forest(topd=topd,
                        k=k,
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        random_state=seed)
    model = model.fit(X_train, y_train)
    train_time = time.time() - start
    print('train time: {:.3f}s'.format(train_time))

    # predict
    y_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_proba, axis=1)

    # evaluate
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba[:, 1])
    print('ACC: {:.3f}, AUC: {:.3f}'.format(acc, auc))

    # delete training data
    cum_delete_time = 0
    if args.delete and not args.simulate:
        delete_indices = np.random.default_rng(seed=seed).choice(
            X_train.shape[0], size=n_delete, replace=False)
        print('instances to delete: {}'.format(delete_indices))

        # delete each sample
        for delete_ndx in delete_indices:
            start = time.time()
            model.delete(delete_ndx)
            delete_time = time.time() - start
            cum_delete_time += delete_time
            print('\ndeleted instance, {}: {:.3f}s'.format(
                delete_ndx, delete_time))

        types, depths, costs = model.get_delete_metrics()
        print('types: {}'.format(types))
        print('depths: {}'.format(depths))
        print('costs: {}'.format(costs))

        avg_delete_time = cum_delete_time / len(delete_indices)
        print('train time: {:.3f}s'.format(train_time))
        print('avg. delete time: {:.3f}s'.format(avg_delete_time))

    # simulate the deletion of each instance
    elif args.delete and args.simulate:
        delete_indices = np.random.default_rng(seed=seed).choice(
            X_train.shape[0], size=n_delete, replace=False)
        print('instances to delete: {}'.format(delete_indices))

        # cumulative time
        cum_delete_time = 0
        cum_sim_time = 0

        # simulate and delete each sample
        for delete_ndx in delete_indices:

            # simulate the deletion
            start = time.time()
            n_samples_to_retrain = model.sim_delete(delete_ndx)
            if args.test_idempotency:
                n_samples_to_retrain = model.sim_delete(delete_ndx)
            sim_time = time.time() - start
            cum_sim_time += sim_time
            print(
                '\nsimulated instance, {}: {:.3f}s, no. samples: {:,}'.format(
                    delete_ndx, sim_time, n_samples_to_retrain))

            # delete
            start = time.time()
            model.delete(delete_ndx)
            delete_time = time.time() - start
            cum_delete_time += delete_time
            print('deleted instance, {}: {:.3f}s'.format(
                delete_ndx, delete_time))

        types, depths, costs = model.get_delete_metrics()
        print('types: {}'.format(types))
        print('depths: {}'.format(depths))
        print('costs: {}'.format(costs.shape))

        avg_sim_time = cum_sim_time / len(delete_indices)
        avg_delete_time = cum_delete_time / len(delete_indices)

        print('avg. sim. time: {:.5f}s'.format(avg_sim_time))
        print('avg. delete time: {:.5f}s'.format(avg_delete_time))