def get_naive(args): """ Return naive model. """ model = dare.Forest(max_depth=args.max_depth, criterion=args.criterion, topd=0, k=args.k, n_estimators=args.n_estimators, max_features=args.max_features, verbose=args.verbose, random_state=args.rs) return model
def _get_model(args): """ Return model. """ model = dare.Forest(criterion=args.criterion, topd=0, k=args.k, n_estimators=args.n_estimators, max_features=args.max_features, max_depth=args.max_depth, random_state=args.rs) return model
def _get_model(args, topd=0): """ Return model with the specified `topd`. """ model = dare.Forest(max_depth=args.max_depth, criterion=args.criterion, topd=topd, k=args.k, n_estimators=args.n_estimators, max_features=args.max_features, verbose=args.verbose, random_state=args.rs) return model
def _get_model_dict(args, params): """ Return the appropriate model. """ if args.model == 'dare': model = dare.Forest(criterion=args.criterion, max_depth=params['max_depth'], n_estimators=params['n_estimators'], max_features=args.max_features, topd=args.topd, k=params['k'], verbose=args.verbose, random_state=args.rs) elif args.model == 'extra_trees': model = ExtraTreesClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'], max_features=args.max_features, criterion=args.criterion, random_state=args.rs) elif args.model == 'extra_trees_k1': model = ExtraTreesClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'], max_features=1, criterion=args.criterion, random_state=args.rs) elif args.model == 'sklearn': model = RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'], max_features=args.max_features, criterion=args.criterion, random_state=args.rs, bootstrap=args.bootstrap) else: raise ValueError('model {} unknown!'.format(args.model)) return model
def main(args): # get data X_train, X_test, y_train, y_test = load_data(args.dataset, args.data_dir) # train topd = 0 k = 100 n_estimators = 100 max_depth = 20 seed = 1 n_delete = 100 if args.model == 'dare': model = dare.Forest(topd=topd, k=k, n_estimators=n_estimators, max_depth=max_depth, random_state=seed) elif args.model == 'sklearn': model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=seed) start = time.time() model = dare.Forest(topd=topd, k=k, n_estimators=n_estimators, max_depth=max_depth, random_state=seed) model = model.fit(X_train, y_train) train_time = time.time() - start print('train time: {:.3f}s'.format(train_time)) # predict y_proba = model.predict_proba(X_test) y_pred = np.argmax(y_proba, axis=1) # evaluate acc = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_proba[:, 1]) print('ACC: {:.3f}, AUC: {:.3f}'.format(acc, auc)) # delete training data cum_delete_time = 0 if args.delete and not args.simulate: delete_indices = np.random.default_rng(seed=seed).choice( X_train.shape[0], size=n_delete, replace=False) print('instances to delete: {}'.format(delete_indices)) # delete each sample for delete_ndx in delete_indices: start = time.time() model.delete(delete_ndx) delete_time = time.time() - start cum_delete_time += delete_time print('\ndeleted instance, {}: {:.3f}s'.format( delete_ndx, delete_time)) types, depths, costs = model.get_delete_metrics() print('types: {}'.format(types)) print('depths: {}'.format(depths)) print('costs: {}'.format(costs)) avg_delete_time = cum_delete_time / len(delete_indices) print('train time: {:.3f}s'.format(train_time)) print('avg. delete time: {:.3f}s'.format(avg_delete_time)) # simulate the deletion of each instance elif args.delete and args.simulate: delete_indices = np.random.default_rng(seed=seed).choice( X_train.shape[0], size=n_delete, replace=False) print('instances to delete: {}'.format(delete_indices)) # cumulative time cum_delete_time = 0 cum_sim_time = 0 # simulate and delete each sample for delete_ndx in delete_indices: # simulate the deletion start = time.time() n_samples_to_retrain = model.sim_delete(delete_ndx) if args.test_idempotency: n_samples_to_retrain = model.sim_delete(delete_ndx) sim_time = time.time() - start cum_sim_time += sim_time print( '\nsimulated instance, {}: {:.3f}s, no. samples: {:,}'.format( delete_ndx, sim_time, n_samples_to_retrain)) # delete start = time.time() model.delete(delete_ndx) delete_time = time.time() - start cum_delete_time += delete_time print('deleted instance, {}: {:.3f}s'.format( delete_ndx, delete_time)) types, depths, costs = model.get_delete_metrics() print('types: {}'.format(types)) print('depths: {}'.format(depths)) print('costs: {}'.format(costs.shape)) avg_sim_time = cum_sim_time / len(delete_indices) avg_delete_time = cum_delete_time / len(delete_indices) print('avg. sim. time: {:.5f}s'.format(avg_sim_time)) print('avg. delete time: {:.5f}s'.format(avg_delete_time))