def run_benchmark(config, classifiers, classifiers_gridparameters): """ Runs the benchmark code, see voya_config_example for argument explanation """ default_config = { "data_file": None, # input data "test_df": None, # instead of data_file, give split data "train_df": None, "out_path": None, "num_folds": 5, "test_size": 0.2, "num_cores": 1, "pu_learning": False, "pu_rand_samp_frac": False, "verbosity": 0, "random_forest_tree_plot": False, "auc_folds": 1, 'u_to_p_ratio': False, 'ranking_Frac': None, 'include_neg_inTrain': True, 'grisearch_metric': 'AUC', } default_config.update(config) config = default_config set_verbosity_level(config["verbosity"]) voya_logger.info("Starting Benchmark") out_path = config['out_path'] if out_path is not None: if not os.path.isdir(out_path): os.makedirs(out_path) # If we are given the test / train sets explicitly test_df = config["test_df"] train_df = config["train_df"] if test_df is not None and train_df is not None: y_test, X_test = datasetup.split_df_labels_features(test_df) y_train, X_train = datasetup.split_df_labels_features(train_df) elif config["data_file"] is not None: # or load all the data and auto split voya_logger.info('loading data from: {}'.format(config['data_file'])) try: df = datasetup.load_data(config['data_file']) except IOError: # file doesnt exist, try seeing is its a df instead df = config['data_file'] voya_logger.info("Input data labels \n{}".format(df.label.value_counts())) try: datasetup.scale_dataframe_features(df) except TypeError: # Got a string as the DF (after IOError) raise VoyaConfigError('data_file is not a valid path to a file or a Pandas DF, got {}'.format(df)) if config["pu_learning"]: # input of positive, negative and unlabeled labels (1, -1, 0) voya_logger.info("PU Learning Mode On") if config["u_to_p_ratio"]: df = datasetup.downsample_pu_df(df, config["u_to_p_ratio"]) df_test, df_train = datasetup.split_test_train_df_pu(df, config['test_size'],) assert set(df_test['label'].unique()) == set((1, 0)) y_test, X_test = datasetup.split_df_labels_features(df_test) y_train, X_train = datasetup.split_df_labels_features(df_train) else: # input of positive and negative (i.e 1, 0) X, y = datasetup.split_df_labels_features(df) X_train, y_train, X_test, y_test = datasetup.get_stratifed_data(y, X, config['test_size']) else: raise ValueError("You must give either `test_df` and `train_df` OR `data_file` in config") results_table_rows = {} # each row is a dict with column_name: value for clf_name, clf_notoptimized in classifiers.iteritems(): voya_logger.info("Running {}".format(clf_name)) clf_results = {'clf_name': clf_name} param_grid = classifiers_gridparameters[clf_name] if param_grid is None: voya_logger.info('Skipping grid search for {}'.format(clf_name)) voya_logger.debug("clf_notoptimized {}".format(clf_notoptimized)) clf_fitted = clf_notoptimized.fit(X_train, y_train) else: voya_logger.info('Performing grid search for {}'.format(clf_name)) skf = sklearn.cross_validation.StratifiedKFold(y_train, n_folds=config['num_folds']) ranking = voya_plotter.PrInRanking(config['ranking_Frac'], config['desired_retention']) if (config['gridsearch_metric'] == 'PosRate'): clf = GridSearchCV(estimator=clf_notoptimized, param_grid=param_grid, cv=skf, scoring=ranking.pr_in_ranking, n_jobs=config['num_cores']) elif (config['gridsearch_metric'] == 'Frac'): clf = GridSearchCV(estimator=clf_notoptimized, param_grid=param_grid, cv=skf, scoring=ranking.frac_to_Xpercent, n_jobs=config['num_cores']) else: clf = GridSearchCV(estimator=clf_notoptimized, param_grid=param_grid, cv=skf, scoring='roc_auc', n_jobs=config['num_cores']) clf_fitted = clf.fit(X_train, y_train).best_estimator_ clf_optimal_parameters = clf.best_params_ clf_results['clf_optimal_parameters'] = clf_optimal_parameters voya_logger.info('Optimal parameters are {}'.format(clf_optimal_parameters)) voya_logger.debug('X = {}'.format(clf_fitted)) y_pred = clf_fitted.predict_proba(X_test)[:, 1] y_pred_label = clf_fitted.predict(X_test) clf_results.update({ 'y_pred': y_pred, 'y_pred_label': y_pred_label, 'clf': clf_fitted, 'clf_notoptimized': clf_notoptimized, 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test, 'param_grid': param_grid, }) voya_logger.info("Benchmarking {}".format(clf_name)) benchmarks.all_benchmarks(clf_results, out_path, config["auc_folds"], # TODO (ryan) split this up now into benchmarks and plots? config['ranking_Frac']) if out_path is not None: # TODO (ryan) non conforming plots, move to benchmarks if config["random_forest_tree_plot"] and isinstance(clf_fitted, sklearn.ensemble.RandomForestClassifier): voya_logger.debug('Generating random forrest plot') # TODO (ryan) weve hardcoded '2' where the feature start several times, export to var? feature_names = [colname.replace('url/tech/', '').replace('cid/tech/', '') for colname in df.columns[2:]] voya_plotter.plot_trees(clf_results['clf'], feature_names) results_table_rows[clf_name] = clf_results voya_logger.info("\n#######\nResults\n#######") num_positives_y_train = y_train.sum() voya_logger.info("Training: positives = {}, negatives/unlabelled={}".format(num_positives_y_train, len(y_train) - num_positives_y_train)) num_positives_y_test = y_test.sum() voya_logger.info( "Testing: positives = {}, negatives={}".format(num_positives_y_test, len(y_test) - num_positives_y_test)) results_table = benchmarks.results_dict_to_data_frame(results_table_rows) voya_logger.info('\n{}'.format(results_table)) return results_table_rows
def run_search_benchmark(config, classifiers, classifiers_gridparameters): """ This works like run_bench except it calls runbench multiple times varying the fraction of unlabelled in the sample. Currently PU learning only, varies the fraction of unlabelled in the classifier as a function of positive Search parameters are set in the config dictionary. See the code for required config (in addition to that in runbench). :param config: :param classifiers: :param classifiers_gridparameters: :return: """ default_config = { "data_file": None, # input data "test_df": None, # instead of data_file, give split data "train_df": None, "out_path": None, "num_folds": 5, "test_size": 0.2, "num_cores": 3, "pu_learning": False, "pu_rand_samp_frac": False, "verbosity": 0, "random_forest_tree_plot": False, "auc_folds": 1, 'u_to_p_ratio': False, 'voya_mode': 'pusearch', 'search_results_file': '', # csv file that records the results of each run 'soft_search_run': True, # if True builds on the previous results, if false overwrites the results file 'search_range': (0.5, 1, 2), # range of values to run over 'runs_per_search': 3, # number of times to run the search per parameter per classifier 'search_live_plot': False, 'constant_test_train': True, # otherwise will resplit every run_per_search 'test_neg_to_pos_ratio': None, 'includes_neg_inTrain': False, } default_config.update(config) config = default_config out_path = config['out_path'] if out_path is not None: if not os.path.isdir(out_path): os.makedirs(out_path) if config['constant_test_train']: # Split test / train so we have a constant testing set try: df = datasetup.load_data(config['data_file']) except IOError: # file doesnt exist, try seeing is its a df instead df = config['data_file'] df_test, df_train = datasetup.split_test_train_df_pu(df, config['test_size'], test_neg_to_pos_ratio=config['test_neg_to_pos_ratio'], includes_neg_inTrain=config['includes_neg_inTrain']) config["test_df"] = df_test config["train_df"] = df_train config["data_file"] = None if not config['runs_per_search'] == 1: # no point doing more if we have a constant test/train voya_logger.warning('Setting runs_per_search to 1 as constant_test_train is True, change auc_folds instead') config['runs_per_search'] = 1 save_file = config['search_results_file'] search_range = config['search_range'] voya_logger.info('Starting search benchmark') if not os.path.exists(save_file) or not config['soft_search_run']: with open(save_file, 'wb') as f: if config['constant_test_train']: f.write('gamma, folds, clf, auc, auc_std, auc_stderr, ranking_Frac, local_auc, local_auc_std, local_auc_stderr, local_pr, frac_to_desRet, frac_to_desRet_stderr\n') else: f.write('gamma, clf, auc, local_pr, frac_to_desRet\n') fig = None for gamma_num, gamma in enumerate(search_range): # gamma is a single value in the search range voya_logger.info('Running classifiers for gamma={} ({}/{})'.format(gamma, gamma_num + 1, len(search_range))) config.update({"u_to_p_ratio": gamma}) if config['constant_test_train']: config["train_df"] = datasetup.downsample_pu_df(df_train, config["u_to_p_ratio"]) results_dict = run_benchmark(config, classifiers, classifiers_gridparameters) # Output csv_output = [] for clf_name in classifiers.keys(): if config['auc_folds']>1: csv_row = (gamma, results_dict[clf_name]['auc_folds'], clf_name, results_dict[clf_name]['auc_score'], results_dict[clf_name]['auc_std'], results_dict[clf_name]['auc_std_err'], results_dict[clf_name]['ranking_Frac'], results_dict[clf_name]['local_auc_score'], results_dict[clf_name]['local_auc_std'], results_dict[clf_name]['local_auc_std_err'], results_dict[clf_name]['local_pr'], results_dict[clf_name]['frac_to_ret'], results_dict[clf_name]['frac_to_ret_stderr']) else: csv_row = (gamma, clf_name, results_dict[clf_name]['auc_score'], results_dict[clf_name]['local_pr'], results_dict[clf_name]['frac_to_ret']) csv_output.append(csv_row) with open(save_file, 'ab') as f: csv_f = csv.writer(f) csv_f.writerows(csv_output) if config['search_live_plot']: plt.clf() fig = voya_plotter.pu_search_result(save_file, fig) plt.draw() if out_path is not None: voya_logger.info('Generating prVSranking all methods plot') voya_plotter.prVSranking_methodComparison(results_dict) plt.savefig(os.path.join(out_path, 'prVsRankComparison__Gamma__{}.png'.format(gamma)), bbox_inches='tight')