def run_and_evaluate_dBoost(params): model = params['model'] final_file = generate_dBoost_result_file_name( model, params['data'], params['parameter_grid_dict'], params['keys']) if not os.path.isfile(final_file): model = params['model'] model(**params['parameter_grid_dict']) result_file = str(params['parameter_grid_dict']['result_file']) del params['parameter_grid_dict']['sample_file'] del params['parameter_grid_dict']['result_file'] run = DBoostMe(params['data'], result_file) run.write_detected_matrix(final_file)
def search_mixture_stat(data, data_sample, data_sample_ground_truth, sample_file, result_file, n_subpops_s, threshold_s, statistical_range, write_out=False): best_params = {} best_fscore = 0.0 precision = 0.0 recall = 0.0 for p in n_subpops_s: for t in threshold_s: for s in statistical_range: run_mixture_stat(p, t, s, sample_file, result_file) our_sample_data = DataSetBasic( data.name + " random" + str(data_sample.shape[0]), data_sample, data_sample_ground_truth) run = DBoostMe(our_sample_data, result_file) current_fscore = run.calculate_total_fscore() current_precision = run.calculate_total_precision() current_recall = run.calculate_total_recall() if write_out: run.write_detected_matrix( Config.get("logging.folder") + "/out/dboost" + '/dboost_' + data.name + '_mixture_subpop' + str(p) + '_threshold_' + str(t) + '_stat_' + str(s) + '.npy') print "n_subpops: " + str(p) + " threshold: " + str( t) + " --statistical " + str(s) print "Fscore: " + str(current_fscore) print "Precision: " + str(run.calculate_total_precision()) print "Recall: " + str(run.calculate_total_recall()) if current_fscore >= best_fscore: best_fscore = current_fscore precision = current_precision recall = current_recall best_params['n_subpops'] = p best_params['threshold'] = t best_params['statistical'] = s return best_params, best_fscore, precision, recall
def search_histogram_stat(data, data_sample, data_sample_ground_truth, sample_file, result_file, peak_s, outlier_s, statistical_range, write_out=False): best_params = {} best_fscore = 0.0 precision = 0.0 recall = 0.0 for p in peak_s: for o in outlier_s: for s in statistical_range: run_histogram_stat(p, o, s, sample_file, result_file) our_sample_data = DataSetBasic( data.name + " random" + str(data_sample.shape[0]), data_sample, data_sample_ground_truth) run = DBoostMe(our_sample_data, result_file) current_fscore = run.calculate_total_fscore() current_precision = run.calculate_total_precision() current_recall = run.calculate_total_recall() if write_out: run.write_detected_matrix( Config.get("logging.folder") + "/out/dboost" + '/dboost_histogram_' + data.name + '_peak' + str(p) + '_outlier_' + str(o) + '_stat_' + str(s) + '.npy') print "peak: " + str(p) + " outlier: " + str( o) + " --statistical " + str(s) print "Fscore: " + str(current_fscore) print "Precision: " + str(run.calculate_total_precision()) print "Recall: " + str(run.calculate_total_recall()) if current_fscore >= best_fscore: best_fscore = current_fscore precision = current_precision recall = current_recall best_params['peak'] = p best_params['outlier'] = o best_params['statistical'] = s return best_params, best_fscore, precision, recall
def search_gaussian_stat(data, data_sample, data_sample_ground_truth, sample_file, result_file, gaussian_range, statistical_range, write_out=False): best_params = {} best_fscore = 0.0 precision = 0.0 recall = 0.0 for g in gaussian_range: for s in statistical_range: run_gaussian_stat(g, s, sample_file, result_file) our_sample_data = DataSetBasic( data.name + " random" + str(data_sample.shape[0]), data_sample, data_sample_ground_truth) run = DBoostMe(our_sample_data, result_file) current_fscore = run.calculate_total_fscore() current_precision = run.calculate_total_precision() current_recall = run.calculate_total_recall() if write_out: run.write_detected_matrix( Config.get("logging.folder") + "/out/dboost" + '/dboost_gausian_' + data.name + '_gausian' + str(g) + '_stat_' + str(s) + '.npy') print "--gaussian " + str(g) + " --statistical " + str(s) print "Fscore: " + str(current_fscore) print "Precision: " + str(run.calculate_total_precision()) print "Recall: " + str(run.calculate_total_recall()) if current_fscore >= best_fscore: best_fscore = current_fscore precision = current_precision recall = current_recall best_params['gaussian'] = g best_params['statistical'] = s return best_params, best_fscore, precision, recall
data_sample_ground_truth = data.matrix_is_error[random_index] sample_file = "/tmp/data_sample.csv" result_file = "/tmp/dboostres.csv" data_sample.to_csv(sample_file, index=False) total_start_time = time.time() gaus = 0.8 stat = 0.5 command = "python3 /home/felix/dBoost/dboost/dboost-stdin.py -F ',' --gaussian " + str( gaus) + " --statistical " + str( stat) + " " + sample_file + " > " + result_file os.system(command) our_sample_data = DataSetBasic(data.name + " random" + str(n), data_sample, data_sample_ground_truth) run = DBoostMe(our_sample_data, result_file) print "--gaussian " + str(gaus) + " --statistical " + str( stat) + " -> Fscore: " + str(run.calculate_total_fscore()) print "Precision: " + str(run.calculate_total_precision()) print "Recall: " + str(run.calculate_total_recall()) runtime = (time.time() - total_start_time) print runtime
from ml.datasets.blackOak.BlackOakDataSet import BlackOakDataSet from ml.tools.dboost.DBoostMe import DBoostMe tool = DBoostMe(BlackOakDataSet(), "/tmp/test_format.csv") print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) #data = BlackOakDataSet() #data.dirty_pd.to_csv("blackOak_clear.csv",index=False, quotechar="\"")