示例#1
0
def run_and_evaluate_dBoost(params):
    model = params['model']

    final_file = generate_dBoost_result_file_name(
        model, params['data'], params['parameter_grid_dict'], params['keys'])

    if not os.path.isfile(final_file):
        model = params['model']
        model(**params['parameter_grid_dict'])

        result_file = str(params['parameter_grid_dict']['result_file'])
        del params['parameter_grid_dict']['sample_file']
        del params['parameter_grid_dict']['result_file']

        run = DBoostMe(params['data'], result_file)
        run.write_detected_matrix(final_file)
示例#2
0
def search_mixture_stat(data,
                        data_sample,
                        data_sample_ground_truth,
                        sample_file,
                        result_file,
                        n_subpops_s,
                        threshold_s,
                        statistical_range,
                        write_out=False):
    best_params = {}
    best_fscore = 0.0
    precision = 0.0
    recall = 0.0

    for p in n_subpops_s:
        for t in threshold_s:
            for s in statistical_range:
                run_mixture_stat(p, t, s, sample_file, result_file)

                our_sample_data = DataSetBasic(
                    data.name + " random" + str(data_sample.shape[0]),
                    data_sample, data_sample_ground_truth)

                run = DBoostMe(our_sample_data, result_file)

                current_fscore = run.calculate_total_fscore()
                current_precision = run.calculate_total_precision()
                current_recall = run.calculate_total_recall()

                if write_out:
                    run.write_detected_matrix(
                        Config.get("logging.folder") + "/out/dboost" +
                        '/dboost_' + data.name + '_mixture_subpop' + str(p) +
                        '_threshold_' + str(t) + '_stat_' + str(s) + '.npy')

                print "n_subpops: " + str(p) + " threshold: " + str(
                    t) + " --statistical " + str(s)
                print "Fscore: " + str(current_fscore)
                print "Precision: " + str(run.calculate_total_precision())
                print "Recall: " + str(run.calculate_total_recall())

                if current_fscore >= best_fscore:
                    best_fscore = current_fscore
                    precision = current_precision
                    recall = current_recall
                    best_params['n_subpops'] = p
                    best_params['threshold'] = t
                    best_params['statistical'] = s

    return best_params, best_fscore, precision, recall
示例#3
0
def search_histogram_stat(data,
                          data_sample,
                          data_sample_ground_truth,
                          sample_file,
                          result_file,
                          peak_s,
                          outlier_s,
                          statistical_range,
                          write_out=False):
    best_params = {}
    best_fscore = 0.0
    precision = 0.0
    recall = 0.0

    for p in peak_s:
        for o in outlier_s:
            for s in statistical_range:
                run_histogram_stat(p, o, s, sample_file, result_file)

                our_sample_data = DataSetBasic(
                    data.name + " random" + str(data_sample.shape[0]),
                    data_sample, data_sample_ground_truth)

                run = DBoostMe(our_sample_data, result_file)

                current_fscore = run.calculate_total_fscore()
                current_precision = run.calculate_total_precision()
                current_recall = run.calculate_total_recall()

                if write_out:
                    run.write_detected_matrix(
                        Config.get("logging.folder") + "/out/dboost" +
                        '/dboost_histogram_' + data.name + '_peak' + str(p) +
                        '_outlier_' + str(o) + '_stat_' + str(s) + '.npy')

                print "peak: " + str(p) + " outlier: " + str(
                    o) + " --statistical " + str(s)
                print "Fscore: " + str(current_fscore)
                print "Precision: " + str(run.calculate_total_precision())
                print "Recall: " + str(run.calculate_total_recall())

                if current_fscore >= best_fscore:
                    best_fscore = current_fscore
                    precision = current_precision
                    recall = current_recall
                    best_params['peak'] = p
                    best_params['outlier'] = o
                    best_params['statistical'] = s

    return best_params, best_fscore, precision, recall
示例#4
0
def search_gaussian_stat(data,
                         data_sample,
                         data_sample_ground_truth,
                         sample_file,
                         result_file,
                         gaussian_range,
                         statistical_range,
                         write_out=False):
    best_params = {}
    best_fscore = 0.0
    precision = 0.0
    recall = 0.0

    for g in gaussian_range:
        for s in statistical_range:
            run_gaussian_stat(g, s, sample_file, result_file)

            our_sample_data = DataSetBasic(
                data.name + " random" + str(data_sample.shape[0]), data_sample,
                data_sample_ground_truth)

            run = DBoostMe(our_sample_data, result_file)

            current_fscore = run.calculate_total_fscore()
            current_precision = run.calculate_total_precision()
            current_recall = run.calculate_total_recall()

            if write_out:
                run.write_detected_matrix(
                    Config.get("logging.folder") + "/out/dboost" +
                    '/dboost_gausian_' + data.name + '_gausian' + str(g) +
                    '_stat_' + str(s) + '.npy')

            print "--gaussian " + str(g) + " --statistical " + str(s)
            print "Fscore: " + str(current_fscore)
            print "Precision: " + str(run.calculate_total_precision())
            print "Recall: " + str(run.calculate_total_recall())

            if current_fscore >= best_fscore:
                best_fscore = current_fscore
                precision = current_precision
                recall = current_recall
                best_params['gaussian'] = g
                best_params['statistical'] = s

    return best_params, best_fscore, precision, recall
data_sample_ground_truth = data.matrix_is_error[random_index]

sample_file = "/tmp/data_sample.csv"
result_file = "/tmp/dboostres.csv"

data_sample.to_csv(sample_file, index=False)

total_start_time = time.time()

gaus = 0.8
stat = 0.5

command = "python3 /home/felix/dBoost/dboost/dboost-stdin.py -F ','  --gaussian " + str(
    gaus) + " --statistical " + str(
        stat) + " " + sample_file + " > " + result_file

os.system(command)

our_sample_data = DataSetBasic(data.name + " random" + str(n), data_sample,
                               data_sample_ground_truth)

run = DBoostMe(our_sample_data, result_file)

print "--gaussian " + str(gaus) + " --statistical " + str(
    stat) + " -> Fscore: " + str(run.calculate_total_fscore())
print "Precision: " + str(run.calculate_total_precision())
print "Recall: " + str(run.calculate_total_recall())

runtime = (time.time() - total_start_time)

print runtime
from ml.datasets.blackOak.BlackOakDataSet import BlackOakDataSet
from ml.tools.dboost.DBoostMe import DBoostMe

tool = DBoostMe(BlackOakDataSet(), "/tmp/test_format.csv")

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())

#data = BlackOakDataSet()

#data.dirty_pd.to_csv("blackOak_clear.csv",index=False, quotechar="\"")