#log_folder = "unigram_metadata_naivebayes"
#log_folder = "unigram_metadata_linearsvm"
#log_folder = "food"
log_folder = "deep_all"

#dataset = FoodHoloClean()
dataset = FlightHoloClean(
)  #FlightHoloClean()#BlackOakDataSetUppercase()#HospitalHoloClean() #BlackOakDataSetUppercase()
#future_steps = 60 #BlackOak = 7, Flights = 9
'''
from ml.datasets.BartDataset.BartDataSet import BartDataset
dataset = BartDataset(BlackOakDataSetUppercase(), "CityFD_20percent")
'''
future_steps = 20  # 60

n = dataset.get_number_dirty_columns()

best_sum_total_f = {}
best_col_seq = {}

for d in range(10):
    file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_" + dataset.name + "_" + str(
        d) + ".csv"
    x, fp, fn, tp = read_csv1(file_path, None)

    certainty_sum = get_all_certainty_sum(x, feature_names)

    #print certainty_sum

    print "train: " + str(x.shape[0])
    print "features: " + str(all_features)
示例#2
0
from ml.tools.dboost.TestDBoost import toLatex
from ml.configuration.Config import Config
import os
import time

data = FlightHoloClean()

steps = 100  #grid for search
N = 1  #10 # number runs

defined_range_labeled_cells = [100]  #[20,40,60,80,100,120]

sizes = np.array(defined_range_labeled_cells, dtype=float)  # in cells

print sizes
dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1])
sizes /= dirty_column_fraction  #cells converted
sizes /= float(data.shape[1])  #cells to rows
row_sizes = np.array(sizes, dtype=int)  # in rows

path_folder = Config.get("logging.folder") + "/out/dboost"
log_file = path_folder + "/Flights_mix_new " + str(time.time()) + ".txt"

if not os.path.exists(path_folder):
    os.makedirs(path_folder)

avg_times, avg_fscores, avg_precision, avg_recall, std_fscores, std_precision, std_recall = test_multiple_sizes_mixture(
    data, steps, N, row_sizes, log_file)

toLatex(defined_range_labeled_cells, avg_times, avg_fscores, avg_precision,
        avg_recall, std_fscores, std_precision, std_recall, log_file)