import numpy as np from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture data = BlackOakDataSetUppercase() ''' steps = 100 sizes = [10, 20, 30, 40, 50] N = 5 ''' steps = 100 N = 10 labels = 378 nr_rows = int(float(labels) / data.shape[1]) sizes = np.array([50, 100, 150, 200], dtype=float) # in cells #sizes = np.array([200], dtype=float) # in cells print sizes dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1]) sizes /= dirty_column_fraction sizes /= float(data.shape[1]) print sizes row_sizes = np.array(sizes, dtype=int) # in rows log_file = "/home/felix/ExampleDrivenErrorDetection/log/dBoost/BlackOakUppercase_mix_new.txt" test_multiple_sizes_mixture(data, steps, N, row_sizes, log_file)
print datasets[i] N_datasets = 7 ''' log_folder = "unique_batch" #"unique" #dataset = HospitalHoloClean() #BlackOakDataSetUppercase() #future_steps = 60 #BlackOak = 7, Flights = 9 dataset = BlackOakDataSetUppercase() future_steps = 7 #BlackOak = 7, Flights = 9 n = dataset.get_number_dirty_columns() best_sum_total_f = {} best_col_seq = {} for d in range(10): file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_"+ dataset.name +"_" + str(d) +".csv" x, fp, fn, tp = read_csv1(file_path, None) estimated_scores = get_estimated_tp_fp_fn(x, n, dataset,feature_names, which_features_to_use) print "train: " + str(x.shape[0]) print "features: " + str(all_features) assert x.shape[1] == all_features
for key, value in sorted_x: labels.append(key) score.append(value) t += 1 if t == 25: break if enable_plotting: ind = np.arange(len(score)) plt.barh(ind, score, align='center', alpha=0.5) plt.yticks(ind, labels) plt.show() y_pred = final.predict(mat) nr_columns = model_for_dataset.get_number_dirty_columns() N_datasets_test = 7 X_test = [] y_test = [] pred_test = [] for ndata in range(N_datasets_test): file_path_test = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/7/log_progress_" + model_for_dataset.name + "_" + str( ndata) + ".csv" t_x, t_y = read_csv1(file_path_test, None) print t_x.shape endfnew = np.zeros(nr_columns)