def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful potential_model_dir = Config.get("column.potential.models") tp_model = pickle.load( open(potential_model_dir + "/tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) fpfn_model = pickle.load( open(potential_model_dir + "/fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_tp_model = pickle.load( open(potential_model_dir + "/delta_tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_fpfn_model = pickle.load( open(potential_model_dir + "/delta_fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" potential_model_dir = Config.get("column.potential.models") return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
def load_model(dataSet): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet #dataset_log_files[Salary().name] = "hospital" # be careful #dataset_log_files[Book().name] = "hospital" # be careful potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/unique_false_current_hist' return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + "XGBoost" + ".p"))
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful #potential_model_dir = Config.get("column.potential.models") potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification" return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
def load_model(dataSet): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet # dataset_log_files[Salary().name] = "hospital" # be careful # dataset_log_files[Book().name] = "hospital" # be careful #potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/current_total_f' potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/simulation100data' tp_model = pickle.load( open(potential_model_dir + "/tp_model_" + "XGBoost" + ".p")) fp_model = pickle.load( open(potential_model_dir + "/fp_model_" + "XGBoost" + ".p")) fn_model = pickle.load(open(potential_model_dir + "/fn_model_XGBoost.p")) return tp_model, fp_model, fn_model
break print "datasets used for training:" for i in range(len(datasets)): print datasets[i] N_datasets = 7 ''' log_folder = "synthetic_unique_batch" #"unique" from ml.datasets.synthetic.Synthetic import Synthetic from ml.datasets.synthetic.ReplaceError import ReplaceError rows = 2000 datasets = [ BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase() ] columns = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4] error_fraction = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] error_types = [ ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError
print "datasets used for training:" for i in range(len(datasets)): print datasets[i] N_datasets = 7 ''' log_folder = "unique_batch" #"unique" #dataset = HospitalHoloClean() #BlackOakDataSetUppercase() #future_steps = 60 #BlackOak = 7, Flights = 9 dataset = BlackOakDataSetUppercase() future_steps = 7 #BlackOak = 7, Flights = 9 n = dataset.get_number_dirty_columns() best_sum_total_f = {} best_col_seq = {} for d in range(10): file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_"+ dataset.name +"_" + str(d) +".csv" x, fp, fn, tp = read_csv1(file_path, None) estimated_scores = get_estimated_tp_fp_fn(x, n, dataset,feature_names, which_features_to_use)
enable_plotting = True classifier_log_paths = {} classifier_log_paths[ XGBoostClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" classifier_log_paths[ LinearSVMClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" classifier_log_paths[ NaiveBayesClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" #dataset_log_files[Salary().name] = "salary1" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break print "datasets used for training:" for i in range(len(datasets)):
enable_plotting = True cutting = True use_potential = False classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [ HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()
print "datasets used for training:" for i in range(len(datasets)): print datasets[i] N_datasets = 7 ''' log_folder = "synthetic_unique_batch" #"unique" from ml.datasets.synthetic.Synthetic import Synthetic from ml.datasets.synthetic.ReplaceError import ReplaceError rows = 2000 datasets =[BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase()] columns = [4,4,4,4,4,4,4,4,4,4] error_fraction = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] error_types = [ReplaceError, ReplaceError,ReplaceError, ReplaceError,ReplaceError, ReplaceError,ReplaceError, ReplaceError,ReplaceError, ReplaceError] seed_synth = 41 dataSet = Synthetic(rows, datasets, columns, error_fraction, error_types, seed_synth) dataset = dataSet #BlackOakDataSetUppercase() #future_steps = 8+9 #BlackOak = 7, Flights = 9 #future_steps = 14+7 #BlackOak = 7 future_steps = 2*2 + 6 n = dataset.get_number_dirty_columns() best_sum_total_f = {}
import numpy as np from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture data = BlackOakDataSetUppercase() ''' steps = 100 sizes = [10, 20, 30, 40, 50] N = 5 ''' steps = 100 N = 10 labels = 378 nr_rows = int(float(labels) / data.shape[1]) sizes = np.array([50, 100, 150, 200], dtype=float) # in cells #sizes = np.array([200], dtype=float) # in cells print sizes dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1]) sizes /= dirty_column_fraction sizes /= float(data.shape[1]) print sizes row_sizes = np.array(sizes, dtype=int) # in rows log_file = "/home/felix/ExampleDrivenErrorDetection/log/dBoost/BlackOakUppercase_mix_new.txt" test_multiple_sizes_mixture(data, steps, N, row_sizes, log_file)
import numpy as np from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase from ml.datasets.flights.FlightHoloClean import FlightHoloClean from ml.datasets.products.Products import Products from ml.datasets.luna.book.Book import Book from ml.datasets.electronics.Electronics import Electronics from ml.datasets.salary_data.Salary import Salary import pandas as pd import csv from ml.data_generator.generate_bart_config import generate_bart_config from shutil import copyfile datasets = [ BlackOakDataSetUppercase().clean_pd.values, FlightHoloClean().clean_pd.values, Salary().clean_pd.values, Electronics().clean_pd.values, Book().clean_pd.values, Products().clean_pd.values ] for n in range(1000): # select dataset dataset_id = np.random.randint(len(datasets)) dataset = datasets[dataset_id] # select number of rows max_rows = 2000 if datasets[dataset_id].shape[0] < max_rows: max_rows = datasets[dataset_id].shape[0]
use_absolute_difference = True # False == Squared / True == Absolute enable_plotting = True classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" classifier_log_paths[ XGBoostClassifier. name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/7" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = BlackOakDataSetUppercase() datasets = [ HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()
dirty_pd = dirty_pd_init clean_pd = dataset.clean_pd super(BartDataset, self).__init__(BartDataset.name, dirty_pd, clean_pd) def validate(self): print "validate" if __name__ == '__main__': from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase import numpy as np data = BartDataset(BlackOakDataSetUppercase(), "CityFD_10percent_Remove") ''' from ml.datasets.salary_data.Salary import Salary #outlier data datan = Salary() def convert_to_int(value): return str(int(float(value))) datan.clean_pd[datan.clean_pd.columns[8]] = datan.clean_pd[datan.clean_pd.columns[8]].apply(convert_to_int) data = BartDataset(datan, "Salary_outlier_5percent") ''' error_fractions = np.sum(data.matrix_is_error, axis=0) print data.clean_pd.columns print error_fractions
from sets import Set from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase from ml.tools.nadeef_detect.FD import FD from ml.tools.nadeef_detect.NadeefDetect import NadeefDetect from ml.datasets.BartDataset.BartDataSet import BartDataset data = BartDataset(BlackOakDataSetUppercase(), "CityFD_20percent") rules = [] rules.append(FD(Set(["ZIP"]), "City")) nadeef = NadeefDetect( data, rules, log_file="/home/felix/SequentialPatternErrorDetection/nadeef/log/Bart.txt")
break print "datasets used for training:" for i in range(len(datasets)): print datasets[i] N_datasets = 7 ''' log_folder = "synthetic_unique_batch" #"unique" from ml.datasets.synthetic.Synthetic import Synthetic from ml.datasets.synthetic.ReplaceError import ReplaceError rows = 2000 datasets = [ BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase() ] columns = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4] error_fraction = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] error_types = [ ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError
import numpy as np from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase data = BlackOakDataSetUppercase() sample_size = 14 for c in range(data.shape[1]): error_ids = np.where(data.matrix_is_error[:,c])[0] print data.clean_pd.columns[c] print "number of errors: " + str(np.sum(data.matrix_is_error[:,c])) if (len(error_ids) >= sample_size): for i in range(sample_size): print "dirty: " + str(data.dirty_pd.values[error_ids[i],c]) + " -> clean: " + str(data.clean_pd.values[error_ids[i],c]) print ""
use_change_features = True enable_plotting = True classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break