classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() ''' datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break
datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break print "datasets used for training:" for i in range(len(datasets)): print datasets[i] N_datasets = 7 ''' dataset = FlightHoloClean( ) #FlightHoloClean()#BlackOakDataSetUppercase()#HospitalHoloClean() #BlackOakDataSetUppercase() #future_steps = 60 #BlackOak = 7, Flights = 9 def getConfig(dataset): path = None future_steps = -1 if type(dataset) == type(FlightHoloClean()): path = '/home/felix/phd/round_robin_part/flights' future_steps = 20 return path, future_steps mypath, future_steps = getConfig(dataset)
import numpy as np from ml.datasets.flights.FlightHoloClean import FlightHoloClean from ml.plot.old.user_effort_all_potential.PlotterLatex import PlotterLatex data = FlightHoloClean() label_potential = [ 4, 8, 12, 16, 26, 36, 46, 56, 66, 76, 86, 96, 106, 116, 126, 136, 146, 156, 166, 176, 186, 196, 206, 216, 226, 236, 246, 256, 266, 276, 286 ] fscore_metadata_no_svd_more_data = [] fscore_metadata_no_svd_more_data.append([ 0.0, 0.0, 0.0, 0.0, 0.3019844693700982, 0.5517146398566308, 0.6819393939392727, 0.8136335209507353, 0.8129854610634996, 0.8224170196000095, 0.8384676145338619, 0.8636178861788035, 0.8638716881534007, 0.8869425148494458, 0.893910608939216, 0.8990408385246227, 0.8884377220586137, 0.8956337602097001, 0.897860314896998, 0.9071071071070146, 0.910703607733707, 0.9108178364327478, 0.9166079871239963 ]) fscore_metadata_no_svd_more_data.append([ 0.0, 0.0, 0.0, 0.0, 0.30139823925407516, 0.515855039637531, 0.6529868868382043, 0.7706678118960443, 0.7891252006420769, 0.7630255697016273, 0.7966925064600301, 0.8155689892883575, 0.8688841419997421, 0.8693620479479327, 0.876720526630616, 0.8844040363671689, 0.893855848759437, 0.8996871333594727, 0.9012019935503657, 0.9018095520618088, 0.910505836575904, 0.9148914891490294, 0.9183389628453579 ]) fscore_metadata_no_svd_more_data.append([
column_id = 0 return column_id else: certainty_array = np.zeros(dataSet.shape[1]) for key, value in certainty.iteritems(): certainty_array[key] = value min_certainty_index = np.argmin(certainty_array) return min_certainty_index #input from ml.datasets.flights.FlightHoloClean import FlightHoloClean dataSet = FlightHoloClean() from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean #dataSet = HospitalHoloClean() from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase #dataSet = BlackOakDataSetUppercase() #print("read: %s seconds ---" % (time.time() - start_time)) start_time = time.time() train_fraction = 1.0 ngrams = 1 runSVD = False use_metadata = True user_error_probability = 0.0 step_size = 10
enable_plotting = True classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break
for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break print "datasets used for training:" for i in range(len(datasets)): print datasets[i] N_datasets = 7 ''' #dataset = HospitalHoloClean() #dataset = BlackOakDataSetUppercase() dataset = FlightHoloClean() n = dataset.get_number_dirty_columns() best_sum_total_f = {} best_col_seq = {} def getConfig(dataset): path = None future_steps = -1 if type(dataset) == type(FlightHoloClean()): path = '/home/felix/phd/round_robin_part/flights' future_steps = 4 * 2 + 20 return path, future_steps