def load_model(dataSet, classifier):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    potential_model_dir = Config.get("column.potential.models")

    tp_model = pickle.load(
        open(potential_model_dir + "/tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    fpfn_model = pickle.load(
        open(potential_model_dir + "/fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    delta_tp_model = pickle.load(
        open(potential_model_dir + "/delta_tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    delta_fpfn_model = pickle.load(
        open(potential_model_dir + "/delta_fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
def load_model(dataSet, classifier):

    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"

    potential_model_dir = Config.get("column.potential.models")

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + classifier.name + ".p"))
Exemplo n.º 3
0
def load_model(dataSet):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    #dataset_log_files[Salary().name] = "hospital"  # be careful
    #dataset_log_files[Book().name] = "hospital"  # be careful

    potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/unique_false_current_hist'

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + "XGBoost" + ".p"))
def load_model(dataSet, classifier):

    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    #potential_model_dir = Config.get("column.potential.models")
    potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification"

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + classifier.name + ".p"))
def load_model(dataSet):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    # dataset_log_files[Salary().name] = "hospital"  # be careful
    # dataset_log_files[Book().name] = "hospital"  # be careful

    #potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/current_total_f'
    potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/simulation100data'

    tp_model = pickle.load(
        open(potential_model_dir + "/tp_model_" + "XGBoost" + ".p"))
    fp_model = pickle.load(
        open(potential_model_dir + "/fp_model_" + "XGBoost" + ".p"))
    fn_model = pickle.load(open(potential_model_dir + "/fn_model_XGBoost.p"))

    return tp_model, fp_model, fn_model
        break

print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''

log_folder = "synthetic_unique_batch"  #"unique"

from ml.datasets.synthetic.Synthetic import Synthetic
from ml.datasets.synthetic.ReplaceError import ReplaceError
rows = 2000
datasets = [
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase()
]
columns = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
error_fraction = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
error_types = [
    ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError,
    ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError
Exemplo n.º 7
0
print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''




log_folder = "unique_batch" #"unique"

#dataset = HospitalHoloClean() #BlackOakDataSetUppercase()
#future_steps = 60 #BlackOak = 7, Flights = 9
dataset = BlackOakDataSetUppercase()
future_steps = 7 #BlackOak = 7, Flights = 9

n = dataset.get_number_dirty_columns()

best_sum_total_f = {}
best_col_seq = {}



for d in range(10):
    file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_"+ dataset.name +"_" + str(d)  +".csv"
    x, fp, fn, tp = read_csv1(file_path, None)

    estimated_scores = get_estimated_tp_fp_fn(x, n, dataset,feature_names, which_features_to_use)
Exemplo n.º 8
0
enable_plotting = True

classifier_log_paths = {}
classifier_log_paths[
    XGBoostClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
classifier_log_paths[
    LinearSVMClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
classifier_log_paths[
    NaiveBayesClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
#dataset_log_files[Salary().name] = "salary1"

classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break

print "datasets used for training:"
for i in range(len(datasets)):
Exemplo n.º 9
0
enable_plotting = True

cutting = True

use_potential = False

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"

classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [
    HospitalHoloClean(),
    BlackOakDataSetUppercase(),
    FlightHoloClean(),
    Book(),
    Salary(),
    Restaurant()
Exemplo n.º 10
0
print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''




log_folder = "synthetic_unique_batch" #"unique"

from ml.datasets.synthetic.Synthetic import Synthetic
from ml.datasets.synthetic.ReplaceError import ReplaceError
rows = 2000
datasets =[BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase(), BlackOakDataSetUppercase()]
columns = [4,4,4,4,4,4,4,4,4,4]
error_fraction = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
error_types = [ReplaceError, ReplaceError,ReplaceError, ReplaceError,ReplaceError, ReplaceError,ReplaceError, ReplaceError,ReplaceError, ReplaceError]
seed_synth = 41
dataSet = Synthetic(rows, datasets, columns, error_fraction, error_types, seed_synth)


dataset = dataSet #BlackOakDataSetUppercase()
#future_steps = 8+9 #BlackOak = 7, Flights = 9
#future_steps = 14+7 #BlackOak = 7
future_steps = 2*2 + 6

n = dataset.get_number_dirty_columns()

best_sum_total_f = {}
Exemplo n.º 11
0
import numpy as np

from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture

data = BlackOakDataSetUppercase()
'''
steps = 100
sizes = [10, 20, 30, 40, 50]
N = 5
'''

steps = 100
N = 10
labels = 378

nr_rows = int(float(labels) / data.shape[1])
sizes = np.array([50, 100, 150, 200], dtype=float)  # in cells
#sizes = np.array([200], dtype=float) # in cells

print sizes
dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1])
sizes /= dirty_column_fraction
sizes /= float(data.shape[1])
print sizes
row_sizes = np.array(sizes, dtype=int)  # in rows

log_file = "/home/felix/ExampleDrivenErrorDetection/log/dBoost/BlackOakUppercase_mix_new.txt"

test_multiple_sizes_mixture(data, steps, N, row_sizes, log_file)
Exemplo n.º 12
0
import numpy as np
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
from ml.datasets.flights.FlightHoloClean import FlightHoloClean
from ml.datasets.products.Products import Products
from ml.datasets.luna.book.Book import Book
from ml.datasets.electronics.Electronics import Electronics
from ml.datasets.salary_data.Salary import Salary
import pandas as pd
import csv
from ml.data_generator.generate_bart_config import generate_bart_config
from shutil import copyfile

datasets = [
    BlackOakDataSetUppercase().clean_pd.values,
    FlightHoloClean().clean_pd.values,
    Salary().clean_pd.values,
    Electronics().clean_pd.values,
    Book().clean_pd.values,
    Products().clean_pd.values
]

for n in range(1000):
    # select dataset
    dataset_id = np.random.randint(len(datasets))
    dataset = datasets[dataset_id]

    # select number of rows
    max_rows = 2000
    if datasets[dataset_id].shape[0] < max_rows:
        max_rows = datasets[dataset_id].shape[0]
use_absolute_difference = True  # False == Squared / True == Absolute

enable_plotting = True

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

classifier_log_paths[
    XGBoostClassifier.
    name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/7"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"

classifier_to_use = XGBoostClassifier
model_for_dataset = BlackOakDataSetUppercase()

datasets = [
    HospitalHoloClean(),
    BlackOakDataSetUppercase(),
    FlightHoloClean(),
    Book(),
    Salary(),
    Restaurant()
        dirty_pd = dirty_pd_init

        clean_pd = dataset.clean_pd

        super(BartDataset, self).__init__(BartDataset.name, dirty_pd, clean_pd)


def validate(self):
    print "validate"


if __name__ == '__main__':
    from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
    import numpy as np

    data = BartDataset(BlackOakDataSetUppercase(), "CityFD_10percent_Remove")
    '''
	from ml.datasets.salary_data.Salary import Salary

	#outlier data
	datan = Salary()
	def convert_to_int(value):
		return str(int(float(value)))
	datan.clean_pd[datan.clean_pd.columns[8]] = datan.clean_pd[datan.clean_pd.columns[8]].apply(convert_to_int)
	data = BartDataset(datan, "Salary_outlier_5percent")
	'''

    error_fractions = np.sum(data.matrix_is_error, axis=0)

    print data.clean_pd.columns
    print error_fractions
from sets import Set

from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
from ml.tools.nadeef_detect.FD import FD
from ml.tools.nadeef_detect.NadeefDetect import NadeefDetect

from ml.datasets.BartDataset.BartDataSet import BartDataset
data = BartDataset(BlackOakDataSetUppercase(), "CityFD_20percent")

rules = []

rules.append(FD(Set(["ZIP"]), "City"))

nadeef = NadeefDetect(
    data,
    rules,
    log_file="/home/felix/SequentialPatternErrorDetection/nadeef/log/Bart.txt")
        break

print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''

log_folder = "synthetic_unique_batch"  #"unique"

from ml.datasets.synthetic.Synthetic import Synthetic
from ml.datasets.synthetic.ReplaceError import ReplaceError
rows = 2000
datasets = [
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase(),
    BlackOakDataSetUppercase()
]
columns = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
error_fraction = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
error_types = [
    ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError,
    ReplaceError, ReplaceError, ReplaceError, ReplaceError, ReplaceError
import numpy as np

from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase

data = BlackOakDataSetUppercase()

sample_size = 14

for c in range(data.shape[1]):
    error_ids = np.where(data.matrix_is_error[:,c])[0]
    print data.clean_pd.columns[c]
    print "number of errors: " + str(np.sum(data.matrix_is_error[:,c]))
    if (len(error_ids) >= sample_size):
        for i in range(sample_size):
            print "dirty: " + str(data.dirty_pd.values[error_ids[i],c]) + " -> clean: " + str(data.clean_pd.values[error_ids[i],c])
    print ""
use_change_features = True

enable_plotting = True

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change"



dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"


classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break