def load_model(dataSet, classifier):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    potential_model_dir = Config.get("column.potential.models")

    tp_model = pickle.load(
        open(potential_model_dir + "/tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    fpfn_model = pickle.load(
        open(potential_model_dir + "/fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    delta_tp_model = pickle.load(
        open(potential_model_dir + "/delta_tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    delta_fpfn_model = pickle.load(
        open(potential_model_dir + "/delta_fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
def load_model(dataSet, classifier):

    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    #potential_model_dir = Config.get("column.potential.models")
    potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification"

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + classifier.name + ".p"))
Exemplo n.º 3
0
cutting = True

use_potential = False

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"

classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [
    HospitalHoloClean(),
    BlackOakDataSetUppercase(),
    FlightHoloClean(),
    Book(),
    Salary(),
    Restaurant()
]
Exemplo n.º 4
0
from ml.datasets.flights.FlightHoloClean import FlightHoloClean
from ml.datasets.products.Products import Products
from ml.datasets.luna.book.Book import Book
from ml.datasets.electronics.Electronics import Electronics
from ml.datasets.salary_data.Salary import Salary
import pandas as pd
import csv
from ml.data_generator.generate_bart_config import generate_bart_config
from shutil import copyfile

datasets = [
    BlackOakDataSetUppercase().clean_pd.values,
    FlightHoloClean().clean_pd.values,
    Salary().clean_pd.values,
    Electronics().clean_pd.values,
    Book().clean_pd.values,
    Products().clean_pd.values
]

for n in range(1000):
    # select dataset
    dataset_id = np.random.randint(len(datasets))
    dataset = datasets[dataset_id]

    # select number of rows
    max_rows = 2000
    if datasets[dataset_id].shape[0] < max_rows:
        max_rows = datasets[dataset_id].shape[0]

    row_size = np.random.randint(low=500, high=max_rows)
enable_plotting = True

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change"



dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"


classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break

print "datasets used for training:"