예제 #1
0
    def __init__(self):
        path_to_dirty = Config.get(
            "datapool.folder") + "/HOSP_HoloClean/dirty/hospital_input.csv"
        path_to_clean = Config.get(
            "datapool.folder"
        ) + "/HOSP_HoloClean/ground-truth/hospital_clean.csv"

        dirty_wrong_format = pd.read_csv(path_to_dirty,
                                         header=0,
                                         dtype=object,
                                         na_filter=False)
        clean_wrong_format = pd.read_csv(path_to_clean,
                                         header=0,
                                         dtype=object,
                                         na_filter=False)

        columns = np.unique(dirty_wrong_format['attribute'].values)
        #print(len(columns))

        dirty_pd, mapColumns = self.to_matrix(dirty_wrong_format, columns)
        clean_pd = self.correct_dirty(mapColumns, dirty_pd, clean_wrong_format)

        #print(dirty_pd.head())

        super(HospitalMoreCol, self).__init__(HospitalMoreCol.name, dirty_pd,
                                              clean_pd)
    def __init__(self):
        path_to_dirty = Config.get("blackoak.data") + "/inputDB.csv"
        path_to_clean = Config.get("blackoak.data") + "/groundDB.csv"

        dirty_pd = pd.read_csv(path_to_dirty, header=0, dtype=object)
        clean_pd = pd.read_csv(path_to_clean, header=0, dtype=object)

        super(BlackOakDataSet, self).__init__("BlackOak", dirty_pd, clean_pd)
예제 #3
0
    def __init__(self):
        clean_df = pd.read_csv(Config.get("datapool.folder") +
                               '/movies/rotten_tomatoes.csv',
                               header=0,
                               dtype=object,
                               na_filter=False)
        dirty_df = pd.read_csv(Config.get("datapool.folder") +
                               '/movies/dirty.csv',
                               header=0,
                               dtype=object,
                               na_filter=False)

        super(Movies, self).__init__("Movies", dirty_df, clean_df)
예제 #4
0
    def __init__(self):
        clean_df = pd.read_csv(
            Config.get("datapool.folder") +
            '/SALARIES/salaries_small/salaries-1_with_id.csv',
            header=0,
            dtype=object,
            na_filter=False)
        dirty_df = pd.read_csv(
            Config.get("datapool.folder") +
            '/SALARIES/salaries_small/dirty/dirty_salaries-1_with_id.csv',
            header=0,
            dtype=object,
            na_filter=False)

        super(Salaries, self).__init__("Movies", dirty_df, clean_df)
예제 #5
0
def run_katara(data):
    ts = time.time()
    tmp_katara_out = path_folder_tmp + "/katara_time_" + str(ts) + "_" + str(
        random.randint(1, 100000)) + "_KATARA_" + ".txt"

    dirty_dataset = path_folder_tmp + '/dirty_dataset_' + str(ts) + '_' + str(
        random.randint(1, 100000)) + '.csv'
    dirty_df = data.dirty_pd.copy()

    for column_i in range(dirty_df.shape[1]):
        dirty_df[dirty_df.columns[column_i]] = dirty_df[
            dirty_df.columns[column_i]].apply(lambda x: x.upper())

    dirty_df.to_csv(dirty_dataset, index=False, encoding='utf-8')

    start_time = time.time()

    command = "cd " + Config.get(
        "abstractionlayer.folder"
    ) + "/\n" + "python2 cleaning_api.py " + dirty_dataset + " " + tmp_katara_out
    print command
    os.system(command)

    return_dict = {}
    return_dict['output'] = tmp_katara_out
    return_dict['time'] = time.time() - start_time

    return return_dict
예제 #6
0
    def __init__(self):
        path_to_dirty = Config.get("datapool.folder") + "/FOOD_HoloClean/dirty/food_input.csv"
        path_to_clean = Config.get("datapool.folder") + "/FOOD_HoloClean/corrected_values/labeled_food.csv"

        dirty_wrong_format = pd.read_csv(path_to_dirty, header=0, dtype=object)
        clean_wrong_format = pd.read_csv(path_to_clean, header=0, dtype=object)


        columns = np.unique(dirty_wrong_format['attribute'].values)

        dirty_pd, mapColumns = self.to_matrix(dirty_wrong_format, columns)
        clean_pd = self.correct_dirty(mapColumns, dirty_pd, clean_wrong_format)

        #print(dirty_pd.head())

        super(FoodsHoloClean, self).__init__(FoodsHoloClean.name, dirty_pd, clean_pd)
예제 #7
0
    def __init__(self):
        clean_df = pd.read_csv(Config.get("datapool.folder") +
                               '/Beers_Mohammad/clean.csv',
                               header=0,
                               dtype=object,
                               na_filter=False)
        dirty_df = pd.read_csv(Config.get("datapool.folder") +
                               '/Beers_Mohammad/dirty.csv',
                               header=0,
                               dtype=object,
                               na_filter=False)

        clean_df = clean_df.drop('ounces', 1)
        dirty_df = dirty_df.drop('ounces', 1)

        super(Beers, self).__init__("Beers", dirty_df, clean_df)
예제 #8
0
def run_gaussian_stat(gaussian,
                      statistical,
                      sample_file="/tmp/data_sample.csv",
                      result_file="/tmp/dboostres.csv"):
    command = "python3 " + Config.get(
        "dboost.py") + " -F ','  --gaussian " + str(
            gaussian) + " --statistical " + str(
                statistical) + " '" + sample_file + "' > '" + result_file + "'"
    os.system(command)
예제 #9
0
    def __init__(self):
        path_to_dirty = Config.get(
            "datapool.folder"
        ) + "/SALARIES/salaries_full/dirty/dirty_salaries_full_with_id.csv"
        path_to_clean = Config.get(
            "datapool.folder") + "/SALARIES/salaries_full/salaries_with_id.csv"

        dirty_pd = pd.read_csv(path_to_dirty,
                               header=0,
                               dtype=object,
                               error_bad_lines=False,
                               na_filter=False)
        clean_pd = pd.read_csv(path_to_clean,
                               header=0,
                               dtype=object,
                               error_bad_lines=False,
                               na_filter=False)

        dirty_pd = dirty_pd.sort_values(['oid', 'id'], ascending=[1, 1])
        clean_pd = clean_pd.sort_values(['oid', 'id'], ascending=[1, 1])

        dirty_pd = dirty_pd[dirty_pd['oid'].isin(clean_pd['oid'].unique())]
        clean_pd = clean_pd[clean_pd['oid'].isin(dirty_pd['oid'].unique())]

        dirty_pd.drop('notes', axis=1, inplace=True)
        clean_pd.drop('notes', axis=1, inplace=True)

        dirty_pd = dirty_pd.reset_index(drop=True)
        clean_pd = clean_pd.reset_index(drop=True)

        assert np.all(dirty_pd['oid'] == clean_pd['oid'])
        assert np.all(dirty_pd['id'] == clean_pd['id'])
        assert np.all(dirty_pd['employeename'] == clean_pd['employeename'])
        assert np.all(dirty_pd['jobtitle'] == clean_pd['jobtitle'])
        assert np.all(dirty_pd['overtimepay'] == clean_pd['overtimepay'])
        assert np.all(dirty_pd['otherpay'] == clean_pd['otherpay'])
        assert np.all(dirty_pd['benefits'] == clean_pd['benefits'])
        assert np.all(
            dirty_pd['totalpaybenefits'] == clean_pd['totalpaybenefits'])
        assert np.all(dirty_pd['year'] == clean_pd['year'])
        assert np.all(dirty_pd['agency'] == clean_pd['agency'])
        assert np.all(dirty_pd['status'] == clean_pd['status'])

        super(Salary, self).__init__("Salary", dirty_pd, clean_pd)
    def __init__(self):
        path_to_dirty = Config.get("datapool.folder") + "/HOSP_HoloClean/dirty/hospital_input.csv"
        path_to_clean = Config.get("datapool.folder") + "/HOSP_HoloClean/ground-truth/hospital_clean.csv"

        dirty_wrong_format = pd.read_csv(path_to_dirty, header=0, dtype=object)
        clean_wrong_format = pd.read_csv(path_to_clean, header=0, dtype=object)

        dirty_pd = self.to_matrix(dirty_wrong_format)
        clean_pd = self.to_matrix(clean_wrong_format)

        # remove empty columns
        dirty_pd = dirty_pd.drop(['address2', 'address3'], 1)
        clean_pd = clean_pd.drop(['address2', 'address3'], 1)


        #dirty_pd.to_csv('hospital.csv', index=False)
        #clean_pd.to_csv('hospital_clean.csv', index=False)

        super(HospitalHoloClean, self).__init__(HospitalHoloClean.name, dirty_pd, clean_pd)
예제 #11
0
def run_histogram_stat(peak,
                       outlier,
                       statistical,
                       sample_file="/tmp/data_sample.csv",
                       result_file="/tmp/dboostres.csv"):
    command = "python3 " + Config.get(
        "dboost.py") + " -F ','  --histogram " + str(peak) + " " + str(
            outlier) + " --statistical " + str(
                statistical) + " '" + sample_file + "' > '" + result_file + "'"

    os.system(command)
예제 #12
0
def run_mixture_stat(n_subpops,
                     threshold,
                     statistical,
                     sample_file="/tmp/data_sample.csv",
                     result_file="/tmp/dboostres.csv"):
    command = "python3 -W ignore " + Config.get(
        "dboost.py") + " -F ','  --mixture " + str(n_subpops) + " " + str(
            threshold) + " --statistical " + str(
                statistical) + " '" + sample_file + "' > '" + result_file + "'"

    os.system(command)
예제 #13
0
    def __init__(self):
        path_to_dirty = Config.get(
            "datapool.folder") + "/FLIGHTS_HoloClean/dirty/flights_input.csv"
        path_to_clean = Config.get(
            "datapool.folder"
        ) + "/FLIGHTS_HoloClean/ground-truth/flights_clean.csv"

        dirty_wrong_format = pd.read_csv(path_to_dirty, header=0, dtype=object)
        clean_wrong_format = pd.read_csv(path_to_clean, header=0, dtype=object)

        dirty_pd = self.to_matrix(dirty_wrong_format)
        clean_pd = self.to_matrix(clean_wrong_format)

        dirty_pd = dirty_pd.sort_values(['flight', 'src'], ascending=[1, 1])
        clean_pd = clean_pd.sort_values(['flight', 'src'], ascending=[1, 1])

        assert np.all(dirty_pd['flight'] == clean_pd['flight'])
        assert np.all(dirty_pd['src'] == clean_pd['src'])

        super(FlightHoloClean, self).__init__(FlightHoloClean.name, dirty_pd,
                                              clean_pd)
예제 #14
0
def install_tools():
    """
    This method installs and configures the data cleaning tools.
    """
    for tool in os.listdir(TOOLS_FOLDER):
        if tool == "NADEEF":
            p = subprocess.Popen(["ant", "all"], cwd="{}/NADEEF".format(TOOLS_FOLDER), stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            p.communicate()
            postgress_username = Config.get("nadeef.db.user")
            postgress_password = Config.get("nadeef.db.password")
            nadeef_configuration_file = open("{}/NADEEF/nadeef.conf".format(TOOLS_FOLDER), "r")
            nadeef_configuration = nadeef_configuration_file.read()
            nadeef_configuration = re.sub("(database.username = )([\w\d]+)", "\g<1>{}".format(postgress_username),
                                          nadeef_configuration, flags=re.IGNORECASE)
            nadeef_configuration = re.sub("(database.password = )([\w\d]+)", "\g<1>{}".format(postgress_password),
                                          nadeef_configuration, flags=re.IGNORECASE)
            nadeef_configuration_file.close()
            nadeef_configuration_file = open("{}/NADEEF/nadeef.conf".format(TOOLS_FOLDER), "w")
            nadeef_configuration_file.write(nadeef_configuration)
        print "{} is installed.".format(tool)
예제 #15
0
def search_histogram_stat(data,
                          data_sample,
                          data_sample_ground_truth,
                          sample_file,
                          result_file,
                          peak_s,
                          outlier_s,
                          statistical_range,
                          write_out=False):
    best_params = {}
    best_fscore = 0.0
    precision = 0.0
    recall = 0.0

    for p in peak_s:
        for o in outlier_s:
            for s in statistical_range:
                run_histogram_stat(p, o, s, sample_file, result_file)

                our_sample_data = DataSetBasic(
                    data.name + " random" + str(data_sample.shape[0]),
                    data_sample, data_sample_ground_truth)

                run = DBoostMe(our_sample_data, result_file)

                current_fscore = run.calculate_total_fscore()
                current_precision = run.calculate_total_precision()
                current_recall = run.calculate_total_recall()

                if write_out:
                    run.write_detected_matrix(
                        Config.get("logging.folder") + "/out/dboost" +
                        '/dboost_histogram_' + data.name + '_peak' + str(p) +
                        '_outlier_' + str(o) + '_stat_' + str(s) + '.npy')

                print "peak: " + str(p) + " outlier: " + str(
                    o) + " --statistical " + str(s)
                print "Fscore: " + str(current_fscore)
                print "Precision: " + str(run.calculate_total_precision())
                print "Recall: " + str(run.calculate_total_recall())

                if current_fscore >= best_fscore:
                    best_fscore = current_fscore
                    precision = current_precision
                    recall = current_recall
                    best_params['peak'] = p
                    best_params['outlier'] = o
                    best_params['statistical'] = s

    return best_params, best_fscore, precision, recall
예제 #16
0
def generate_dBoost_result_file_name(model, data, parameter_grid_dict, keys):
    path = Config.get("logging.folder") + "/out/dboost_results"

    if not os.path.exists(path):
        os.makedirs(path)

    dBoost_result = path + "/dboost_" + str(model.__name__) + "_" + str(
        data.name)
    for p_i in range(len(keys)):
        dBoost_result += '_' + str(keys[p_i]) + '_' + str(
            parameter_grid_dict[keys[p_i]])
    dBoost_result += '.npy'

    return dBoost_result
예제 #17
0
def search_mixture_stat(data,
                        data_sample,
                        data_sample_ground_truth,
                        sample_file,
                        result_file,
                        n_subpops_s,
                        threshold_s,
                        statistical_range,
                        write_out=False):
    best_params = {}
    best_fscore = 0.0
    precision = 0.0
    recall = 0.0

    for p in n_subpops_s:
        for t in threshold_s:
            for s in statistical_range:
                run_mixture_stat(p, t, s, sample_file, result_file)

                our_sample_data = DataSetBasic(
                    data.name + " random" + str(data_sample.shape[0]),
                    data_sample, data_sample_ground_truth)

                run = DBoostMe(our_sample_data, result_file)

                current_fscore = run.calculate_total_fscore()
                current_precision = run.calculate_total_precision()
                current_recall = run.calculate_total_recall()

                if write_out:
                    run.write_detected_matrix(
                        Config.get("logging.folder") + "/out/dboost" +
                        '/dboost_' + data.name + '_mixture_subpop' + str(p) +
                        '_threshold_' + str(t) + '_stat_' + str(s) + '.npy')

                print "n_subpops: " + str(p) + " threshold: " + str(
                    t) + " --statistical " + str(s)
                print "Fscore: " + str(current_fscore)
                print "Precision: " + str(run.calculate_total_precision())
                print "Recall: " + str(run.calculate_total_recall())

                if current_fscore >= best_fscore:
                    best_fscore = current_fscore
                    precision = current_precision
                    recall = current_recall
                    best_params['n_subpops'] = p
                    best_params['threshold'] = t
                    best_params['statistical'] = s

    return best_params, best_fscore, precision, recall
예제 #18
0
def visualize_model(dataSet, column_id, final_gb, feature_name_list, train,
                    target_run, res):
    try:
        column_name = dataSet.clean_pd.columns[column_id]

        feature_name_list_err_corr = list(feature_name_list)
        print "missing features: " + str(
            len(final_gb[column_id].feature_names) - len(feature_name_list))

        if len(final_gb[column_id].feature_names) - len(feature_name_list) > 0:
            for err_corr_id in range(dataSet.shape[1]):
                if dataSet.is_column_applicable(
                        err_corr_id) and err_corr_id != column_id:
                    feature_name_list_err_corr.append(
                        "error_corr_" +
                        str(dataSet.clean_pd.columns[err_corr_id]))

        directory = Config.get("logging.folder") + '/out/html/' + dataSet.name
        if not os.path.exists(directory):
            os.makedirs(directory)
        path = directory + '/' + str(column_name) + '_' + str(
            train[column_id].shape[0]) + '_' + str(time.time()) + '.html'

        table_content = show_weights(final_gb[column_id],
                                     feature_names=feature_name_list_err_corr,
                                     importance_type="gain").data

        # print table_content
        from ml.VisualizeSVD import replace_with_url

        table_content = replace_with_url(table_content, dataSet)

        url = 'file://' + path
        html = "<h1>" + str(column_name) + "</h1>"
        html += "<h2>number of labels: " + str(
            train[column_id].shape[0]) + "</h2>"
        html += "<h2>F-Score: " + str(f1_score(target_run,
                                               res[column_id])) + "</h2>"
        html += str(table_content)

        with open(path, 'w') as webf:
            webf.write(html)
        webf.close()
        # webbrowser.open(url)
    except jinja2.exceptions.UndefinedError:
        print(
            format_as_text(
                explain_weights(final_gb[column_id],
                                feature_names=feature_name_list)))
    def __init__(self, duplicate_factor=1):
        path_to_dirty = Config.get("blackoak.data") + "/inputDB.csv"
        path_to_clean = Config.get("blackoak.data") + "/groundDB.csv"

        dirty_pd_init = pd.read_csv(path_to_dirty,
                                    header=0,
                                    dtype=object,
                                    na_filter=False)
        clean_pd_init = pd.read_csv(path_to_clean,
                                    header=0,
                                    dtype=object,
                                    na_filter=False)

        #print dirty_pd_init.dtypes
        #print clean_pd_init.dtypes

        dirty_pd = self.uppercase(dirty_pd_init)
        clean_pd = self.uppercase(clean_pd_init)

        #dirty_pd.to_csv("BlackOakUppercase_dirty_new.csv", index=False)

        duplicated_clean = clean_pd.copy(deep=True)
        duplicated_dirty = dirty_pd.copy(deep=True)

        for i in range(duplicate_factor - 1):
            copy_dirty = dirty_pd.copy(deep=True)
            copy_clean = clean_pd.copy(deep=True)

            duplicated_dirty = duplicated_dirty.append(copy_dirty,
                                                       ignore_index=True)
            duplicated_clean = duplicated_clean.append(copy_clean,
                                                       ignore_index=True)

        super(BlackOakDataSetUppercase,
              self).__init__(BlackOakDataSetUppercase.name, duplicated_dirty,
                             duplicated_clean)
예제 #20
0
def search_gaussian_stat(data,
                         data_sample,
                         data_sample_ground_truth,
                         sample_file,
                         result_file,
                         gaussian_range,
                         statistical_range,
                         write_out=False):
    best_params = {}
    best_fscore = 0.0
    precision = 0.0
    recall = 0.0

    for g in gaussian_range:
        for s in statistical_range:
            run_gaussian_stat(g, s, sample_file, result_file)

            our_sample_data = DataSetBasic(
                data.name + " random" + str(data_sample.shape[0]), data_sample,
                data_sample_ground_truth)

            run = DBoostMe(our_sample_data, result_file)

            current_fscore = run.calculate_total_fscore()
            current_precision = run.calculate_total_precision()
            current_recall = run.calculate_total_recall()

            if write_out:
                run.write_detected_matrix(
                    Config.get("logging.folder") + "/out/dboost" +
                    '/dboost_gausian_' + data.name + '_gausian' + str(g) +
                    '_stat_' + str(s) + '.npy')

            print "--gaussian " + str(g) + " --statistical " + str(s)
            print "Fscore: " + str(current_fscore)
            print "Precision: " + str(run.calculate_total_precision())
            print "Recall: " + str(run.calculate_total_recall())

            if current_fscore >= best_fscore:
                best_fscore = current_fscore
                precision = current_precision
                recall = current_recall
                best_params['gaussian'] = g
                best_params['statistical'] = s

    return best_params, best_fscore, precision, recall
예제 #21
0
	def __init__(self, data):
		new_columns = []
		for col_i in range(len(data.dirty_pd.columns)):
			new_columns.append(data.dirty_pd.columns[col_i].replace(" ", "_"))

		data.dirty_pd.columns = new_columns

		print data.dirty_pd.columns

		data.dirty_pd.to_csv('/tmp/data.csv', index=False)




		'''
		data.dirty_pd.to_csv('/tmp/data.csv',
					 index=False,
					 quoting=csv.QUOTE_ALL,
					 escapechar='\\',
					 quotechar="'",
					 na_rep="")
		'''
		#/home/felix/abstractionlayer/datasets
		#/tmp/data.csv
		run_input = {
			"dataset": {
				"type": "csv",
				"param": ["/home/felix/abstractionlayer/datasets/hosp_holoclean.csv"]
			},
			"tool": {
				"name": "katara",
				"param": [Config.get("abstractionlayer.tools") + "/KATARA/dominSpecific"]
			}
		}

		matrix_detected = np.zeros(data.shape)

		results_list = run_data_cleaning_job(run_input)
		for x in results_list:
			print x
			matrix_detected[x[0]-1, x[1]] = True



		super(KATARA, self).__init__("KATARA_me", data, matrix_detected)
def run_dboost(dBoost, data, defined_range_labeled_cells, steps, N):
    ts = time.time()

    path_folder = Config.get("logging.folder") + "/out/dboost_interval"

    if not os.path.exists(path_folder):
        os.makedirs(path_folder)

    log_file = path_folder + "/" + str(data.name) + "_time_" + str(ts) + "_dBoost_" + dBoost.func_name + ".txt"

    sizes = np.array(defined_range_labeled_cells, dtype=float)  # in cells

    dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1])
    sizes /= dirty_column_fraction  # cells converted
    sizes /= float(data.shape[1])  # cells to rows
    row_sizes = np.array(sizes, dtype=int)  # in rows

    avg_times, avg_fscores, avg_precision, avg_recall, std_fscores, std_precision, std_recall = dBoost(
        data, steps, N, row_sizes, log_file)

    toLatex(defined_range_labeled_cells, avg_times, avg_fscores, avg_precision, avg_recall, std_fscores,
            std_precision, std_recall, log_file)
예제 #23
0
from ml.datasets.MoviesMohammad.Movies import Movies
from ml.datasets.RestaurantMohammad.Restaurant import Restaurant
from ml.datasets.BeersMohammad.Beers import Beers
from ml.datasets.Citations.Citation import Citation
from ml.datasets.salary_data.Salary import Salary

import time
from ml.tools.dboost.TestDBoost import test_multiple_sizes_hist
from ml.tools.dboost.TestDBoost import test_multiple_sizes_gaussian
from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture

from ml.configuration.Config import Config
import numpy as np
import os

path_folder = Config.get("logging.folder") + "/out/dboost"

if not os.path.exists(path_folder):
    os.makedirs(path_folder)

#data_list = [FlightHoloClean, BlackOakDataSetUppercase, HospitalHoloClean, Restaurant, Movies, Beers, Citation]
data_list = [FlightHoloClean]

steps = 100
N = 1

dBoost_methods = [test_multiple_sizes_mixture]

for dataset in data_list:
    data = dataset()
    rows_number = data.shape[0]
from ml.datasets.RestaurantMohammad.Restaurant import Restaurant
from ml.datasets.BeersMohammad.Beers import Beers
from ml.datasets.Citations.Citation import Citation

from ml.active_learning.classifier.XGBoostClassifier import XGBoostClassifier
from ml.active_learning.classifier.LinearSVMClassifier import LinearSVMClassifier
from ml.active_learning.classifier.NaiveBayesClassifier import NaiveBayesClassifier

from ml.datasets.salary_data.Salary import Salary
import numpy as np

from ml.configuration.Config import Config
import os
import time

path_folder = Config.get("logging.folder") + "/out/model"
if not os.path.exists(path_folder):
    os.makedirs(path_folder)

data_list = [
    FlightHoloClean, BlackOakDataSetUppercase, HospitalHoloClean, Movies,
    Restaurant, Citation, Beers, Salary
]
classifiers = [XGBoostClassifier, LinearSVMClassifier, NaiveBayesClassifier]

parameters = []

my_array = []
for dataset in data_list:
    data = dataset()
    for classifier in classifiers:
from sets import Set

from ml.datasets.BeersMohammad.Beers import Beers
from ml.tools.nadeef_detect.FD import FD
from ml.tools.nadeef_detect.UDF import UDF
from ml.tools.nadeef_detect.NadeefDetect import NadeefDetect

from ml.configuration.Config import Config
import os
import time

path_folder = Config.get("logging.folder") + "/out/nadeef"

if not os.path.exists(path_folder):
    os.makedirs(path_folder)

#according to FUN and fdmine, no perfect FDs
# according to HyFD only ID columns are involved into FDs #check
data = Beers()

my_list = list(data.clean_pd.columns)
my_list[0] = 'anid'
data.clean_pd.columns = my_list
data.dirty_pd.columns = my_list

rules = []

#rules.append(UDF('ounces', 'value.length() > 4'))

rules.append(UDF('ibu', 'value.equals("N/A")'))
rules.append(UDF('abv', '(value != null && !isNumeric(value))'))
예제 #26
0
from ml.active_learning.classifier.XGBoostClassifier import XGBoostClassifier
from ml.active_learning.classifier.LinearSVMClassifier import LinearSVMClassifier
from ml.active_learning.classifier.NaiveBayesClassifier import NaiveBayesClassifier

import numpy as np

from ml.configuration.Config import Config
import os
import time

from ml.datasets.food.FoodsHoloClean import FoodsHoloClean
from ml.datasets.adult.Adult import Adult
from ml.datasets.soccer.Soccer import Soccer
from ml.datasets.hospital.HospitalMoreCol import HospitalMoreCol

path_folder = Config.get("logging.folder") + "/out/holodetect"
if not os.path.exists(path_folder):
    os.makedirs(path_folder)

#data_list = [FlightHoloClean, BlackOakDataSetUppercase, HospitalHoloClean, Movies, Restaurant, Citation, Beers, Salary]
data_list = [HospitalMoreCol]

parameters = []
#parameters.append({'use_metadata': False, 'correlationFeatures': False}) #char unigrams
#parameters.append({'use_metadata': False, 'correlationFeatures': False, 'is_word': True}) #word unigrams
#parameters.append({'use_metadata_only': True, 'correlationFeatures': False}) #metadata
#parameters.append({'use_metadata': False, 'ngrams': 2, 'correlationFeatures': False}) #char unigrams + bigrams
#parameters.append({'correlationFeatures': False}) #char unigrams + meta data
#parameters.append({}) #char unigrams + meta data + correlation

#ed
예제 #27
0
    def __init__(self):
        clean_df = pd.read_csv(Config.get("datapool.folder") + '/Citations/citation.csv', header=0, dtype=object, na_filter=False, encoding="utf8")
        dirty_df = pd.read_csv(Config.get("datapool.folder") + '/Citations/dirty.csv', header=0, dtype=object, na_filter=False, encoding="utf8")

        super(Citation, self).__init__("Citation", dirty_df, clean_df)
예제 #28
0
# All Rights Reserved
########################################


########################################
import os
import json
import re
import subprocess
import pandas
from ml.configuration.Config import Config
########################################


########################################
TOOLS_FOLDER = Config.get("abstractionlayer.tools")
########################################


########################################
def install_tools():
    """
    This method installs and configures the data cleaning tools.
    """
    for tool in os.listdir(TOOLS_FOLDER):
        if tool == "NADEEF":
            p = subprocess.Popen(["ant", "all"], cwd="{}/NADEEF".format(TOOLS_FOLDER), stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            p.communicate()
            postgress_username = Config.get("nadeef.db.user")
            postgress_password = Config.get("nadeef.db.password")
    def __init__(self):
        clean_df = pd.read_csv(Config.get("datapool.folder") + '/restaurants/yellow_pages.csv', header=0, dtype=object, na_filter=False)
        dirty_df = pd.read_csv(Config.get("datapool.folder") + '/restaurants/dirty.csv', header=0, dtype=object, na_filter=False)

        super(Restaurant, self).__init__("Restaurant", dirty_df, clean_df)
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean
from ml.datasets.MoviesMohammad.Movies import Movies
from ml.datasets.RestaurantMohammad.Restaurant import Restaurant
from ml.datasets.BeersMohammad.Beers import Beers
from ml.datasets.Citations.Citation import Citation
from ml.datasets.salary_data.Salary import Salary

from ml.active_learning.classifier.XGBoostClassifier import XGBoostClassifier
import numpy as np

from ml.configuration.Config import Config
import os
import time

path_folder = Config.get("logging.folder") + "/out/features"
if not os.path.exists(path_folder):
    os.makedirs(path_folder)

data_list = [FlightHoloClean]

classifier = XGBoostClassifier

parameters = []
#parameters.append({'correlationFeatures': False, 'use_metadata': False, 'use_cond_prob': True, 'use_cond_prob_only': True}) #word2vec
#feature_names = ['conditional probability']
#LSTM
parameters.append({
    'correlationFeatures': False,
    'use_metadata': False,
    'use_word2vec': True,