def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") self.detector = SqlDCErrorDetection(self.session) self.session.detect_errors([self.detector])
def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") detector = SqlDCErrorDetection(self.session) self.session.detect_errors([detector]) self.session._ds_domain_pruning(holo_obj.pruning_threshold1, holo_obj.pruning_threshold2, holo_obj.pruning_dk_breakoff, holo_obj.pruning_clean_breakoff)
def __init__(self): self.holo_obj = HoloClean( holoclean_path="..", # path to holoclean package verbose=False, # to limit possible values for training data pruning_threshold1=0.1, # to limit possible values for training data to less than k values pruning_clean_breakoff=6, # to limit possible values for dirty data (applied after # Threshold 1) pruning_threshold2=0, # to limit possible values for dirty data to less than k values pruning_dk_breakoff=6, # learning parameters learning_iterations=30, learning_rate=0.001, batch_size=5) self.session = Session(self.holo_obj)
class Testing: def __init__(self): self.holo_obj = HoloClean( mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar", verbose=True, timing_file='execution_time.txt') self.session = Session(self.holo_obj) def test(self): #dataset = "../tutorial/data/hospital_dataset.csv" # dataset = "../datasets/flights/flight_input_holo.csv" # dataset = "../datasets/food/food_input_holo.csv" dataset = "../datasets/unit_test/unit_test_dataset.csv" #denial_constraints = "../tutorial/data/hospital_constraints.txt" #denial_constraints = "../datasets/flights/flight_constraints.txt" # denial_constraints = "../datasets/food/food_constraints1.txt" denial_constraints = "../datasets/unit_test/unit_test_constraints.txt" flattening = 0 # flattening = 1 #ground_truth = "../tutorial/data/groundtruth.csv" #ground_truth = "../datasets/flights/flights_clean.csv" # ground_truth = "../datasets/food/food_clean.csv" ground_truth = 0 # Ingesting Dataset and Denial Constraints self.session.load_data(dataset) self.session.load_denial_constraints(denial_constraints) # Error Detector detector = Mysql_DCErrorDetection(self.session.Denial_constraints, self.holo_obj, self.session.dataset) self.session.detect_errors(detector) self.session.repair() if ground_truth: self.session.compare_to_truth(ground_truth)
class TestMysqlErrordetector(unittest.TestCase): def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") self.detector = SqlDCErrorDetection(self.session) self.session.detect_errors([self.detector]) def tearDown(self): del self.session def test_number_of_dk_cells(self): dataframe_C_dk = holo_obj.dataengine.get_table_to_dataframe( 'C_dk', self.session.dataset) self.assertEquals(dataframe_C_dk.count(), 10) def test_number_of_clean_cells(self): dataframe_C_clean = holo_obj.dataengine.get_table_to_dataframe( 'C_clean', self.session.dataset) self.assertEquals(dataframe_C_clean.count(), 5) def test_correction_of_clean_cells(self): dataframe_C_clean = holo_obj.dataengine.get_table_to_dataframe( 'C_clean', self.session.dataset) anticipated_C_clean_cells = [["3", "D"], ["1", "D"], ["2", "D"], ["3", "A"], ["3", "B"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_C_clean_cells, StructType([ StructField("ind", StringType(), False), StructField("attr", StringType(), False), ])) incorrect = anticipated_dataframe.subtract(dataframe_C_clean) self.assertEquals(incorrect.count(), 0) def test_correction_of_dk_cells(self): dataframe_C_dk = holo_obj.dataengine.get_table_to_dataframe( 'C_dk', self.session.dataset) anticipated_dataframe_C_dk_cells = [["3", "C"], ["2", "C"], ["2", "A"], ["2", "E"], ["3", "E"], ["2", "B"], ["1", "A"], ["1", "C"], ["1", "B"], ["1", "E"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_dataframe_C_dk_cells, StructType([ StructField("ind", StringType(), False), StructField("attr", StringType(), False), ])) incorrect = anticipated_dataframe.subtract(dataframe_C_dk) self.assertEquals(incorrect.count(), 0)
index_attribute = "index" holo = HoloClean( holoclean_path="..", # path to holoclean package verbose=False, pruning_threshold1=0.1, # to limit possible values for training data pruning_clean_breakoff= 6, # to limit possible values for training data to less than k values pruning_threshold2= 0, # to limit possible values for dirty data (applied after Threshold 1) pruning_dk_breakoff= 6, # to limit possible values for dirty data to less than k values learning_iterations=30, # learning parameters learning_rate=0.001, batch_size=5) session = Session(holo) data = session.load_data(data_path) dcs = session.load_denial_constraints(dc_path) #data.select('City').show(15) detector = SqlDCErrorDetection(session) error_detector_list = [] error_detector_list.append(detector) clean, dirty = session.detect_errors(error_detector_list) #clean.head(5) #dirty.head(5) repaired = session.repair() repaired = repaired.withColumn(index_attribute, repaired[index_attribute].cast("int")) repaired.sort(index_attribute) shutil.rmtree("repaired") # repaired.repartition(1).write.format('com.databricks.spark.csv').option("header", 'true').save('repaired')
def __init__(self): self.holo_obj = HoloClean( mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar", verbose=True, timing_file='execution_time.txt') self.session = Session(self.holo_obj)
# In this tutorial, we will walk step-by-step through the process of repairing a dataset in `HoloClean`. # The dataset in question contains information about hospitals and is commonly-used for research purposes. Errors are present in ~5% of the cells and there is significant duplicate information - the ideal environment for `HoloClean`. # ### Step 1: Data Loading # We begin by instantiating the `HoloClean` and `Session` objects needed to run the repairs. For a more detailed overview of these objects and the rest of our infrastructure, please see Tutorial 1. # In[1]: from holoclean.holoclean import HoloClean, Session holo = HoloClean( mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar") session = Session(holo) # Next, we load in the data and denial constraints needed for this dataset. Both pieces of information are stored in the MySQL database. # In[2]: data_path = "data/hospital_dataset.csv" ## loads data into our database and returns pyspark dataframe of initial data data = session.load_data(data_path) dc_path = "data/hospital_constraints.txt" # loads denial constraints into our database and returns a simple list of dcs as strings dcs = session.load_denial_constraints(dc_path)
from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection from holoclean.errordetection.sql_nullerrordetector import SqlnullErrorDetection k_inferred = 2 holo_obj = HoloClean(holoclean_path="../..", verbose=False, pruning_threshold1=0.001, pruning_clean_breakoff=6, pruning_threshold2=0.0, pruning_dk_breakoff=6, learning_iterations=30, learning_rate=0.001, batch_size=5, k_inferred=k_inferred) session = Session(holo_obj) dataset = "../data/hospital.csv" session.load_data(dataset) session.load_denial_constraints("../data/hospital_constraints.txt") detector_list = [] Dcdetector = SqlDCErrorDetection(session) Nulldetector = SqlnullErrorDetection(session) detector_list.append(Dcdetector) detector_list.append(Nulldetector) session.detect_errors(detector_list) session.repair() class UnitTestPredictions(unittest.TestCase): def setUp(self):
class Testing: def __init__(self): self.holo_obj = HoloClean( holoclean_path="..", # path to holoclean package verbose=True, # to limit possible values for training data pruning_threshold1=0.0, # to limit possible values for training data to less than k values pruning_clean_breakoff=6, # to limit possible values for dirty data (applied after # Threshold 1) pruning_threshold2=0.0, # to limit possible values for dirty data to less than k values pruning_dk_breakoff=6, # learning parameters learning_iterations=30, learning_rate=0.001, batch_size=5, # number of inferred values k_inferred=2) self.session = Session(self.holo_obj) def test(self): t1 = time.time() dataset = "data/hospital.csv" print("using dataset: {}".format(dataset)) denial_constraints = "data/hospital_constraints.txt" print("using denial_constraints: {}".format(denial_constraints)) ground_truth = "data/hospital_clean.csv" print("using ground_truth: {}".format(ground_truth)) # uncheck this if you don't have ground truth # ground_truth = 0 # Ingesting Dataset and Denial Constraints self.session.load_data(dataset) self.session.load_denial_constraints(denial_constraints) # Error Detectors: We have two, dc violations and null values t3 = time.time() detector_list = [] Dcdetector = SqlDCErrorDetection(self.session) Nulldetector = SqlnullErrorDetection(self.session) detector_list.append(Dcdetector) detector_list.append(Nulldetector) self.session.detect_errors(detector_list) t4 = time.time() if self.holo_obj.verbose: self.holo_obj.logger.info("Error detection time:") self.holo_obj.logger.info("Error detection time:" + str(t4 - t3)) self.session.repair() if ground_truth: self.session.compare_to_truth(ground_truth) t2 = time.time() if self.holo_obj.verbose: self.holo_obj.logger.info("Total time:" + str(t2 - t1)) print "Execution finished" exit(0)
class TestPruning(unittest.TestCase): def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") detector = SqlDCErrorDetection(self.session) self.session.detect_errors([detector]) self.session._ds_domain_pruning(holo_obj.pruning_threshold1, holo_obj.pruning_threshold2, holo_obj.pruning_dk_breakoff, holo_obj.pruning_clean_breakoff) def test_possible_values_clean(self): possible_values_clean = holo_obj.dataengine.get_table_to_dataframe( "Possible_values_clean", self.session.dataset) anticipated_possible_values_clean = [["1", "3", "A", "p", "0", "1"], ["1", "3", "A", "u", "1", "2"], ["2", "3", "B", "y", "1", "1"], ["2", "3", "B", "z", "0", "2"], ["2", "3", "B", "w", "0", "3"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_possible_values_clean, StructType([ StructField("vid", StringType(), False), StructField("tid", StringType(), False), StructField("attr_name", StringType(), False), StructField("attr_val", StringType(), False), StructField("observed", StringType(), False), StructField("domain_id", StringType(), False), ])) incorrect = anticipated_dataframe.subtract( possible_values_clean) self.assertEquals(incorrect.count(), 0) def test_possible_values_dk(self): possible_values_dk = holo_obj.dataengine.get_table_to_dataframe( "Possible_values_dk", self.session.dataset) anticipated_possible_values_dk = [["1", "1", "A", "p", "1", "1"], ["1", "1", "A", "u", "0", "2"], ["2", "1", "B", "y", "0", "1"], ["2", "1", "B", "z", "0", "2"], ["2", "1", "B", "w", "1", "3"], ["3", "1", "C", "m", "0", "1"], ["3", "1", "C", "f", "1", "2"], ["4", "1", "E", "r", "1", "1"], ["5", "2", "A", "p", "1", "1"], ["5", "2", "A", "u", "0", "2"], ["6", "2", "B", "y", "0", "1"], ["6", "2", "B", "z", "1", "2"], ["6", "2", "B", "w", "0", "3"], ["7", "2", "C", "m", "0", "1"], ["7", "2", "C", "f", "1", "2"], ["8", "2", "E", "r", "1", "1"], ["9", "3", "C", "m", "1", "1"], ["9", "3", "C", "f", "0", "2"], ["10", "3", "E", "r", "1", "1"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_possible_values_dk, StructType([ StructField("vid", StringType(), False), StructField("tid", StringType(), False), StructField("attr_name", StringType(), False), StructField("attr_val", StringType(), False), StructField("observed", StringType(), False), StructField("domain_id", StringType(), False), ])) incorrect = anticipated_dataframe.subtract( possible_values_dk) self.assertEquals(incorrect.count(), 0) def test_kij_dk(self): kij_dk = holo_obj.dataengine.get_table_to_dataframe( "Kij_lookup_dk", self.session.dataset) anticipated_kij_dk = [["1", "1", "A", "2"], ["2", "1", "B", "3"], ["3", "1", "C", "2"], ["4", "1", "E", "1"], ["5", "2", "A", "2"], ["6", "2", "B", "3"], ["7", "2", "C", "2"], ["8", "2", "E", "1"], ["9", "3", "C", "2"], ["10", "3", "E", "1"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_kij_dk, StructType([ StructField("vid", StringType(), False), StructField("tid", StringType(), False), StructField("attr_name", StringType(), False), StructField("k_ij", StringType(), False), ])) incorrect = anticipated_dataframe.subtract( kij_dk) self.assertEquals(incorrect.count(), 0) def test_kij_clean(self): kij_clean = holo_obj.dataengine.get_table_to_dataframe( "Kij_lookup_clean", self.session.dataset) anticipated_kij_clean = [["1", "3", "A", "2"], ["2", "3", "B", "3"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_kij_clean, StructType([ StructField("vid", StringType(), False), StructField("tid", StringType(), False), StructField("attr_name", StringType(), False), StructField("k_ij", StringType(), False), ])) incorrect = anticipated_dataframe.subtract( kij_clean) self.assertEquals(incorrect.count(), 0)
class TestDCFeaturizer(unittest.TestCase): def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_non_symmetric_constraints.txt") detector = SqlDCErrorDetection(self.session) self.session.detect_errors([detector]) self.session._ds_domain_pruning(holo_obj.pruning_threshold1, holo_obj.pruning_threshold2, holo_obj.pruning_dk_breakoff, holo_obj.pruning_clean_breakoff) def test_DC_query_for_clean(self): dc_signal = SignalDC(self.session.Denial_constraints, self.session) self.session._add_featurizer(dc_signal) temp_list = dc_signal._create_all_relaxed_dc() relaxed_dcs = [] for relaxed_dc in temp_list: relaxed_dcs.append(relaxed_dc[0]) expected_r_dcs = \ ["postab.tid = t1." + GlobalVariables.index_name + " AND postab.attr_name = 'A' AND postab.attr_val=t2.A AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t1.B>t2.B", "postab.tid = t2." + GlobalVariables.index_name + " AND postab.attr_name ='A' AND t1.A=postab.attr_val AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t1.B>t2.B", "postab.tid = t1." + GlobalVariables.index_name + " AND postab.attr_name = 'B' AND postab.attr_val>t2.B AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t1.A=t2.A", "postab.tid = t2." + GlobalVariables.index_name + " AND postab.attr_name ='B' AND t1.B>postab.attr_val AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t1.A=t2.A", "postab.tid = t1." + GlobalVariables.index_name + " AND postab.attr_name = 'C' AND postab.attr_val>='f' AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t2.C<='m' AND t1.E=t2.E", "postab.tid = t2." + GlobalVariables.index_name + " AND postab.attr_name = 'C' AND postab.attr_val<='m' AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t1.C>='f' AND t1.E=t2.E", "postab.tid = t1." + GlobalVariables.index_name + " AND postab.attr_name = 'E' AND postab.attr_val=t2.E AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t1.C>='f' AND t2.C<='m'", "postab.tid = t2." + GlobalVariables.index_name + " AND postab.attr_name ='E' AND t1.E=postab.attr_val AND t1." + GlobalVariables.index_name + " < t2." + GlobalVariables.index_name + " AND t1.C>='f' AND t2.C<='m'" ] self.assertEquals(relaxed_dcs, expected_r_dcs)
class TestInitFeaturizer(unittest.TestCase): def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") detector = SqlDCErrorDetection(self.session) self.session.detect_errors([detector]) self.session._ds_domain_pruning(holo_obj.pruning_threshold1, holo_obj.pruning_threshold2, holo_obj.pruning_dk_breakoff, holo_obj.pruning_clean_breakoff) self.init_signal = SignalInit(self.session) def tearDown(self): del self.session def test_Init_query_for_clean(self): query = self.init_signal.get_query()[0] Init_feature_dataframe = \ holo_obj.dataengine.query(query, 1) anticipated_Init_feature_C_clean_cells = [["1", "2", "1", "1"], ["2", "1", "1", "1"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_Init_feature_C_clean_cells, StructType([ StructField("vid", StringType(), False), StructField("assigned_val", StringType(), False), StructField("feature", StringType(), False), StructField("count", StringType(), False), ])) incorrect = anticipated_dataframe.subtract(Init_feature_dataframe) self.assertEquals(incorrect.count(), 0) def test_Init_query_for_dk(self): query = self.init_signal.get_query(0)[0] Init_feature_dataframe = \ holo_obj.dataengine.query(query, 1) anticipated_Init_feature_C_dk_cells = [["1", "1", "1", "1"], ["2", "3", "1", "1"], ["3", "2", "1", "1"], ["4", "1", "1", "1"], ["5", "1", "1", "1"], ["6", "2", "1", "1"], ["7", "2", "1", "1"], ["8", "1", "1", "1"], ["9", "1", "1", "1"], ["10", "1", "1", "1"]] anticipated_dataframe = holo_obj.spark_session.createDataFrame( anticipated_Init_feature_C_dk_cells, StructType([ StructField("vid", StringType(), False), StructField("assigned_val", StringType(), False), StructField("feature", StringType(), False), StructField("count", StringType(), False), ])) incorrect = anticipated_dataframe.subtract(Init_feature_dataframe) self.assertEquals(incorrect.count(), 0)