def __init__(self): self.holo_obj = HoloClean( holoclean_path="..", # path to holoclean package verbose=False, # to limit possible values for training data pruning_threshold1=0.1, # to limit possible values for training data to less than k values pruning_clean_breakoff=6, # to limit possible values for dirty data (applied after # Threshold 1) pruning_threshold2=0, # to limit possible values for dirty data to less than k values pruning_dk_breakoff=6, # learning parameters learning_iterations=30, learning_rate=0.001, batch_size=5) self.session = Session(self.holo_obj)
import unittest import sys sys.path.append("../..") from holoclean.holoclean import HoloClean, Session from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection from pyspark.sql.types import * holo_obj = HoloClean(holoclean_path="../..", verbose=True, timing_file='execution_time.txt', learning_iterations=50, learning_rate=0.001, batch_size=20) class TestMysqlErrordetector(unittest.TestCase): def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") self.detector = SqlDCErrorDetection(self.session) self.session.detect_errors([self.detector]) def tearDown(self): del self.session def test_number_of_dk_cells(self):
def __init__(self): self.holo_obj = HoloClean( mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar", verbose=True, timing_file='execution_time.txt') self.session = Session(self.holo_obj)
# ## Tutorial 2: A Complete HoloClean Pipeline # In this tutorial, we will walk step-by-step through the process of repairing a dataset in `HoloClean`. # The dataset in question contains information about hospitals and is commonly-used for research purposes. Errors are present in ~5% of the cells and there is significant duplicate information - the ideal environment for `HoloClean`. # ### Step 1: Data Loading # We begin by instantiating the `HoloClean` and `Session` objects needed to run the repairs. For a more detailed overview of these objects and the rest of our infrastructure, please see Tutorial 1. # In[1]: from holoclean.holoclean import HoloClean, Session holo = HoloClean( mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar") session = Session(holo) # Next, we load in the data and denial constraints needed for this dataset. Both pieces of information are stored in the MySQL database. # In[2]: data_path = "data/hospital_dataset.csv" ## loads data into our database and returns pyspark dataframe of initial data data = session.load_data(data_path) dc_path = "data/hospital_constraints.txt" # loads denial constraints into our database and returns a simple list of dcs as strings dcs = session.load_denial_constraints(dc_path)
import unittest import sys sys.path.append("../..") from holoclean.holoclean import HoloClean, Session from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection from holoclean.errordetection.sql_nullerrordetector import SqlnullErrorDetection k_inferred = 2 holo_obj = HoloClean(holoclean_path="../..", verbose=False, pruning_threshold1=0.001, pruning_clean_breakoff=6, pruning_threshold2=0.0, pruning_dk_breakoff=6, learning_iterations=30, learning_rate=0.001, batch_size=5, k_inferred=k_inferred) session = Session(holo_obj) dataset = "../data/hospital.csv" session.load_data(dataset) session.load_denial_constraints("../data/hospital_constraints.txt") detector_list = [] Dcdetector = SqlDCErrorDetection(session) Nulldetector = SqlnullErrorDetection(session) detector_list.append(Dcdetector) detector_list.append(Nulldetector) session.detect_errors(detector_list) session.repair()
import unittest import sys sys.path.append("../..") from holoclean.holoclean import HoloClean, Session from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection from pyspark.sql.types import * holo_obj = HoloClean( holoclean_path="../..", verbose=True, timing_file='execution_time.txt') class TestPruning(unittest.TestCase): def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") detector = SqlDCErrorDetection(self.session) self.session.detect_errors([detector]) self.session._ds_domain_pruning(holo_obj.pruning_threshold1, holo_obj.pruning_threshold2, holo_obj.pruning_dk_breakoff, holo_obj.pruning_clean_breakoff) def test_possible_values_clean(self):