예제 #1
0
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        self.detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([self.detector])
예제 #2
0
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([detector])
        self.session._ds_domain_pruning(holo_obj.pruning_threshold1,
                                        holo_obj.pruning_threshold2,
                                        holo_obj.pruning_dk_breakoff,
                                        holo_obj.pruning_clean_breakoff)
예제 #3
0
    def test(self):

        t1 = time.time()

        dataset = "data/hospital.csv"
        print("using dataset: {}".format(dataset))
        denial_constraints = "data/hospital_constraints.txt"
        print("using denial_constraints: {}".format(denial_constraints))
        ground_truth = "data/hospital_clean.csv"
        print("using ground_truth: {}".format(ground_truth))

        # uncheck this if you don't have ground truth
        # ground_truth = 0

        # Ingesting Dataset and Denial Constraints
        self.session.load_data(dataset)
        self.session.load_denial_constraints(denial_constraints)

        # Error Detectors: We have two, dc violations and null values

        t3 = time.time()
        detector_list = []
        Dcdetector = SqlDCErrorDetection(self.session)
        Nulldetector = SqlnullErrorDetection(self.session)
        detector_list.append(Dcdetector)
        detector_list.append(Nulldetector)
        self.session.detect_errors(detector_list)

        t4 = time.time()
        if self.holo_obj.verbose:
            self.holo_obj.logger.info("Error detection time:")
            self.holo_obj.logger.info("Error detection time:" + str(t4 - t3))

        self.session.repair()

        if ground_truth:
            self.session.compare_to_truth(ground_truth)

        t2 = time.time()
        if self.holo_obj.verbose:
            self.holo_obj.logger.info("Total time:" + str(t2 - t1))
            print "Execution finished"

        exit(0)
예제 #4
0
    verbose=False,
    pruning_threshold1=0.1,  # to limit possible values for training data
    pruning_clean_breakoff=
    6,  # to limit possible values for training data to less than k values
    pruning_threshold2=
    0,  # to limit possible values for dirty data (applied after Threshold 1)
    pruning_dk_breakoff=
    6,  # to limit possible values for dirty data to less than k values
    learning_iterations=30,  # learning parameters
    learning_rate=0.001,
    batch_size=5)
session = Session(holo)
data = session.load_data(data_path)
dcs = session.load_denial_constraints(dc_path)
#data.select('City').show(15)
detector = SqlDCErrorDetection(session)
error_detector_list = []
error_detector_list.append(detector)
clean, dirty = session.detect_errors(error_detector_list)
#clean.head(5)
#dirty.head(5)
repaired = session.repair()
repaired = repaired.withColumn(index_attribute,
                               repaired[index_attribute].cast("int"))
repaired.sort(index_attribute)
shutil.rmtree("repaired")
# repaired.repartition(1).write.format('com.databricks.spark.csv').option("header", 'true').save('repaired')
repaired.coalesce(1).write.format('com.databricks.spark.csv').option(
    "header", 'true').save('repaired')
# session.compare_to_truth(gt_path)