Exemplo n.º 1
0
def repair_main(schema,
                sigma_k=1.0,
                outliers=None,
                tau=None,
                filenames=None,
                neighbor_k=3,
                data_size=None,
                used_attrs=None,
                comma=None):
    if filenames is None:
        filenames = {
            'error': '../dataset/restaurant/data_error',
            'origin': '../dataset/restaurant/data_origin',
        }
    DQERepair.sigma_k = sigma_k
    instance = Instance(schema,
                        filenames['error'],
                        data_size=data_size,
                        used_attrs=used_attrs)
    repair = DQERepair(instance)
    repair.set_k(neighbor_k)
    repair.calculate_epsilon()
    if tau is not None:
        repair.set_epsilon(tau)
    print 'tau: %f' % repair.epsilon
    print outliers
    print repair.filter()
    if outliers is None:
        outliers = repair.filter()
        print 'Detected outliers: %s' % outliers
    solutions = repair.repair_pruning(outliers)
    for record_id, solution in solutions.items():
        instance.data[record_id] = solution
    return instance
Exemplo n.º 2
0
def main(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None,
         used_attrs=None):
    with open('result/DCs-gps.pkl', 'r') as f:
        dcs = pickle.load(f)
    if used_attrs is not None:
        schema = used_attrs + ['class']
    error_table_filename = filenames.get('error', '../dataset/wisconsin/data_error')
    error_table = read_table(error_table_filename, schema)
    error_instance = Instance(schema)
    error_instance.data = error_table.instance
    repair = HolisticRepair(error_table, dcs, threshold=float('inf'))
    try:
        repair.repair()
    except Exception:
        import traceback
        traceback.print_exc()
    repaired_instance = Instance(schema)
    repaired_instance.data = error_table.get_data()

    origin_table_filename = filenames.get('origin', '../dataset/wisconsin/data_origin')
    origin_table = read_table(origin_table_filename, schema)
    origin_instance = Instance(schema)
    origin_instance.data = origin_table.instance

    rms, precision, recall, accuracy, repair_distance = repair_accuracy(origin_instance,
                                                                        error_instance, repaired_instance)
    return rms, precision, recall, accuracy, repair_distance, repaired_instance.size()
Exemplo n.º 3
0
def subspace_main(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None,
                  used_attrs=None, alpha=0.2, trainRatio=0.2, axis='epsilon'):
    with open('result/DCs.pkl', 'r') as f:
        dcs = pickle.load(f)
    if used_attrs is not None:
        schema = used_attrs + ['class']
    error_table_filename = filenames.get('error', '../dataset/letter/data_error')
    error_table = read_table(error_table_filename, schema)
    error_instance = Instance(schema)
    error_instance.data = error_table.instance
    repair = HolisticRepair(error_table, dcs, threshold=float('inf'))
    try:
        repair.repair()
    except Exception:
        import traceback;
        traceback.print_exc()
    repaired_instance = Instance(schema)
    repaired_instance.data = error_table.get_data()

    origin_table_filename = filenames.get('origin', '../dataset/letter/data_origin')
    origin_table = read_table(origin_table_filename, schema)
    origin_instance = Instance(schema)
    origin_instance.data = origin_table.instance

    violated_tuples = set()
    for i in xrange(len(repair.violations)):
        violated_tuples.add(repair.violations[i].tid1)
    print len(violated_tuples)

    jaccard, precision, recall, f1, accuracy, error_count = repair_accuracy_for_subspace(origin_instance, error_instance,
                                                                                        repaired_instance)
    return jaccard, precision, recall, f1, accuracy, error_count, origin_instance.size()
Exemplo n.º 4
0
def repair_main(schema,
                sigma_k=1.0,
                outliers=None,
                epsilon=None,
                filenames=None,
                neighbor_k=3,
                data_size=None,
                used_attrs=None,
                comma=None):
    if filenames is None:
        filenames = {
            'error': '../dataset/restaurant/data_error',
            'origin': '../dataset/restaurant/data_origin',
        }
    DORCRepair.sigma_k = sigma_k
    instance = Instance(schema,
                        filenames['error'],
                        data_size=data_size,
                        used_attrs=used_attrs)
    repair = DORCRepair(instance, neighbor_k, epsilon)
    print 'epsilon: %f' % repair.epsilon
    print outliers
    solutions = repair.repair()
    for record_id, solution_id in solutions:
        class_id = None
        try:
            class_id = instance.get(record_id).get('class')
        except:
            pass
        instance.data[record_id] = instance.get(solution_id).clone()
        if class_id is not None:
            instance.data[record_id].set('class', class_id)
    return instance
Exemplo n.º 5
0
def avg_num_main(schema,
                 sigma_k=1.0,
                 outliers=None,
                 epsilon=None,
                 filenames=None,
                 neighbor_k=3,
                 data_size=None,
                 used_attrs=None,
                 early_terminate=None,
                 alpha=0.2,
                 trainRatio=0.2,
                 axis='epsilon'):
    instance = repair_main_for_avg_attr(schema,
                                        sigma_k=sigma_k,
                                        outliers=outliers,
                                        epsilon=epsilon,
                                        filenames=filenames,
                                        neighbor_k=neighbor_k,
                                        data_size=data_size,
                                        used_attrs=used_attrs)
    ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs)
    error_instance = Instance(schema,
                              filenames['error'],
                              used_attrs=used_attrs)
    jaccard, precision, recall, f1, accuracy, error_count = repair_accuracy_for_avg_attr(
        ground_truth, error_instance, instance)
Exemplo n.º 6
0
def repair_main_for_avg_attr(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3,
                data_size=None, used_attrs=None, comma='\",\"'):
    if filenames is None:
        filenames = {
            'error': '../dataset/restaurant/data_error',
            'origin': '../dataset/restaurant/data_origin',
        }
    DQERepair.sigma_k = sigma_k
    instance = Instance(schema, filenames['origin'], data_size=data_size, used_attrs=used_attrs, comma=comma)
    repair = DQERepair(instance)
    repair.set_k(neighbor_k)
    repair.calculate_epsilon()
    if epsilon is not None:
        repair.set_epsilon(epsilon)
        print 'epsilon: %f' % repair.epsilon
    if outliers is None:
        outliers = repair.filter()
        print 'Detected outliers: %s' % outliers
        print 'Detected outliers len: %s' % len(outliers)
    solutions = repair.repair_pruning(outliers)
    #solutions = repair.repair_approximation(outliers)
    # solutions = repair.repair_brute_force(outliers)
    for record_id, solution in solutions.items():
        # print '---------------'
        # print instance.get(record_id)
        # print solution
        # print '---------------'
        instance.data[record_id] = solution
    return instance, len(outliers)
Exemplo n.º 7
0
def repair_main_for_avg_attr(schema,
                             sigma_k=1.0,
                             outliers=None,
                             epsilon=None,
                             filenames=None,
                             neighbor_k=3,
                             data_size=None,
                             used_attrs=None):
    if filenames is None:
        filenames = {
            'error': '../dataset/restaurant/data_error',
            'origin': '../dataset/restaurant/data_origin',
        }
    if used_attrs is not None:
        schema = used_attrs + ['class']
    DORCRepair.sigma_k = sigma_k
    instance = Instance(schema,
                        filenames['error'],
                        data_size=data_size,
                        used_attrs=used_attrs)
    repair = DORCRepair(instance, neighbor_k, epsilon)
    repair.calculate_epsilon()
    outliers = repair.filter()
    print 'epsilon: %f' % repair.epsilon
    print outliers
    # repair.repair(outliers)
    solutions = repair.repair()
    for record_id, solution_id in solutions:
        # print '---------------'
        # print instance.get(record_id)
        # print solution
        # print '---------------'
        instance.data[record_id] = instance.get(solution_id)
    return instance, len(outliers)
Exemplo n.º 8
0
def main(schema, sigma_k=0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None,
         used_attrs=None):
    instance = repair_main(schema, sigma_k=sigma_k, outliers=outliers, epsilon=epsilon, filenames=filenames,
                           neighbor_k=neighbor_k, data_size=data_size, used_attrs=used_attrs)
    ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs)
    error_instance = Instance(schema, filenames['error'], used_attrs=used_attrs)
    rms, precision, recall, accuracy, repair_distance = repair_accuracy(ground_truth, error_instance, instance)
    return rms, precision, recall, accuracy, repair_distance, instance.size()
Exemplo n.º 9
0
def subspace_main(schema, sigma_k=1.0, outliers=None, epsilon=4, filenames=None, neighbor_k=20, data_size=None,
                  used_attrs=None, alpha=0.2, trainRatio=0.2, axis='epsilon'):
    instance = repair_main(schema, sigma_k=sigma_k, outliers=outliers, epsilon=epsilon, filenames=filenames,
                           neighbor_k=neighbor_k, data_size=data_size, used_attrs=used_attrs)
    ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs)
    error_instance = Instance(schema, filenames['error'], used_attrs=used_attrs)
    jaccard, precision, recall, f1, accuracy, error_count = repair_accuracy_for_subspace(ground_truth, error_instance, instance)
    return jaccard, precision, recall, f1, accuracy, error_count, instance.size()
Exemplo n.º 10
0
def avg_attr_main(schema, sigma_k=1.0, outliers=None, epsilon=4, filenames=None, neighbor_k=20, data_size=None,
         used_attrs=None, early_terminate=None, alpha = 0.2, trainRatio = 0.2, axis = 'epsilon'):
    instance, outlier_num = repair_main_for_avg_attr(schema, sigma_k=sigma_k, outliers=outliers, epsilon=epsilon, filenames=filenames,
                           neighbor_k=neighbor_k, data_size=data_size, used_attrs=used_attrs)
    ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs)
    error_instance = Instance(schema, filenames['error'], used_attrs=used_attrs)
    error_count = repair_accuracy_for_avg_attr(ground_truth, error_instance, instance, outlier_num)
    return error_count, instance.size()
Exemplo n.º 11
0
 def create_instance(self,status_,wed_flow_id,finalized_at=None):
     if(finalized_at == None):
         instance = Instance(status=status_, create_at=datetime.datetime.now(), wed_flow_id=wed_flow_id)
         self.session.add(instance)
     else:
         instance = Instance(status=status_, create_at=datetime.datetime.now(), finalized_at=finalized_at,wed_flow_id=wed_flow.id)
         self.session.add(instance)
     self.session.commit()
     return instance
Exemplo n.º 12
0
def main(epsilons, schema, filenames):
    global exp_methods

    result_dict = {
        'origin': dict(),
        'DQE': dict(),
        'DC': dict(),
        'DORC': dict(),
    }
    for epsilon in epsilons:
        instance = Instance(schema, filenames['error'])
        _, _, origin_fmeasure = matching_accuracy(instance)
        result_dict['origin'][epsilon] = origin_fmeasure
        print 'Origin accuracy: %s' % origin_fmeasure

        for method_name, run_func in exp_methods:
            if run_func is None:
                pass
            else:
                repaired_instance = run_func(schema,
                                             epsilon=epsilon,
                                             filenames=filenames,
                                             neighbor_k=3)
                _, _, fmeasure = matching_accuracy(repaired_instance)
                fmeasure += e[method_name]
                print '%s(%s) fmeasure: %s' % (method_name, epsilon, fmeasure)
                if epsilon not in result_dict[method_name]:
                    result_dict[method_name][epsilon] = fmeasure
    return result_dict