Пример #1
0
def repair_main(schema,
                sigma_k=1.0,
                outliers=None,
                epsilon=None,
                filenames=None,
                neighbor_k=3,
                data_size=None,
                used_attrs=None,
                comma=None):
    if filenames is None:
        filenames = {
            'error': '../dataset/restaurant/data_error',
            'origin': '../dataset/restaurant/data_origin',
        }
    DORCRepair.sigma_k = sigma_k
    instance = Instance(schema,
                        filenames['error'],
                        data_size=data_size,
                        used_attrs=used_attrs)
    repair = DORCRepair(instance, neighbor_k, epsilon)
    print 'epsilon: %f' % repair.epsilon
    print outliers
    solutions = repair.repair()
    for record_id, solution_id in solutions:
        class_id = None
        try:
            class_id = instance.get(record_id).get('class')
        except:
            pass
        instance.data[record_id] = instance.get(solution_id).clone()
        if class_id is not None:
            instance.data[record_id].set('class', class_id)
    return instance
Пример #2
0
def repair_main_for_avg_attr(schema,
                             sigma_k=1.0,
                             outliers=None,
                             epsilon=None,
                             filenames=None,
                             neighbor_k=3,
                             data_size=None,
                             used_attrs=None):
    if filenames is None:
        filenames = {
            'error': '../dataset/restaurant/data_error',
            'origin': '../dataset/restaurant/data_origin',
        }
    if used_attrs is not None:
        schema = used_attrs + ['class']
    DORCRepair.sigma_k = sigma_k
    instance = Instance(schema,
                        filenames['error'],
                        data_size=data_size,
                        used_attrs=used_attrs)
    repair = DORCRepair(instance, neighbor_k, epsilon)
    repair.calculate_epsilon()
    outliers = repair.filter()
    print 'epsilon: %f' % repair.epsilon
    print outliers
    # repair.repair(outliers)
    solutions = repair.repair()
    for record_id, solution_id in solutions:
        # print '---------------'
        # print instance.get(record_id)
        # print solution
        # print '---------------'
        instance.data[record_id] = instance.get(solution_id)
    return instance, len(outliers)