def repair_main(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None, comma=None): if filenames is None: filenames = { 'error': '../dataset/restaurant/data_error', 'origin': '../dataset/restaurant/data_origin', } DORCRepair.sigma_k = sigma_k instance = Instance(schema, filenames['error'], data_size=data_size, used_attrs=used_attrs) repair = DORCRepair(instance, neighbor_k, epsilon) print 'epsilon: %f' % repair.epsilon print outliers solutions = repair.repair() for record_id, solution_id in solutions: class_id = None try: class_id = instance.get(record_id).get('class') except: pass instance.data[record_id] = instance.get(solution_id).clone() if class_id is not None: instance.data[record_id].set('class', class_id) return instance
def repair_main_for_avg_attr(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None): if filenames is None: filenames = { 'error': '../dataset/restaurant/data_error', 'origin': '../dataset/restaurant/data_origin', } if used_attrs is not None: schema = used_attrs + ['class'] DORCRepair.sigma_k = sigma_k instance = Instance(schema, filenames['error'], data_size=data_size, used_attrs=used_attrs) repair = DORCRepair(instance, neighbor_k, epsilon) repair.calculate_epsilon() outliers = repair.filter() print 'epsilon: %f' % repair.epsilon print outliers # repair.repair(outliers) solutions = repair.repair() for record_id, solution_id in solutions: # print '---------------' # print instance.get(record_id) # print solution # print '---------------' instance.data[record_id] = instance.get(solution_id) return instance, len(outliers)