def activeLearning(data_d, data_model, labelPairFunction, num_questions) : training_data = [] duplicates = [] nonduplicates = [] num_iterations = 100 pairs = blocking.allCandidates(data_d) record_distances = core.recordDistances(pairs, data_d, data_model) for _ in range(num_questions) : print "finding the next uncertain pair ..." uncertain_indices = findUncertainPairs(record_distances, data_model) record_distances = record_distances[: , uncertain_indices] uncertain_pairs = record_distances['pairs'][0:1] record_distances = record_distances[1:] labeled_pairs = labelPairFunction(uncertain_pairs, data_d, data_model) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, training_data, data_model) data_model = core.trainModel(training_data, num_iterations, data_model) training_pairs = {0 : nonduplicates, 1 : duplicates} return(training_data, training_pairs, data_model)
def semiSupervisedNonDuplicates(data_d, data_model, nonduplicate_confidence_threshold=.7, sample_size = 2000): pair_combinations = list(combinations(data_d.iteritems(), 2)) if len(pair_combinations) <= sample_size : return pair_combinations shuffle(pair_combinations) confident_distinct_pairs = [] n_distinct_pairs = 0 for pair in pair_combinations : pair_distance = core.recordDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < (1 - nonduplicate_confidence_threshold): key_pair, value_pair = zip(*pair) confident_distinct_pairs.append(value_pair) n_distinct_pairs += 1 if n_distinct_pairs == sample_size : return confident_distinct_pairs
def activeLearning(data_d, data_model, labelPairFunction, training_data, training_pairs = None, key_groups = [] ): duplicates = [] nonduplicates = [] if training_pairs : nonduplicates.extend(training_pairs[0]) duplicates.extend(training_pairs[1]) finished = False candidates = blocking.allCandidates(data_d, key_groups) import time t_train = time.time() record_distances = core.recordDistances(candidates, data_model) print 'calculated recordDistances in ', time.time() - t_train, 'seconds' while finished == False : print 'finding the next uncertain pair ...' uncertain_indices = findUncertainPairs(record_distances, data_model) # pop the next most uncertain pair off of record distances record_distances = record_distances[:, uncertain_indices] uncertain_pair_ids = (record_distances['pairs'])[0:1] record_distances = record_distances[1:] uncertain_pairs = [] for pair in uncertain_pair_ids : record_pair = [data_d[instance] for instance in pair] record_pair = tuple(record_pair) uncertain_pairs.append(record_pair) labeled_pairs, finished = labelPairFunction(uncertain_pairs, data_model) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, data_model, training_data) if len(training_data) > 0 : data_model = core.trainModel(training_data, data_model, 1) else : raise ValueError("No training pairs given") training_pairs = {0: nonduplicates, 1: duplicates} return (training_data, training_pairs, data_model)
def semiSupervisedNonDuplicates(data_d, data_model, nonduplicate_confidence_threshold=.7): # this is an expensive call and we're making it multiple times pairs = allCandidates(data_d) record_distances = core.recordDistances(pairs, data_d, data_model) confident_nondupes_ids = [] scored_pairs = core.scorePairs(record_distances, data_model) for (i, score) in enumerate(scored_pairs): if score < 1 - nonduplicate_confidence_threshold: confident_nondupes_ids.append(record_distances['pairs'][i]) confident_nondupes_pairs = [(data_d[pair[0]], data_d[pair[1]]) for pair in confident_nondupes_ids] return confident_nondupes_pairs
def goodThreshold(self, blocks, recall_weight=1.5): """ Returns the threshold that maximizes the expected F score, a weighted average of precision and recall for a sample of blocked data. Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate recall_weight -- Sets the tradeoff between precision and recall. I.e. if you care twice as much about recall as you do precision, set recall_weight to 2. """ candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) record_distances = core.recordDistances(candidates, self.data_model) probability = core.scorePairs(record_distances, self.data_model) probability.sort() probability = probability[::-1] expected_dupes = numpy.cumsum(probability) recall = expected_dupes / expected_dupes[-1] precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1) score = recall * precision / (recall + recall_weight ** 2 * precision) i = numpy.argmax(score) logging.info("Maximum expected recall and precision") logging.info("recall: %2.3f" % recall[i]) logging.info("precision: %2.3f" % precision[i]) logging.info("With threshold: %2.3f" % probability[i]) return probability[i]
def semiSupervisedNonDuplicates(data_sample, data_model, nonduplicate_confidence_threshold=.7, sample_size = 2000): if len(data_sample) <= sample_size : return data_sample confident_distinct_pairs = [] n_distinct_pairs = 0 for pair in data_sample : pair_distance = core.recordDistances([pair], data_model) score = core.scorePairs(pair_distance, data_model) if score < (1 - nonduplicate_confidence_threshold): key_pair, value_pair = zip(*pair) confident_distinct_pairs.append(value_pair) n_distinct_pairs += 1 if n_distinct_pairs == sample_size : return confident_distinct_pairs
def activeLearning(candidates, data_model, labelPairFunction, training_data, training_pairs = None, key_groups = [] ): """ Ask the user to label the record pair we are most uncertain of. Train the data model, and update our uncertainty. Repeat until user tells us she is finished. """ duplicates = [] nonduplicates = [] if training_pairs : nonduplicates.extend(training_pairs[0]) duplicates.extend(training_pairs[1]) finished = False import time t_train = time.time() record_distances = core.recordDistances(candidates, data_model) logging.info('calculated recordDistances in %s seconds' % str(time.time() - t_train)) seen_indices = set() while finished == False : logging.info('finding the next uncertain pair ...') uncertain_indices = findUncertainPairs(record_distances, data_model) for uncertain_index in uncertain_indices : if uncertain_index not in seen_indices : seen_indices.add(uncertain_index) break uncertain_pairs = [(candidates[uncertain_index][0][1], candidates[uncertain_index][1][1])] labeled_pairs, finished = labelPairFunction(uncertain_pairs, data_model) nonduplicates.extend(labeled_pairs[0]) duplicates.extend(labeled_pairs[1]) training_data = addTrainingData(labeled_pairs, data_model, training_data) if len(training_data) > 0 : data_model = core.trainModel(training_data, data_model, 1) else : raise ValueError("No training pairs given") training_pairs = {0: nonduplicates, 1: duplicates} return (training_data, training_pairs, data_model)