def _addTrainingData(self, labeled_pairs): """ Appends training data to the training data collection. """ for label, examples in labeled_pairs.items(): n_examples = len(examples) labels = [label] * n_examples new_data = numpy.empty(n_examples, dtype=self.training_data.dtype) new_data["label"] = labels new_data["distances"] = core.fieldDistances(examples, self.data_model) self.training_data = numpy.append(self.training_data, new_data)
def _addTrainingData(self, labeled_pairs): """ Appends training data to the training data collection. """ for label, examples in labeled_pairs.items(): n_examples = len(examples) labels = [label] * n_examples new_data = numpy.empty(n_examples, dtype=self.training_data.dtype) new_data['label'] = labels new_data['distances'] = core.fieldDistances( examples, self.data_model) self.training_data = numpy.append(self.training_data, new_data)
def goodThreshold(self, blocks, constrained_matching=False, recall_weight=1.5): """ Returns the threshold that maximizes the expected F score, a weighted average of precision and recall for a sample of blocked data. Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate recall_weight -- Sets the tradeoff between precision and recall. I.e. if you care twice as much about recall as you do precision, set recall_weight to 2. """ blocked_records = (block.values() for block in blocks) candidates = core.blockedPairs(blocked_records, constrained_matching) field_distances = core.fieldDistances(candidates, self.data_model) probability = core.scorePairs(field_distances, self.data_model) probability.sort() probability = probability[::-1] expected_dupes = numpy.cumsum(probability) recall = expected_dupes / expected_dupes[-1] precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1) score = recall * precision / (recall + recall_weight**2 * precision) i = numpy.argmax(score) logging.info('Maximum expected recall and precision') logging.info('recall: %2.3f', recall[i]) logging.info('precision: %2.3f', precision[i]) logging.info('With threshold: %2.3f', probability[i]) return probability[i]
def goodThreshold(self, blocks, recall_weight=1.5): """ Returns the threshold that maximizes the expected F score, a weighted average of precision and recall for a sample of blocked data. Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate recall_weight -- Sets the tradeoff between precision and recall. I.e. if you care twice as much about recall as you do precision, set recall_weight to 2. """ blocked_records = (block.values() for block in blocks) candidates = core.blockedPairs(blocked_records) field_distances = core.fieldDistances(candidates, self.data_model) probability = core.scorePairs(field_distances, self.data_model) probability.sort() probability = probability[::-1] expected_dupes = numpy.cumsum(probability) recall = expected_dupes / expected_dupes[-1] precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1) score = recall * precision / (recall + recall_weight ** 2 * precision) i = numpy.argmax(score) logging.info("Maximum expected recall and precision") logging.info("recall: %2.3f", recall[i]) logging.info("precision: %2.3f", precision[i]) logging.info("With threshold: %2.3f", probability[i]) return probability[i]