def matchBlocks(self, blocks, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidate_records = self._blockedPairs(blocks) self.matches = core.scoreDuplicates(candidate_records, self.data_model, self.num_processes, threshold) clusters = self._cluster(self.matches, cluster_threshold) return clusters
def duplicateClusters(self, blocks, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold) clusters = clustering.cluster(self.dupes, cluster_threshold) return clusters
def matchBlocks(self, blocks, threshold=0.5, *args, **kwargs): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ candidate_records = self._blockedPairs(blocks) matches = core.scoreDuplicates(candidate_records, self.data_model, self.classifier, self.num_cores, threshold) logger.debug("matching done, begin clustering") clusters = self._cluster(matches, threshold, *args, **kwargs) try: match_file = matches.filename del matches os.remove(match_file) except AttributeError: pass return clusters
def duplicateClusters(self, blocks, threshold=0.5, parallel=False): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) if parallel == True: global globalThreshold globalThreshold = threshold global globalDataModel globalDataModel = self.data_model pool = Pool(processes=self.processes) start = time.time() self.dupes = itertools.chain.from_iterable( pool.imap(_mapScoreDuplicates, self._splitEvery(100, candidates)) ) elapsed = time.time() - start print "Parallel scoreDuplicates with", self.processes, "processes takes :", elapsed else: start = time.time() self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold) elapsed = time.time() - start print "Serial scoreDuplicates takes : ", elapsed clusters = clustering.cluster(self.dupes, cluster_threshold) return clusters
def duplicateClusters(self, blocks, data, constrained_matching=False, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 blocked_keys, blocked_records = core.split( (block.keys(), block.values()) for block in blocks) candidate_keys = core.blockedPairs(blocked_keys, constrained_matching, data) candidate_records = core.blockedPairs(blocked_records, constrained_matching) self.dupes = core.scoreDuplicates(candidate_keys, candidate_records, self.data_model, threshold) if constrained_matching: clusters = clustering.clusterConstrained(self.dupes, cluster_threshold) else: clusters = clustering.cluster(self.dupes, cluster_threshold) return clusters
def duplicateClusters(self, blocks, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 blocked_keys, blocked_records = core.split((block.keys(), block.values()) for block in blocks) candidate_keys = core.blockedPairs(blocked_keys) candidate_records = core.blockedPairs(blocked_records) candidate_keys, ids = itertools.tee(candidate_keys) peek = ids.next() id_type = type(peek[0]) ids = itertools.chain([peek], ids) self.dupes = core.scoreDuplicates(candidate_keys, candidate_records, id_type, self.data_model, threshold) clusters = clustering.cluster(self.dupes, id_type, cluster_threshold) return clusters
def matchBlocks(self, blocks, threshold=.5, *args, **kwargs): # pragma : no cover """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidate_records = self._blockedPairs(blocks) matches = core.scoreDuplicates(candidate_records, self.data_model, self.classifier, self.num_cores, threshold) logger.debug("matching done, begin clustering") clusters = self._cluster(matches, cluster_threshold, *args, **kwargs) try: match_file = matches.filename del matches os.remove(match_file) except AttributeError: pass return clusters
def thresholdBlocks(self, blocks, recall_weight=1.5): # pragma: nocover """ Returns the threshold that maximizes the expected F score, a weighted average of precision and recall for a sample of blocked data. Arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate recall_weight -- Sets the tradeoff between precision and recall. I.e. if you care twice as much about recall as you do precision, set recall_weight to 2. """ candidate_records = itertools.chain.from_iterable( self._blockedPairs(blocks)) probability = core.scoreDuplicates(candidate_records, self.data_model, self.classifier, self.num_cores)['score'] probability = probability.copy() probability.sort() probability = probability[::-1] expected_dupes = numpy.cumsum(probability) recall = expected_dupes / expected_dupes[-1] precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1) score = recall * precision / (recall + recall_weight**2 * precision) i = numpy.argmax(score) logger.info('Maximum expected recall and precision') logger.info('recall: %2.3f', recall[i]) logger.info('precision: %2.3f', precision[i]) logger.info('With threshold: %2.3f', probability[i]) return probability[i]
def thresholdBlocks(self, blocks, recall_weight=1.5): # pragma: nocover """ Returns the threshold that maximizes the expected F score, a weighted average of precision and recall for a sample of blocked data. Arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate recall_weight -- Sets the tradeoff between precision and recall. I.e. if you care twice as much about recall as you do precision, set recall_weight to 2. """ candidate_records = itertools.chain.from_iterable(self._blockedPairs(blocks)) probability = core.scoreDuplicates(candidate_records, self.data_model, self.classifier, self.num_cores)['score'] probability = probability.copy() probability.sort() probability = probability[::-1] expected_dupes = numpy.cumsum(probability) recall = expected_dupes / expected_dupes[-1] precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1) score = recall * precision / (recall + recall_weight ** 2 * precision) i = numpy.argmax(score) logger.info('Maximum expected recall and precision') logger.info('recall: %2.3f', recall[i]) logger.info('precision: %2.3f', precision[i]) logger.info('With threshold: %2.3f', probability[i]) return probability[i]
def matchBlocks(self, blocks, threshold=.5, *args, **kwargs): """ Partitions blocked data and generates a sequence of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ candidate_records = itertools.chain.from_iterable( self._blockedPairs(blocks)) matches = core.scoreDuplicates(candidate_records, self.data_model, self.classifier, self.num_cores, threshold=0) logger.debug("matching done, begin clustering") for cluster in self._cluster(matches, threshold, *args, **kwargs): yield cluster try: match_file = matches.filename del matches os.remove(match_file) except AttributeError: pass
def thresholdBlocks(self, blocks, recall_weight=1.5): """ Returns the threshold that maximizes the expected F score, a weighted average of precision and recall for a sample of blocked data. Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate recall_weight -- Sets the tradeoff between precision and recall. I.e. if you care twice as much about recall as you do precision, set recall_weight to 2. """ probability = core.scoreDuplicates(self._blockedPairs(blocks), self.data_model, self.num_processes)["score"] probability.sort() probability = probability[::-1] expected_dupes = numpy.cumsum(probability) recall = expected_dupes / expected_dupes[-1] precision = expected_dupes / numpy.arange(1, len(expected_dupes) + 1) score = recall * precision / (recall + recall_weight ** 2 * precision) i = numpy.argmax(score) logger.info("Maximum expected recall and precision") logger.info("recall: %2.3f", recall[i]) logger.info("precision: %2.3f", precision[i]) logger.info("With threshold: %2.3f", probability[i]) return probability[i]
def matchBlocks(self, blocks, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidate_records = self._blockedPairs(blocks) self.matches = core.scoreDuplicates(candidate_records, self.data_model, self.num_processes, threshold) logger.info("matching done, begin clustering") clusters = self._cluster(self.matches, cluster_threshold) return clusters
def _mapScoreDuplicates(candidates): return core.scoreDuplicates(candidates, globalDataModel, globalThreshold)
print len(candidates), print "comparisons." print "Learned Weights" for k1, v1 in data_model.items() : try: for k2, v2 in v1.items() : print (k2, v2['weight']) except : print (k1, v1) print "" print "finding duplicates ..." print "" dupes = core.scoreDuplicates(candidates, data_d, data_model, .5) clustered_dupes = clustering.cluster(dupes, estimated_dupe_fraction = 0.4) print "# duplicate sets" print len(clustered_dupes) orig_data = {} with open(inputFile) as f : reader = csv.reader(f) reader.next() for row_id, row in enumerate(reader) : orig_data[row_id] = row with open("output/TL_dupes_list_" + str(time.time()) + ".csv","w") as f : writer = csv.writer(f)