def duplicateClusters(self, blocks, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold) clusters = clustering.cluster(self.dupes, cluster_threshold) return clusters
def duplicateClusters(self, blocks, threshold=0.5, parallel=False): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 candidates = (pair for block in blocks for pair in itertools.combinations(block, 2)) if parallel == True: global globalThreshold globalThreshold = threshold global globalDataModel globalDataModel = self.data_model pool = Pool(processes=self.processes) start = time.time() self.dupes = itertools.chain.from_iterable( pool.imap(_mapScoreDuplicates, self._splitEvery(100, candidates)) ) elapsed = time.time() - start print "Parallel scoreDuplicates with", self.processes, "processes takes :", elapsed else: start = time.time() self.dupes = core.scoreDuplicates(candidates, self.data_model, threshold) elapsed = time.time() - start print "Serial scoreDuplicates takes : ", elapsed clusters = clustering.cluster(self.dupes, cluster_threshold) return clusters
def duplicateClusters(self, blocks, data, constrained_matching=False, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 blocked_keys, blocked_records = core.split( (block.keys(), block.values()) for block in blocks) candidate_keys = core.blockedPairs(blocked_keys, constrained_matching, data) candidate_records = core.blockedPairs(blocked_records, constrained_matching) self.dupes = core.scoreDuplicates(candidate_keys, candidate_records, self.data_model, threshold) if constrained_matching: clusters = clustering.clusterConstrained(self.dupes, cluster_threshold) else: clusters = clustering.cluster(self.dupes, cluster_threshold) return clusters
def duplicateClusters(self, blocks, threshold=.5): """ Partitions blocked data and returns a list of clusters, where each cluster is a tuple of record ids Keyword arguments: blocks -- Sequence of tuples of records, where each tuple is a set of records covered by a blocking predicate threshold -- Number between 0 and 1 (default is .5). We will only consider as duplicates record pairs as duplicates if their estimated duplicate likelihood is greater than the threshold. Lowering the number will increase recall, raising it will increase precision """ # Setting the cluster threshold this ways is not principled, # but seems to reliably help performance cluster_threshold = threshold * 0.7 blocked_keys, blocked_records = core.split((block.keys(), block.values()) for block in blocks) candidate_keys = core.blockedPairs(blocked_keys) candidate_records = core.blockedPairs(blocked_records) candidate_keys, ids = itertools.tee(candidate_keys) peek = ids.next() id_type = type(peek[0]) ids = itertools.chain([peek], ids) self.dupes = core.scoreDuplicates(candidate_keys, candidate_records, id_type, self.data_model, threshold) clusters = clustering.cluster(self.dupes, id_type, cluster_threshold) return clusters
print "comparisons." print "Learned Weights" for k1, v1 in data_model.items() : try: for k2, v2 in v1.items() : print (k2, v2['weight']) except : print (k1, v1) print "" print "finding duplicates ..." print "" dupes = core.scoreDuplicates(candidates, data_d, data_model, .5) clustered_dupes = clustering.cluster(dupes, estimated_dupe_fraction = 0.4) print "# duplicate sets" print len(clustered_dupes) orig_data = {} with open(inputFile) as f : reader = csv.reader(f) reader.next() for row_id, row in enumerate(reader) : orig_data[row_id] = row with open("output/TL_dupes_list_" + str(time.time()) + ".csv","w") as f : writer = csv.writer(f) heading_row = header
print "Learned Weights" for k1, v1 in data_model.items() : try: for k2, v2 in v1.items() : print (k2, v2['weight']) except : print (k1, v1) print "" print "finding duplicates ..." print "" dupes = scoreDuplicates(candidates, data_d, data_model) clustered_dupes = cluster(dupes, .2) # dupe_ids = set([frozenset(dupe_pair[0]) for dupe_pair in dupes]) # true_positives = dupe_ids & duplicates_s # false_positives = dupe_ids - duplicates_s # uncovered_dupes = duplicates_s - dupe_ids # # print "False negatives" # for pair in uncovered_dupes : # print "" # for instance in tuple(pair) : # print data_d[instance].values() # # print "____________________________________________" # print "False positives" #