def rswoosh(self, I): """ RSwoosh - Benjelloun et al. 2009 Performs entity resolution on any set of records using merge and match functions :param I: Set of input records :return Inew: Set of resolved entities (records) """ Inew = set() # initialize the resolved entities while I: # until entity resolution is complete currentrecord = I.pop() # an arbitrary record buddy = False for rnew in Inew: # iterate over Inew match, prob = self._match_function.match(currentrecord, rnew) if match: buddy = rnew break # Found a match! if buddy: print 'Merging records with P(match) = ', prob print ' x2: ', get_weak_pairwise_features(currentrecord, rnew) currentrecord.display(indent=' ') print ' ----' buddy.display(indent=' ') currentrecord.merge(buddy) I.add(currentrecord) Inew.discard(buddy) else: Inew.add(currentrecord) return Inew
def rswoosh(self, I): """ RSwoosh - Benjelloun et al. 2009 Performs entity resolution on any set of records using merge and match functions :param I: Set of input records :return Inew: Set of resolved entities (records) """ Inew = set() # initialize the resolved entities while I: # until entity resolution is complete currentrecord = I.pop() # an arbitrary record buddy = False for rnew in Inew: # iterate over Inew match, prob = self._match_function.match(currentrecord, rnew) if match: buddy = rnew break # Found a match! if buddy: print 'Merging records with P(match) = ', prob print ' x2: ', get_weak_pairwise_features( currentrecord, rnew) currentrecord.display(indent=' ') print ' ----' buddy.display(indent=' ') currentrecord.merge(buddy) I.add(currentrecord) Inew.discard(buddy) else: Inew.add(currentrecord) return Inew
def match(self, r1, r2): """ Determines if two records match :param r1: Record object :param r2: Record object :return x1_hat: False or True, whether r1 and r2 match :return p_x1: Probability of weak match """ # x1 = get_x1(r1, r2) # if np.isnan(x1): # x1 = False x = get_weak_pairwise_features(r1, r2) np.copyto(x, self._x_mean, where=np.isnan(x)) # mean imputation prob = self._classifier.predict_proba(x)[0, 1] match = prob >= self._decision_threshold if r1 == r2: match = True # if records are the same, to satisfy Idempotence property return match, prob
def match(self, r1, r2): """ Determines if two records match :param r1: Record object :param r2: Record object :return: False or True, whether r1 and r2 match :return p_x1: Probability of weak match """ # x1 = get_x1(r1, r2) # if np.isnan(x1): # x1 = False x2 = get_weak_pairwise_features(r1, r2) np.copyto(x2, self._x_mean, where=np.isnan(x2)) # mean imputation p_x1 = self._logreg.predict_proba(x2)[0, 1] x1_hat = p_x1 > self._decision_threshold if r1 == r2: x1_hat = True # if records are the same, to satisfy Idempotence property return x1_hat, p_x1
def batch_match(self, records): """ Batch mode of match :param records: List of pairs of records :return match: List of booleans, whether the corresponding record tuple matches :return prob: Probability of weak match """ n = len(records) if n == 0: return [], [] m = records[0][0].feature_descriptor.number_weak X = np.empty([n, m]) idempotence = list() for i, pair in enumerate(records): r1 = pair[0] r2 = pair[1] idempotence.append(r1 == r2) X[i, :] = get_weak_pairwise_features(r1, r2) np.copyto(X, self._x_mean, where=np.isnan(X)) prob = self._classifier.predict_proba(X)[:, 1] match = list(prob >= self._decision_threshold) or idempotence return match, prob
def test_get_x2(self): r0 = self._database.records[0] x2 = get_weak_pairwise_features(r0, r0) self.assertEqual(x2[0], 0) # [1], binary match self.assertEqual(x2[1], 0) # [2], date diff self.assertEqual(x2[2], 0) # [3], bin self.assertEqual(x2[3], 0) # [4], bin self.assertEqual(x2[4], 0) # [7] bin self.assertEqual(x2[5], 0) # [8] num diff self.assertTrue(isnan(x2[6])) # [9] bin self.assertTrue(isnan(x2[7])) # [10] num diff self.assertTrue(isnan(x2[8])) # [11] num diff self.assertTrue(isnan(x2[9])) # [12] bin self.assertTrue(isnan(x2[10])) # [13] num diff self.assertTrue(isnan(x2[11])) # [14] num diff self.assertTrue(isnan(x2[12])) # [15] num diff self.assertTrue(isnan(x2[13])) # [16] bin self.assertTrue(isnan(x2[14])) # [17] bin self.assertTrue(isnan(x2[15])) # [18] bin self.assertTrue(isnan(x2[16])) # [19] bin self.assertTrue(isnan(x2[17])) # [24] bin self.assertTrue(isnan(x2[18])) # [25] bin self.assertEqual(x2[19], np.exp(-3)) # [26] number matches