def string(self, s1, s2, method='levenshtein', threshold=None, *args, **kwargs): """Compare attributes of pairs with string algorithm. Shortcut of :class:`recordlinkage.compare.String`:: from recordlinkage.compare import String indexer = recordlinkage.Compare() indexer.add(String()) """ compare = String(s1, s2, method=method, threshold=threshold, *args, **kwargs) self.add(compare) return self
def compute_compare(self, pairs, df_a, df_b=None): for comparison in self.compare_on: vartype, field, method, threshold, label = self._unpack_dict( **comparison) if vartype == 'string': self.comparator.add( String(field, field, method, threshold, 0, label)) if vartype == 'exact': self.comparator.add(Exact(field, field, 1, 0, 0, label)) return self.comparator.compute(pairs, df_a, df_b)
def string(self, *args, **kwargs): """Compare attributes of pairs with string algorithm. Shortcut of :class:`recordlinkage.compare.String`:: from recordlinkage.compare import String indexer = recordlinkage.Compare() indexer.add(String()) """ compare = String(*args, **kwargs) self.add(compare) return self
print(len(dfA), 'records in dataset A') print(len(true_links), 'links in dataset A') # start indexing print('Build index...') indexer = rl.Index() indexer.add(Block('given_name')) indexer.add(Block('surname')) indexer.add(Block('soc_sec_id')) candidate_links = indexer.index(dfA) # start comparing print('Start comparing...') comparer = rl.Compare() comparer.add(Exact('given_name', 'given_name', label='given_name')) comparer.add(String('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')) comparer.add(Exact('date_of_birth', 'date_of_birth', label='date_of_birth')) comparer.add(Exact('suburb', 'suburb', label='suburb')) comparer.add(Exact('state', 'state', label='state')) comparer.add(String('address_1', 'address_1', threshold=0.85, label='address_1')) comparer.add(String('address_2', 'address_2', threshold=0.85, label='address_2')) features = comparer.compute(candidate_links, dfA) print('feature shape', features.shape) # use the Logistic Regression Classifier # this classifier is equivalent to the deterministic record linkage approach intercept = -9.5 coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]
'Jaro', 'Jaro-Winkler', 'Levenshtein', 'Damerau-Levenshtein', 'Smith-Waterman', 'LCS' ] i = 0 for method in methods: print(method) thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] precisions = [] recalls = [] fmeasures = [] times = [] for threshold in thresholds: compare = rl.Compare([ String('Security', 'Company', method=method, label="Company_score"), String('GICS Sector', 'Sector', method=method, label="Sector_score") ]) start = time.time() features = compare.compute(candidates, sp500_pre, forbes_pre) end = time.time() times.append(end - start) matches = merge_results(features, forbes, sp500, threshold) precisions.append( metrics.precision_score(true['Score'], matches['Score'])) recalls.append( metrics.recall_score(true['Score'], matches['Score']))