Пример #1
0
    def string(self,
               s1,
               s2,
               method='levenshtein',
               threshold=None,
               *args,
               **kwargs):
        """Compare attributes of pairs with string algorithm.

        Shortcut of :class:`recordlinkage.compare.String`::

            from recordlinkage.compare import String

            indexer = recordlinkage.Compare()
            indexer.add(String())

        """

        compare = String(s1,
                         s2,
                         method=method,
                         threshold=threshold,
                         *args,
                         **kwargs)
        self.add(compare)

        return self
    def compute_compare(self, pairs, df_a, df_b=None):
        for comparison in self.compare_on:
            vartype, field, method, threshold, label = self._unpack_dict(
                **comparison)
            if vartype == 'string':
                self.comparator.add(
                    String(field, field, method, threshold, 0, label))
            if vartype == 'exact':
                self.comparator.add(Exact(field, field, 1, 0, 0, label))

        return self.comparator.compute(pairs, df_a, df_b)
Пример #3
0
    def string(self, *args, **kwargs):
        """Compare attributes of pairs with string algorithm.

        Shortcut of :class:`recordlinkage.compare.String`::

            from recordlinkage.compare import String

            indexer = recordlinkage.Compare()
            indexer.add(String())

        """
        compare = String(*args, **kwargs)
        self.add(compare)

        return self
Пример #4
0
print(len(dfA), 'records in dataset A')
print(len(true_links), 'links in dataset A')

# start indexing
print('Build index...')
indexer = rl.Index()
indexer.add(Block('given_name'))
indexer.add(Block('surname'))
indexer.add(Block('soc_sec_id'))
candidate_links = indexer.index(dfA)

# start comparing
print('Start comparing...')
comparer = rl.Compare()
comparer.add(Exact('given_name', 'given_name', label='given_name'))
comparer.add(String('surname', 'surname', method='jarowinkler',
                    threshold=0.85, label='surname'))
comparer.add(Exact('date_of_birth', 'date_of_birth', label='date_of_birth'))
comparer.add(Exact('suburb', 'suburb', label='suburb'))
comparer.add(Exact('state', 'state', label='state'))
comparer.add(String('address_1', 'address_1', threshold=0.85,
                    label='address_1'))
comparer.add(String('address_2', 'address_2', threshold=0.85,
                    label='address_2'))
features = comparer.compute(candidate_links, dfA)

print('feature shape', features.shape)

# use the Logistic Regression Classifier
# this classifier is equivalent to the deterministic record linkage approach
intercept = -9.5
coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5]
Пример #5
0
        'Jaro', 'Jaro-Winkler', 'Levenshtein', 'Damerau-Levenshtein',
        'Smith-Waterman', 'LCS'
    ]

    i = 0
    for method in methods:
        print(method)
        thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        precisions = []
        recalls = []
        fmeasures = []
        times = []
        for threshold in thresholds:
            compare = rl.Compare([
                String('Security',
                       'Company',
                       method=method,
                       label="Company_score"),
                String('GICS Sector',
                       'Sector',
                       method=method,
                       label="Sector_score")
            ])
            start = time.time()
            features = compare.compute(candidates, sp500_pre, forbes_pre)
            end = time.time()
            times.append(end - start)
            matches = merge_results(features, forbes, sp500, threshold)
            precisions.append(
                metrics.precision_score(true['Score'], matches['Score']))
            recalls.append(
                metrics.recall_score(true['Score'], matches['Score']))