예제 #1
0
    def test_datasets_existance(self):

        # Load all datasets
        load_febrl1()
        load_febrl2()
        load_febrl3()
        load_febrl4()
예제 #2
0
    def test_febrl3(self):

        df = load_febrl3()
        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 5000)

        df, links = load_febrl3(return_links=True)
        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 5000)
        self.assertIsInstance(links, pandas.MultiIndex)
예제 #3
0
'''

from __future__ import print_function

import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.compare import Exact, String
from recordlinkage.datasets import load_febrl3

# set logging
rl.logging.set_verbosity(rl.logging.INFO)

# load dataset
print('Loading data...')
dfA, true_links = load_febrl3(return_links=True)
print(len(dfA), 'records in dataset A')
print(len(true_links), 'links in dataset A')

# start indexing
print('Build index...')
indexer = rl.Index()
indexer.add(Block('given_name'))
indexer.add(Block('surname'))
indexer.add(Block('soc_sec_id'))
candidate_links = indexer.index(dfA)

# start comparing
print('Start comparing...')
comparer = rl.Compare()
comparer.add(Exact('given_name', 'given_name', label='given_name'))