def test_datasets_existance(self): # Load all datasets load_febrl1() load_febrl2() load_febrl3() load_febrl4()
def test_febrl3(self): df = load_febrl3() self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 5000) df, links = load_febrl3(return_links=True) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 5000) self.assertIsInstance(links, pandas.MultiIndex)
''' from __future__ import print_function import recordlinkage as rl from recordlinkage.index import Block from recordlinkage.compare import Exact, String from recordlinkage.datasets import load_febrl3 # set logging rl.logging.set_verbosity(rl.logging.INFO) # load dataset print('Loading data...') dfA, true_links = load_febrl3(return_links=True) print(len(dfA), 'records in dataset A') print(len(true_links), 'links in dataset A') # start indexing print('Build index...') indexer = rl.Index() indexer.add(Block('given_name')) indexer.add(Block('surname')) indexer.add(Block('soc_sec_id')) candidate_links = indexer.index(dfA) # start comparing print('Start comparing...') comparer = rl.Compare() comparer.add(Exact('given_name', 'given_name', label='given_name'))