예제 #1
0
    def time_random_index(self):

        # setup class
        c_pairs = rl.RandomIndex(2500)

        # Make pairs
        c_pairs.index(self.A)
예제 #2
0
    def test_random_seed(self):
        """Random: test seeding random algorithm"""

        # TEST IDENTICAL
        index_cl1 = recordlinkage.RandomIndex(n=1000, random_state=100)
        index_cl2 = recordlinkage.RandomIndex(n=1000, random_state=100)
        index_cl3 = recordlinkage.RandomIndex(n=1000, random_state=101)

        pairs1 = index_cl1.index((self.a, self.b))
        pairs2 = index_cl2.index((self.a, self.b))
        pairs3 = index_cl3.index((self.a, self.b))

        # are pairs1 and pairs2 indentical?
        ptm.assert_index_equal(pairs1, pairs2)

        # are pairs1 and pairs3 not indentical? # numpy workaround
        self.assertFalse(np.array_equal(pairs1.values, pairs3.values))
예제 #3
0
    def test_random_with_replace(self):
        """Random: test random indexing with replacement"""

        # situation 1: linking
        index_cl1 = recordlinkage.RandomIndex(n=1000,
                                              replace=True,
                                              random_state=100)

        pairs1 = index_cl1.index((self.a, self.b))
        self.assertEqual(len(pairs1), 1000)
        self.assertFalse(pairs1.is_unique)

        # situation 2: dedup
        index_cl2 = recordlinkage.RandomIndex(n=1000,
                                              replace=True,
                                              random_state=101)

        pairs2 = index_cl2.index(self.a)
        self.assertEqual(len(pairs2), 1000)
        self.assertFalse(pairs2.is_unique)
예제 #4
0
import tempfile
import shutil
import pickle

import numpy as np
import pandas as pd
import pandas.util.testing as ptm
from parameterized import parameterized, param

import recordlinkage

TEST_INDEXATION_OBJECTS = [
    param(recordlinkage.FullIndex()),
    param(recordlinkage.BlockIndex(on='var_arange')),
    param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=False)),
]


class TestData(unittest.TestCase):
    """Unittest object to setup test data."""
    @classmethod
    def setUpClass(cls):

        n_a = 100
        n_b = 150

        cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)]
        cls.index_b = ['rec_b_%s' % i for i in range(0, n_b)]
예제 #5
0
class TestIndexApi(TestData):
    """General unittest for the indexing API."""

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_repr(self, index_class):

        index_str = str(index_class)
        index_repr = repr(index_class)
        self.assertEqual(index_str, index_repr)

        start_str = '<{}'.format(index_class.__class__.__name__)
        self.assertTrue(index_str.startswith(start_str))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_arguments(self, index_class):
        """Test the index method arguments"""

        # The following should work
        index_class.index(self.a)
        index_class.index(self.a, self.b)
        index_class.index((self.a))
        index_class.index([self.a])
        index_class.index((self.a, self.b))
        index_class.index([self.a, self.b])
        index_class.index(x=(self.a, self.b))

    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = recordlinkage.FullIndex()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = recordlinkage.FullIndex()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it.

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_empty_imput_dataframes(self, index_class):
        """Empty DataFrames"""

        # make an empty dataframe with the columns of self.a and self.b
        df_a = pd.DataFrame(columns=self.a.columns.tolist())
        df_b = pd.DataFrame(columns=self.b.columns.tolist())

        if not isinstance(index_class, recordlinkage.RandomIndex):
            # make an index
            pairs = index_class.index((df_a, df_b))

            # check if the MultiIndex has length 0
            self.assertIsInstance(pairs, pd.MultiIndex)
            self.assertEqual(len(pairs), 0)
        else:
            with self.assertRaises(ValueError):
                index_class.index((df_a, df_b))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_error_handling(self, index_class):
        """Test error handling on non-unique index."""

        # make a non_unique index
        df_a = self.a.rename(
            index={self.a.index[1]: self.a.index[0]}, inplace=False)

        with self.assertRaises(ValueError):
            index_class.index(df_a)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_dedup(self, index_class):

        index_names = ['dedup', None, 'index', int(1)]
        expected = [
            ['dedup_1', 'dedup_2'],
            [None, None],
            ['index_1', 'index_2'],
            ['1_1', '1_2'],
        ]

        for i, name in enumerate(index_names):

            index_A = pd.Index(self.a.index).rename(name)
            df_A = pd.DataFrame(self.a, index=index_A)

            pairs = index_class.index((df_A))

            self.assertEqual(pairs.names, expected[i])
            self.assertEqual(df_A.index.name, name)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_dedup(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        # make the index
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_link(self, index_class):

        # tuples with the name of the first and second index
        index_names = [
            ('index1', 'index2'),
            ('index1', None),
            (None, 'index2'),
            (None, None),
            (10, 'index2'),
            (10, 11)
        ]

        for name_a, name_b in index_names:

            # make an index for each dataframe with a new index name
            index_a = pd.Index(self.a.index, name=name_a)
            df_a = pd.DataFrame(self.a, index=index_a)

            index_b = pd.Index(self.b.index, name=name_b)
            df_b = pd.DataFrame(self.b, index=index_b)

            pairs = index_class.index((df_a, df_b))
            self.assertEqual(pairs.names, [name_a, name_b])

            # check for inplace editing (not the intention)
            self.assertEqual(df_a.index.name, name_a)
            self.assertEqual(df_b.index.name, name_b)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_link(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_pickle(self, index_class):
        """Test if it is possible to pickle the class."""

        pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle')

        # pickle before indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))

        # compute the record pairs
        index_class.index(self.a, self.b)

        # pickle after indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))