def test_binarize_input(self): m = np.array([1, .81, .85, .81, .85, .81]) u = np.array([1, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) X_train = X_train * np.random.rand(*X_train.shape) # Create the train dataset. X_test, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) X_test = X_test * np.random.rand(*X_test.shape) ecm = rl.ECMClassifier(binarize=True) ecm.fit(X_train) ecm.predict(X_test)
def test_ecm_atol_none(self): m = np.array([0.95, .81, .85, .81, .85, .81]) u = np.array([0, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(10000, 500, m=m, u=u, random_state=535, return_links=True) # Create the train dataset. X_test, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) ecm = rl.ECMClassifier(atol=None) ecm.fit(X_train) ecm.predict(X_test) assert math.isclose(ecm.u_probs['c_1'][1], 0.0, abs_tol=1e-3) assert math.isclose(ecm.u_probs['c_1'][0], 1.0, abs_tol=1e-3)
def render_bin_test_data(cls, n_pairs_train=5000, n_matches_train=1000, n_pairs_test=50000, n_matches_test=10000): cls.m = np.array([.92, .81, .85, .90, .99, .70, .56]) cls.u = np.array([.19, .23, .50, .11, .20, .14, .50]) cls.labels = [ 'name', 'second_name', 'surname', 'dob', 'street', 'state', 'zipcode' ] # Create the train dataset. cls.X_train, cls.y_train = binary_vectors(n_pairs_train, n_matches_train, m=cls.m, u=cls.u, random_state=535, return_links=True) cls.X_train.columns = cls.labels # Create the test dataset. cls.X_test, cls.y_test = binary_vectors(n_pairs_test, n_matches_test, m=cls.m, u=cls.u, random_state=535, return_links=True) cls.y_test.columns = cls.labels
def test_ecm_init_jaro_1value(self): m = np.array([1.0, 0.85, .85, .81, .85, .81]) u = np.array([1.0, .10, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) ecm = rl.ECMClassifier(init='jaro') ecm.fit(X_train) ecm.predict(X_train) with pytest.raises(KeyError): ecm.m_probs['c_1'][0] assert math.isclose(ecm.m_probs['c_1'][1], 1.0, abs_tol=0.01) assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08) assert math.isclose(ecm.u_probs['c_1'][1], 1.0, abs_tol=0.01) assert math.isclose(ecm.u_probs['c_2'][1], 0.1, abs_tol=0.05) assert math.isclose(ecm.p, 0.5, abs_tol=0.05)
def make_fake_data(n1, n2, pM, pML, pUL, randState=113): nPair = n1 * n2 L = len(pML) gamma, links =np.array(datasets.binary_vectors(nPair, int(pM*nPair), \ m=pML, u = pUL, random_state=randState, return_links = True)) gamma['match'] = False gamma.loc[links, 'match'] = True matches = gamma['match'] # make pair identifiers i = [[i] * n1 for i in range(n2)] iVals = [] for x in i: iVals += x jVals = [j for j in range(n1)] * n2 Gamma = pd.DataFrame({ 'gamma': list(gamma[['c_1', 'c_2', 'c_3']].values), 'i': iVals, 'j': jVals, 'match': matches }) Gamma = Gamma.reset_index(drop=True) ext = 'nMatch' + str(int(pM * nPair)) + '_L' + str(L) Gamma.to_csv('Gamma_' + ext + '.csv', mode='w') return Gamma
def test_sklearn_preinit(self): m = np.array([1.0, .81, .85, .81, .85, .81]) u = np.array([1.0, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) binarizer = LabelBinarizer() binarizer.classes_ = np.array([0, 1]) binarizer.transform(X_train.iloc[:, 1]) assert len(binarizer.classes_) == 2
def test_random_comparison_vectors(self): # Test the generation of a random dataset n_record_pairs = 10000 n_matches = 500 df = binary_vectors(n_record_pairs, n_matches, m=[0.8] * 8, u=[0.2] * 8, random_state=535) # Check the result is a DataFrame with MultiIndex self.assertIsInstance(df, pandas.DataFrame) self.assertIsInstance(df.index, pandas.MultiIndex) # Test the length of the dataframe self.assertEqual(len(df), n_record_pairs)
def make_Gamma(n1, n2, pM, pML, pUL): nPair = n1 * n2 L = len(pML) gamma = np.array( datasets.binary_vectors(nPair, int(pM * nPair), m=pML, u=pUL, random_state=113)) i = [[i] * n1 for i in range(n2)] iVals = [] for x in i: iVals += x jVals = [j for j in range(n1)] * n2 Gamma = pd.DataFrame({ 'gamma': [list(gamma[i]) for i in range(len(gamma))], 'i': iVals, 'j': jVals }) return Gamma
def test_random_comparison_vectors_1value_col(): m = numpy.array([1, .81, .85, 0]) u = numpy.array([1, .23, .50, 0]) # Create the train dataset. X_train, y_train = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) assert len(X_train.iloc[:, 0].unique()) == 1 assert X_train.iloc[:, 0].unique()[0] == 1 assert len(X_train.iloc[:, 3].unique()) == 1 assert X_train.iloc[:, 3].unique()[0] == 0 assert len(X_train.iloc[:, 1].unique()) == 2 assert len(X_train.iloc[:, 2].unique()) == 2
def test_fs_column_labels(self, classifier): m = np.array([0.95, .81, .85, .81, .85, .81]) u = np.array([0, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) cl = classifier() if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)): cl.fit(X_train) else: cl.fit(X_train, true_links) assert set([*cl.m_probs]) == set(list(X_train)) assert set([*cl.u_probs]) == set(list(X_train)) assert set([*cl.log_m_probs]) == set(list(X_train)) assert set([*cl.log_m_probs]) == set(list(X_train))
def test_ecm_init(self): m = np.array([0.23, .81, .85, .81, .85, .81]) u = np.array([0.34, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) ecm = rl.ECMClassifier(init='random') ecm.fit(X_train) ecm.predict(X_train) print(ecm.m_probs) print(ecm.log_m_probs) print(ecm.u_probs) print(ecm.log_u_probs) assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
import numpy as np import recordlinkage as rl from recordlinkage.datasets import binary_vectors # create a dataset with the following settings n_pairs = 50000 n_matches = 7000 m_simulate = np.array([.94, .81, .85, .90, .99, .70, .56, .92]) u_simulate = np.array([.19, .23, .50, .11, .20, .14, .50, .09]) # Create the dataset and return the true links. X_data, links_true = binary_vectors( n_pairs, # the number of candidate links n_matches, # the number of true links m=m_simulate, # the m probabilities u=u_simulate, # the u probabilities random_state=535, # set seed return_links=True) # return true links # Initialise the NaiveBayesClassifier. cl = rl.NaiveBayesClassifier() cl.fit(X_data, links_true) # Print the parameters that are trained (m, u and p). Note that the estimates # are very good. print("p probability P(Match):", cl.p) print("m probabilities P(x_i=1|Match):", cl.m_probs) print("u probabilities P(x_i=1|Non-Match):", cl.u_probs) print("log m probabilities P(x_i=1|Match):", cl.log_m_probs) print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)