def test_n_grams_ignore_case_to_lower(self): """Should return all case insensitive ngrams in a string""" test_series = pd.Series(pd.Series(['aa'])) # Explicit ignore case sg = StringGrouper(test_series, ignore_case=True) expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds'] self.assertListEqual(expected_result, sg.n_grams('McDonalds'))
def test_get_matches_raises_exception_if_unexpected_options_given(self): # When the input id data does not correspond with its string data: test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_id_1 = pd.Series(['A0', 'A1']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) test_series_id_2 = pd.Series(['B0', 'B1']) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, master_id=test_series_id_1) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates=test_series_2, duplicates_id=test_series_id_2, master_id=test_series_id_1) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates=test_series_2, master_id=test_series_id_1) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, test_series_2, duplicates_id=test_series_id_2) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates_id=test_series_id_2) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates_id=test_series_id_2, master_id=test_series_id_1) # When the input data is ok but the option combinations are invalid: test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, test_series_2, duplicates_id=test_series_id_2) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates_id=test_series_id_2) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, master_id=test_series_id_1, duplicates_id=test_series_id_2)
def test_n_grams_ignore_case_to_lower_with_defaults(self): """Should return all case insensitive ngrams in a string""" test_series = pd.Series(pd.Series(['aa'])) # Implicit default case (i.e. default behaviour) sg = StringGrouper(test_series) expected_result = ['mcd', 'cdo', 'don', 'ona', 'nal', 'ald', 'lds'] self.assertListEqual(expected_result, sg.n_grams('McDonalds'))
def test_get_groups_raises_exception(self): """Should raise an exception if called before the StringGrouper is fit""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2) with self.assertRaises(StringGrouperNotFitException): _ = sg.get_groups()
def test_n_grams_case_unchanged(self): """Should return all ngrams in a string with case""" test_series = pd.Series(pd.Series(['aa'])) # Explicit do not ignore case sg = StringGrouper(test_series, ignore_case=False) expected_result = ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds'] self.assertListEqual(expected_result, sg.n_grams('McDonalds'))
def test_get_matches_2_series_2_id_series(self): test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) sg = StringGrouper(test_series_1, test_series_2, duplicates_id=test_series_id_2, master_id=test_series_id_1).fit() left_side = ['foo', 'bar'] left_side_id = ['A0', 'A1'] left_index = [0, 1] right_side = ['foo', 'bar'] right_side_id = ['B0', 'B1'] right_index = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index }) expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype( sg._config. tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_build_matrix(self): """Should create a csr matrix only master""" test_series = pd.Series(['foo', 'bar', 'baz']) sg = StringGrouper(test_series) master, dupe = sg._get_tf_idf_matrices() c = csr_matrix([[0., 0., 1.], [1., 0., 0.], [0., 1., 0.]]) np.testing.assert_array_equal(c.toarray(), master.toarray()) np.testing.assert_array_equal(c.toarray(), dupe.toarray())
def test_string_grouper_type_error(self): """StringGrouper should raise an typeerror master or duplicates are not a series of strings""" with self.assertRaises(TypeError): _ = StringGrouper('foo', 'bar') with self.assertRaises(TypeError): _ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1])) with self.assertRaises(TypeError): _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j']))
def test_get_matches_two_dataframes(self): test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2).fit() left_side = ['foo', 'bar'] right_side = ['foo', 'bar'] similarity = [1.0, 1.0] expected_df = pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_groups_single_df(self): """Should return a pd.series object with the same length as the original df. The series object will contain a list of the grouped strings""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expected_result, result)
def test_add_match_multiple_occurences(self): """Should add multiple matches if there are exact duplicates""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2).fit() sg.add_match('foooo', 'baz') matches = sg.get_matches() matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'baz')] self.assertEqual(2, matches.shape[0])
def test_get_groups_two_df(self): """Should return a pd.series object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expect ed_result, result)
def test_get_non_matches_empty_case(self): """This test ensures that _get_non_matches() returns an empty DataFrame when all pairs of strings match""" simple_example = SimpleExample() s_master = simple_example.a_few_strings s_dup = simple_example.one_string sg = StringGrouper(s_master, s_dup, max_n_matches=len(s_master), min_similarity=0).fit() self.assertTrue(sg._get_non_matches_list().empty)
def test_get_groups_1_string_series_1_id_series(self): """Should return a pd.series object with the same length as the original df. The series object will contain a list of the grouped strings""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_add_match_single_group_matches_symmetric(self): """New matches that are added to a SG with only a master series should be symmetric""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) sg = StringGrouper(test_series_1).fit() sg.add_match('no match', 'baz') matches = sg.get_matches() matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] self.assertEqual(1, matches_1.shape[0]) matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')] self.assertEqual(1, matches_2.shape[0])
def test_get_groups_two_df_same_similarity(self): """Should return a pd.series object with the length of the dupes. If there are two dupes with the same similarity, the first one is chosen""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expected_result, result)
def test_get_groups_two_df_no_match(self): """Should return a pd.series object with the length of the dupes. If no match is found in dupes, the original will be returned""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expected_result, result)
def test_build_matches(self): """Should create the cosine similarity matrix of two series""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() expected_matches = np.array([[1., 0., 0.] , [0., 1., 0.] , [0., 0., 0.]]) np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray())
def test_build_matches_list(self): """Should create the cosine similarity matrix of two series""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() master = [0, 1] dupe_side = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg._matches_list)
def test_get_groups_4_df_same_similarity(self): """Should return a pd.series object with the length of the dupes. If there are two dupes with the same similarity, the first one is chosen""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_groups_4_df_no_match(self): """Should return a pd.series object with the length of the dupes. If no match is found in dupes, the original will be returned""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_groups_2_string_series_2_id_series(self): """Should return a pd.series object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_matches_1_series_1_id_series(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0'] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_side_id': left_side_id, 'left_side': left_side, 'right_side_id': right_side_id, 'right_side': right_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_build_matrix_master_and_duplicates(self): """Should create a csr matrix for master and duplicates""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() master_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.], [0., 1., 0., 0.]]) dupes_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.], [0., 0., 1., 0.]]) np.testing.assert_array_equal(master_expected.toarray(), master.toarray()) np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray())
def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value( self): """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2]) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'], index=[100, 101, 102, 103]) sg = StringGrouper(test_series_1, test_series_2, replace_na=True) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame( list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])), columns=['most_similar_index', 'most_similar_master'], index=test_series_2.index) pd.testing.assert_frame_equal(expected_result, result)
def test_match_list_symmetry_without_symmetrize_function( self, mock_symmetrize_matrix_param): """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" simple_example = SimpleExample() df = simple_example.customers_df2['Customer Name'] sg = StringGrouper(df, max_n_matches=2).fit() mock_symmetrize_matrix_param.assert_called_once() # obtain the upper and lower triangular parts of the matrix of matches: upper = sg._matches_list[ sg._matches_list['master_side'] < sg._matches_list['dupe_side']] lower = sg._matches_list[ sg._matches_list['master_side'] > sg._matches_list['dupe_side']] # switch the column names of lower triangular part (i.e., transpose) to convert it to upper triangular: upper_prime = lower.rename(columns={ 'master_side': 'dupe_side', 'dupe_side': 'master_side' }) # obtain the intersection between upper and upper_prime: intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) # if the intersection is empty then _matches_list is completely non-symmetric (this is acceptable) # if the intersection is not empty then at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. self.assertFalse( intersection.empty or len(upper) == len(upper_prime) == len(intersection))
def test_match_list_symmetry_with_symmetrize_function(self): """This test ensures that _matches_list is symmetric""" simple_example = SimpleExample() df = simple_example.customers_df2['Customer Name'] sg = StringGrouper(df, max_n_matches=2).fit() # Obtain the upper and lower triangular parts of the matrix of matches: upper = sg._matches_list[ sg._matches_list['master_side'] < sg._matches_list['dupe_side']] lower = sg._matches_list[ sg._matches_list['master_side'] > sg._matches_list['dupe_side']] # Switch the column names of the lower triangular part (i.e., transpose) to convert it to upper triangular: upper_prime = lower.rename(columns={ 'master_side': 'dupe_side', 'dupe_side': 'master_side' }) # Obtain the intersection between upper and upper_prime: intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) # If the intersection is empty this means _matches_list is completely non-symmetric (this is acceptable) # If the intersection is not empty this means at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
def test_prior_matches_added(self): """When a new match is added, any pre-existing matches should also be updated""" sample = [ 'microsoftoffice 365 home', 'microsoftoffice 365 pers', 'microsoft office' ] df = pd.DataFrame(sample, columns=['name']) sg = StringGrouper(df['name'], ignore_index=True) sg = sg.fit() sg = sg.add_match('microsoft office', 'microsoftoffice 365 home') sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office') df['deduped'] = sg.get_groups() # All strings should now match to the same "master" string self.assertEqual(1, len(df.deduped.unique()))
def test_get_matches_single(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) sg = StringGrouper(test_series_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_index = [0, 0, 1, 2, 3, 3] right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'similarity': similarity, 'right_side': right_side, 'right_index': right_index }) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_remove_match(self): """Should remove a match""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1).fit() sg.remove_match('foooo', 'foooob') matches = sg.get_matches() matches_1 = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] # In the case of only a master series, the matches are recursive, so both variants are to be removed matches_2 = matches[(matches.left_side == 'foooob') & (matches.right_side == 'foooo')] self.assertEqual(0, matches_1.shape[0]) self.assertEqual(0, matches_2.shape[0]) sg2 = StringGrouper(test_series_1, test_series_2).fit() sg2.remove_match('foooo', 'foooob') matches = sg2.get_matches() matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] self.assertEqual(0, matches.shape[0])