def test_build_matches(self): """Should create the cosine similarity matrix of two series""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() expected_matches = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]]) np.testing.assert_array_equal( expected_matches, sg._build_matches(master, dupe).toarray())
def test_add_match_single_group_matches_symmetric(self): """New matches that are added to a SG with only a master series should be symmetric""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) sg = StringGrouper(test_series_1).fit() sg.add_match('no match', 'baz') matches = sg.get_matches() matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] self.assertEqual(1, matches_1.shape[0]) matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')] self.assertEqual(1, matches_2.shape[0])
def test_get_groups_4_df_no_match(self): """Should return a pd.series object with the length of the dupes. If no match is found in dupes, the original will be returned""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_groups_4_df_same_similarity(self): """Should return a pd.series object with the length of the dupes. If there are two dupes with the same similarity, the first one is chosen""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_groups_2_string_series_2_id_series(self): """Should return a pd.series object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_remove_match(self): """Should remove a match""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1).fit() sg.remove_match('foooo', 'foooob') matches = sg.get_matches() matches_1 = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] # In the case of only a master series, the matches are recursive, so both variants are to be removed matches_2 = matches[(matches.left_side == 'foooob') & (matches.right_side == 'foooo')] self.assertEqual(0, matches_1.shape[0]) self.assertEqual(0, matches_2.shape[0]) sg2 = StringGrouper(test_series_1, test_series_2).fit() sg2.remove_match('foooo', 'foooob') matches = sg2.get_matches() matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] self.assertEqual(0, matches.shape[0])
def test_get_matches_1_series_1_id_series(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0'] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_side_id': left_side_id, 'left_side': left_side, 'right_side_id': right_side_id, 'right_side': right_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_matches_single(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) sg = StringGrouper(test_series_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({ 'left_side': left_side, 'right_side': right_side, 'similarity': similarity }) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_groups_1_string_series_1_id_series(self): """Should return a pd.DataFrame object with the same length as the original df. The series object will contain a list of the grouped strings""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1, ignore_index=True) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list( zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])), columns=['group_rep_id', 'group_rep']) pd.testing.assert_frame_equal(expected_result, result)
def test_build_matrix_master_and_duplicates(self): """Should create a csr matrix for master and duplicates""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() master_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.], [0., 1., 0., 0.]]) dupes_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.], [0., 0., 1., 0.]]) np.testing.assert_array_equal(master_expected.toarray(), master.toarray()) np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray())
def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value( self): """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2]) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'], index=[100, 101, 102, 103]) sg = StringGrouper(test_series_1, test_series_2, replace_na=True) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame( list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])), columns=['most_similar_index', 'most_similar_master'], index=test_series_2.index) pd.testing.assert_frame_equal(expected_result, result)
def test_build_matches_list(self): """Should create the cosine similarity matrix of two series""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() master = [0, 1] dupe_side = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({ 'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity }) pd.testing.assert_frame_equal(expected_df, sg._matches_list)
def test_get_matches_raises_exception_if_unexpected_options_given(self): # When the input id data does not correspond with its string data: test_series_1 = pd.Series(['foo', 'bar', 'baz']) bad_test_series_id_1 = pd.Series(['A0', 'A1']) good_test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) bad_test_series_id_2 = pd.Series(['B0', 'B1']) good_test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, master_id=bad_test_series_id_1) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates=test_series_2, duplicates_id=bad_test_series_id_2, master_id=good_test_series_id_1) # When the input data is ok but the option combinations are invalid: with self.assertRaises(Exception): _ = StringGrouper(test_series_1, test_series_2, master_id=good_test_series_id_1) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, test_series_2, duplicates_id=good_test_series_id_2) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates_id=good_test_series_id_2) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, duplicates_id=good_test_series_id_2) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, master_id=good_test_series_id_1, ignore_index=True, replace_na=True) # Here we force an exception by making the number of index-levels of duplicates different from master: # and setting replace_na=True test_series_2.index = pd.MultiIndex.from_tuples( list(zip(list('ABC'), [0, 1, 2]))) with self.assertRaises(Exception): _ = StringGrouper(test_series_1, duplicates=test_series_2, replace_na=True)
def test_prior_matches_added(self): """When a new match is added, any pre-existing matches should also be updated""" sample = [ 'microsoftoffice 365 home', 'microsoftoffice 365 pers', 'microsoft office' ] df = pd.DataFrame(sample, columns=['name']) sg = StringGrouper(df['name'], ignore_index=True) sg = sg.fit() sg = sg.add_match('microsoft office', 'microsoftoffice 365 home') sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office') df['deduped'] = sg.get_groups() # All strings should now match to the same "master" string self.assertEqual(1, len(df.deduped.unique()))
def test_get_matches_two_dataframes(self): test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2).fit() left_side = ['foo', 'bar'] left_index = [0, 1] right_side = ['foo', 'bar'] right_index = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'similarity': similarity, 'right_side': right_side, 'right_index': right_index }) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_case_insensitive_build_matches_list(self): """Should create the cosine similarity matrix of two case insensitive series""" test_series_1 = pd.Series(['foo', 'BAR', 'baz']) test_series_2 = pd.Series(['FOO', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() master = [0, 1] dupe_side = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({ 'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity }) expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype( sg._config. tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg._matches_list)
def test_get_matches_single(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) sg = StringGrouper(test_series_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_index = [0, 0, 1, 2, 3, 3] right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'similarity': similarity, 'right_side': right_side, 'right_index': right_index }) expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype( sg._config. tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_n_grams(self): """Should return all ngrams in a string""" test_series = pd.Series(pd.Series(['aa'])) sg = StringGrouper(test_series) expected_result = ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds'] self.assertListEqual(expected_result, sg.n_grams('McDonalds'))