def test_add_match_multiple_occurences(self): """Should add multiple matches if there are exact duplicates""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2).fit() sg.add_match('foooo', 'baz') matches = sg.get_matches() matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'baz')] self.assertEqual(2, matches.shape[0])
def test_add_match_single_group_matches_symmetric(self): """New matches that are added to a SG with only a master series should be symmetric""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) sg = StringGrouper(test_series_1).fit() sg.add_match('no match', 'baz') matches = sg.get_matches() matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] self.assertEqual(1, matches_1.shape[0]) matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')] self.assertEqual(1, matches_2.shape[0])
def test_add_match_raises_exception_if_string_not_present(self): test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1).fit() sg2 = StringGrouper(test_series_1, test_series_2).fit() with self.assertRaises(ValueError): sg.add_match('doesnt exist', 'baz') with self.assertRaises(ValueError): sg.add_match('baz', 'doesnt exist') with self.assertRaises(ValueError): sg2.add_match('doesnt exist', 'baz') with self.assertRaises(ValueError): sg2.add_match('baz', 'doesnt exist')
def test_prior_matches_added(self): """When a new match is added, any pre-existing matches should also be updated""" sample = [ 'microsoftoffice 365 home', 'microsoftoffice 365 pers', 'microsoft office' ] df = pd.DataFrame(sample, columns=['name']) sg = StringGrouper(df['name'], ignore_index=True) sg = sg.fit() sg = sg.add_match('microsoft office', 'microsoftoffice 365 home') sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office') df['deduped'] = sg.get_groups() # All strings should now match to the same "master" string self.assertEqual(1, len(df.deduped.unique()))