Пример #1
0
 def test_add_match_multiple_occurences(self):
     """Should add multiple matches if there are exact duplicates"""
     test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2).fit()
     sg.add_match('foooo', 'baz')
     matches = sg.get_matches()
     matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'baz')]
     self.assertEqual(2, matches.shape[0])
Пример #2
0
 def test_add_match_single_group_matches_symmetric(self):
     """New matches that are added to a SG with only a master series should be symmetric"""
     test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
     sg = StringGrouper(test_series_1).fit()
     sg.add_match('no match', 'baz')
     matches = sg.get_matches()
     matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')]
     self.assertEqual(1, matches_1.shape[0])
     matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')]
     self.assertEqual(1, matches_2.shape[0])
Пример #3
0
 def test_add_match_raises_exception_if_string_not_present(self):
     test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1).fit()
     sg2 = StringGrouper(test_series_1, test_series_2).fit()
     with self.assertRaises(ValueError):
         sg.add_match('doesnt exist', 'baz')
     with self.assertRaises(ValueError):
         sg.add_match('baz', 'doesnt exist')
     with self.assertRaises(ValueError):
         sg2.add_match('doesnt exist', 'baz')
     with self.assertRaises(ValueError):
         sg2.add_match('baz', 'doesnt exist')
    def test_prior_matches_added(self):
        """When a new match is added, any pre-existing matches should also be updated"""
        sample = [
            'microsoftoffice 365 home', 'microsoftoffice 365 pers',
            'microsoft office'
        ]

        df = pd.DataFrame(sample, columns=['name'])

        sg = StringGrouper(df['name'], ignore_index=True)
        sg = sg.fit()

        sg = sg.add_match('microsoft office', 'microsoftoffice 365 home')
        sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office')
        df['deduped'] = sg.get_groups()
        # All strings should now match to the same "master" string
        self.assertEqual(1, len(df.deduped.unique()))