Пример #1
0
 def test_add_match_single_occurence(self):
     """Should add the match if there are no exact duplicates"""
     test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1).fit()
     sg.add_match('no match', 'baz')
     matches = sg.get_matches()
     matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')]
     self.assertEqual(1, matches.shape[0])
     sg2 = StringGrouper(test_series_1, test_series_2).fit()
     sg2.add_match('no match', 'bar')
     matches = sg2.get_matches()
     matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'bar')]
     self.assertEqual(1, matches.shape[0])
 def test_get_matches_2_series_2_id_series(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_2 = pd.Series(['foo', 'bar', 'bop'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2'])
     sg = StringGrouper(test_series_1,
                        test_series_2,
                        duplicates_id=test_series_id_2,
                        master_id=test_series_id_1).fit()
     left_side = ['foo', 'bar']
     left_side_id = ['A0', 'A1']
     left_index = [0, 1]
     right_side = ['foo', 'bar']
     right_side_id = ['B0', 'B1']
     right_index = [0, 1]
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({
         'left_index': left_index,
         'left_side': left_side,
         'left_id': left_side_id,
         'similarity': similarity,
         'right_id': right_side_id,
         'right_side': right_side,
         'right_index': right_index
     })
     expected_df.loc[:,
                     'similarity'] = expected_df.loc[:,
                                                     'similarity'].astype(
                                                         sg._config.
                                                         tfidf_matrix_dtype)
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
Пример #3
0
 def test_get_matches_two_dataframes(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foo', 'bar', 'bop'])
     sg = StringGrouper(test_series_1, test_series_2).fit()
     left_side = ['foo', 'bar']
     right_side = ['foo', 'bar']
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'similarity': similarity})
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
Пример #4
0
    def test_remove_match(self):
        """Should remove a match"""
        test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob'])
        test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
        sg = StringGrouper(test_series_1).fit()
        sg.remove_match('foooo', 'foooob')
        matches = sg.get_matches()
        matches_1 = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')]
        # In the case of only a master series, the matches are recursive, so both variants are to be removed
        matches_2 = matches[(matches.left_side == 'foooob') & (matches.right_side == 'foooo')]
        self.assertEqual(0, matches_1.shape[0])
        self.assertEqual(0, matches_2.shape[0])

        sg2 = StringGrouper(test_series_1, test_series_2).fit()
        sg2.remove_match('foooo', 'foooob')
        matches = sg2.get_matches()
        matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')]
        self.assertEqual(0, matches.shape[0])
Пример #5
0
 def test_add_match_single_group_matches_symmetric(self):
     """New matches that are added to a SG with only a master series should be symmetric"""
     test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
     sg = StringGrouper(test_series_1).fit()
     sg.add_match('no match', 'baz')
     matches = sg.get_matches()
     matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')]
     self.assertEqual(1, matches_1.shape[0])
     matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')]
     self.assertEqual(1, matches_2.shape[0])
 def test_get_matches_1_series_1_id_series(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1, master_id=test_series_id_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({'left_side_id': left_side_id, 'left_side': left_side,
                                 'right_side_id': right_side_id, 'right_side': right_side, 'similarity': similarity})
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
Пример #7
0
 def test_get_matches_single(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     sg = StringGrouper(test_series_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     left_index = [0, 0, 1, 2, 3, 3]
     right_index = [0, 3, 1, 2, 0, 3]
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({
         'left_index': left_index,
         'left_side': left_side,
         'similarity': similarity,
         'right_side': right_side,
         'right_index': right_index
     })
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 def test_get_matches_two_dataframes(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foo', 'bar', 'bop'])
     sg = StringGrouper(test_series_1, test_series_2).fit()
     left_side = ['foo', 'bar']
     left_index = [0, 1]
     right_side = ['foo', 'bar']
     right_index = [0, 1]
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({
         'left_index': left_index,
         'left_side': left_side,
         'similarity': similarity,
         'right_side': right_side,
         'right_index': right_index
     })
     expected_df.loc[:,
                     'similarity'] = expected_df.loc[:,
                                                     'similarity'].astype(
                                                         sg._config.
                                                         tfidf_matrix_dtype)
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())