Пример #1
0
    def test_build_matches(self):
        """Should create the cosine similarity matrix of two series"""
        test_series_1 = pd.Series(['foo', 'bar', 'baz'])
        test_series_2 = pd.Series(['foo', 'bar', 'bop'])
        sg = StringGrouper(test_series_1, test_series_2)
        master, dupe = sg._get_tf_idf_matrices()

        expected_matches = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]])
        np.testing.assert_array_equal(
            expected_matches,
            sg._build_matches(master, dupe).toarray())
 def test_add_match_single_group_matches_symmetric(self):
     """New matches that are added to a SG with only a master series should be symmetric"""
     test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo'])
     sg = StringGrouper(test_series_1).fit()
     sg.add_match('no match', 'baz')
     matches = sg.get_matches()
     matches_1 = matches[(matches.left_side == 'no match')
                         & (matches.right_side == 'baz')]
     self.assertEqual(1, matches_1.shape[0])
     matches_2 = matches[(matches.left_side == 'baz')
                         & (matches.right_side == 'no match')]
     self.assertEqual(1, matches_2.shape[0])
 def test_get_groups_4_df_no_match(self):
     """Should return a pd.series object with the length of the dupes. If no match is found in dupes,
     the original will be returned"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_4_df_same_similarity(self):
     """Should return a pd.series object with the length of the dupes. If there are two dupes with the same
     similarity, the first one is chosen"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_2_string_series_2_id_series(self):
     """Should return a pd.series object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
    def test_remove_match(self):
        """Should remove a match"""
        test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob'])
        test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
        sg = StringGrouper(test_series_1).fit()
        sg.remove_match('foooo', 'foooob')
        matches = sg.get_matches()
        matches_1 = matches[(matches.left_side == 'foooo')
                            & (matches.right_side == 'foooob')]
        # In the case of only a master series, the matches are recursive, so both variants are to be removed
        matches_2 = matches[(matches.left_side == 'foooob')
                            & (matches.right_side == 'foooo')]
        self.assertEqual(0, matches_1.shape[0])
        self.assertEqual(0, matches_2.shape[0])

        sg2 = StringGrouper(test_series_1, test_series_2).fit()
        sg2.remove_match('foooo', 'foooob')
        matches = sg2.get_matches()
        matches = matches[(matches.left_side == 'foooo')
                          & (matches.right_side == 'foooob')]
        self.assertEqual(0, matches.shape[0])
 def test_get_matches_1_series_1_id_series(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1, master_id=test_series_id_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({'left_side_id': left_side_id, 'left_side': left_side,
                                 'right_side_id': right_side_id, 'right_side': right_side, 'similarity': similarity})
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 def test_get_matches_single(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     sg = StringGrouper(test_series_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({
         'left_side': left_side,
         'right_side': right_side,
         'similarity': similarity
     })
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 def test_get_groups_1_string_series_1_id_series(self):
     """Should return a pd.DataFrame object with the same length as the original df. The series object will contain
     a list of the grouped strings"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1,
                        master_id=test_series_id_1,
                        ignore_index=True)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(
         zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])),
                                    columns=['group_rep_id', 'group_rep'])
     pd.testing.assert_frame_equal(expected_result, result)
    def test_build_matrix_master_and_duplicates(self):
        """Should create a csr matrix for master and duplicates"""
        test_series_1 = pd.Series(['foo', 'bar', 'baz'])
        test_series_2 = pd.Series(['foo', 'bar', 'bop'])
        sg = StringGrouper(test_series_1, test_series_2)
        master, dupe = sg._get_tf_idf_matrices()
        master_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.],
                                      [0., 1., 0., 0.]])
        dupes_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.],
                                     [0., 0., 1., 0.]])

        np.testing.assert_array_equal(master_expected.toarray(),
                                      master.toarray())
        np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray())
 def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value(
         self):
     """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'],
                               index=[100, 101, 102, 103])
     sg = StringGrouper(test_series_1, test_series_2, replace_na=True)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(
         list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])),
         columns=['most_similar_index', 'most_similar_master'],
         index=test_series_2.index)
     pd.testing.assert_frame_equal(expected_result, result)
 def test_build_matches_list(self):
     """Should create the cosine similarity matrix of two series"""
     test_series_1 = pd.Series(['foo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foo', 'bar', 'bop'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     master = [0, 1]
     dupe_side = [0, 1]
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({
         'master_side': master,
         'dupe_side': dupe_side,
         'similarity': similarity
     })
     pd.testing.assert_frame_equal(expected_df, sg._matches_list)
    def test_get_matches_raises_exception_if_unexpected_options_given(self):
        # When the input id data does not correspond with its string data:
        test_series_1 = pd.Series(['foo', 'bar', 'baz'])
        bad_test_series_id_1 = pd.Series(['A0', 'A1'])
        good_test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
        test_series_2 = pd.Series(['foo', 'bar', 'bop'])
        bad_test_series_id_2 = pd.Series(['B0', 'B1'])
        good_test_series_id_2 = pd.Series(['B0', 'B1', 'B2'])
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1, master_id=bad_test_series_id_1)
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1,
                              duplicates=test_series_2,
                              duplicates_id=bad_test_series_id_2,
                              master_id=good_test_series_id_1)

        # When the input data is ok but the option combinations are invalid:
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1,
                              test_series_2,
                              master_id=good_test_series_id_1)
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1,
                              test_series_2,
                              duplicates_id=good_test_series_id_2)
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1,
                              duplicates_id=good_test_series_id_2)
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1,
                              master_id=good_test_series_id_1,
                              duplicates_id=good_test_series_id_2)
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1,
                              master_id=good_test_series_id_1,
                              ignore_index=True,
                              replace_na=True)
        # Here we force an exception by making the number of index-levels of duplicates different from master:
        # and setting replace_na=True
        test_series_2.index = pd.MultiIndex.from_tuples(
            list(zip(list('ABC'), [0, 1, 2])))
        with self.assertRaises(Exception):
            _ = StringGrouper(test_series_1,
                              duplicates=test_series_2,
                              replace_na=True)
    def test_prior_matches_added(self):
        """When a new match is added, any pre-existing matches should also be updated"""
        sample = [
            'microsoftoffice 365 home', 'microsoftoffice 365 pers',
            'microsoft office'
        ]

        df = pd.DataFrame(sample, columns=['name'])

        sg = StringGrouper(df['name'], ignore_index=True)
        sg = sg.fit()

        sg = sg.add_match('microsoft office', 'microsoftoffice 365 home')
        sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office')
        df['deduped'] = sg.get_groups()
        # All strings should now match to the same "master" string
        self.assertEqual(1, len(df.deduped.unique()))
Пример #15
0
 def test_get_matches_two_dataframes(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foo', 'bar', 'bop'])
     sg = StringGrouper(test_series_1, test_series_2).fit()
     left_side = ['foo', 'bar']
     left_index = [0, 1]
     right_side = ['foo', 'bar']
     right_index = [0, 1]
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({
         'left_index': left_index,
         'left_side': left_side,
         'similarity': similarity,
         'right_side': right_side,
         'right_index': right_index
     })
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 def test_case_insensitive_build_matches_list(self):
     """Should create the cosine similarity matrix of two case insensitive series"""
     test_series_1 = pd.Series(['foo', 'BAR', 'baz'])
     test_series_2 = pd.Series(['FOO', 'bar', 'bop'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     master = [0, 1]
     dupe_side = [0, 1]
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({
         'master_side': master,
         'dupe_side': dupe_side,
         'similarity': similarity
     })
     expected_df.loc[:,
                     'similarity'] = expected_df.loc[:,
                                                     'similarity'].astype(
                                                         sg._config.
                                                         tfidf_matrix_dtype)
     pd.testing.assert_frame_equal(expected_df, sg._matches_list)
 def test_get_matches_single(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     sg = StringGrouper(test_series_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     left_index = [0, 0, 1, 2, 3, 3]
     right_index = [0, 3, 1, 2, 0, 3]
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({
         'left_index': left_index,
         'left_side': left_side,
         'similarity': similarity,
         'right_side': right_side,
         'right_index': right_index
     })
     expected_df.loc[:,
                     'similarity'] = expected_df.loc[:,
                                                     'similarity'].astype(
                                                         sg._config.
                                                         tfidf_matrix_dtype)
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
Пример #18
0
 def test_n_grams(self):
     """Should return all ngrams in a string"""
     test_series = pd.Series(pd.Series(['aa']))
     sg = StringGrouper(test_series)
     expected_result = ['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds']
     self.assertListEqual(expected_result, sg.n_grams('McDonalds'))