def test_get_groups_single_valued_series(self): """This test ensures that get_groups() returns a single-valued DataFrame or Series object since the input-series is also single-valued. This test was created in response to a bug discovered by George Walker""" pd.testing.assert_frame_equal( pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']), group_similar_strings(pd.Series(["hello"]), min_similarity=0.6)) pd.testing.assert_series_equal( pd.Series(["hello"], name='group_rep'), group_similar_strings(pd.Series(["hello"]), min_similarity=0.6, ignore_index=True)) pd.testing.assert_frame_equal( pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']), match_most_similar(pd.Series(["hello"]), pd.Series(["hello"]), min_similarity=0.6)) pd.testing.assert_series_equal( pd.Series(["hello"], name='most_similar_master'), match_most_similar(pd.Series(["hello"]), pd.Series(["hello"]), min_similarity=0.6, ignore_index=True))
def test_get_groups_single_df_group_rep_bad_option_value(self): """Should raise an exception when group_rep value given is neither 'centroid' nor 'first'""" simple_example = SimpleExample() customers_df = simple_example.customers_df with self.assertRaises(Exception): _ = group_similar_strings(customers_df['Customer Name'], group_rep='nonsense', min_similarity=0.6)
def test_get_groups_single_df_keep_index(self): """Should return a pd.Series object with the same length as the original df. The series object will contain a list of the grouped strings with their indexes displayed in columns""" simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_frame_equal( simple_example.expected_result_centroid_with_index_col, group_similar_strings(customers_df['Customer Name'], min_similarity=0.6, ignore_index=False))
def test_get_groups_single_df_group_rep_default(self): """Should return a pd.Series object with the same length as the original df. The series object will contain a list of the grouped strings""" simple_example = SimpleExample() customers_df = simple_example.customers_df pd.testing.assert_series_equal( simple_example.expected_result_centroid, group_similar_strings(customers_df['Customer Name'], min_similarity=0.6, ignore_index=True))
def test_group_similar_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_groups.return_value = 'whatever' test_series_1 = None test_series_id_1 = None df = group_similar_strings(test_series_1, string_ids=test_series_id_1) mock_StringGrouper_instance.fit.assert_called_once() mock_StringGrouper_instance.get_groups.assert_called_once() self.assertEqual(df, 'whatever')