def test_add_match_single_occurence(self): """Should add the match if there are no exact duplicates""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1).fit() sg.add_match('no match', 'baz') matches = sg.get_matches() matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] self.assertEqual(1, matches.shape[0]) sg2 = StringGrouper(test_series_1, test_series_2).fit() sg2.add_match('no match', 'bar') matches = sg2.get_matches() matches = matches[(matches.left_side == 'no match') & (matches.right_side == 'bar')] self.assertEqual(1, matches.shape[0])
def test_get_matches_2_series_2_id_series(self): test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2']) sg = StringGrouper(test_series_1, test_series_2, duplicates_id=test_series_id_2, master_id=test_series_id_1).fit() left_side = ['foo', 'bar'] left_side_id = ['A0', 'A1'] left_index = [0, 1] right_side = ['foo', 'bar'] right_side_id = ['B0', 'B1'] right_index = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index }) expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype( sg._config. tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_matches_two_dataframes(self): test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2).fit() left_side = ['foo', 'bar'] right_side = ['foo', 'bar'] similarity = [1.0, 1.0] expected_df = pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_remove_match(self): """Should remove a match""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooob']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1).fit() sg.remove_match('foooo', 'foooob') matches = sg.get_matches() matches_1 = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] # In the case of only a master series, the matches are recursive, so both variants are to be removed matches_2 = matches[(matches.left_side == 'foooob') & (matches.right_side == 'foooo')] self.assertEqual(0, matches_1.shape[0]) self.assertEqual(0, matches_2.shape[0]) sg2 = StringGrouper(test_series_1, test_series_2).fit() sg2.remove_match('foooo', 'foooob') matches = sg2.get_matches() matches = matches[(matches.left_side == 'foooo') & (matches.right_side == 'foooob')] self.assertEqual(0, matches.shape[0])
def test_add_match_single_group_matches_symmetric(self): """New matches that are added to a SG with only a master series should be symmetric""" test_series_1 = pd.Series(['foooo', 'no match', 'baz', 'foooo']) sg = StringGrouper(test_series_1).fit() sg.add_match('no match', 'baz') matches = sg.get_matches() matches_1 = matches[(matches.left_side == 'no match') & (matches.right_side == 'baz')] self.assertEqual(1, matches_1.shape[0]) matches_2 = matches[(matches.left_side == 'baz') & (matches.right_side == 'no match')] self.assertEqual(1, matches_2.shape[0])
def test_get_matches_1_series_1_id_series(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0'] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_side_id': left_side_id, 'left_side': left_side, 'right_side_id': right_side_id, 'right_side': right_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_matches_single(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) sg = StringGrouper(test_series_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_index = [0, 0, 1, 2, 3, 3] right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'similarity': similarity, 'right_side': right_side, 'right_index': right_index }) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_matches_two_dataframes(self): test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2).fit() left_side = ['foo', 'bar'] left_index = [0, 1] right_side = ['foo', 'bar'] right_index = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'similarity': similarity, 'right_side': right_side, 'right_index': right_index }) expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype( sg._config. tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches())