def test2_5_hits(self): input_ = pd.DataFrame( {'test_col': ['berlin and paris are both capitals', 'Angela M. plans to buy a flat in Hamburg', 'The BASF is located in Ludwigshafen, Rhineland-Palatine', 'Crist. Ronaldo plays in Madrid for Real Madrid']}) expected = pd.read_csv("test/data/linking/5hits_expected.csv") output = dbpedia_spotlight_linker(input_, 'test_col', max_hits=5) pd.testing.assert_frame_equal( output, expected, check_like=True)
def test8_all_nan(self): input_ = pd.DataFrame( {'test_col': [np.nan, np.nan, np.nan, np.nan]}) expected = pd.DataFrame( {'test_col': [np.nan, np.nan, np.nan, np.nan], 'new_link': [np.nan, np.nan, np.nan, np.nan]}) output = dbpedia_spotlight_linker(input_, 'test_col') pd.testing.assert_frame_equal( output, expected, check_like=True)
def transform(self, X, y=None): X = dbpedia_spotlight_linker( X, column=self.column, new_attribute_name=self.new_attribute_name, progress=self.progress, max_hits=self.max_hits, language=self.language, selection=self.selection, confidence=self.confidence, support=self.support, min_similarity_score=self.min_similarity_score, caching=self.caching) return X
def test3_nan(self): input_ = pd.DataFrame( {'test_col': ['Dakar', 'Horse Racing in Uruguay', np.nan, 'Ukulele']}) expected = pd.DataFrame( {'test_col': ['Dakar', 'Horse Racing in Uruguay', np.nan, 'Ukulele'], 'new_link': ['http://dbpedia.org/resource/Dakar', 'http://dbpedia.org/resource/Horse_racing', np.nan, 'http://dbpedia.org/resource/Ukulele']}) output = dbpedia_spotlight_linker(input_, 'test_col') pd.testing.assert_frame_equal( output, expected, check_like=True)
def test1_default(self): input_ = pd.DataFrame( {'test_col': ['berlin', 'darmstadt', 'london', 'munich']}) expected = pd.DataFrame( {'test_col': ['berlin', 'darmstadt', 'london', 'munich'], 'new_link': ['http://dbpedia.org/resource/Berlin', 'http://dbpedia.org/resource/Darmstadt', 'http://dbpedia.org/resource/London', 'http://dbpedia.org/resource/Munich']}) output = dbpedia_spotlight_linker(input_, 'test_col') pd.testing.assert_frame_equal( output, expected, check_like=True)
def test7_no_match(self): input_ = pd.DataFrame( {'test_col': ['B. Obama', 'Barrrack Obbama', 'President B. Obama', 'B. Obama']}) expected = pd.DataFrame( {'test_col': ['B. Obama', 'Barrrack Obbama', 'President B. Obama', 'B. Obama'], 'new_link': [np.nan, np.nan, np.nan, np.nan]}) output = dbpedia_spotlight_linker(input_, 'test_col', confidence=0.99) pd.testing.assert_frame_equal( output, expected, check_like=True)
def test6_high_confidence(self): input_ = pd.DataFrame( {'test_col': ['Barack Hussein Obama', 'Barrrack Obbama', 'President B. Obama', 'B. Obama']}) expected = pd.DataFrame( {'test_col': ['Barack Hussein Obama', 'Barrrack Obbama', 'President B. Obama', 'B. Obama'], 'new_link': ['http://dbpedia.org/resource/Barack_Obama', np.nan, np.nan, np.nan]}) output = dbpedia_spotlight_linker(input_, 'test_col', confidence=0.99) pd.testing.assert_frame_equal( output, expected, check_like=True)
def test9_wrong_input_type(self): input_ = pd.DataFrame( {'test_col': ['B. Obama', 666, 'President B. Obama', 'B. Obama']}) expected = pd.DataFrame( {'test_col': ['B. Obama', 666, 'President B. Obama', 'B. Obama'], 'new_link': ['http://dbpedia.org/resource/Barack_Obama', np.nan, 'http://dbpedia.org/resource/President_of_the_United_States', 'http://dbpedia.org/resource/Barack_Obama']}) output = dbpedia_spotlight_linker(input_, 'test_col') pd.testing.assert_frame_equal( output, expected, check_like=True)
def test4_selection_support(self): input_ = pd.DataFrame( {'test_col': ['Berlin and Paris are both capitals but Paris is rich and Paris is in France', 'Rather Rome or Madrid? Definitely Rome!', 'Hamburg? Hamburg is not a city in Rhineland-Palatine', "If I had to choose between Spaghetti or Linguine I'd always choose Spaghetti"]}) expected = pd.DataFrame( {'test_col': ['Berlin and Paris are both capitals but Paris is rich and Paris is in France', 'Rather Rome or Madrid? Definitely Rome!', 'Hamburg? Hamburg is not a city in Rhineland-Palatine', "If I had to choose between Spaghetti or Linguine I'd always choose Spaghetti"], 'new_link': ['http://dbpedia.org/resource/France', 'http://dbpedia.org/resource/Rome', 'http://dbpedia.org/resource/Hamburg', 'http://dbpedia.org/resource/Spaghetti']}) output = dbpedia_spotlight_linker(input_, 'test_col', selection='support') pd.testing.assert_frame_equal( output, expected, check_like=True)
def test5_selection_similarityScore(self): input_ = pd.DataFrame( {'test_col': ['Berlin and Paaris? Berlin and Pariss!', 'Foooootball or Rugbyy?', 'How to write New Yorrk? or is it Neew York', "Hopelessly trying to spell Emannuel Makrone"]}) expected = pd.DataFrame( {'test_col': ['Berlin and Paaris? Berlin and Pariss!', 'Foooootball or Rugbyy?', 'How to write New Yorrk? or is it Neew York', "Hopelessly trying to spell Emannuel Makrone"], 'new_link': ['http://dbpedia.org/resource/Berlin', np.nan, 'http://dbpedia.org/resource/How_(TV_series)', 'http://dbpedia.org/resource/Hopelessly']}) output = dbpedia_spotlight_linker(input_, 'test_col', selection='similarityScore') pd.testing.assert_frame_equal( output, expected, check_like=True)