示例#1
0
    def test2_5_hits(self):
        input_ = pd.DataFrame(
            {'test_col': ['berlin and paris are both capitals', 
                          'Angela M. plans to buy a flat in Hamburg',
                          'The BASF is located in Ludwigshafen, Rhineland-Palatine', 
                          'Crist. Ronaldo plays in Madrid for Real Madrid']})

        expected = pd.read_csv("test/data/linking/5hits_expected.csv")
        output = dbpedia_spotlight_linker(input_, 'test_col', max_hits=5)

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
示例#2
0
    def test8_all_nan(self):
        input_ = pd.DataFrame(
            {'test_col': [np.nan, np.nan,
                          np.nan, np.nan]})

        expected = pd.DataFrame(
            {'test_col': [np.nan, np.nan, np.nan, np.nan],
             'new_link': [np.nan, np.nan, np.nan, np.nan]})
             
        output = dbpedia_spotlight_linker(input_, 'test_col')

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
 def transform(self, X, y=None):
     X = dbpedia_spotlight_linker(
         X,
         column=self.column,
         new_attribute_name=self.new_attribute_name,
         progress=self.progress,
         max_hits=self.max_hits,
         language=self.language,
         selection=self.selection,
         confidence=self.confidence,
         support=self.support,
         min_similarity_score=self.min_similarity_score,
         caching=self.caching)
     return X
示例#4
0
    def test3_nan(self):
        input_ = pd.DataFrame(
            {'test_col': ['Dakar', 'Horse Racing in Uruguay',
                           np.nan, 'Ukulele']})
        expected = pd.DataFrame(
            {'test_col': ['Dakar', 'Horse Racing in Uruguay',
                           np.nan, 'Ukulele'],
             'new_link': ['http://dbpedia.org/resource/Dakar',
                          'http://dbpedia.org/resource/Horse_racing',
                           np.nan,
                          'http://dbpedia.org/resource/Ukulele']})
        output = dbpedia_spotlight_linker(input_, 'test_col')

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
示例#5
0
    def test1_default(self):
        input_ = pd.DataFrame(
            {'test_col': ['berlin', 'darmstadt',
                          'london', 'munich']})
        expected = pd.DataFrame(
            {'test_col': ['berlin', 'darmstadt',
                          'london', 'munich'],
             'new_link': ['http://dbpedia.org/resource/Berlin',
                          'http://dbpedia.org/resource/Darmstadt',
                          'http://dbpedia.org/resource/London',
                          'http://dbpedia.org/resource/Munich']})
        output = dbpedia_spotlight_linker(input_, 'test_col')

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
示例#6
0
    def test7_no_match(self):
        input_ = pd.DataFrame(
            {'test_col': ['B. Obama', 'Barrrack Obbama',
                          'President B. Obama', 'B. Obama']})

        expected = pd.DataFrame(
            {'test_col': ['B. Obama', 'Barrrack Obbama',
                          'President B. Obama', 'B. Obama'],
             'new_link': [np.nan, np.nan, np.nan, np.nan]})
             
        output = dbpedia_spotlight_linker(input_, 'test_col', 
            confidence=0.99)

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
示例#7
0
    def test6_high_confidence(self):
        input_ = pd.DataFrame(
            {'test_col': ['Barack Hussein Obama', 'Barrrack Obbama',
                          'President B. Obama', 'B. Obama']})

        expected = pd.DataFrame(
            {'test_col': ['Barack Hussein Obama', 'Barrrack Obbama',
                          'President B. Obama', 'B. Obama'],
             'new_link': ['http://dbpedia.org/resource/Barack_Obama',
                           np.nan, np.nan, np.nan]})
        output = dbpedia_spotlight_linker(input_, 'test_col', 
            confidence=0.99)

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
示例#8
0
    def test9_wrong_input_type(self):
        input_ = pd.DataFrame(
            {'test_col': ['B. Obama', 666,
                          'President B. Obama', 'B. Obama']})

        expected = pd.DataFrame(
            {'test_col': ['B. Obama', 666,
                          'President B. Obama', 'B. Obama'],
             'new_link': ['http://dbpedia.org/resource/Barack_Obama', 
                np.nan, 'http://dbpedia.org/resource/President_of_the_United_States', 
                'http://dbpedia.org/resource/Barack_Obama']})
             
        output = dbpedia_spotlight_linker(input_, 'test_col')

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
示例#9
0
    def test4_selection_support(self):
        input_ = pd.DataFrame(
            {'test_col': ['Berlin and Paris are both capitals but Paris is rich and Paris is in France', 
                          'Rather Rome or Madrid? Definitely Rome!',
                          'Hamburg? Hamburg is not a city in Rhineland-Palatine', 
                          "If I had to choose between Spaghetti or Linguine I'd always choose Spaghetti"]})
        expected = pd.DataFrame(
            {'test_col': ['Berlin and Paris are both capitals but Paris is rich and Paris is in France', 
                          'Rather Rome or Madrid? Definitely Rome!',
                          'Hamburg? Hamburg is not a city in Rhineland-Palatine', 
                          "If I had to choose between Spaghetti or Linguine I'd always choose Spaghetti"],
             'new_link': ['http://dbpedia.org/resource/France',
                          'http://dbpedia.org/resource/Rome',
                           'http://dbpedia.org/resource/Hamburg',
                          'http://dbpedia.org/resource/Spaghetti']})

        output = dbpedia_spotlight_linker(input_, 'test_col', selection='support')

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)
示例#10
0
    def test5_selection_similarityScore(self):
        input_ = pd.DataFrame(
            {'test_col': ['Berlin and Paaris? Berlin and Pariss!', 
                          'Foooootball or Rugbyy?',
                          'How to write New Yorrk? or is it Neew York', 
                          "Hopelessly trying to spell Emannuel Makrone"]})

        expected = pd.DataFrame(
            {'test_col': ['Berlin and Paaris? Berlin and Pariss!', 
                          'Foooootball or Rugbyy?',
                          'How to write New Yorrk? or is it Neew York', 
                          "Hopelessly trying to spell Emannuel Makrone"],
             'new_link': ['http://dbpedia.org/resource/Berlin',
                           np.nan,
                          'http://dbpedia.org/resource/How_(TV_series)',
                          'http://dbpedia.org/resource/Hopelessly']})
        output = dbpedia_spotlight_linker(input_, 'test_col', 
            selection='similarityScore')

        pd.testing.assert_frame_equal(
                output, expected, check_like=True)