示例#1
0
 def test_sorting(self):
     """
     Test that the resolver sorts the tokens in descending order of score.
     """
     """
     Create the test data
     """
     tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
     posts = [
         "Manchester United falter against Tottenham Hotspur",
         "Manchester United unable to avoid defeat to Tottenham",
         "Tottenham lose again",
     ]
     corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
     """
     Ensure that the more common candidates are ranked towards the beginning.
     """
     candidates = TokenExtractor().extract(corpus)
     scores = TFScorer().score(candidates)
     scores = ThresholdFilter(0).filter(scores)
     self.assertTrue(scores)
     resolved, unresolved = Resolver().resolve(scores)
     self.assertEqual(set(scores.keys()), set(resolved))
     self.assertEqual([], unresolved)
     self.assertEqual('tottenham', resolved[0])
     self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
示例#2
0
    def test_high_threshold(self):
        """
        Test that when the threshold is high, it excludes all candidates.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=2,
                              stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite",
            "Ronaldo's goal voted goal of the year by football fans appreciative of the striker",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)

        resolver = WikipediaSearchResolver(TF(), tokenizer, 1, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertFalse(len(resolved))
        self.assertEqual(set(scores.keys()), set(unresolved))
示例#3
0
 def test_resolve_all(self):
     """
     Test that when resolving candidates, all of them are returned.
     """
     """
     Create the test data
     """
     tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
     posts = [
         "Manchester United falter against Tottenham Hotspur",
         "Manchester United unable to avoid defeat to Tottenham",
         "Tottenham lose again",
     ]
     corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
     """
     Ensure that all candidates are resolved.
     """
     candidates = TokenExtractor().extract(corpus)
     scores = TFScorer().score(candidates)
     scores = ThresholdFilter(0).filter(scores)
     self.assertTrue(scores)
     resolved, unresolved = Resolver().resolve(scores)
     self.assertEqual(set(scores.keys()), set(resolved))
     self.assertEqual([], unresolved)