def test_sorting(self): """ Test that the resolver sorts the tokens in descending order of score. """ """ Create the test data """ tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", "Tottenham lose again", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] """ Ensure that the more common candidates are ranked towards the beginning. """ candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) self.assertTrue(scores) resolved, unresolved = Resolver().resolve(scores) self.assertEqual(set(scores.keys()), set(resolved)) self.assertEqual([], unresolved) self.assertEqual('tottenham', resolved[0]) self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
def test_high_threshold(self): """ Test that when the threshold is high, it excludes all candidates. """ """ Create the test data """ tokenizer = Tokenizer(min_length=2, stem=True, stopwords=list(stopwords.words("english"))) posts = [ "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite", "Ronaldo's goal voted goal of the year by football fans appreciative of the striker", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] candidates = EntityExtractor().extract(corpus, binary=True) scores = TFScorer().score(candidates) resolver = WikipediaSearchResolver(TF(), tokenizer, 1, corpus) resolved, unresolved = resolver.resolve(scores) self.assertFalse(len(resolved)) self.assertEqual(set(scores.keys()), set(unresolved))
def test_resolve_all(self): """ Test that when resolving candidates, all of them are returned. """ """ Create the test data """ tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True) posts = [ "Manchester United falter against Tottenham Hotspur", "Manchester United unable to avoid defeat to Tottenham", "Tottenham lose again", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] """ Ensure that all candidates are resolved. """ candidates = TokenExtractor().extract(corpus) scores = TFScorer().score(candidates) scores = ThresholdFilter(0).filter(scores) self.assertTrue(scores) resolved, unresolved = Resolver().resolve(scores) self.assertEqual(set(scores.keys()), set(resolved)) self.assertEqual([], unresolved)