示例#1
0
    def test_threshold_filter(self):
        """
        Test the basic functionality of the threshold filter.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0.75)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertFalse('damascus' in scores)
示例#2
0
 def test_sorting(self):
     """
     Test that the resolver sorts the tokens in descending order of score.
     """
     """
     Create the test data
     """
     tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
     posts = [
         "Manchester United falter against Tottenham Hotspur",
         "Manchester United unable to avoid defeat to Tottenham",
         "Tottenham lose again",
     ]
     corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
     """
     Ensure that the more common candidates are ranked towards the beginning.
     """
     candidates = TokenExtractor().extract(corpus)
     scores = TFScorer().score(candidates)
     scores = ThresholdFilter(0).filter(scores)
     self.assertTrue(scores)
     resolved, unresolved = Resolver().resolve(scores)
     self.assertEqual(set(scores.keys()), set(resolved))
     self.assertEqual([], unresolved)
     self.assertEqual('tottenham', resolved[0])
     self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
示例#3
0
    def test_zero_threshold(self):
        """
        Test that when a threshold of zero is given, all candidate participants are retained.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertTrue('damascus' in scores)
示例#4
0
    def test_sorting(self):
        """
        Test that the resolver sorts the tokens in descending order of score.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
            "Tottenham lose again",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores)
        self.assertEqual('tottenham', resolved[0])
        self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
        self.assertEqual(
            set([
                'falter', 'against', 'hotspur', 'unable', 'avoid', 'defeat',
                'lose', 'again'
            ]), set(resolved[3:]))
示例#5
0
    def test_repeated_tokens(self):
        """
        Test that when tokens are repeated, the frequency that is returned is the term frequency.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor(tokenizer=tokenizer)
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates, normalize_scores=False)
        self.assertEqual(2, scores.get('erdogan'))
示例#6
0
    def test_normalization(self):
        """
        Test that when normalization is disabled, the returned scores are integers.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor()
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates, normalize_scores=False)
        self.assertEqual(2, scores.get('erdogan'))
示例#7
0
    def test_score_across_multiple_documents(self):
        """
        Test that the score is based on term frequency.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor(tokenizer=tokenizer)
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates, normalize_scores=False)
        self.assertEqual(3, scores.get('erdogan'))
示例#8
0
    def test_max_score(self):
        """
        Test that the maximum score is 1 when normalization is enabled.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor()
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)
        self.assertTrue(all(score <= 1 for score in scores.values()))
示例#9
0
    def test_score_of_unknown_token(self):
        """
        Test that the score of an unknown token is 0.
        """
        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        extractor = TokenExtractor()
        scorer = TFScorer()
        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)
        self.assertFalse(scores.get('unknown'))
示例#10
0
    def test_high_threshold(self):
        """
        Test that when the threshold is high, it excludes all candidates.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=2,
                              stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite",
            "Ronaldo's goal voted goal of the year by football fans appreciative of the striker",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)

        resolver = WikipediaSearchResolver(TF(), tokenizer, 1, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertFalse(len(resolved))
        self.assertEqual(set(scores.keys()), set(unresolved))
示例#11
0
 def test_resolve_all(self):
     """
     Test that when resolving candidates, all of them are returned.
     """
     """
     Create the test data
     """
     tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
     posts = [
         "Manchester United falter against Tottenham Hotspur",
         "Manchester United unable to avoid defeat to Tottenham",
         "Tottenham lose again",
     ]
     corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
     """
     Ensure that all candidates are resolved.
     """
     candidates = TokenExtractor().extract(corpus)
     scores = TFScorer().score(candidates)
     scores = ThresholdFilter(0).filter(scores)
     self.assertTrue(scores)
     resolved, unresolved = Resolver().resolve(scores)
     self.assertEqual(set(scores.keys()), set(resolved))
     self.assertEqual([], unresolved)
示例#12
0
    def test_empty_corpus(self):
        """
        Test that when an empty corpus is given, all candidates are unresolved.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer, []).resolve(scores)
        self.assertEqual(len(scores), len(unresolved))
示例#13
0
    def test_unknown_token(self):
        """
        Test that when an unknown candidate is given, it is unresolved.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer,
                                             corpus).resolve({'unknown': 1})
        self.assertTrue('unknown' in unresolved)
示例#14
0
    def test_low_threshold(self):
        """
        Test that when the threshold is not zero, it excludes some ambiguous candidates.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=2, stem=True, stopwords=list(stopwords.words("english")))
        posts = [
            "Memphis mum about his future at Lyon after the Dutch footballer wins it for the Ligue 1 team",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)

        resolver = WikipediaNameResolver(TF(), tokenizer, 0.4, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertTrue('Memphis' in unresolved)
示例#15
0
    def test_all_resolved_or_unresolved(self):
        """
        Test that the resolver either resolves or does not resolve named entities.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Burnley",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)

        resolver = WikipediaNameResolver(TF(), tokenizer, 0, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertEqual(len(scores), len(resolved + unresolved))
示例#16
0
    def test_sorting(self):
        """
        Test that the resolver sorts the named entities in descending order of score.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=True, case_fold=True)
        posts = [
            "In the most heated football match of the season, Liverpool falter against Manchester City",
            "Liverpool unable to avoid defeat to Watford, Manchester City close in on football title"
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = WikipediaSearchResolver(TF(), tokenizer, 0,
                                                       corpus).resolve(scores)
        self.assertEqual('Liverpool F.C.', resolved[0])
        self.assertEqual('Manchester City F.C.', resolved[1])
        self.assertEqual('Watford F.C.', resolved[2])
示例#17
0
    def test_all_resolved_or_unresolved(self):
        """
        Test that the resolver either resolves or does not resolve named entities.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=2,
                              stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "Ronaldo, speaking after Juventus' victory, says league is still wide open, but his team is favorite",
            "Ronaldo's goal voted goal of the year by football fans appreciative of the striker",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)

        resolver = WikipediaSearchResolver(TF(), tokenizer, 0, corpus)
        resolved, unresolved = resolver.resolve(scores)
        self.assertEqual(len(scores), len(resolved + unresolved))
示例#18
0
    def test_token_resolver(self):
        """
        Test the token resolver.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores)

        self.assertTrue('manchester' in resolved)
        self.assertTrue('united' in resolved)
        self.assertTrue('tottenham' in resolved)
        self.assertTrue('hotspur' in resolved)
示例#19
0
    def test_different_tokenizer(self):
        """
        Test that when a different tokenizer is used than the one used in extraction, it is used.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores)
        self.assertTrue('to' in resolved)

        resolved, unresolved = TokenResolver(
            Tokenizer(min_length=3, stem=False), corpus).resolve(scores)
        self.assertTrue('to' in unresolved)
示例#20
0
    def test_wikipedia_name_resolver(self):
        """
        Test the Wikipedia name resolver.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=False)
        posts = [
            "Manchester United falter against Burnley",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)

        resolver = WikipediaNameResolver(TF(), tokenizer, 0, corpus)
        resolved, unresolved = resolver.resolve(scores)

        self.assertTrue('manchester united' in resolved)
        self.assertTrue('burnley' in resolved)
示例#21
0
    def test_sorting_ambiguous(self):
        """
        Test that the resolver sorts the named entities in descending order of score, but ambiguous candidates are at the end.
        """

        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
        posts = [
            "Manchester United falter against Manchester City",
            "Manchester United unable to avoid defeat to Tottenham",
            "Tottenham lose again",
        ]
        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        candidates = EntityExtractor().extract(corpus, binary=True)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = WikipediaNameResolver(TF(), tokenizer, 0, corpus).resolve(scores)
        self.assertEqual('manchester united', resolved[0])
        self.assertEqual('manchester city', resolved[1])
        self.assertEqual('tottenham', resolved[2])
示例#22
0
    def test_case_folding(self):
        """
        Test that when case-folding is set, the case does not matter.
        In this test, the stem 'report' can be formed by:

            #. Reporters - appears twice
            #. reporters - appears twice
            #. reports - appears three times

        Without case-folding, 'reports' would be chosen to represent the token 'report'.
        'reports' appears three times, and 'Reporters' and 'reporters' appear twice.
        With case-folding, 'reports' still appears three times, but 'reporters' appears four times.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=1, stem=True)
        posts = [
            "Reporters Without Borders issue statement after reporters are harrassed",
            "Reporters left waiting all night long: reports",
            "Two reporters injured before gala: reports",
            "Queen reacts: reports of her falling ill exaggerated"
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer,
                                             corpus,
                                             case_fold=False).resolve(scores)
        self.assertTrue('reports' in resolved)

        resolved, unresolved = TokenResolver(tokenizer, corpus,
                                             case_fold=True).resolve(scores)
        self.assertTrue('reporters' in resolved)