def test_empty_corpus(self): """ Test the token extractor with an empty corpus. """ extractor = TokenExtractor() candidates = extractor.extract([]) self.assertFalse(len(candidates))
def test_repeated_tokens_with_custom_tokenizer(self): """ Test that when a custom tokenizer is given, repeated tokenizers appear multiple times. """ """ Create the test data. """ tokenizer = Tokenizer(stopwords=stopwords.words("english"), stem=False) posts = [ "Manchester United back to winning ways after defeating Manchester City.", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() candidates = extractor.extract(corpus) self.assertEqual(1, candidates[0].count('manchester')) extractor = TokenExtractor(tokenizer=tokenizer) candidates = extractor.extract(corpus) self.assertEqual(2, candidates[0].count('manchester'))
def test_extract_with_custom_tokenizer(self): """ Test that when a custom tokenizer is given, it is used instead of the dimensions. """ """ Create the test data, which uses stemming. """ tokenizer = Tokenizer(stopwords=stopwords.words("english"), stem=True) posts = [ "Manchester United back to winning ways", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() candidates = extractor.extract(corpus) self.assertEqual(set(["manchest", "unit", "back", "win", "way"]), set(candidates[0])) extractor = TokenExtractor(tokenizer=Tokenizer( stopwords=stopwords.words('english'), stem=False)) candidates = extractor.extract(corpus) self.assertEqual( set(["manchester", "united", "back", "winning", "ways"]), set(candidates[0]))
def test_repeated_tokens(self): """ Test that when tokens are repeated, the frequency that is returned is the term frequency. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor(tokenizer=tokenizer) scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates, normalize_scores=False) self.assertEqual(2, scores.get('erdogan'))
def test_score_of_unknown_token(self): """ Test that the score of an unknown token is 0. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates) self.assertFalse(scores.get('unknown'))
def test_max_score(self): """ Test that the maximum score is 1 when normalization is enabled. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates) self.assertTrue(all(score <= 1 for score in scores.values()))
def test_normalization(self): """ Test that when normalization is disabled, the returned scores are integers. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates, normalize_scores=False) self.assertEqual(2, scores.get('erdogan'))
def test_score_across_multiple_documents(self): """ Test that the score is based on term frequency. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor(tokenizer=tokenizer) scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates, normalize_scores=False) self.assertEqual(3, scores.get('erdogan'))
def test_logarithm_base(self): """ Test that when a logarithmic base is provided, it is used instead of the default base. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "After Erdogan's statement, Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ] extractor = TokenExtractor() scorer = LogDFScorer(base=2) candidates = extractor.extract(corpus) scores = scorer.score(candidates, normalize_scores=False) self.assertEqual(math.log(2 + 1, 2), scores.get('erdogan')) # apply Laplace smoothing
def test_return_length(self): """ Test that the token extractor returns as many token sets as the number of documents given. """ """ Create the test data. """ tokenizer = Tokenizer(stopwords=stopwords.words("english"), stem=False) posts = [ "Manchester United falter against Tottenham Hotspur", "", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() candidates = extractor.extract(corpus) self.assertEqual(2, len(candidates)) self.assertEqual( set(["manchester", "united", "falter", "tottenham", "hotspur"]), set(candidates[0])) self.assertEqual(set([]), set(candidates[1]))
def test_tf_scorer(self): """ Test the basic functionality of the TF scorer. """ """ Create the test data. """ tokenizer = Tokenizer(stem=False) posts = [ "Erdogan with threats to attack regime forces 'everywhere' in Syria", "Damascus says Erdogan 'disconnected from reality' after threats", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() scorer = TFScorer() candidates = extractor.extract(corpus) scores = scorer.score(candidates) self.assertEqual(1, scores.get('erdogan', 0)) self.assertEqual(0.5, scores.get('damascus', 0)) self.assertEqual(1, scores.get('threats', 0))
def test_token_extractor(self): """ Test the token extractor with normal input. """ """ Create the test data. """ tokenizer = Tokenizer(stopwords=stopwords.words("english"), stem=False) posts = [ "Manchester United falter against Tottenham Hotspur", "Mourinho under pressure as Manchester United follow with a loss", ] corpus = [Document(post, tokenizer.tokenize(post)) for post in posts] extractor = TokenExtractor() candidates = extractor.extract(corpus) self.assertEqual( set(["manchester", "united", "falter", "tottenham", "hotspur"]), set(candidates[0])) self.assertEqual( set([ "mourinho", "pressure", "manchester", "united", "follow", "loss" ]), set(candidates[1]))