def test_default_tokenize(self): tokenizer = TweetTokenizer() tweet: str = ( "Me: I think I have Ebola " "Doctor: when did you start feel" "ing symptoms Me: bout a w" "eek ago Everyone in hospi" "tal: http://t.co/LoIPKzvOmT" ) tokenized = tokenizer.tokenize(tweet) right_answer: List[str] = [ "me", "i", "think", "i", "have", "ebola", "doctor", "when", "did", "you", "start", "feeling", "symptoms", "me", "bout", "a", "week", "ago", "everyone", "in", "hospital", ] self.assertListEqual(tokenized, right_answer)
def test_supported_langs(self): with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError): tokenizer = TweetTokenizer(language="zxx") with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError): tokenizer = TweetTokenizer(language="es") with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError): tokenizer = TweetTokenizer(language="english")
def test_url_titles(self): tokenizer = TweetTokenizer() tweet = {"entities": { "urls": [ {"url": "http://alexandradelucia.com", "expanded_url": "http://alexandradelucia.com"}, {"url": "https://www.washingtonpost.com/news/voraciously/wp/2020/07/13/welcome-to-the-new-buffet-which-isnt-a-buffet-anymore/?utm_campaign=wp_post_most&utm_medium=email&utm_source=newsletter&wpisrc=nl_most", "expanded_url": "https://www.washingtonpost.com/news/voraciously/wp/2020/07/13/welcome-to-the-new-buffet-which-isnt-a-buffet-anymore/?utm_campaign=wp_post_most&utm_medium=email&utm_source=newsletter&wpisrc=nl_most" } ] }} right_answer = ["About · Alexandra DeLucia", "Welcome to the new buffet, which isn’t a buffet anymore - The Washington Post"] parsed_titles = tokenizer.add_url_titles(tweet).get("url_titles") self.assertEqual(right_answer, parsed_titles)
def test_get_tweet_text(self): tokenizer = TweetTokenizer() tweet: Dict[str, Any] = { "text": "sample text", "truncated": True, "extended_tweet": {"full_text": "sample text plus more text"}, "quoted_status": {"text": "quoted text", "extended_tweet": {"full_text": "quoted text and more text"}}, "retweeted_status": {"text": "retweeted text", "extended_tweet": {"full_text": "retweeted text and more text"}} } right_answer = "sample text plus more text quoted text and more text retweeted text and more text" all_text = tokenizer.get_tweet_text(tweet) self.assertEqual(all_text, right_answer)
def test_apostraphe_preservation(self): tokenizer = TweetTokenizer(token_pattern=r"\b[\w']+\b") tweet: str = "Why can't I ' #twerk '' :'( :')" right_answer = ["why", "can't", "i", "twerk"] tokenized = tokenizer.tokenize(tweet) self.assertListEqual(tokenized, right_answer) tweet: str = "She just wanted to say 'hello'" right_answer = ["she", "just", "wanted", "to", "say", "hello"] tokenized = tokenizer.tokenize(tweet) self.assertListEqual(tokenized, right_answer) tweet: str = "If people who are in love together are called \"Love Birds\" then people who always argue together should be called \"Angry Birds\"-happy nw yr" right_answer = ["if", "people", "who", "are", "in", "love", "together", "are", "called", "love", "birds", "then", "people", "who", "always", "argue", "together", "should", "be", "called", "angry", "birds", "happy", "nw", "yr"] tokenized = tokenizer.tokenize(tweet) self.assertListEqual(tokenized, right_answer)
def test_default_tokenize(self): tokenizer = TweetTokenizer() tweet: str = ( "Me: I think I have Ebola " "Doctor: when did you start feel" "ing symptoms Me: bout a w" "eek ago Everyone in hospi" "tal: http://t.co/LoIPKzvOmT" ) tokenized = tokenizer.tokenize(tweet) right_answer: List[str] = [ "me", "i", "think", "i", "have", "ebola", "doctor", "when", "did", "you", "start", "feeling", "symptoms", "me", "bout", "a", "week", "ago", "everyone", "in", "hospital", ] self.assertListEqual(tokenized, right_answer) # Quote handling tweet: str = "If people who are in love together are called \"Love Birds\" then people who always argue together should be called \"Angry Birds\"-happy nw yr" right_answer = ["if", "people", "who", "are", "in", "love", "together", "are", "called", "love", "birds", "then", "people", "who", "always", "argue", "together", "should", "be", "called", "angry", "birds", "happy", "nw", "yr"] tokenized = tokenizer.tokenize(tweet) self.assertListEqual(tokenized, right_answer)
parser.add_argument("--output-dir", type=str, required=True) parser.add_argument( "--include-retweeted-content", action="store_true", help= "Search entities in retweeted and quoted statuses in addition to the original tweet" ) return parser.parse_args() if __name__ == "__main__": # Parse command-line arguments args = parse_args() # Initialize tokenizer tokenizer = TweetTokenizer( include_retweeted_and_quoted_content=args.include_retweeted_content) # Loop through files for input_file in args.input_files: modified_tweets = [] reader = TweetReader(input_file) for tweet in reader.read_tweets(): temp = tokenizer.add_url_titles(tweet) modified_tweets.append(temp) # Write out tweets output_file = os.path.join(args.output_dir, input_file) writer = TweetWriter(output_file) writer.write(modified_tweets)
def test_remove_lone_digits(self): tokenizer = TweetTokenizer(remove_lone_digits=True) tweet: str = "luv 4 u" right_answer = ["luv", "u"] tokenized = tokenizer.tokenize(tweet) self.assertListEqual(tokenized, right_answer)
def test_remove_ampersand(self): tokenizer = TweetTokenizer() tweet: str = "@dr_m_badawy tnx u so much , the same for u & all the best" right_answer = ["tnx", "u", "so", "much", "the", "same", "for", "u", "all", "the", "best"] tokenized = tokenizer.tokenize(tweet) self.assertListEqual(tokenized, right_answer)
def test_contraction_expansion(self): tokenizer = TweetTokenizer(expand_contractions=True) tweet: str = "Why can't I #twerk" right_answer = ["why", "can", "not", "i", "twerk"] tokenized = tokenizer.tokenize(tweet) self.assertListEqual(tokenized, right_answer)
def test_tokenize_file(self): tokenizer = TweetTokenizer() text = tokenizer.tokenize_tweet_file("demo_tweets.json")
return False def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print("\Topic #{}:".format(topic_idx)) print(" ".join( [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) # load data # num_docs = 0 doc_num = 0 filtered_tweets = [] tokenizer = TweetTokenizer(stopwords=stopwords.words('english')) for filename in os.listdir(DOCS_DIR): if not filename.endswith(".json.gz"): continue reader = TweetReader(os.path.join(DOCS_DIR, filename)) for tweet in reader.read_tweets(): if not _confirm_org(tweet): continue doc_num += 1 if doc_num % 2 != 0: continue if tweet.get("truncated", False): text = tweet["extended_tweet"]["full_text"] else: text = tweet["text"] tokens = tokenizer.tokenize(text)