def test_default_tokenize(self):
     tokenizer = TweetTokenizer()
     tweet: str = (
         "Me: I think I have Ebola       "
         "Doctor: when did you start feel"
         "ing symptoms       Me: bout a w"
         "eek ago       Everyone in hospi"
         "tal: http://t.co/LoIPKzvOmT"
     )
     tokenized = tokenizer.tokenize(tweet)
     right_answer: List[str] = [
         "me",
         "i",
         "think",
         "i",
         "have",
         "ebola",
         "doctor",
         "when",
         "did",
         "you",
         "start",
         "feeling",
         "symptoms",
         "me",
         "bout",
         "a",
         "week",
         "ago",
         "everyone",
         "in",
         "hospital",
     ]
     self.assertListEqual(tokenized, right_answer)
Exemplo n.º 2
0
 def test_supported_langs(self):
     with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError):
         tokenizer = TweetTokenizer(language="zxx")
     with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError):
         tokenizer = TweetTokenizer(language="es")
     with self.assertRaises(littlebird.tweet_tokenizer.LanguageNotSupportedError):
         tokenizer = TweetTokenizer(language="english")
Exemplo n.º 3
0
 def test_url_titles(self):
     tokenizer = TweetTokenizer()
     tweet = {"entities": {
         "urls": [
             {"url": "http://alexandradelucia.com", "expanded_url": "http://alexandradelucia.com"},
             {"url": "https://www.washingtonpost.com/news/voraciously/wp/2020/07/13/welcome-to-the-new-buffet-which-isnt-a-buffet-anymore/?utm_campaign=wp_post_most&utm_medium=email&utm_source=newsletter&wpisrc=nl_most", "expanded_url": "https://www.washingtonpost.com/news/voraciously/wp/2020/07/13/welcome-to-the-new-buffet-which-isnt-a-buffet-anymore/?utm_campaign=wp_post_most&utm_medium=email&utm_source=newsletter&wpisrc=nl_most"
             }
         ]
     }}
     right_answer = ["About · Alexandra DeLucia", "Welcome to the new buffet, which isn’t a buffet anymore - The Washington Post"]
     parsed_titles = tokenizer.add_url_titles(tweet).get("url_titles")
     self.assertEqual(right_answer, parsed_titles)
Exemplo n.º 4
0
 def test_get_tweet_text(self):
     tokenizer = TweetTokenizer()
     tweet: Dict[str, Any] = {
         "text": "sample text",
         "truncated": True,
         "extended_tweet": {"full_text": "sample text plus more text"},
         "quoted_status": {"text": "quoted text", "extended_tweet": {"full_text": "quoted text and more text"}},
         "retweeted_status": {"text": "retweeted text", "extended_tweet": {"full_text": "retweeted text and more text"}}
     }
     right_answer = "sample text plus more text quoted text and more text retweeted text and more text"
     all_text = tokenizer.get_tweet_text(tweet)
     self.assertEqual(all_text, right_answer)
Exemplo n.º 5
0
    def test_apostraphe_preservation(self):
        tokenizer = TweetTokenizer(token_pattern=r"\b[\w']+\b")
        tweet: str = "Why can't I ' #twerk '' :'( :')"
        right_answer = ["why", "can't", "i", "twerk"]
        tokenized = tokenizer.tokenize(tweet)
        self.assertListEqual(tokenized, right_answer)

        tweet: str = "She just wanted to say 'hello'"
        right_answer = ["she", "just", "wanted", "to", "say", "hello"]
        tokenized = tokenizer.tokenize(tweet)
        self.assertListEqual(tokenized, right_answer)
        
        tweet: str = "If people who are in love together are called \"Love Birds\" then people who always argue together should be called \"Angry Birds\"-happy nw yr"
        right_answer = ["if", "people", "who", "are", "in", "love", "together", "are", "called", "love", "birds", "then", "people", "who", "always", "argue", "together", "should", "be", "called", "angry", "birds", "happy", "nw", "yr"]
        tokenized = tokenizer.tokenize(tweet)
        self.assertListEqual(tokenized, right_answer)
Exemplo n.º 6
0
    def test_default_tokenize(self):
        tokenizer = TweetTokenizer()
        tweet: str = (
            "Me: I think I have Ebola       "
            "Doctor: when did you start feel"
            "ing symptoms       Me: bout a w"
            "eek ago       Everyone in hospi"
            "tal: http://t.co/LoIPKzvOmT"
        )
        tokenized = tokenizer.tokenize(tweet)
        right_answer: List[str] = [
            "me",
            "i",
            "think",
            "i",
            "have",
            "ebola",
            "doctor",
            "when",
            "did",
            "you",
            "start",
            "feeling",
            "symptoms",
            "me",
            "bout",
            "a",
            "week",
            "ago",
            "everyone",
            "in",
            "hospital",
        ]
        self.assertListEqual(tokenized, right_answer)

        # Quote handling
        tweet: str = "If people who are in love together are called \"Love Birds\" then people who always argue together should be called \"Angry Birds\"-happy nw yr"
        right_answer = ["if", "people", "who", "are", "in", "love", "together", "are", "called", "love", "birds", "then", "people", "who", "always", "argue", "together", "should", "be", "called", "angry", "birds", "happy", "nw", "yr"]
        tokenized = tokenizer.tokenize(tweet)
        self.assertListEqual(tokenized, right_answer)
Exemplo n.º 7
0
    parser.add_argument("--output-dir", type=str, required=True)
    parser.add_argument(
        "--include-retweeted-content",
        action="store_true",
        help=
        "Search entities in retweeted and quoted statuses in addition to the original tweet"
    )
    return parser.parse_args()


if __name__ == "__main__":
    # Parse command-line arguments
    args = parse_args()

    # Initialize tokenizer
    tokenizer = TweetTokenizer(
        include_retweeted_and_quoted_content=args.include_retweeted_content)

    # Loop through files
    for input_file in args.input_files:
        modified_tweets = []
        reader = TweetReader(input_file)
        for tweet in reader.read_tweets():
            temp = tokenizer.add_url_titles(tweet)
            modified_tweets.append(temp)

        # Write out tweets
        output_file = os.path.join(args.output_dir, input_file)
        writer = TweetWriter(output_file)
        writer.write(modified_tweets)
Exemplo n.º 8
0
 def test_remove_lone_digits(self):
     tokenizer = TweetTokenizer(remove_lone_digits=True)
     tweet: str = "luv 4 u"
     right_answer = ["luv", "u"]
     tokenized = tokenizer.tokenize(tweet)
     self.assertListEqual(tokenized, right_answer)
Exemplo n.º 9
0
 def test_remove_ampersand(self):
     tokenizer = TweetTokenizer()
     tweet: str = "@dr_m_badawy tnx u so much , the same for u & all the best"
     right_answer = ["tnx", "u", "so", "much", "the", "same", "for", "u", "all", "the", "best"]
     tokenized = tokenizer.tokenize(tweet)
     self.assertListEqual(tokenized, right_answer)
Exemplo n.º 10
0
 def test_contraction_expansion(self):
     tokenizer = TweetTokenizer(expand_contractions=True)
     tweet: str = "Why can't I #twerk"
     right_answer = ["why", "can", "not", "i", "twerk"]
     tokenized = tokenizer.tokenize(tweet)
     self.assertListEqual(tokenized, right_answer)
Exemplo n.º 11
0
 def test_tokenize_file(self):
     tokenizer = TweetTokenizer()
     text = tokenizer.tokenize_tweet_file("demo_tweets.json")
        return False


def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\Topic #{}:".format(topic_idx))
        print(" ".join(
            [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


# load data
# num_docs = 0
doc_num = 0
filtered_tweets = []
tokenizer = TweetTokenizer(stopwords=stopwords.words('english'))
for filename in os.listdir(DOCS_DIR):
    if not filename.endswith(".json.gz"):
        continue
    reader = TweetReader(os.path.join(DOCS_DIR, filename))
    for tweet in reader.read_tweets():
        if not _confirm_org(tweet):
            continue
        doc_num += 1
        if doc_num % 2 != 0:
            continue
        if tweet.get("truncated", False):
            text = tweet["extended_tweet"]["full_text"]
        else:
            text = tweet["text"]
        tokens = tokenizer.tokenize(text)