예제 #1
0
def tweets_by_user_demo(user="******", count=200):
    """
    Use the REST API to search for past tweets by a given user.
    """
    oauth = credsfromfile()
    client = Query(**oauth)
    client.register(TweetWriter())
    client.user_tweets(user, count)
예제 #2
0
def tweets_by_user_demo(user='******', count=200):
    """
    Use the REST API to search for past tweets by a given user.
    """
    oauth = credsfromfile()
    client = Query(**oauth)
    client.register(TweetWriter())
    client.user_tweets(user, count)
예제 #3
0
from nltk.twitter.common import json2csv
from nltk.twitter.common import json2csv_entities
from nltk.corpus import twitter_samples
from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile
import pandas as pd

oauth = credsfromfile()
n = 10  # 設定拿取 tweets 資料則數
username = '******'

# Query
client = Query(**oauth)  # 歷史資料
client.register(TweetWriter())  # 寫入
client.user_tweets(username, n)  # 拿取 tweets 資料(n則)

'''
使用 json2csv 存取 tweets 資料 (text欄位)
input_file 的 abspath 需參考上述 Query 寫入資料的路徑做修改
'''

input_file = twitter_samples.abspath('/Users/youngmihuang/twitter-files/tweets.20180726-155316.json')
with open(input_file) as fp:
    json2csv(fp, 'tweets_text.csv', ['text'])

# 讀取
data = pd.read_csv('tweets_text.csv')
for line in data.text:
    print('Trump tweets content: ')
    print(line)

# 斷詞
from nltk.twitter import Query, credsfromfile, TweetViewer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import sys

if (len(sys.argv)<4):
    print ('Usage:', sys.argv[0], ' twitter_username max_tweets_to_search max_top_words_to_print lemmatize(optional)' )
    quit()

#capture the output of tweetViewer to file for processing
sys.stdout = open('tweets.txt', 'w')

oauth = credsfromfile()
client = Query(**oauth)
client.register(TweetViewer(limit=sys.argv[2]))
client.user_tweets(sys.argv[1], sys.argv[2])


#give back control to stdout
sys.stdout = sys.__stdout__
lemmatizer = WordNetLemmatizer()

if (len(sys.argv)>4 and sys.argv[4].lower()=='lemmatize'):
    lemmatize=True
else:
    lemmatize=False


def text_cleaner(documents):
    text_cleaned = []
    for document in documents: