def scrape_twitter(google_client): tw = Twitter() # tweets = tw.tweets(keywords='JetBlue', stream=False, limit=10) #sample from the public stream # print(tweets) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets( keywords='JetBlue OR #JetBlue -filter:retweets', limit=10000) topics_dict = { "tweet_texts":[], \ "ent_score":[], \ "ent_magn":[], \ "overall_score":[], \ "overall_magn":[]} for tweet in tqdm(tweets): topics_dict["tweet_texts"].append(tweet['text']) ent_score, ent_magnitude, doc_score, doc_magnitude = analyze_text( google_client, text=tweet['text']) topics_dict["ent_score"].append(ent_score) topics_dict["ent_magn"].append(ent_magnitude) topics_dict["overall_score"].append(doc_score) topics_dict["overall_magn"].append(doc_magnitude) # pprint(tweet, depth=1) # print('\n\n') print('Total Count:', len(topics_dict["tweet_texts"])) metrics = ["ent_score", "ent_magn", "overall_score", "overall_magn"] for metric in metrics: metric_score = np.asarray(topics_dict[metric]) print(metric, "Mean:", np.mean(metric_score), "St Dev:", np.std(metric_score)) with open('./csvs/twitter-jetblue-sentiment.json', 'w') as fp: json.dump(topics_dict, fp)
def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets, if available. """ ids_f = StringIO("""\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""") oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: id_str = tweet["id_str"] print(f"id: {id_str}") text = tweet["text"] if text.startswith("@null"): text = "[Tweet not available]" print(text + "\n")
def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets. """ ids_f =\ io.StringIO("""\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""") oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: try: id_str = tweet['id_str'] print('id: {}\ntext: {}\n'.format(id_str, tweet['text'])) except IndexError: pass
def collect_tweets(my_keyword, json_writer, stop_num): my_keyword = my_keyword.strip() print('finding tweets with {} keyword'.format(my_keyword)) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=my_keyword, limit=stop_num) dump_tweets(tweets, json_writer)
def search_demo(keywords="nltk"): """ Use the REST API to search for past tweets containing a given keyword. """ oauth = credsfromfile() client = Query(**oauth) for tweet in client.search_tweets(keywords=keywords, limit=10): print(tweet["text"])
def tweets_by_user_demo(user="******", count=200): """ Use the REST API to search for past tweets by a given user. """ oauth = credsfromfile() client = Query(**oauth) client.register(TweetWriter()) client.user_tweets(user, count)
def get_tweet_by_id(self, filepath, tw_id): ids = str(tw_id) ids = StringIO(ids) client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) tw = read_csv_tweets(filepath) for i in hydrated: yield tw.loc[tw['user.id'] == i]['text']
def get_users(self, *args: None): #by IdUser client = Query(**oauth) user_info = clinet.user_info_from_id(*args) users = [] for user in user_info: name, followers, following = user_info['name'], user_info[ 'followers_count'], user_info['friends_count'] users.append(user) print(f'{name} {followers} {following}\n') return users
def lookup_by_userid_demo(): """ Use the REST API to convert a userID to a screen name. """ oauth = credsfromfile() client = Query(**oauth) user_info = client.user_info_from_id(USERIDS) for info in user_info: name = info['screen_name'] followers = info['followers_count'] following = info['friends_count'] print("{0}, followers: {1}, following: {2}".format(name, followers, following))
def lookup_by_userid_demo(): """ Use the REST API to convert a userID to a screen name. """ oauth = credsfromfile() client = Query(**oauth) user_info = client.user_info_from_id(USERIDS) for info in user_info: name = info["screen_name"] followers = info["followers_count"] following = info["friends_count"] print(f"{name}, followers: {followers}, following: {following}")
def obtener_Twits(listaPalabras, DicPalabras): listaPalabrasConsulta = [] # Esto podria mejorarlo # size = len(listaPalabras) / 2 for x in list(DicPalabras)[0:4]: listaPalabrasConsulta.append(x) print("Lista de palabras para la consulta: ", listaPalabrasConsulta) # Consulta a Twitter, genera un and de las palabras mmas importantes (El espacio es AND logico y , es un OR Logico) txt = ' '.join(listaPalabrasConsulta) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=txt, limit=10) arrTweets = [] for tweet in tweets: arrTweets.append(Standardizer.standardize(tweet['text'])) return arrTweets
def add_tweets(self, user, party): """ Downloads tweets from a single Twitter user up to the specified ID. :param user: the Twitter handle of the user. :param party: the political party to which `user` belongs. :return the list of downloaded tweets. """ query = Query(**self.oauth) tweets = query.get_user_timeline( screen_name=user, count=200, exclude_replies='false', include_rts='true') self.tweets[user] = tweets self.save() self.users[user] = party return tweets
def limit_by_time_demo(keywords="nltk"): """ Query the REST API for Tweets about NLTK since yesterday and send the output to terminal. This example makes the assumption that there are sufficient Tweets since yesterday for the date to be an effective cut-off. """ date = yesterday() dt_date = datetime.datetime(*date) oauth = credsfromfile() client = Query(**oauth) client.register(TweetViewer(limit=100, lower_date_limit=date)) print(f"Cutoff date: {dt_date}\n") for tweet in client.search_tweets(keywords=keywords): print("{} ".format(tweet["created_at"]), end="") client.handler.handle(tweet)
def getOpinionsOfTopic(topic, oauth, num_of_tweets): raw_tweets = [] client = Query(**oauth) tweets = client.search_tweets(keywords=topic, limit=num_of_tweets) for tweet in tweets: raw_tweets.append(tweet) tweets, retweet_counts, fave_counts, followers_count = preprocess_tweet(raw_tweets) sentiments, totals = getOpinionTotals(tweets, retweet_counts, fave_counts, followers_count) adjustedTotal = totals['Positive'] + totals['Negative'] + totals['Neutral'] posPercent = totals['Positive'] / adjustedTotal negPercent = totals['Negative'] / adjustedTotal neuPercent = totals['Neutral'] / adjustedTotal print("Opinions for the topic \"{}\":\nPositive: {:.0%}, Negative: {:.0%}, Neutral: {:.0%} out of {} tweets.\n" .format(topic, posPercent, negPercent, neuPercent, num_of_tweets)) greatestTotal = float(max(totals.values())) opinion = "" for key in totals.keys(): if totals[key] == greatestTotal: opinion = key.lower() if opinion != 'Neutral'.lower(): print("The topic was mostly {}. Finding the most {} tweet.".format(opinion, opinion)) else: print("The topic was mostly neutral. Unable to find the most neutral tweet.") sent = {'pos' : 0, 'neg' : 0,'neu' : 0, 'compound' : 0} sentTweet = "" for i in range(len(tweets)): if opinion == 'Positive'.lower(): if (sentiments[i]['compound'] >= sent['compound'] and sentiments[i]['pos'] > sent['pos']): sent = sentiments[i] sentTweet = raw_tweets[i] elif opinion == 'Negative'.lower(): if (sentiments[i]['compound'] <= sent['compound'] and sentiments[i]['neg'] > sent['neg']): sent = sentiments[i] sentTweet = raw_tweets[i] if opinion != 'Neutral'.lower(): print("Most {} tweet: {}".format(opinion, sentTweet['text'])) print("URL: https://twitter.com/statuses/{}".format(sentTweet['id'])) print("------------------------------------")
# export TWITTER="twitter.txt" from nltk.twitter import Twitter, Query, Streamer, credsfromfile import pickle from pprint import pprint __author__ = 'kongaloosh' import json from pprint import pprint with open('data/investments.json') as data_file: # with open('data.json') as data_file: oauth = credsfromfile() data = json.load(data_file) tw = Twitter() client = Query(**oauth) for i in range(len(data['investments'])): if type(dict(data['investments'][i])): tweets = client.search_tweets(keywords=data['investments'][i]['name'], limit=100) tweets = list(tweets) data['investments'][i]['tweets'] = tweets with open('data_pickle.pkl', 'w') as outfile: pickle.dump(data, outfile) f = pickle.load(open('data_pickle.pkl', 'r')) print(f)
def get_twiter(self, keywords): client = Query(**oauth) tweets = client.search_tweets(keywords, limit) tweet = next(tweets) return tweet
from nltk.twitter.common import json2csv from nltk.twitter.common import json2csv_entities from nltk.corpus import twitter_samples from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile import pandas as pd oauth = credsfromfile() n = 10 # 設定拿取 tweets 資料則數 username = '******' # Query client = Query(**oauth) # 歷史資料 client.register(TweetWriter()) # 寫入 client.user_tweets(username, n) # 拿取 tweets 資料(n則) ''' 使用 json2csv 存取 tweets 資料 (text欄位) input_file 的 abspath 需參考上述 Query 寫入資料的路徑做修改 ''' input_file = twitter_samples.abspath('/Users/youngmihuang/twitter-files/tweets.20180726-155316.json') with open(input_file) as fp: json2csv(fp, 'tweets_text.csv', ['text']) # 讀取 data = pd.read_csv('tweets_text.csv') for line in data.text: print('Trump tweets content: ') print(line) # 斷詞
def search(): oauth = credsfromfile() client = Query(**oauth) df = pd.read_csv('twitter_users.csv') df = df[df['Flag'] == 'Use'] terms = set([ '@' + u.replace('https://twitter.com/', '') for u in df['URL'].values ]) with open('terms.pkl', 'rb') as f: terms = terms.union(pickle.load(f)) searches = 0 li_html = '<li>name={0} created={1} favorited={2} retweeted={3} \ {4} query={5}</li>' for term in terms: searches += 1 row = twitter_searches.find_one(query=term) if row is not None: if hours_from_now(row['search_date']) < 24: continue tweets = client.search_tweets(keywords=term + ' python http -RT', lang='en', limit=5) for t in tweets: if int(t['favorite_count']) == 0: log.debug('No favorites') continue text = t['text'] dt = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y') if hours_from_now(dt) > 24: continue if core.not_english(text): log.debug('Not english: {}'.format(text)) continue log.debug('Searching for {}'.format(term)) uname = t['user']['screen_name'] uname_html = '<a href="https://twitter.com/{0}">{0}</a>' users = [ v.replace('https://twitter.com/', '') for v in pd.read_csv('twitter_users.csv')['URL'].values ] with open('twitter_users.csv', 'a') as users_csv: if uname not in set(users): users_csv.write('{0},{1},Recommended\n'.format( datetime.now(), 'https://twitter.com/' + uname)) html = li_html.format(uname_html.format(uname), t['created_at'], t['favorite_count'], t['retweet_count'], hrefs_from_text(text), term) twitter_searches.upsert( dict(query=term, search_date=datetime.now(), html=html), ['query', 'html']) if searches == 150: break