def scrape_twitter(google_client): tw = Twitter() # tweets = tw.tweets(keywords='JetBlue', stream=False, limit=10) #sample from the public stream # print(tweets) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets( keywords='JetBlue OR #JetBlue -filter:retweets', limit=10000) topics_dict = { "tweet_texts":[], \ "ent_score":[], \ "ent_magn":[], \ "overall_score":[], \ "overall_magn":[]} for tweet in tqdm(tweets): topics_dict["tweet_texts"].append(tweet['text']) ent_score, ent_magnitude, doc_score, doc_magnitude = analyze_text( google_client, text=tweet['text']) topics_dict["ent_score"].append(ent_score) topics_dict["ent_magn"].append(ent_magnitude) topics_dict["overall_score"].append(doc_score) topics_dict["overall_magn"].append(doc_magnitude) # pprint(tweet, depth=1) # print('\n\n') print('Total Count:', len(topics_dict["tweet_texts"])) metrics = ["ent_score", "ent_magn", "overall_score", "overall_magn"] for metric in metrics: metric_score = np.asarray(topics_dict[metric]) print(metric, "Mean:", np.mean(metric_score), "St Dev:", np.std(metric_score)) with open('./csvs/twitter-jetblue-sentiment.json', 'w') as fp: json.dump(topics_dict, fp)
def collect_tweets(my_keyword, json_writer, stop_num): my_keyword = my_keyword.strip() print('finding tweets with {} keyword'.format(my_keyword)) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=my_keyword, limit=stop_num) dump_tweets(tweets, json_writer)
def search_demo(keywords='nltk'): """ Use the REST API to search for past tweets containing a given keyword. """ oauth = credsfromfile() client = Query(**oauth) for tweet in client.search_tweets(keywords=keywords, limit=10): print(tweet['text'])
def search_demo(keywords="nltk"): """ Use the REST API to search for past tweets containing a given keyword. """ oauth = credsfromfile() client = Query(**oauth) for tweet in client.search_tweets(keywords=keywords, limit=10): print(tweet["text"])
def obtener_Twits(listaPalabras, DicPalabras): listaPalabrasConsulta = [] # Esto podria mejorarlo # size = len(listaPalabras) / 2 for x in list(DicPalabras)[0:4]: listaPalabrasConsulta.append(x) print("Lista de palabras para la consulta: ", listaPalabrasConsulta) # Consulta a Twitter, genera un and de las palabras mmas importantes (El espacio es AND logico y , es un OR Logico) txt = ' '.join(listaPalabrasConsulta) oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=txt, limit=10) arrTweets = [] for tweet in tweets: arrTweets.append(Standardizer.standardize(tweet['text'])) return arrTweets
def limit_by_time_demo(keywords="nltk"): """ Query the REST API for Tweets about NLTK since yesterday and send the output to terminal. This example makes the assumption that there are sufficient Tweets since yesterday for the date to be an effective cut-off. """ date = yesterday() dt_date = datetime.datetime(*date) oauth = credsfromfile() client = Query(**oauth) client.register(TweetViewer(limit=100, lower_date_limit=date)) print(f"Cutoff date: {dt_date}\n") for tweet in client.search_tweets(keywords=keywords): print("{} ".format(tweet["created_at"]), end="") client.handler.handle(tweet)
def limit_by_time_demo(keywords="nltk"): """ Query the REST API for Tweets about NLTK since yesterday and send the output to terminal. This example makes the assumption that there are sufficient Tweets since yesterday for the date to be an effective cut-off. """ date = yesterday() dt_date = datetime.datetime(*date) oauth = credsfromfile() client = Query(**oauth) client.register(TweetViewer(limit=100, lower_date_limit=date)) print("Cutoff date: {}\n".format(dt_date)) for tweet in client.search_tweets(keywords=keywords): print("{} ".format(tweet['created_at']), end='') client.handler.handle(tweet)
def getOpinionsOfTopic(topic, oauth, num_of_tweets): raw_tweets = [] client = Query(**oauth) tweets = client.search_tweets(keywords=topic, limit=num_of_tweets) for tweet in tweets: raw_tweets.append(tweet) tweets, retweet_counts, fave_counts, followers_count = preprocess_tweet(raw_tweets) sentiments, totals = getOpinionTotals(tweets, retweet_counts, fave_counts, followers_count) adjustedTotal = totals['Positive'] + totals['Negative'] + totals['Neutral'] posPercent = totals['Positive'] / adjustedTotal negPercent = totals['Negative'] / adjustedTotal neuPercent = totals['Neutral'] / adjustedTotal print("Opinions for the topic \"{}\":\nPositive: {:.0%}, Negative: {:.0%}, Neutral: {:.0%} out of {} tweets.\n" .format(topic, posPercent, negPercent, neuPercent, num_of_tweets)) greatestTotal = float(max(totals.values())) opinion = "" for key in totals.keys(): if totals[key] == greatestTotal: opinion = key.lower() if opinion != 'Neutral'.lower(): print("The topic was mostly {}. Finding the most {} tweet.".format(opinion, opinion)) else: print("The topic was mostly neutral. Unable to find the most neutral tweet.") sent = {'pos' : 0, 'neg' : 0,'neu' : 0, 'compound' : 0} sentTweet = "" for i in range(len(tweets)): if opinion == 'Positive'.lower(): if (sentiments[i]['compound'] >= sent['compound'] and sentiments[i]['pos'] > sent['pos']): sent = sentiments[i] sentTweet = raw_tweets[i] elif opinion == 'Negative'.lower(): if (sentiments[i]['compound'] <= sent['compound'] and sentiments[i]['neg'] > sent['neg']): sent = sentiments[i] sentTweet = raw_tweets[i] if opinion != 'Neutral'.lower(): print("Most {} tweet: {}".format(opinion, sentTweet['text'])) print("URL: https://twitter.com/statuses/{}".format(sentTweet['id'])) print("------------------------------------")
def search(): oauth = credsfromfile() client = Query(**oauth) df = pd.read_csv('twitter_users.csv') df = df[df['Flag'] == 'Use'] terms = set(['@' + u.replace('https://twitter.com/', '') for u in df['URL'].values]) with open('terms.pkl', 'rb') as f: terms = terms.union(pickle.load(f)) searches = 0 li_html = '<li>name={0} created={1} favorited={2} retweeted={3} \ {4} query={5}</li>' for term in terms: searches += 1 row = twitter_searches.find_one(query=term) if row is not None: if hours_from_now(row['search_date']) < 24: continue tweets = client.search_tweets(keywords=term + ' python http -RT', lang='en', limit=5) for t in tweets: if int(t['favorite_count']) == 0: log.debug('No favorites') continue text = t['text'] dt = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y') if hours_from_now(dt) > 24: continue if core.not_english(text): log.debug('Not english: {}'.format(text)) continue log.debug('Searching for {}'.format(term)) uname = t['user']['screen_name'] uname_html = '<a href="https://twitter.com/{0}">{0}</a>' users = [v.replace('https://twitter.com/', '') for v in pd.read_csv('twitter_users.csv')['URL'].values] with open('twitter_users.csv', 'a') as users_csv: if uname not in set(users): users_csv.write('{0},{1},Recommended\n'.format( datetime.now(), 'https://twitter.com/' + uname)) html = li_html.format(uname_html.format(uname), t['created_at'], t['favorite_count'], t['retweet_count'], hrefs_from_text(text), term) twitter_searches.upsert(dict(query=term, search_date=datetime.now(), html=html), ['query', 'html']) if searches == 150: break
import json import re import csv from nltk.twitter import Twitter from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile from nltk.sentiment.vader import SentimentIntensityAnalyzer tw = Twitter() sid = SentimentIntensityAnalyzer() # Grab credentials from file oauth = credsfromfile() # Search API client = Query(**oauth) tweets = client.search_tweets(keywords='Bitcoin, #cryptocurrency', limit=10000) tweet = next(tweets) # Open data file outfile = open("bitcoin_auto.csv", "a") writer = csv.writer(outfile) mydata = [ 'DATE', 'TWEET', 'COMPOUND', 'NEGATIVE', 'NEUTRAL', 'POSITIVE', 'LATITUDE', 'LONGITUDE' ] # writer.writerow(mydata) def pre_process_text(tweet): text = [] words_list = []
if __name__ == '__main__': oauth = credsfromfile() client = Query(**oauth) with open(searchfile, 'rb') as f_search: search_terms = [ term.strip() for term in f_search.readlines() if term.strip() ] # Get tweets for specific search terms for term in search_terms: print "Collecting {term}".format(term=term) search_data = [] tweets = client.search_tweets( keywords="{term} -filter:retweets".format(term=term), limit=float('inf')) while True: tweet = next(tweets, None) if tweet is None: break elif tweet['id_str'] in tweet_ids: continue author_names.add(tweet['user']['screen_name']) tweet_ids.add(tweet['id_str']) search_data.append(tweet) tweet['text'] = unicodedata.normalize( 'NFKD', strip_emoji(tweet['text'])).encode( 'ascii', 'ignore').encode('utf-8')
import json import re import csv from nltk.twitter import Twitter from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile from nltk.sentiment.vader import SentimentIntensityAnalyzer tw = Twitter() sid = SentimentIntensityAnalyzer() # Grab credentials from file oauth = credsfromfile() # Search API client = Query(**oauth) tweets = client.search_tweets(keywords='Korean, summit', limit=10000) tweet = next(tweets) # Open data file outfile = open("korean_summit_auto.csv", "a") writer = csv.writer(outfile) mydata = [ 'DATE', 'TWEET', 'COMPOUND', 'NEGATIVE', 'NEUTRAL', 'POSITIVE', 'LATITUDE', 'LONGITUDE' ] # writer.writerow(mydata) def pre_process_text(tweet): text = [] words_list = []
def search(): oauth = credsfromfile() client = Query(**oauth) df = pd.read_csv('twitter_users.csv') df = df[df['Flag'] == 'Use'] terms = set([ '@' + u.replace('https://twitter.com/', '') for u in df['URL'].values ]) with open('terms.pkl', 'rb') as f: terms = terms.union(pickle.load(f)) searches = 0 li_html = '<li>name={0} created={1} favorited={2} retweeted={3} \ {4} query={5}</li>' for term in terms: searches += 1 row = twitter_searches.find_one(query=term) if row is not None: if hours_from_now(row['search_date']) < 24: continue tweets = client.search_tweets(keywords=term + ' python http -RT', lang='en', limit=5) for t in tweets: if int(t['favorite_count']) == 0: log.debug('No favorites') continue text = t['text'] dt = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y') if hours_from_now(dt) > 24: continue if core.not_english(text): log.debug('Not english: {}'.format(text)) continue log.debug('Searching for {}'.format(term)) uname = t['user']['screen_name'] uname_html = '<a href="https://twitter.com/{0}">{0}</a>' users = [ v.replace('https://twitter.com/', '') for v in pd.read_csv('twitter_users.csv')['URL'].values ] with open('twitter_users.csv', 'a') as users_csv: if uname not in set(users): users_csv.write('{0},{1},Recommended\n'.format( datetime.now(), 'https://twitter.com/' + uname)) html = li_html.format(uname_html.format(uname), t['created_at'], t['favorite_count'], t['retweet_count'], hrefs_from_text(text), term) twitter_searches.upsert( dict(query=term, search_date=datetime.now(), html=html), ['query', 'html']) if searches == 150: break
import re import csv from nltk.twitter import Twitter from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter, credsfromfile from nltk.sentiment.vader import SentimentIntensityAnalyzer tw = Twitter() sid = SentimentIntensityAnalyzer() # Grab credentials from file oauth = credsfromfile() # Search API client = Query(**oauth) tweets = client.search_tweets(keywords='Syria', limit=10000) tweet = next(tweets) # Open data file outfile = open("syria_auto.csv","a") writer = csv.writer(outfile) mydata = ['DATE', 'TWEET', 'COMPOUND', 'NEGATIVE', 'NEUTRAL', 'POSITIVE','LATITUDE','LONGITUDE'] # writer.writerow(mydata) def pre_process_text(tweet): text = [] words_list = [] clean_list = [] # Get all tweet text in english if tweet['lang'] == "en":
import process_twt from NBClassifier import NBClassifier from SCClassifier import SCClassifier from BGClassifier import BGClassifier from nltk.corpus import twitter_samples, TwitterCorpusReader import os import pickle import matplotlib.pyplot as plt import numpy as np # settings oauth = credsfromfile() client = Query(**oauth) twtNum = 10 client.register(TweetViewer(limit=twtNum)) tweets_gen = client.search_tweets(keywords='hearthstone', lang='en') tweets = [] slangdict = process_twt.get_slang_dict() twt_list = [] for t in tweets_gen: twt_list.append(process_twt.preprocess(t['text'], slangdict=slangdict)) twt_list = list(set(twt_list)) for t in twt_list[:twtNum]: print t fileIds = twitter_samples.fileids() root = twitter_samples.root # read tweet data from corpus negReader = TwitterCorpusReader(root, fileIds[0])
# export TWITTER="twitter.txt" from nltk.twitter import Twitter, Query, Streamer, credsfromfile import pickle from pprint import pprint __author__ = 'kongaloosh' import json from pprint import pprint with open('data/investments.json') as data_file: # with open('data.json') as data_file: oauth = credsfromfile() data = json.load(data_file) tw = Twitter() client = Query(**oauth) for i in range(len(data['investments'])): if type(dict(data['investments'][i])): tweets = client.search_tweets(keywords=data['investments'][i]['name'], limit=100) tweets = list(tweets) data['investments'][i]['tweets'] = tweets with open('data_pickle.pkl', 'w') as outfile: pickle.dump(data, outfile) f = pickle.load(open('data_pickle.pkl', 'r')) print(f)
luhn.display_comparison() sumbasic.display_comparison() res = input("Press 'r' to restart\n") if res != 'r': restart = False elif choice == '2': # summarize a twitter topic tweet_topic = input("Enter the topic you want a summary for\n") # Authenticate and retrieve tweets based on user entered topic oauth = credsfromfile() client = Query(**oauth) client.register(TweetWriter()) tweets = client.search_tweets(keywords=tweet_topic, limit=100, lang='en') tweetSummarizer = TweetSummarizer() # clean tweets and store in tweets.csv rows = [] usable_rows = [] for tweet in tweets: rows.append(str(tweet['text'])) if len(rows) > 0: usable_rows = rows.copy() for i in range(0, len(rows)): rows[i] = clean_tweet(rows[i]) tweetSummarizer.store_full_tweets(rows[i]) with open('tweets.csv', 'w', encoding='utf-8') as csvfile:
def get_twiter(self, keywords): client = Query(**oauth) tweets = client.search_tweets(keywords, limit) tweet = next(tweets) return tweet
def search(): oauth = credsfromfile() client = Query(**oauth) df = pd.read_sql('SELECT URL FROM twitter_users', db.executable.raw_connection()) users = set([u.replace('https://twitter.com/', '') for u in df['URL'].values]) terms = set(['@' + u for u in users]) with open('terms.pkl', 'rb') as f: terms = terms.union(pickle.load(f)) searches = 0 li_html = 'name={0} created={1} favorited={2} retweeted={3} \ {4} query={5}' for term in terms: searches += 1 row = twitter_searches.find_one(query=term) if row is not None: if hours_from_now(row['search_date']) < 24: continue tweets = client.search_tweets(keywords=term + ' python http -RT', lang='en') for t in tweets: if int(t['favorite_count']) == 0: log.debug('No favorites') continue text = t['text'] dt = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y') if hours_from_now(dt) > 24: continue if core.not_english(text): log.debug('Not english: {}'.format(text)) continue log.debug('Searching for {}'.format(term)) uname = t['user']['screen_name'] uname_html = '<a href="https://twitter.com/{0}">{0}</a>' if uname not in set(users): db['twitter_users'].insert( dict(Flag='Recommended', Date=datetime.now(), URL='https://twitter.com/' + uname)) html = li_html.format(uname_html.format(uname), t['created_at'], t['favorite_count'], t['retweet_count'], hrefs_from_text(text), term) twitter_searches.upsert(dict(query=term, search_date=datetime.now(), html=html), ['query', 'html']) if searches == 150: break
print('\n' + candidate['name']) print("\n" + subject + " :") print("Mots concernées : " + str(candidate['pro'] + candidate['cons'])) print("Avis pour : " + str(candidate['pro'])) print("Avis contre : " + str(candidate['cons'])) print("Sans avis : " + str(nb_tweets - (candidate['pro'] + candidate['cons']))) print("Indice pour : " + str(candidate['pro'] / nb_tweets)) print("Indice contre : " + str(candidate['cons'] / nb_tweets)) if (candidate['pro'] > candidate['cons']): print("Les gens sont pour ce candidat") elif (candidate['cons'] > candidate['pro']): print("Les gens sont contre ce candidat") else: print("Les gens sont partagés") print('\n\n') if __name__ == '__main__': print("Loadind tweets & analyzing ...") subject = 'people_tweets' for candidate in candidates_tweets: tw = Twitter() oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=candidate['name'], limit=nb_tweets) analyze_tweets(candidate, tweets, subject) print_results(candidate, subject)
import matplotlib.pyplot as plt from nltk.sentiment.vader import SentimentIntensityAnalyzer import seaborn as sns #Rest API from nltk.twitter import Twitter tw = Twitter() # tw.tweets(keywords='LokSabhaElection2019', limit=2) tw.tweets(keywords='LokSabhaElection2019', stream=False, limit=20) ## Read tweets totaltweets = 0 oauth = credsfromfile() client = Query(**oauth) f = open('E:/temp/twitter.txt', 'w') tweets = client.search_tweets(keywords='LokSabhaElection2019', limit=10000) for tweet in tweets: print(tweet['text']) try: f.write(tweet['text']) totaltweets += 1 except Exception: pass f.close() f = open('E:/temp/twitter.txt', 'a') oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords='Elections2019', limit=10000) for tweet in tweets: print(tweet['text'])
return {i: data.count(i) for i in data} #LIVE twitter feed #------------------ #get 10 twitter messages with #whatdoyouwant tw = Twitter() tw.tweets(keywords='nationalgriduk', stream=False, limit=10) brand = 'nationalgriduk' #API keys #------------------------ oauth = credsfromfile() client = Query(**oauth) tweets = client.search_tweets(keywords=brand, limit=20000) tweet = next(tweets) pprint(tweet, depth=1) #make sure tweets can be encoded non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) #print(x.translate(non_bmp_map)) # Sentiment analysis #------------------------------- analyzer = SentimentIntensityAnalyzer() #vadersentiment object Data = [] Words = [] Label = []