def get_tweets(word, begin_date, end_date='today', limit=None): """ :return: tweets data as dataframe: returns a dictionary of twitter user data """ if end_date == 'today': list_of_tweets = twitterscraper.query_tweets(word, limit=limit, begindate=begin_date) else: list_of_tweets = twitterscraper.query_tweets(word, begindate=begin_date, enddate=end_date) tweets_data = [] for tweet in list_of_tweets: tweet_data = {} tweet_data['tweetID'] = tweet.tweet_id tweet_data['time'] = tweet.timestamp.strftime("%d-%b-%Y (%H:%M)") tweet_data['text'] = tweet.text tweet_data['likes'] = tweet.likes tweet_data['usedID'] = tweet.username tweet_data['URL'] = tweet.tweet_url tweet_data['timestamp'] = tweet.timestamp tweets_data.append(tweet_data) tweets_data = pd.DataFrame(tweets_data) return tweets_data, list_of_tweets
def main(event, context): #a = np.arange(15).reshape(3, 5) print("Access twitter data") #print(a) list_of_tweets = query_tweets("Modi", 5) #print the retrieved tweets to the screen: for tweet in query_tweets("Modi", 5): print("**************************") print(tweet)
def get_tweets(user): ed = dt.date.today() bd = ed - dt.timedelta(days=1) tweets = query_tweets(user, limit=20, begindate=bd) if not tweets: bd = ed - dt.timedelta(days=20) tweets = query_tweets(user, limit=20, begindate=bd) if not tweets: return ["\n"] # return [str(i.text).replace('\\','') for i in tweets] return [ re.sub('[^A-Za-z0-9#@ .,$&(){}[]]+', ' ', str(i.text)) for i in tweets ]
def load_tweets(d, m, y, country, limit=10): flat_tweets1 = query_tweets(country + " coronavirus", limit, dt.date(y, m, d)) flat_tweets2 = query_tweets(country + " covid", limit, dt.date(y, m, d)) flat_tweets3 = query_tweets(country + " covid19", limit, dt.date(y, m, d)) tweets = [] load_in_tweets(tweets, flat_tweets1) load_in_tweets(tweets, flat_tweets2) load_in_tweets(tweets, flat_tweets3) return tweets
def tw_scraper(hashtag, start_date, end_date): ''' The maximum search for Twitter API is 72,000 tweets per hour Using twitterscraper module can scrape without this limitation But the maximum tweets per search on my machine is around 13,000 This function recursively scrape tweets with twitterscraper of certain hashtag in smaller time period to avoid those constraints Inputs: hashtag: (string) a hashtag keyword startdate: (str) string of start date ("year-month-date") enddate: (str) string of end date ("year-month-date") path: (string) archive path Ouput: df: (pandas dataframe) tweets in the time period ''' assert len(startdate) == 10, "Wrong start date format, (yyyy-mm-dd)" assert len(edndate) == 10, "Wrong end date format, (yyyy-mm-dd)" start_ls = list(map(int, startdate.split("-"))) end_ls = list(map(int, enddate.split("-"))) assert start_ls[0] > 1989 and start_ls[0] < 2021, "Wrong start year, range: 1990~2020" assert end_ls[0] > 1989 and end_ls[0] < 2021, "Wrong end year, range: 1990~2020" assert start_ls[1] > 0 and start_ls[1] < 13, "Wrong start month, range: 1~12" assert end_ls[1] > 0 and end_ls[1] < 13, "Wrong end month, range: 1~12" assert start_ls[2] > 0 and start_ls[2] < 32, "Wrong start day, range: 1~31" assert end_ls[2] > 0 and end_ls[2] < 32, "Wrong end day, range: 1~31" start_date = dt.date(*start_date) end_date = dt.date(*end_date) hashtag = "#"+hashtag if end_date > start_date+dt.timedelta(days=5): med_date = start_date + dt.timedelta(days=5) tweets = twitterscraper.query_tweets(hashtag, begindate=start_date, enddate=med_date) df = pd.DataFrame(t.__dict__ for t in tweets) return pd.concat([df,tw_scraper(hashtag, med_date+dt.timedelta(days=1), end_date)]) else: tweets = twitterscraper.query_tweets(hashtag, begindate=start_date, enddate=end_date) df = pd.DataFrame(t.__dict__ for t in tweets) return df
def return_data(): if request.method == 'POST': today = dt.date.today() yesterday = today - dt.timedelta(days=1) begin_date = yesterday end_date = today limit = 100 lang = 'en' tweets = query_tweets("corona", begindate=begin_date, enddate=end_date, limit=limit, lang=lang) df = pd.DataFrame(t.__dict__ for t in tweets) i = df.text.str.strip() j = df.timestamp l = random.randint(1, len(i)) tweet = i[l] tweet_vector = vectorize.transform([tweet]) sent = classifier.predict(tweet_vector).tolist() mytext = tweet language = 'en' myobj = gTTS(text=mytext, lang=language, slow=False) myobj.save("audio.mp3") return render_template('index.html', tw=tweet, time=j[l], sent=sent[0]) else: return render_template('index.html')
def harvest_words(keyword_to_harvest, stopwords, limit=2000): words = [] # Decode the text to support swedish characters keyword_to_harvest = keyword_to_harvest.encode('utf-8') # Query twitter with the "keyword" for query in query_tweets(keyword_to_harvest, lang='sv', limit=limit, poolsize=20, begindate=datetime.date( 2014, 1, 1, )): # Split the result to get the words new_list = query.text.split(u' ') for word in new_list: try: # Remove the stopwords --> Not interesting! if word.encode('utf-8').lower() in stopwords: new_list.remove(word) else: words.append(word) except: print(word, " is not unicode") return words
def retrieve_tweets(): retrieved = 0 for tweet in query_tweets('from:python_tip exclude:replies'): res = process_tweet(tweet) if res: retrieved += 1 print(f'Stored {retrieved} new tweets')
def tweets_scraper_inner(begin_date, end_date, keyword, limit, lang="en"): """ Using the twitterscraper API. Github source: https://github.com/taspinar/twitterscraper return: dataframe. """ tweets = query_tweets(keyword, begindate=begin_date, enddate=end_date, limit=limit, lang=lang) text, timestamp, likes, retweets, replies = [], [], [], [], [] for tweet in tweets: text.append(tweet.text) timestamp.append(tweet.timestamp) likes.append(tweet.likes) retweets.append(tweet.retweets) replies.append(tweet.replies) tweets = pd.DataFrame({ "text": text, "timestamp": timestamp, "likes": likes, "retweets": retweets, "replies": replies }) # Don't need the exact h-m-s, cast it to date object. tweets['timestamp'] = tweets['timestamp'].apply( lambda x: str(x.date())) return tweets
def scrapyTweets(keywords, book_name_xls, sheet_name_xls, datanumber, startdate): start = time() totalNum = 0 data = [] value_title = [ [ "username", "fullname", "user_id", "tweet_id", "tweet_url", "text", "timestamp", "replies", "retweets", "is_retweet", "retweeter_username", "retweet_id" ], ] write_excel_xls(book_name_xls, sheet_name_xls, value_title) for tweet in query_tweets(keywords, datanumber)[:datanumber]: tempList = [] if (checkTime(str(tweet.timestamp), startdate)): totalNum = totalNum + 1 appendToList(tweet, tempList) data.append(tempList) if (len(data) > 5000): write_excel_xls_append(book_name_xls, data) data = [] write_excel_xls_append(book_name_xls, data) end = time() totalTime = end - start return totalNum, totalTime
def twscrape_search(query, *, count=None, begindate=datetime.date(2006, 3, 21), enddate=datetime.date.today(), poolsize=20, lang=''): result = query_tweets(query, limit=count, begindate=begindate, enddate=enddate, poolsize=poolsize, lang=lang) data = ({ 'id': item.id, 'text': merge_whitespaces(item.text), 'timestamp': item.timestamp, 'likes': item.likes, 'retweets': item.retweets, 'replies': item.replies, 'url': item.url, 'html': merge_whitespaces(item.html), 'user': item.user, 'fullname': merge_whitespaces(item.fullname) } for item in result) return sorted(data, key=lambda x: int(x['id']), reverse=True)
def main(): logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) try: parser = ArgumentParser( description=__doc__ ) parser.add_argument("query", type=str, help="Advanced twitter query") parser.add_argument("-o", "--output", type=str, default="tweets.json", help="Path to a JSON file to store the gathered " "tweets to.") parser.add_argument("-l", "--limit", type=int, default=None, help="Number of minimum tweets to gather.") parser.add_argument("-a", "--all", action='store_true', help="Set this flag if you want to get all tweets " "in the history of twitter. This may take a " "while but also activates parallel tweet " "gathering. The number of tweets however, " "will be capped at around 100000 per 10 " "days.") args = parser.parse_args() if isfile(args.output): logging.error("Output file already exists! Aborting.") exit(-1) if args.all: tweets = query_all_tweets(args.query) else: tweets = query_tweets(args.query, args.limit) with open(args.output, "w") as output: dump(tweets, output, cls=JSONEncoder) except KeyboardInterrupt: logging.info("Program interrupted by user. Quitting...")
def run(): data = load_data() if not data: download_s3() data = load_data() if not data: raise Exception('something went horribly wrong') print 'fetching tweets from Twitter' scraper_tweets = query_tweets(TWITTER_QUERY, 100) new_tweets_by_id = { st.id: st for st in [SimpleTweet.from_scraper(rt) for rt in scraper_tweets] } old_count = len(data['tweets_by_id']) data['tweets_by_id'].update(new_tweets_by_id) new_count = len(data['tweets_by_id']) print 'updates: %d' % (new_count - old_count) if new_count == old_count: print 'nothing new, skipping save' else: print 'saving tweets locally' out = { 'blacklist': data['blacklist'], 'tweets': data['tweets_by_id'].values(), } with open(DATA_PATH, 'w') as output: json.dump(out, output) upload_s3()
def twitterscraperSearch(): with open("Alost_user.ndjson", "a", newline='') as output: list_of_tweets = ts.query_tweets( "Aalst (from:HLN_BE) until:2020-03-01 since:2020-02-14", 30) for tweet in list_of_tweets: json_data = json.dumps({ "tweet_id": int(tweet.tweet_id), "screen_name": tweet.screen_name, "user_id": int(tweet.user_id), "created_at": str(tweet.timestamp ), #sinon, format date, mais pas reconnu par json "entities.hashtags": [{ "text": x } for x in tweet.hashtags], "entities.user_mentions": [], "lang": getLang(tweet.text_html), "full_text": remove_html_tags(tweet.text_html) }) output.writelines(json_data + "\n")
def top_results(ipstr): two_months = dt.timedelta(days=60) end_date = dt.date.today() begin_date = end_date - two_months limit = 400 lang = "english" tweets = query_tweets(ipstr, begindate=begin_date, enddate=end_date, limit=limit, lang=lang) #df=pd.DataFrame(tweets,columns = ['screen_name','username','user id','tweet id','tweet url','timestamp','var','text','text html','links','hashtags','has media','img urls','video url','likes','retweets','replies','is replied','parent tweet id','reply to users']) df = pd.DataFrame(t.__dict__ for t in tweets) #df.sort_values(by=['likes','retweets','replies'], inplace=True, ascending=False) #removing unwanted columns df.drop(df.columns[[1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 17, 18, 19, 20]], axis=1, inplace=True) #df = df[df.likes >=0] #df.set_index('username',inplace=True) #df.drop_duplicates(subset ="screen_name", inplace = True) return df
def scrape_twitter(track_list,limit,poolsize,begindate,enddate,loc_near,radius): """ Parameters ---------- track_list : list of strings limit : integer poolsize : integer Number of parallel processes TwitterScraper should initiate while scraping for your query begindate : date object enddate : date object loc_near : string The location from which the tweets should come. This has to be a city name followed by the abbreviate state of the of the form "The City,ST" ex) "Grand Junction,CO" or "New York,NY" or "Colorado Springs,CO" radius : int The radius (in miles) around the location from which the tweets should come Returns ------- tweets : twitterscraper.tweet.Tweet objects """ tweets = [] string_query = make_string_query(track_list=track_list, loc_near=loc_near, loc_within_mi=radius) for tweet in query_tweets(query=string_query, limit=limit, poolsize=poolsize, begindate=begindate, enddate=enddate, lang="en"): tweets.append(tweet) print("Number of Tweets fround for " + loc_near + ": ", len(tweets)) return tweets
def get_tweet(app): return query_tweets(app, limit=None, begindate=dt.date.today()-dt.timedelta(days=1), enddate=dt.date.today(), poolsize=20, lang='en' )
def collectData(): from twitterscraper import query_tweets import datetime as dt import json #Build query string with all marker variations specified query_str = "" for marker in self.markers: query_str+=marker+" " #Collect data tweets = query_tweets(query_str,begindate=self.start_time,enddate=start.end_time,limit=self.limit,lang=self.lang) #Convert data into format that is jsonable list_of_json_tweets = [] for tweet in tweets: tweet_time = tweet.timestamp tweet.timestamp = tweet_time.strftime('%Y-%m-%d %H:%M:%S') list_of_json_tweets.append(vars(tweet)) #Write output file_name = self.start_time.strftime("%Y-%m-%d") jsonfile = open(self.destination+file_name+".json","w") length_of_tweet_list = len(list_of_json_tweets) length_of_tweet_list-=1 for index,item in enumerate(list_of_json_tweets): json.dump(item,jsonfile) if length_of_tweet_list != index: json_file.write(",") json_file.close()
def scrapingTweets(since, until): """ Scrapes Tweets within a date range and writes them to a csv file. Scrapes Tweet text, Tweet ID, and Timestamp :param since: When you want to start. Should be in YYYY-MM-DD format. :param until: When you want to stop. Should be in YYYY-MM-DD format. :return: Filename of csv with tweets scraped """ # upper bound and lower bound of queries. sometimes it is necessary to rescrape certain dates because twitter will # start blocking your queries startDate = dt.datetime.strptime(since, '%Y-%m-%d').date() endDate = dt.datetime.strptime(until, '%Y-%m-%d').date() scraped_tweets_filename = "Coachella" + startDate.strftime( "%Y" + "-" + "%m" + "-" + "%d") + "_" + endDate.strftime( ("%Y" + "-" + "%m" + "-" + "%d")) # queries tweets and writes each tweet to file collected_tweets = twitterscraper.query_tweets("Coachella", limit=None, begindate=startDate, enddate=endDate) with open(scraped_tweets_filename, 'w') as file: for tweet in collected_tweets: tweet_writer = csv.writer(file, delimiter=';', quoting=csv.QUOTE_ALL) tweet_writer.writerow([ tweet.id, tweet.timestamp.strftime("%m/%d/%Y %H:%M:%S"), tweet.text.replace('\n', ' '), tweet.user ])
def scrape(start_date, end_date, keyword="Bitcoin"): for dt in rrule(DAILY, dtstart=start_date, until=end_date): df = pd.DataFrame(columns=[ 'ID', 'Tweet', 'Time', 'User', 'Likes', 'Replies', 'Retweet' ]) yesterday = dt - timedelta(days=1) tomorrow = dt + timedelta(days=1) begin = yesterday.date() end = tomorrow.date() print("{} {}".format(begin, end)) list_of_tweets = query_tweets("Bitcoin", 1000, begindate=begin, enddate=end) for tweet in list_of_tweets: df = df.append( { 'ID': tweet.id, 'Tweet': tweet.text, 'Time': tweet.timestamp, 'User': tweet.user, 'Likes': tweet.likes, 'Replies': tweet.replies, 'Retweet': tweet.retweets }, ignore_index=True) df.to_csv("{}\extracted\{}.csv".format(keyword.lower(), dt.strftime('%Y-%m-%d'), index=False))
def recoleccion(self): from twitterscraper import query_tweets # Inicia recoleccion con idioma seleccionado por usuario print("=== RECOLECCIÓN TWEETSCRAPPING INICIADO") list_of_tweets = query_tweets(query=self.palabra, limit=self.limite, begindate=self.fecha_inicio, enddate=self.fecha_final, lang=self.idioma, poolsize=self.bins) # Transforma recolección a lista de diccionarios print("Transformando recolección a diccionarios...") tweets = self.get_tweets_info(list_of_tweets) # Crear Dataframe dataframe = pd.DataFrame(tweets) # Filtrar por columnas de interes tweets_df = dataframe[["username", "timestamp", "text"]] # Estandarizar nombre de las columnas tweets_df = tweets_df.rename(columns={ "username": "******", "timestamp": "Date", "text": "Text" }) # Borrar archivo para que se vuelva a escribir if self.ruta_extraccion.exists(): os.remove(self.ruta_extraccion) # Exportar extracción a CSV print("Exportando CSV: extraccion_tweets.csv") tweets_df.to_csv(self.ruta_extraccion, index=None, header=True)
def collect_data(self): start_time = time.time() print("INFO: starting to gather tweets...") list_of_tweets = query_tweets( query='', limit=2000, begindate=datetime.date(2017, 2, 1), lang='en') #get list of tweets from twitterscraper API print("INFO: -----TOTAL AMOUNT OF TWEETS: {length}-----".format( length=len(list_of_tweets))) print("INFO: execution time for gathering of tweets: {time} seconds". format(time=time.time() - start_time)) print('') count = 0 print('INFO: *****ADDING NEW TWEETS*****') for tweet in list_of_tweets: if tweet.id not in self.idtotweet: self.idtotweet[tweet.id] = tweet.text count += 1 print('INFO: {cnt} new tweets added to the corpus!'.format(cnt=count)) with open('read_from.json', 'w') as fw: json.dump(self.idtotweet, fw) for key in self.idtotweet: self.corpus.append(self.idtotweet[key]) print('INFO: new length of permanent corpus: {twts}'.format( twts=len(self.idtotweet)))
def get_name(request): tweetList = [] # if this is a POST request we need to process the form data if request.method == 'POST': # create a form instance and populate it with data from the request: form = NameForm(request.POST) # check whether it's valid: if form.is_valid(): # process the data in form.cleaned_data as required # ... # redirect to a new URL: y_name = form.cleaned_data['your_name'] for tweet in query_tweets("from:%s" % y_name, 7)[:7]: print(tweet.text.encode('utf-8')) tweetList.append(unicode(tweet.text).replace("'", "")) scoreboard(tweetList) with open('test_sentences.csv', 'rb') as f: reader = csv.reader(f) your_list = list(reader) deepmojiTokenizer() hateSpeechTokenizer() average = wrap() #return HttpResponseRedirect('/thanks/') return render(request, 'name.html', {'form': average}) # if a GET (or any other method) we'll create a blank form else: form = NameForm() return render(request, 'name.html', {'form': form})
def twload(ipstr): ytload(ipstr) conn = create_connection() create_tabletw(conn) two_months = dt.timedelta(days=60) end_date = dt.date.today() begin_date = end_date - two_months limit = 500 lang = "english" tweets = query_tweets(ipstr, begindate=begin_date, enddate=end_date, limit=limit, lang=lang) df = pd.DataFrame(t.__dict__ for t in tweets) df.sort_values(by=['likes', 'retweets', 'replies'], inplace=True, ascending=False) #removing unwanted columns df.drop(df.columns[[3, 4, 6, 8, 9, 10, 11, 12, 13, 17, 18, 19, 20]], axis=1, inplace=True) df = df[df.likes >= 0] df.set_index('username', inplace=True) df.insert(0, 'id', range(0, len(df))) df = analyse_sentiment(df, "text") #print(df['sentiment']) df.to_sql('twsentiment', conn, if_exists='replace')
def find_bazarki(): """ scrapes twitter for bazarki-like tweets returns list of them (tweet-like objects) use memoize=True to... memoize! """ list_of_tweets = query_tweets( "#wybory2019 OR #WyboryParlamentarne2019 #bazarek", 1, begindate=date(2019, 10, 11), enddate=date(2019, 10, 14), ) bazarki = [] # print the retrieved tweets to the screen: for tweet in list_of_tweets: regex_tester = r"pis.+\s[0-9]+" match = re.search(regex_tester, tweet.text, flags=re.I) if match: bazarki.append({ "text": tweet.text, "url": "https://twitter.com" + tweet.tweet_url }) print(f"znaleziono {len(bazarki)} bazarków") return bazarki
def query_and_output(query, output_name, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''): # perform query tweets = query_tweets(query, limit, begindate, enddate, poolsize, lang) # construct string csv output = None if tweets: si = StringIO() cw = csv.writer(si) cw.writerow([ "timestamp", "user", "fullname", "text", "hashtags", "id", "url", "retweets", "favorites", "replies" ]) for x in tweets: # parse text for hashtags tag_set = set(re.findall('\#\w+', x.text)) tag_values = " ".join(tag_set) # add row for tweet in csv cw.writerow([ x.timestamp, x.user, x.fullname, x.text, tag_values, x.id, x.url, x.retweets, x.likes, x.replies ]) output = make_response(si.getvalue()) output.headers[ 'Content-Disposition'] = 'attachment; filename=export.csv' output.headers['Content-type'] = 'text/csv' return output
def tweets_search(request): if request.method == 'POST': key_groups = [] username = request.POST.get('username') word_group = request.POST.get('group').split('&') for group in word_group: key_groups.append(group.split(',')) start_date = request.POST.get('startDate').split('-') end_date = request.POST.get('endDate').split('-') keywords = request.POST.get('words').split(',') begin_date = f'{start_date[2]}-{start_date[1]}-{start_date[0]}' end_date = f'{end_date[2]}-{end_date[1]}-{end_date[0]}' tweets = [] for tweet in query_tweets( query= f"{query_constructor(keywords, key_groups)} from:{username}", limit=40, poolsize=1, begindate=date.fromisoformat(begin_date), enddate=date.fromisoformat(end_date)): tweets.append(tweet.text) return render(request, 'portal/tweets_list.html', {'tweets': tweets}) else: return render(request, 'portal/tweets_search.html')
def getTweets(self, query, begin_date, end_date, limit=None, n_jobs=2): """A function that takes a query and some dates and scrapes all tweets from Twitter that satisfy the given criteria. Args: query (str): The search string. begin_date (string): The start date that tweets are going to be scraped. end_date (string): The start date that tweets are going to be scraped. limit (string): A threshold number of tweets downloaded n_jobs (int): The degree of parallelism. Returns: text_list (list): A list of the scraped tweets. """ # Date format is DD-MM-YYY (string) twitterscraper.query.HEADER = { 'User-Agent': random.choice(twitterscraper.query.HEADERS_LIST) } b_day, b_month, b_year = begin_date.split('-') e_day, e_month, e_year = end_date.split('-') tweets = query_tweets(query, begindate=dt.date(int(b_year), int(b_month), int(b_day)), enddate=dt.date(int(e_year), int(e_month), int(e_day)), limit=limit, lang='en', poolsize=n_jobs) text_list = [(str(t.timestamp.strftime('%d-%m-%Y')), t.text.strip().replace('\n', ' ')) for t in tweets] return text_list
def twitter_scraper(query, quantity=10, outfile='twitter.json'): ''' @query, may contain the following pattern: 'happy hour' -> Finds tweets: containing the exact phrase 'happy hour'. 'love OR hate' -> Finds tweets: containing either 'love' or 'hate' (or both). 'beer -root' -> Finds tweets: containing 'beer' but not 'root.' '#haiku' -> Finds tweets: containing the hashtag 'haiku.' 'from:alexiskold' -> Finds tweets: sent from person 'alexiskold.' 'to:techcrunch' -> Finds tweets: sent to person 'techcrunch.' '@mashable' -> Finds tweets: Referencing person 'mashable.' '"happy hour" near:"san francisco"' -> Finds tweets: containing the exact phrase 'happy hour' and sent near 'san francisco.' 'near:NYC within:15mi' -> Finds tweets: sent within 15 miles of 'NYC.' 'superhero since:2010-12-27' -> Finds tweets: containing 'superhero' and sent since date '2010-12-27' (year-month-day). 'ftw until:2010-12-27' -> Finds tweets: containing 'ftw' and sent up to date '2010-12-27.' 'movie -scary :)' -> Finds tweets: containing 'movie', but not 'scary,' and with a positive attitude. 'flight :(' -> Finds tweets: containing 'flight' and with a negative attitude. 'traffic ?' -> Finds tweets: containing 'traffic' and asking a question. 'hilarious filter:links' -> Finds tweets: containing 'hilarious' and linking to URLs. 'news source:twitterfeed' -> Finds tweets: containing 'news' and entered via TwitterFeed Note: any advanced query supported by https://twitter.com/search-advanced?lang=en, can be implemented by the 'query_tweets' function. ''' # local variables tweets = [] # structure tweets for tweet in query_tweets(query, int(quantity)): tweets.append({ 'text': tweet.text, 'likes': tweet.likes, 'retweets': tweet.retweets, 'replies': tweet.replies, 'user': tweet.user, 'timestamp': str(tweet.timestamp) }) # write to file if len(tweets): with open(outfile, 'w') as file: json.dump(tweets, file, indent=4)
def request_twitter(company, start_date, end_date): query = company + ' -filter:retweets -filter:replies' delta = dt.timedelta(days=7) twts_raw = pd.DataFrame() twts_df = pd.DataFrame() EndDT = start_date + delta BeginDT = start_date while EndDT <= EndDate: tweets = query_tweets(query, limit=None, begindate=BeginDT, enddate=EndDT, lang='en') tweets_df = pd.DataFrame(t.__dict__ for t in tweets) twts_df = twts_df.append(tweets_df) if (EndDate - EndDT) > delta: BeginDT = BeginDT + delta EndDT = EndDT + delta else: BeginDT = BeginDT + delta EndDT = EndDate if BeginDT > EndDT: break twts_raw = twts_df twts_df = twts_df.drop_duplicates(subset=['timestamp']) twts_df = twts_df.set_index('timestamp') twts_df = twts_df.sort_index() return twts_df, twts_raw
def top_results(ipstr): two_months = dt.timedelta(days=60) end_date = dt.date.today() begin_date = end_date - two_months limit = 100 lang = "english" tweets = query_tweets(ipstr, begindate=begin_date, enddate=end_date, limit=limit, lang=lang) df = pd.DataFrame(t.__dict__ for t in tweets) df.sort_values(by=['likes', 'retweets', 'replies'], inplace=True, ascending=False) df.drop(df.columns[[2, 3, 5, 6, 8, 11, 13, 17, 18, 19, 20]], axis=1, inplace=True) df = df[df.likes >= 30] df.set_index('username', inplace=True) return (df)
# In[2]: from twitterscraper import query_tweets import re import nltk from string import digits from datetime import datetime # In[146]: time = [] # docker to store results of query_tweets for one search docker = [] for tweet in query_tweets("bitcoin", 20)[:40]: docker.append(tweet) text = tweet.text.encode('utf-8').decode('utf-8') text = re.sub(r"http\s+","",text) testdata = text.encode('utf-8') time.append(tweet.timestamp) # In[3]: docker[0].timestamp # In[4]: time # docker[0].timestamp = time[0] ?
# In[251]: from twitterscraper import query_tweets import re import nltk from nltk.corpus import stopwords from string import digits import string import snowballstemmer from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, TfidfTransformer # In[247]: textData = [] for tweet in query_tweets("bitcoin%20since%3A2017-09-01%20until%3A2017-09-13", 10)[:40]: text = tweet.text.encode('utf-8').decode('utf-8') # remove all http links included in the twitter text = re.sub(r"http\S+", "",text) # remove all digits included in the twitter remove_digits = str.maketrans('', '', digits) text = text.translate(remove_digits) textData.append(text) #print(tweet.text.encode('utf-8')) # In[248]: def preprocess_document(data):
#####Scrape tweets from Twitter to classify ##Example: #1) Scrape tweets from Twitter that have #bucs, #buccaneers, or #siegetheday in their text and are in English #2) Save these tweets as a row to a .csv file import twitterscraper with open('nflbucs.csv','a',newline = '',encoding='utf-8') as fil: writer = csv.writer(fil) for tweet in twitterscraper.query_tweets("%23bucs%20OR%20%23buccaneers%20OR%20%23siegetheday%20lang%3Aen%20include%3Aretweets", 1000): writer.writerow(tweet) ####Train classifier based on tweet data #0) Load data and setup from nltk.corpus import twitter_samples ##take a sample of data twitter_samples.strings('positive_tweets.json')[1] twitter_samples.strings('negative_tweets.json')[1] ##create function word_feats() to turn string into a dictionary def word_feats(words): return dict([(word, True) for word in words]) #1) a) Tokenize tweets from sample data #b) Use word_feats() to create a dictionary out of the tokenized words #c) Create list variable of positive and negative features using the dictionary from (b) and append 'pos' or 'neg' import nltk posfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'pos') for row in twitter_samples.strings('positive_tweets.json')] len(posfeats) #check length - equivalent to number of tweets negfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'neg') for row in twitter_samples.strings('negative_tweets.json')]