def obtain_tweet_by_user(username, NUM_TWEETS, label, training=1, NUM_TWEETS_TRAINING=0): # Creating list to append tweet data to tweets_list1 = [] if training: for i, tweet in enumerate( sntwitter.TwitterSearchScraper('from:' + username).get_items()): if i > NUM_TWEETS: break tweets_list1.append([tweet.content, label]) elif training == 0: for i, tweet in enumerate( sntwitter.TwitterSearchScraper('from:' + username).get_items()): if i < NUM_TWEETS_TRAINING: continue tweets_list1.append([tweet.content, label]) # Creating a dataframe from the tweets list above tweets_df1 = pd.DataFrame(tweets_list1, columns=['Text', 'Label']) tweets_df1.to_csv(username + str(training) + ".csv") return tweets_df1
def scrape_tweets(*, start: str, end: str, num_tweets: int, terms: list[str], target: str): scraper = twt.TwitterSearchScraper( " ".join(terms), f" lang: en since:{start} until:{end} -filter:replies") Row = namedtuple("Row", [ "tweetId", "date", "user", "url", "contents", "weight", "pos", "neu", "neg" ]) # Represents the output file we're writing to output = None if target[0] == "psql": output = writer.PSQLWriter(db_config=target[1]) else: output = csv_file = writer.CSVWriter(file_name=target[1], out_directory=target[2], column_names=Row._fields) for i, tweet in enumerate(scraper.get_items()): if i > MAX_TWEETS: break weight = classify.naive_weight(tweet.content) pos, neu, neg = classify.vader(tweet.content) output.append( Row(tweet.id, tweet.date, tweet.user, tweet.url, tweet.content, weight, pos, neu, neg)) output.stop()
def cmd_twitter(phrase, metadata, session): # phrase to be two parts: search by (keyword or username), the search argument try: search_by = phrase.split(", ")[0] search_arg = phrase.split(", ")[1] todays_date_str = datetime.today().strftime('%Y-%m-%d') yesterdays_date = datetime.now() - timedelta(1) yesterdays_date_str = datetime.strftime(yesterdays_date, '%Y-%m-%d') if search_by == "keyword": scrape_crit = f"{search_arg} since:{yesterdays_date_str} until:{todays_date_str}" body_text = f"Tweets about {search_arg}:\n\n" elif search_by == "username": scrape_crit = f"from:{search_arg}" body_text = f"Tweets by {search_arg}:\n\n" # Using TwitterSearchScraper to scrape data and append tweets to list for i, tweet in enumerate( sntwitter.TwitterSearchScraper(scrape_crit).get_items()): # print(vars(tweet)) # print(vars(tweet.user)) if i > 9: break date_str = tweet.date.strftime("%Y-%d-%m") body_text += f"Name: {tweet.user.displayname} ({tweet.user.username})\n" body_text += f"Date: {date_str}\n" body_text += f"Tweet: {tweet.content}\n\n" except Exception as e: body_text = "Error: " + str(e) print(body_text) return body_text
def get_tweets(keywords, save_dir=SAVEDIR_NEW, maxTweets=100): df_keywords = pd.read_csv('keywords.csv') dates = pd.date_range('1/1/2020', periods=52, freq='W') for week_idx, date in enumerate(dates): for label in df_keywords.columns: keywords = df_keywords[label] ss = label.replace('/', '-') savestr = os.path.join(save_dir, f'{ss}-{date}.json') start = f'{dates[week_idx]}'[:10] stop = f'{dates[week_idx+1]}'[:10] print( f'{datestr}: Fetching tweets in range {start} - {stop} for keywords: {keywords}' ) tweets = [] query = " OR ".join( keywords) + " lang:de" + ' since:' + start + " until:" + stop # Using TwitterSearchScraper to scrape data and append tweets to list for i, tweet in enumerate( sntwitter.TwitterSearchScraper(query).get_items()): if i > maxTweets: break tweets.append(tweet.__dict__) print(f'Found {len(tweets)} tweets') if len(tweets) > 0: pd.DataFrame(tweets).to_json(savestr, orient='records', lines=True)
def twitter_scraper(tw_id, since="2020-01-01", to="2020-02-01") : """ scrape twitter from given account Args: tw_id (str): the twitter account id since (str, optional): from date. Defaults to "2020-01-01". to (str, optional): to date. Defaults to "2020-02-01". Returns: """ # initialize tweet_time = [] tweet_dates = [] tweets_content = [] # scrape from twitter the_query = "from:" + tw_id + " since:" + since + " until:" + to for tweet in tw.TwitterSearchScraper(query=the_query).get_items() : tweet_dates.append(tweet.date.strftime("%Y-%m-%d")) tweet_time.append(tweet.date.strftime("%Y-%m-%d %H:%M:%S")) tweets_content.append(tweet.content) # convert to dataframe tweets = pd.DataFrame( {"Time" : tweet_time, "Content" : tweets_content, "Date" : tweet_dates} ).set_index("Time") tweets = tweets.iloc[ : :-1 ] # reverse the order, retrieved from: https://stackoverflow.com/questions/20444087/right-way-to-reverse-pandas-dataframe return tweets
def scrape(query, date): date = pd.Timestamp(date) next_day = date + pd.Timedelta(days=2) scraper = sntwitter.TwitterSearchScraper( f'{query} since:{date.strftime("%Y-%m-%d")} until:{next_day.strftime("%Y-%m-%d")}' ) fname = f'./tweets{query}/{date.year}/{date.month:02}/{date.strftime("%Y-%m-%d")}-tweets-{query}-compressed.csv' start = pd.Timestamp.now() print( f'Date: {date.strftime("%Y-%m-%d")} | Beg: {start.strftime("%Y-%m-%d %H:%M:%S")}' ) tweets, header, mode = [], True, 'w' for i, tweet in enumerate(scraper.get_items()): tweets.append(tweet) if i % 1000 == 999: to_df(tweets, date).to_csv(fname, index=False, header=header, mode=mode, compression='gzip') tweets, header, mode = [], False, 'a' to_df(tweets, date).to_csv(fname, index=False, header=header, mode=mode, compression='gzip') final = pd.Timestamp.now() print( f'Date: {date.strftime("%Y-%m-%d")} | End: {final.strftime("%Y-%m-%d %H:%M:%S")}' ) print( f'Date: {date.strftime("%Y-%m-%d")} | Dur: {(final - start).total_seconds()}s' )
def authenticate(): # twitter_configs = process_twitter_configs() # consumer_key = twitter_configs["consumer_key"] # consumer_secret = twitter_configs["consumer_key_secret"] # access_token = twitter_configs["access_token"] # access_secret = twitter_configs["access_token_secret"] # # auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # auth.set_access_token(access_token, access_secret) csvFile = open('15-10-2020.csv', 'a') # creates a file in which you want to store the data. csvWriter = csv.writer(csvFile) maxTweets = 1000 # the number of tweets you require # for i, tweet in enumerate(sntwitter.TwitterSearchScraper('#covid19' + # 'since:2020-10-15 until:2020-10-16' # ).get_items()): for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:AOC' ).get_items()): if i > maxTweets: break print(tweet.date) csvWriter.writerow([tweet.date, tweet.content])
def load_tweets(q_str, max_tweets=100): # Creating list to append tweet data to tweets_list = [] # Using TwitterSearchScraper to scrape data and append tweets to list for idx, tweet in enumerate( sntwitter.TwitterSearchScraper(q_str).get_items()): if idx > max_tweets: break source = get_source(tweet.source) text = tweet.content.lower() # do not include bots and giveaways if "Twit" in source and "away" not in text and "give" not in text: tweets_list.append([ tweet.date, tweet.id, text, tweet.user.username, tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, source, tweet.url, tweet.user.id, tweet.user.description, tweet.user.followersCount, tweet.user.friendsCount ]) # Creating a dataframe from the tweets list above df_tweets = pd.DataFrame(tweets_list, columns=[ 'created_at', 'id', 'text', 'username', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', "source", "tweet_url", 'user_id', "user_bio", 'followers_count', 'friends_count' ]) return df_tweets
def get_tweets(maxTweets, trend, startdate, enddate): maxTweets = maxTweets input = '%s since:%s until:%s' % (trend, startdate, enddate) tweets_list2 = [] # Using TwitterSearchScraper to scrape data and append tweets to list for i, tweet in enumerate( sntwitter.TwitterSearchScraper(input).get_items()): if i > maxTweets: break tweets_list2.append([tweet.content]) tweets = pd.DataFrame(tweets_list2, columns=['Text']) def processed_tweet(tweet): return ' '.join( re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', tweet).split()) tweets['processed_tweet'] = tweets['Text'].apply( lambda x: processed_tweet(x)) all_tweets = ' '.join(tweet for tweet in tweets['processed_tweet']) wordcloud = WordCloud(stopwords=STOPWORDS, background_color="black").generate(all_tweets) image = wordcloud.to_image() buffered = BytesIO() image.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()) return img_str.decode('utf-8')
def crawler(source_file_dir: str) -> None: with open(source_file_dir, 'r') as stock_profile_list: data_folder_dir = os.path.dirname(source_file_dir) with open(os.path.join(data_folder_dir, 'tweet.csv'), 'w') as tweet_list: fields = ['Stock', 'Date', 'Content', 'Author'] tweet_writer = csv.DictWriter(tweet_list, fields) tweet_writer.writeheader() rstocklist = csv.DictReader(stock_profile_list) next(rstocklist) for row in rstocklist: max_tweets = 10 print(f"searching for ticker ${row['ticker']}") hashtag = f"${row['ticker']}" try: for i, tweet in enumerate(sntwitter.TwitterSearchScraper(hashtag).get_items()): if i > max_tweets: break tweet_writer.writerow( {'Stock': row['ticker'], 'Date': str(tweet.date), 'Content': tweet.content, 'Author': tweet.username}) except Exception as e: # work on python 3.x print(str(e))
def get_tweets_on_date(start_day): keyword = 'dogecoin OR Dogecoin OR DogeCoin' maxTweets = 2000 end_day = start_day + timedelta(days=1) #Open/create a file to append data to csvFile = open('dogecoin-sentiment-' + start_day.strftime('%Y-%m-%d') + '.csv', 'w', newline='', encoding='utf8') #Use csv writer csvWriter = csv.writer(csvFile) csvWriter.writerow([ 'id', 'date', 'text', 'user', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount' ]) for i, tweet in enumerate( sntwitter.TwitterSearchScraper( keyword + ' lang:en since:' + start_day.strftime('%Y-%m-%d') + ' until:' + end_day.strftime('%Y-%m-%d') + ' -filter:links -filter:replies').get_items()): if i > maxTweets: break csvWriter.writerow([ tweet.id, tweet.date, tweet.content, tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.quoteCount ]) csvFile.close()
def process_twitter_details(keyword): start_date = str(date.today() + timedelta(days=-14)) end_date = str(date.today() + timedelta(days=-7)) usernames = [] tweet_ids = [] contents = [] dates = [] medium_links = [] intext_links = [] tweet_url = [] for i, tweet in enumerate( sntwitter.TwitterSearchScraper(keyword + " since:" + start_date + ' until:' + end_date).get_items()): if i > 10: break usernames.append(tweet.username) tweet_ids.append(tweet.id) dates.append(tweet.date) contents.append(tweet.content) medium_links.append(tweet.outlinks) intext_links.append(tweet.tcooutlinks) tweet_url.append(tweet.url) return usernames, tweet_ids, contents, dates, medium_links, intext_links, tweet_url
def getFirstAppearance(keywords, startDate, endDate, location, printAllTweets): # build query counter = 0 hashString = '' for keyword in keywords: counter += 1 hashString = hashString + keyword if counter < len(keywords): hashString += " OR " # print(hashString) query = hashString + " since:" + startDate + " until:" + endDate if location is not None: query += ''' near:"''' + location + '''" within:30mi''' tweets = [] for i, tweet in enumerate(sntwitter.TwitterSearchScraper( query).get_items()): t1 = Tweet(tweet.id,tweet.date,tweet.content,tweet.username) tweets.append(t1) if printAllTweets: print(t1) #frequencycounter+=1 #results.append(Tweet(tweet.id,tweet.date,tweet.content)) #id = tweet.id if len(tweets)==0: return "No tweets found between " + str(startDate) + " and "+ str(endDate) #return last element since it is the earliest return tweets[len(tweets)-1]
def submit(): countvar = int(x1.get()) wordsvar = x2.get() sincevar = x3.get() untilvar = x4.get() csvnamevar = x5.get() personvar = x6.get() locationvar = x7.get() tweetslist = [] params = "'" + wordsvar + " from:" + personvar + " near:" + locationvar + " since:" + sincevar + " until:" + untilvar + "'" params = str(params) global disglobvar if disglobvar == "yes": params = "'" + wordsvar + " from:" + personvar + " near:" + locationvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'" params = str(params) #if personvar==None: if len(personvar) == 0: params = "'" + wordsvar + " near:" + locationvar + " since:" + sincevar + " until:" + untilvar + "'" if disglobvar == "yes": params = "'" + wordsvar + " near:" + locationvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'" str(params) #if locationvar==None: if len(locationvar) == 0: params = "'" + wordsvar + " from:" + personvar + " since:" + sincevar + " until:" + untilvar + "'" if disglobvar == "yes": params = "'" + wordsvar + " from:" + personvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'" str(params) #if locationvar==None and personvar==None: if len(locationvar) == 0 and len(personvar) == 0: params = "'" + wordsvar + " since:" + sincevar + " until:" + untilvar + "'" if disglobvar == "yes": params = "'" + wordsvar + " include:nativeretweets" + " since:" + sincevar + " until:" + untilvar + "'" str(params) #print(params) for i, tweet in enumerate( sntwitter.TwitterSearchScraper(params).get_items()): if i > countvar: break if (i % 100 == 0): print("Progress:", i, "/", countvar) tweetslist.append([ tweet.content, tweet.date, tweet.user.username, tweet.lang, tweet.user.location, tweet.id ]) tweetslistdataframe = pd.DataFrame(tweetslist, columns=[ "Tweet Content", "Tweet Date", "Username", "Language", "Location", 'Tweet ID' ]) tweetslistdataframe.to_csv(csvnamevar + ".csv") tweetslist.clear() del tweetslistdataframe print("Finished Downloading Tweets")
def create_corpus(company): name = company[0] keyword = company[1] start_date = date(2018, 12, 31) next_date = start_date + dt.timedelta(days=1) end_date = date(2020, 1, 1) max_tweets = 10 tweet_content = [] tweet_dates = [] filename = name + "_tweets.csv" while next_date < end_date: start = start_date.strftime("%Y-%m-%d") next = next_date.strftime("%Y-%m-%d") search = keyword + ' since:' + start + ' until:' + next + ' lang:en' for i, t in enumerate( sntwitter.TwitterSearchScraper(search).get_items()): if i > max_tweets: break print("collecting tweets...") tweet_content.append(clean_text(t.content)) tweet_dates.append(t.date) start_date = next_date next_date = next_date + dt.timedelta(days=1) print("finished a batch!") tweet_data = {'Time': tweet_dates, 'Text': tweet_content} tweet_df = pd.DataFrame(tweet_data) tweet_df.to_csv("./tweets/" + filename)
def sns(query, num_tweets=10): output = [] # We decode the parameter since we are calling the function from a separate server and parameters info is binary-enconded engine_query = '#' + query.decode('utf-8') + ' -filter:retweets' for i, tweet in enumerate(sntwitter.TwitterSearchScraper(engine_query).get_items()): # We need to limit the no. of tweets manually since the sns Python wrapper doesn't include the functionality if i > num_tweets-1: break try: to_translate = tweet.content translated = GoogleTranslator(source='auto', target='en').translate(to_translate) sentimiento = TextBlob(translated).sentiment except Exception as e: sentimiento = 0.0 output.append({ 'Tweet_No': str(i+1), 'Date': str(tweet.date), 'ID': tweet.id, 'Content': tweet.content, 'Username': tweet.username, 'Sentiment': round(sentimiento.polarity, 2) }) return json.dumps(output, indent=3)
def get_tweets(self, keywords, tweets_per_week, weeks, lang='pl'): """ :param tweets_per_week: :param weeks: :param lang: :param keywords: provide keywords separated by a +, e. g. "korona+szczepienie" :return: list Tweet objects """ now = datetime.now() now_str = now.strftime('%Y-%m-%d') until = now since = now # Open/create a file to append data to csvFile = open(os.path.join('en', (keywords + '-sentiment-' + now_str + '.csv')), 'a', newline='', encoding='utf8') # Use csv writer csvWriter = csv.writer(csvFile) csvWriter.writerow(['id', 'date', 'tweet', 'retweet_count', 'like_count']) for _ in range(weeks): until = since until_str = until.strftime('%Y-%m-%d') since = until - timedelta(days=7) since_str = since.strftime('%Y-%m-%d') for i, tweet in enumerate(sntwitter.TwitterSearchScraper( keywords + ' lang:' + lang + ' since:' + since_str + ' until:' + until_str + ' -filter:links -filter:replies').get_items()): if i > tweets_per_week: break csvWriter.writerow([tweet.id, tweet.date, tweet.content, tweet.retweetCount, tweet.likeCount]) csvFile.close()
def graphs(): tweets_list2 = [] for i,tweet in enumerate(sntwitter.TwitterSearchScraper(raw_text + ' since:' + since_date +' until:' + until_date).get_items()): if i>count: break tweets_list2.append([tweet.content]) df = pd.DataFrame(tweets_list2, columns=['Tweet']) # Create Dataframe with just tweets df['cleanLinks'] = df['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0]) # Removing URLs df['cleanLinks'] = df['cleanLinks'].apply(lambda x: x.lower()) # applying lowercase to text # Special Characters list spec_chars = ["!", '"', "#", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "–", '$'] for char in spec_chars: df['cleanLinks'] = df['cleanLinks'].str.replace(char, ' ') # Counting Numbers def Wordcount(cleanLinks): if 'buying' in cleanLinks.lower(): return 'buy positions mentioned' if 'selling' in cleanLinks.lower(): return 'sell positions mentioned' if 'buy' in cleanLinks.lower(): return 'buy positions mentioned' if 'sell' in cleanLinks.lower(): return 'sell positions mentioned' if 'short' in cleanLinks.lower(): return 'short positions mentioned' if 'long' in cleanLinks.lower(): return 'long positions mentioned' if 'put' in cleanLinks.lower(): return 'puts mentioned' if 'call' in cleanLinks.lower(): return 'calls mentioned' else: return df['Market Polar Position'] = df['cleanLinks'].apply(Wordcount) # Graph numbers st.markdown('**Visualization of investor positions:**') position_A = df['Market Polar Position'].value_counts() st.write(position_A) # Graph plt.axis('off') df['Market Polar Position'].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(10, 5)) plt.savefig('buyers.png') buy = Image.open("buyers.png") return buy
def main(): keyword = input('Informe um usuário ou tópico para buscar: ') maxTweets = int(input("Selecione a quantidade de tweets para buscar: ")) # Iniciando um arquivo csv vazio para manipular csvFile = open(keyword + '-sentiment-' + now + '.csv', 'a', newline='', encoding='utf8') # Usando csv writer para abrir o arquivo para escrita e definir suas colunas csvWriter = csv.writer(csvFile) csvWriter.writerow([ 'id', 'date', 'tweet', ]) for i, tweet in enumerate( sntwitter.TwitterSearchScraper( keyword + ' since:' + yesterday + ' until:' + now + ' -filter:links -filter:replies').get_items()): if i > maxTweets: break csvWriter.writerow([tweet.id, tweet.date, tweet.content]) csvFile.close() # iniciando a analise de sentimentos analyzer = SentimentIntensityAnalyzer() # Lendo o CSV de volta em nosso programa df = pd.read_csv('~/Documents/PycharmProjects/Diversos/webscrap/' + keyword + '-sentiment-' + now + '.csv', parse_dates=True, index_col=0) # Criando as colunas de sentimentos df['compound'] = [ analyzer.polarity_scores(x)['compound'] for x in df['tweet'] ] df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['tweet']] df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['tweet']] df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['tweet']] # Pegando a media dos sentimentos de cada coluna avg_compound = np.average(df['compound']) avg_neg = np.average(df['neg']) * -1 avg_neu = np.average(df['neu']) avg_pos = np.average(df['pos']) # Contagem dos tweets count = len(df.index) # Print das analises print("Foram encontrados ", count, "tweets sobre " + keyword, end='\n*') print("Sentimentos positivos:", '%.2f' % avg_pos, end='\n*') print("Sentimentos neutros:", '%.2f' % avg_neu, end='\n*') print("Sentimentos negativos:", '%.2f' % avg_neg, end='\n*') print("Sentimentos compostos:", '%.2f' % avg_compound, end='\n')
def tweets_search(request): if request.method == 'POST': key_groups = [] # Recebo os dados enviados pelo usuário username = request.POST.get('username') start_date = request.POST.get('startDate').split('-') end_date = request.POST.get('endDate').split('-') type_search = request.POST.get('search') num_search = request.POST.get('num') keywords = request.POST.get('words').split(',') # Evita que usuários deixem de preencher algo if keywords == [''] or start_date == [''] or end_date == ['']: return render(request, 'portal/tweets_search.html') # Adapto a data no formato DD-MM-AAAA para AAAA-MM-DD begin_date = f'{start_date[2]}-{start_date[1]}-{start_date[0]}' end_date = f'{end_date[2]}-{end_date[1]}-{end_date[0]}' num = len(keywords) j = 0 # Crio a string que determina a raspagem a ser feita search = '' while num > j: # Caso tenha usuário, defino que deve ser concatenado seu username if username != '' and j == 0: search = search + f'from:{username}' # Concateno a primeira keyword if j == 0: search = search + f' {keywords[0]}' # Concateno as demais keywords else: # Concateno com AND caso deseje tweets com todos as keywords if type_search == 'all-kw': search = search + f' AND {keywords[j]}' # Concateno com OR caso deseje tweets com no mínimo uma keyword else: search = search + f' OR {keywords[j]}' j += 1 # Concateno as datas de início e fim da busca search = search+ f' since:{begin_date}' + f' until:{end_date}' tweets = [] datas = [] # Realiza a busca definida anteriormente, se limitando com o número máximo de tweets for i, tweet in enumerate(sntwitter.TwitterSearchScraper(search).get_items()): if num_search != 'ilimitado' and i > int(num_search): break # Recebe e formata a data do tweet data = str(tweet.date).split()[0] data = data.split('-') data = f'{data[2]}-{data[1]}-{data[0]}' tweets.append([data, tweet.content]) return render(request, 'portal/tweets_list.html', {'tweets': tweets}) else: return render(request, 'portal/tweets_search.html')
def gen_wordcloud(): tweets_list2 = [] for i,tweet in enumerate(sntwitter.TwitterSearchScraper(raw_text + ' since:' + since_date +' until:' + until_date).get_items()): if i>count: break tweets_list2.append([tweet.content]) df = pd.DataFrame(tweets_list2, columns=['Tweet']) df['cleanLinks'] = df['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0]) # Removing URLs df['cleanLinks'] = df['cleanLinks'].apply(lambda x: x.lower()) # applying lowercase to text # Special Character list spec_chars = ["!", '"', "#", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "–", '$'] for char in spec_chars: df['cleanLinks'] = df['cleanLinks'].str.replace(char, ' ') # WC generation words = " ".join(df['cleanLinks']) # remove punctuation and stop words def punctuation_stop(text): filtered = [] stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) for w in word_tokens: if w not in stop_words and w.isalpha(): filtered.append(w.lower()) return filtered unwanted = [raw_text, raw_text_U, 'market', 'moving', 'average', 'economy', 'stockmarket', 'stocks', 'stock', 'people', 'money', 'markets', 'today', 'http', 'the', 'to', 'and', 'is', 'of', 'in', 'it', 'you', 'for', 'on', 'this', 'will', 'are', 'price', 'dow', 'jones', 'robinhood', 'link', 'http', 'dow', 'jones', 'order', '//', 'sign', 'join', 'claim'] try: words_filtered = punctuation_stop(words) text = " ".join([ele for ele in words_filtered if ele not in unwanted]) wc = WordCloud(background_color="gray", stopwords=STOPWORDS, max_words=500, width=2000, height=2000) wc.generate(text) plt.imshow(wc, interpolation="bilinear") plt.axis('off') plt.savefig('WC.png') gen = Image.open("WC.png") plt.show() return gen except ValueError: st.error('**Not enough tweets found to build wordcloud**')
def extract(line, max_lim, log, repeatinfo): global username global date global lang global text global likes #global location global sharedata global url global media global repeated start_hash_time = time.time() line = re.sub(r'[ˆ\n]', r'', line) tweetdata = line.split(';') if log: print( "Extracting " + tweetdata[0] + " in " + str(tweetdata[1]) + " >> " + str(tweetdata[2]) + " ...") results = 0 # Extract data for i,tweet in enumerate(sntwitter.TwitterSearchScraper(tweetdata[0] + " since:" + tweetdata[1] + " until:" + tweetdata[2]).get_items()): if (i > max_lim) and (max_lim > 0): # Max limit of results if log: print("Maximum Limit of Extraction! Extraction stopped!") break if (text.count(tweet.content) == 0) or (username.count(tweet.user.username) == 0): # Check for duplicates username.append(tweet.user.username) date.append(tweet.date) lang.append(tweet.lang) text.append(tweet.content) likes.append(tweet.likeCount) #location.append(tweet.location) sharedata.append("likes=" + str(tweet.likeCount) + ";retweets=" + str(tweet.retweetCount) + ";replies=" + str(tweet.replyCount) + ";quotes=" + str(tweet.quoteCount)) url.append(tweet.url) if repeatinfo != "": repeated.append(repeatinfo) if tweet.media: mediaurl = [] for medium in tweet.media: if medium.type == "photo": mediaurl.append(medium.fullUrl) elif medium.type == "video": for v in medium.variants: mediaurl.append(v.url.replace("?tag=13", "").replace("?tag=10", "")) media.append(mediaurl) else: media.append([]) results = i end_hash_time = formatTime(time.time() - start_hash_time) if log: print(str(results), tweetdata[0], "tweet(s) extracted in {:0>2}:{:0>2}:{:05.2f}".format(int(end_hash_time[0]), int(end_hash_time[1]), end_hash_time[2]), "\n")
def getAuthorityData(startDate, endDate, user): query = "from:" + user + " since:" + startDate + " until:" + endDate #print (query) list = [] for i, tweet in enumerate(sntwitter.TwitterSearchScraper( query).get_items()): list.append(Tweet(tweet.id, tweet.date, tweet.content,tweet.username)) #print(tweet.id) return list
async def retrieve_trends(self): while(1): today = datetime.date.today() query :str = self._hashtag + ' since:' + str(today) print(f'query is {query}') totaltweets = sntwitter.TwitterSearchScraper(query) for i, tweet in enumerate(totaltweets.get_items()): #print(tweet.date) #print(tweet.content) if tweet == None: continue inf :List[float]= [tweet.retweetCount, tweet.likeCount, tweet.replyCount] if inf == []: continue if self.my_criteria(inf): thisuser: str = tweet.user.username print(f'influencer {thisuser}') untilday = tweet.date.date() last_N_tweets = sntwitter.TwitterSearchScraper('from:' + thisuser +' until:'+str(untilday)).get_items() infthisuer:List[List[float]] = [] userinf = [] if thisuser in self._userinf.keys(): userinf = self._userinf[thisuser] else: for N, tweet_user in enumerate(last_N_tweets): if tweet_user == None: continue # print(tweet_user.date) if N > self._PastN: break #got_hashtags = self.retrive_hashtags(content=tweet_user.content) if N <= self._PastN: infthisuer.append([tweet_user.retweetCount, tweet_user.likeCount, tweet_user.replyCount]) if len(infthisuer) ==0: continue infthisuer = np.array(infthisuer, dtype=np.float32) avg = infthisuer.mean(0) userinf = [avg[0], avg[1], avg[2]] if self.alert_criteria(userinf,inf): content = tweet.content.replace('\n', '').replace('\r', '') userinf_str:str = str(userinf[0])+","+str(userinf[1])+","+str(userinf[2]) tweet_inf:str = str(inf[0])+","+str(inf[1])+","+str(inf[2]) alert_content = '\n'+tweet.url+' / '+str(tweet.date)+" / user inf "+ userinf_str+" / tweet inf "+tweet_inf+\ " / "+tweet.user.username+" / "+content+'\n' self.make_alert(alert_content) await asyncio.sleep(1800)
def getEarliestTweets(hashtags, startDate, endDate, location): #TODO: start ist immer das aktuelle Datum #convert strings to actual date elements start = datetime.datetime.strptime(startDate, "%Y-%m-%d") end = datetime.datetime.strptime(endDate,"%Y-%m-%d") # build query counter = 0 hashString = '' for hashtag in hashtags: counter += 1 hashString = hashString + hashtag if counter < len(hashtags): hashString += " OR " # print(hashString) ids = [] results = [] dict = {} while (start<=end): query = hashString + " since:" + start.strftime("%Y-%m-%d") + " until:" + (start+datetime.timedelta(days=1)).strftime("%Y-%m-%d") if location is not None: query += ''' near:"''' + location+'''" within:50mi''' #print(query) frequencycounter = 0 for i, tweet in enumerate(sntwitter.TwitterSearchScraper( query).get_items()): frequencycounter+=1 results.append(Tweet(tweet.id,tweet.date,tweet.content,tweet.username)) ids.append(tweet.id) #print(tweet.id) #print(tweet.date) #print(tweet.content+"\n") dict[start.strftime("%Y-%m-%d")]=frequencycounter start = start + datetime.timedelta(days=1) """for i in reversed(results): print(i.date) print(i.id) print(i.content+"\n")""" print(dict) return ids
def getPlayerTweets(handle: str, since: str, until: str) -> dict: """ Retrieves player tweets within the specified time period. :param handle: handle of player whose tweet IDs are being retrieved :param since: string indicating the lower bound date :param until: string indicating the upper bound date :return: dict with tweet id as key, tweet object as value """ return {tweet.id: tweet for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:%s since:%s until:%s' % (handle, since, until)).get_items())}
def twitter_scrape(handle, num_tweet): """ Input: 1) handle: Twitter handle of a person without '@' (e.g. JoeBiden) 2) num_tweet: The number of past tweets the analysis will be based on (e.g. 100) Outputs: 1) Dataframe containing scraped tweets """ raw_tweet_df = pd.DataFrame( itertools.islice(sntwitter.TwitterSearchScraper('from: ' + handle).get_items(), num_tweet)) return raw_tweet_df
def scrape(date): # creating generator for scraping tweets = twitter.TwitterSearchScraper( f"bitcoin since:{yest2str(date)} until:{date} filter:has_engagement lang:en" ).get_items() # iterating through all tweets tweets = itertools.islice(tweets, n_iter) # storing tweets in pandas dataframe df = pd.DataFrame(tweets) # returning necessay columns of dataframe return df[['date', 'content']]
def search_tweets_sn(q, since=None, until=None, username=None, near=None, radius=None, lang=None, max_tweets=-1, quiet=False): """ Search tweets according to keyword arguments specified using snscrape. Parameters ---------- q (str): A query text to be matched. since (str. "yyyy-mm-dd"): A lower bound date (UTC) to restrict search. Default is 7 days before today. until (str. "yyyy-mm-dd"): An upper bound date (not included) to restrict search. Default is today. username (str or iterable): An optional specific username(s) from a twitter account (with or without "@"). Default is no username restriction. near (str): A reference location area (e.g. Milan) from where tweets were generated. Default is no reference area. radius (str): A distance radius (e.g. 15km) from location specified by "near". Meaningful only if "near" is set. lang (str): Restrict language of the tweets retrieved. Must be an ISO 639-1 code (e.g. en, it, etc.). Default is no language restriction. max_tweets (int): The maximum number of tweets to be retrieved. If this number is unsetted or lower than 1 all possible tweets will be retrieved. Default is -1. Returns ------- tweets (NLPTweetList): list of tweets resulting from the search and amenable to analysis. """ if until is None: until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d') if since is None: since = datetime.datetime.strftime( datetime.datetime.strptime(until, '%Y-%m-%d') - datetime.timedelta(days=7), '%Y-%m-%d') if max_tweets == -1: max_tweets = sys.maxsize criteria = f"{q} since:{since} until:{until} exclude:retweets exclude:replies" if username is not None: criteria += f" from:{username}" if near is not None: criteria += f" near:{near.replace(' ', '&')}" if radius is not None: criteria += f" within:{radius}" if lang is not None: criteria += f" lang:{lang}" tweets = NLPTweetList(islice( sntwitter.TwitterSearchScraper(criteria).get_items(), max_tweets), tqdm_total=max_tweets, quiet=quiet) return tweets
def snl(): last = 1373977398891450374 for i, tweet in enumerate( sntwitter.TwitterSearchScraper( 'vacinas OR vacina OR vacinacao OR vacinacao (@FlavioDino OR @GovernoMA) max_id:{} since:2021-01-17 until:2021-05-01' .format(last)).get_items()): if i > 5: break last = tweet.id print("\n") print("tweet id: {}".format(tweet.id)) print("tweet text: {}".format(tweet.content)) print("tweet date: {}".format(tweet.date))