def run_keyword(keyword, older): """Iteratively search over a given keyword""" lg.info(f"Searching tweets for keywords {keyword.valeur}") keyword.nb_rech += 1 min_id = keyword.plus_ancien_tweet max_id = keyword.plus_recent_tweet keep_looking = True i = 0 while keep_looking: i += 1 try: lg.info( f"Searching for keyword {keyword.valeur} ; loop number {i}") if older is True: lg.debug("Older is true") res = scrap_older_tweets(keyword, min_id) else: lg.debug("Older is false") res = scrap_newer_tweets(keyword, max_id) # then we search tweet oldest than those just collected older = True min_id = manage_result(res, keyword) if not min_id: keep_looking = False except TwythonRateLimitError: lg.warning( "Twitter limit reached." + "Wait 15 minutes before moving to next keyword") # time.sleep(900) break
def main(): tweets_to_process = ( session.query(TWEET).filter(TWEET.texte_retraite == None)) # noqa lg.info(f"Tweet to process : {tweets_to_process.count()}") while tweets_to_process.count() > 0: for tweet in tweets_to_process.all()[0:1000]: tweet.texte_retraite = processing(tweet.texte) session.commit() session.close() tweets_to_process = (session.query(TWEET).filter( TWEET.texte_retraite == ""))
def main(older=True): keywords = ( session.query(KEYWORD) .filter(KEYWORD.active == True) # noqa .order_by(KEYWORD.nb_rech) .all() ) print(keywords) for keyword in keywords: run_keyword(keyword, older) lg.info("Program over")
def common_hashtags(export=True, n=1000): freq = nltk.FreqDist( [h for h in hashtag_generator()] ) if export is True: with open( "outputs/hashtags.csv", "w", newline='', encoding="utf-8") as csvfile: writer = csv.writer( csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerows(freq.most_common(n)) lg.info("Fichier des hasthags exporté") return(freq.most_common(n))
def manage_result(res, keyword): """Manage the twitter response according to wether or not its empty""" if res['statuses']: # case where there are some results to write in database lg.info(f"Number of tweets : {len(res['statuses'])}") min_id = min([tweet['id'] for tweet in res['statuses']])-1 lg.debug(f"min_id : {min_id}") # Formatting and writing data write_data(res, keyword) return min_id else: lg.info( "No more results for keyword '%s' ; moving to next keyword", keyword.valeur) return None
def main(): tweets_to_process = session.query(TWEET).filter( TWEET.nb_rt == None) # noqa while tweets_to_process.count() > 0: lg.info(f"""Searching influence data. Tweets remaining : {tweets_to_process.count()}""") tweet_list = tweets_to_process.all()[0:100] ids = [tweet.tweet_id for tweet in tweet_list] results = api_query(ids) for tweet in tweets_to_process.all()[0:100]: tweet.nb_favori = results[tweet.tweet_id]["nb_favori"] tweet.nb_rt = results[tweet.tweet_id]["nb_rt"] tweet.date_influence = auj session.commit() session.close() tweets_to_process = session.query(TWEET).filter( TWEET.nb_rt == None) # noqa
def write_data(res, keyword): """ Reorganize result data and write them to sqlite database """ for tw in res['statuses']: tweet_id = tw['id'] user_id = tw['user']['id'] # Writing User data: compte = session.query(COMPTE).filter(COMPTE.user_id == user_id) if not compte.all(): # Si non existant on le créé compte = COMPTE(tw['user']) session.add(compte) else: compte = compte.one() # Writing Tweet_data: tweet = session.query(TWEET).filter(TWEET.tweet_id == tweet_id) if not tweet.all(): tweet = TWEET(tw) tweet.compte = compte session.add(tweet) lg.info(f"adding tweet {tweet_id}") # Updating Keyword data : if keyword.plus_ancien_tweet: if tweet_id < keyword.plus_ancien_tweet: keyword.plus_ancien_tweet = tweet_id else: keyword.plus_ancien_tweet = tweet_id if keyword.plus_recent_tweet: if tweet_id > keyword.plus_recent_tweet: keyword.plus_recent_tweet = tweet_id else: keyword.plus_recent_tweet = tweet_id session.commit()
results[tweet_id] = {"nb_rt": nb_rt, "nb_favori": nb_favori} return results def main(): tweets_to_process = session.query(TWEET).filter( TWEET.nb_rt == None) # noqa while tweets_to_process.count() > 0: lg.info(f"""Searching influence data. Tweets remaining : {tweets_to_process.count()}""") tweet_list = tweets_to_process.all()[0:100] ids = [tweet.tweet_id for tweet in tweet_list] results = api_query(ids) for tweet in tweets_to_process.all()[0:100]: tweet.nb_favori = results[tweet.tweet_id]["nb_favori"] tweet.nb_rt = results[tweet.tweet_id]["nb_rt"] tweet.date_influence = auj session.commit() session.close() tweets_to_process = session.query(TWEET).filter( TWEET.nb_rt == None) # noqa if __name__ == '__main__': for annee in range(2006, 2018): lg.info(f"Collecte des informations d'influence") main()