def rules_setup(self): print("Get original rules...") current_rules = self.get_all_rules() print("Delete original rules...") self.delete_all_rules(current_rules) print("Set new rules...") self.set_rules() print("Current rules:") current_rules = self.get_all_rules() write_to_log(self.log_file, f"Streaming with filter rules: {str(current_rules)}...")
def __init__(self, query, start_time, end_time, bearer_token, db_conn, db_cur): self.auth = bearer_token self.query = query self.start_time = start_time.strftime("%Y-%m-%dT%H:%M:%SZ") self.end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ") self.next_token = None self.db_conn = db_conn self.db_cur = db_cur self.log_file = f'../logs/recent_search_{LOGFILE_NAME}_{datetime.now().strftime("%Y%m%d_%H:%M:%S")}.txt' write_to_log( self.log_file, f"Started to search from {self.start_time} to {self.end_time} with query {self.query}..." )
def save_next_batch(self): # try: print("Started new batch!") try: tweets_details, next_token = premium_search_retweet( product=self.product, label=self.label, query=self.query, from_date=self.start_time, to_date=self.end_time, bearer_token=self.auth, next_token=self.next_token) except Exception as e: if str(e).startswith("429"): raise Exception("Rate limit exceeded!") r_obj_list = tweets_details self.next_token = next_token for r_obj in r_obj_list: # save only retweets if r_obj["is_retweet"]: try: todb_values = [ r_obj["tweet_id"], r_obj["author_id"], r_obj["created_at"], r_obj["parent_tweet_id"], r_obj["parent_tweet_author_id"], r_obj["like_count"], r_obj["quote_count"], r_obj["reply_count"], r_obj["retweet_count"], r_obj["text"], ] batch_insert(RETWEETS.name, RETWEETS.cols, [todb_values], self.db_cur) self.db_conn.commit() write_to_log( self.log_file, f'Saved to retweets! tweet_id: {r_obj["tweet_id"]}') except sqlite3.IntegrityError: print("Retweet already saved!") write_to_log(self.log_file, f"Saved one batch!") print("Saved one batch!")
def __init__(self, product, label, query, start_time, end_time, bearer_token, db_conn, db_cur): self.auth = bearer_token self.product = product self.label = label self.query = query self.start_time = start_time.strftime("%Y%m%d%H%M") self.end_time = end_time.strftime("%Y%m%d%H%M") self.next_token = None self.db_conn = db_conn self.db_cur = db_cur self.log_file = f'../logs/july_retweet_search_{LOGFILE_NAME}_{datetime.now().strftime("%Y%m%d_%H:%M:%S")}.txt' write_to_log( self.log_file, f"Started to search from {self.start_time} to {self.end_time} with query {self.query}..." )
def save_next_batch(self): if self.next_id_ptr == 0: write_to_log( self.log_file, f'-------[{datetime.now().strftime("%H:%M:%S")}] Starting to update retweets-------' ) print(f"Processing next batch...") limit_exceeded = False trange = tqdm(range(self.next_id_ptr, len(self.tweets_id_list))) for idx in trange: tweet_id = self.tweets_id_list[idx] try: r_obj = get_single_tweet_by_id_labs(tweet_id, self.auth) values = [ r_obj["like_count"], r_obj["quote_count"], r_obj["reply_count"], r_obj["retweet_count"], tweet_id ] update_row(self.table_name, "tweet_id", [ "like_count", "quote_count", "reply_count", "retweet_count" ], [values], self.db_cur) self.db_conn.commit() write_to_log(self.log_file, f"Updated stats for tweet: {tweet_id}") except Exception as e: error_msg = str(e) if error_msg == "[Self Defined]Error in response!": pass elif error_msg.startswith("429"): limit_exceeded = True self.next_id_ptr = idx write_to_log( self.log_file, f'**[{datetime.now().strftime("%H:%M:%S")}]** Finished {idx} tweets! Next tweet idx: {self.next_id_ptr}' ) trange.close() break else: # print(traceback.format_exc()) raise Exception("error!") write_to_log( self.log_file, f'**{datetime.now().strftime("%H:%M:%S")}**Finished one batch!') if limit_exceeded: raise Exception("[Self Defined]Limit Exceeded!") else: self.next_id_ptr = len(self.tweets_id_list)
def next_batch(self): tweets_todb = [] from_date_str = self.next_datetime.strftime("%Y%m%d%H%M") to_date_str = (self.next_datetime + timedelta(days=1)).strftime("%Y%m%d%H%M") print(f"Searching for date {from_date_str}...") write_to_log( self.log_file, f'-------[{datetime.now().strftime("%H:%M:%S")}] Start searching for date {from_date_str}-------' ) tweets_count = 0 next_token = None while tweets_count < self.tweets_per_day and next_token != "Not existed!": try: tweets_list, next_token = premium_search( product=self.product, label=self.label, query=self.query, from_date=from_date_str, to_date=to_date_str, next_token=next_token) tweets_count += len(tweets_list) for obj in tweets_list: tweets_todb.append([ obj["tweet_id"], obj["full_text"], obj["created_at"], obj["language"], obj["hashtags_str"], obj["mentions_str"], obj["favorite_count"], obj["retweet_count"] ]) except Exception as e: error_code = str(e).split(":")[0] if error_code == "88" or error_code == "429": print("Rate limit exceeded!") write_to_log( self.log_file, f'**[{datetime.now().strftime("%H:%M:%S")}]** Rate limit exceeded! Next date to search: {from_date_str}' ) program_sleep(61) else: write_to_log(self.log_file, f"{e}") break self.next_datetime += timedelta(days=1) write_to_log( self.log_file, f'-------[{datetime.now().strftime("%H:%M:%S")}] Finished searching for date {from_date_str}-------\n' ) return tweets_todb
def save_next_batch(self): # try: results = recent_search_labs(self.query, self.start_time, self.end_time, self.next_token, self.auth) # except Exception as e: # write_to_log(self.log_file, f'[ERROR]{datetime.now().strftime("%Y%m%d_%H:%M:%S")}: {e}') # return r_obj_list = results["r_obj_list"] self.next_token = results["next_token"] for r_obj in r_obj_list: # # find original tweet if is reply # while r_obj['is_reply'] and r_obj["tweet_id"] not in VISITED_REPLIES: # try: # write_to_log(self.log_file, f'Reply tweet! tweet_id: {r_obj["tweet_id"]}. Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}') # VISITED_REPLIES.add(r_obj["tweet_id"]) # r_obj = get_single_tweet_by_id_labs(r_obj['parent_tweet_id'], self.auth) # except Exception as e: # print_dict(r_obj) # write_to_log(self.log_file, e) # break if not r_obj["is_reply"]: try: todb_values = [ r_obj["tweet_id"], r_obj["author_id"], r_obj["created_at"], r_obj["text"], r_obj["expanded_urls"], r_obj["hashtags_str"], r_obj["mentions_str"], r_obj["like_count"], r_obj["quote_count"], r_obj["reply_count"], r_obj["retweet_count"], ] batch_insert(REGULAR_TWEETS.name, REGULAR_TWEETS.cols, [todb_values], self.db_cur) self.db_conn.commit() write_to_log( self.log_file, f'Saved to regular_tweets! tweet_id: {r_obj["tweet_id"]}' ) except sqlite3.IntegrityError: print("Tweet already saved!") except sqlite3.OperationalError: write_to_log( self.log_file, f'-------[ERROR] Cannot save to regular_tweets! tweet_id: {r_obj["tweet_id"]}-------\n' + f"Tweet values: {str(todb_values)}\n" + f"----------------------------------------------------------------------------------" ) write_to_log( self.log_file, f'**{datetime.now().strftime("%H:%M:%S")}**Finished one batch! Next next_token: {self.next_token}' )
def next_batch(self): if self.next_id_ptr == 0: write_to_log( self.log_file, f'-------[{datetime.now().strftime("%H:%M:%S")}] Starting getting full tweets for {self.table_name}-------' ) print(f"Getting next batch for {self.table_name}...") limit_exceeded = False tweets_todb = [] images_todb = [] videos_todb = [] gifs_todb = [] externals_todb = [] trange = tqdm(range(self.next_id_ptr, len(self.tweets_id_list))) for idx in trange: tweet_id = self.tweets_id_list[idx] try: obj = get_single_tweet_by_id(tweet_id) if obj: tweets_todb.append([ obj["tweet_id"], obj["full_text"], obj["created_at"], obj["language"], obj["hashtags_str"], obj["mentions_str"], obj["favorite_count"], obj["retweet_count"] ]) for media in obj["all_media_urls"]: if media["media_type"] == "photo": images_todb.append([tweet_id, media["media_url"]]) elif media["media_type"] == "video": videos_todb.append([tweet_id, media["media_url"]]) elif media["media_type"] == "animated_gif": gifs_todb.append([tweet_id, media["media_url"]]) elif media["media_type"] == "other": externals_todb.append( [tweet_id, media["media_url"]]) except Exception as e: error_code = str(e).split("}]:")[0] error_code = error_code.split("'code': ")[1] if error_code == "88": limit_exceeded = True self.next_id_ptr = idx write_to_log( self.log_file, f'**[{datetime.now().strftime("%H:%M:%S")}]** Finished {idx} tweets! Next tweet idx: {self.next_id_ptr}' ) trange.close() break else: write_to_log(self.log_file, f"{e}") except Exception as e: write_to_log(self.log_file, f"{e}") trange.close() time.sleep(1) if not limit_exceeded: self.next_id_ptr = len(self.tweets_id_list) write_to_log( self.log_file, f'-------[{datetime.now().strftime("%H:%M:%S")}] Finished getting full tweets for {self.table_name}!-------\n' ) return { "limit_exceeded": limit_exceeded, "tweets_todb": tweets_todb, "images": images_todb, "videos": videos_todb, "gifs": gifs_todb, "externals": externals_todb }
def next_batch(self): if self.next_cached_id_ptr >= len(self.cached_ids): self.cached_ids = [] self.next_cached_id_ptr = (self.next_cached_id_ptr if self.is_first_batch else 0) self.current_id_file = self.id_files[self.next_file_ptr] with open(self.current_id_file, "r") as infile: for line in infile: tweet_id = line.strip() self.cached_ids.append(tweet_id) write_to_log( self.log_file, f'-------[{datetime.now().strftime("%H:%M:%S")}] Starting processing file with idx {self.next_file_ptr}: {self.current_id_file}-------', ) self.next_file_ptr += 1 self.is_first_batch = False print(f"Getting Tweets in {self.current_id_file}...") limit_exceeded = False covid19_tweets_todb = [] images_todb = [] videos_todb = [] gifs_todb = [] externals_todb = [] trange = tqdm(range(self.next_cached_id_ptr, len(self.cached_ids))) for idx in trange: tweet_id = self.cached_ids[idx] try: obj = get_single_tweet_by_id(tweet_id) if obj: covid19_tweets_todb.append([ obj["tweet_id"], obj["full_text"], obj["created_at"], obj["language"], obj["hashtags_str"], obj["mentions_str"], obj["favorite_count"], obj["retweet_count"], ]) for media in obj["all_media_urls"]: if media["media_type"] == "photo": images_todb.append([tweet_id, media["media_url"]]) elif media["media_type"] == "video": videos_todb.append([tweet_id, media["media_url"]]) elif media["media_type"] == "animated_gif": gifs_todb.append([tweet_id, media["media_url"]]) elif media["media_type"] == "other": externals_todb.append( [tweet_id, media["media_url"]]) except Exception as e: try: error_code = str(e).split("}]:")[0] error_code = error_code.split("'code': ")[1] if error_code == "88": limit_exceeded = True self.next_cached_id_ptr = idx write_to_log( self.log_file, f'**[{datetime.now().strftime("%H:%M:%S")}]** Finished {idx} tweets in file {self.next_file_ptr - 1}! Next tweet idx: {self.next_cached_id_ptr}', ) trange.close() break else: write_to_log(self.log_file, f"{e}") except Exception as e: write_to_log(self.log_file, f"{e}") trange.close() time.sleep(1) if not limit_exceeded: self.next_cached_id_ptr = len(self.cached_ids) write_to_log( self.log_file, f'-------[{datetime.now().strftime("%H:%M:%S")}] Finished processing file with idx {self.next_file_ptr - 1}: {self.current_id_file}-------\n', ) return { "limit_exceeded": limit_exceeded, "covid19_tweets": covid19_tweets_todb, "images": images_todb, "videos": videos_todb, "gifs": gifs_todb, "externals": externals_todb, }
def add_text_column(self): add_column(RETWEETS.name, "text", "TEXT", self.db_cur) self.db_conn.commit() write_to_log(self.log_file, "[TABLE CHANGED] Added column text")
def stream_connect(self): response = requests.get(self.stream_url, auth=self.auth, stream=True, params={"tweet.fields": "author_id,created_at,entities,text,public_metrics", "user.fields": "id", "expansions": "referenced_tweets.id,referenced_tweets.id.author_id"}) if response.status_code > 201: raise Exception(f"{response.status_code}: {response.text}") # window_count = 0 # checkpoint_timestamp = datetime.now() for response_line in response.iter_lines(): if response_line: tweet_dict = json.loads(response_line) r_obj = get_tweet_details_labs(tweet_dict, metric_fieldname="public_metrics") # find original tweet if is retweet while r_obj["is_retweet"]: try: todb_values = [ r_obj["tweet_id"], r_obj["author_id"], r_obj["created_at"], r_obj["parent_tweet_id"], r_obj["parent_tweet_author_id"], r_obj["like_count"], r_obj["quote_count"], r_obj["reply_count"], r_obj["retweet_count"], r_obj["text"], ] batch_insert(RETWEETS.name, RETWEETS.cols, [todb_values], self.db_cur) self.db_conn.commit() write_to_log(self.log_file, f'Saved to retweets! tweet_id: {r_obj["tweet_id"]}. Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}') r_obj = get_single_tweet_by_id_labs(r_obj["parent_tweet_id"], self.auth) except sqlite3.IntegrityError: print("Retweet already saved!") break except sqlite3.OperationalError: write_to_log( self.log_file, f'-------[ERROR] Cannot save to retweets! tweet_id: {r_obj["tweet_id"]}-------\n' + f"Tweet values: {str(todb_values)}\n" + f'Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}\n' + f"----------------------------------------------------------------------------", ) r_obj = get_single_tweet_by_id_labs(r_obj["parent_tweet_id"], self.auth) except Exception as e: write_to_log(self.log_file, e) break if not r_obj["is_retweet"]: # find original tweet if is reply while r_obj["is_reply"]: try: write_to_log(self.log_file, f'Reply tweet! tweet_id: {r_obj["tweet_id"]}. Looking for parent tweet! tweet_id: {r_obj["parent_tweet_id"]}') r_obj = get_single_tweet_by_id_labs(r_obj["parent_tweet_id"], self.auth) except Exception as e: write_to_log(self.log_file, e) break try: todb_values = [ r_obj["tweet_id"], r_obj["author_id"], r_obj["created_at"], r_obj["text"], r_obj["expanded_urls"], r_obj["hashtags_str"], r_obj["mentions_str"], r_obj["like_count"], r_obj["quote_count"], r_obj["reply_count"], r_obj["retweet_count"], ] batch_insert(REGULAR_TWEETS.name, REGULAR_TWEETS.cols, [todb_values], self.db_cur) self.db_conn.commit() write_to_log(self.log_file, f'Saved to regular_tweets! tweet_id: {r_obj["tweet_id"]}') except sqlite3.IntegrityError: print("Original tweet already saved!") except sqlite3.OperationalError: write_to_log( self.log_file, f'-------[ERROR] Cannot save to regular_tweets! tweet_id: {r_obj["tweet_id"]}-------\n' + f"Tweet values: {str(todb_values)}\n" + f"----------------------------------------------------------------------------------", ) except Exception as e: write_to_log(self.log_file, e)