class Collector: def __init__(self): self.twitter_api = TwitterService().api self.bq_service = BigQueryService() self.limit = STATUS_LIMIT self.batch_size = BATCH_SIZE def fetch_remaining_status_ids(self): sql = f""" SELECT DISTINCT a.status_id FROM `{self.bq_service.dataset_address}.all_status_ids` a LEFT JOIN `{self.bq_service.dataset_address}.recollected_statuses` completed ON completed.status_id = a.status_id WHERE completed.status_id IS NULL LIMIT {self.limit} """ return [ row["status_id"] for row in list(self.bq_service.execute_query(sql)) ] def perform(self): remaining_status_ids = self.fetch_remaining_status_ids() if any(remaining_status_ids): for batch_of_ids in split_into_batches(remaining_status_ids, batch_size=self.batch_size): self.process_batch(batch_of_ids) else: print("OH ALL DONE! SLEEPING...") server_sleep(10 * 60 * 60) def lookup_statuses(self, status_ids): """Fetch full status info including urls, and full text. Max per request is 100, so batch size must be smaller than that. See: https://docs.tweepy.org/en/stable/api.html#API.statuses_lookup https://developer.twitter.com/en/docs/twitter-api/v1/tweets/post-and-engage/api-reference/get-statuses-lookup """ return self.twitter_api.statuses_lookup( id_=status_ids, include_entities=True, # this is where the full urls are trim_user=True, # we already have this info include_ext_alt_text= True, # If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null. include_card_uri=False, map_= True, # "Tweets that do not exist or cannot be viewed by the current user will still have their key represented but with an explicitly null value paired with it" tweet_mode="extended") def process_batch(self, status_ids): recollected_statuses = [] recollected_urls = [] success_counter = 0 for status in self.lookup_statuses(status_ids): # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field status_id = status.id # all statuses will have an id recollected_status = { "status_id": status_id, "user_id": None, "full_text": None, "created_at": None, "lookup_at": generate_timestamp() } # represent failed lookups with null text values if list(status._json.keys()) != [ "id" ]: # this will be the only field for empty statuses. otherwise try to parse them: success_counter += 1 recollected_status["user_id"] = status.user.id recollected_status["full_text"] = parse_full_text( status) # update the full text if possible recollected_status["created_at"] = generate_timestamp( status.created_at) for url in status.entities["urls"]: recollected_urls.append({ "status_id": status_id, "expanded_url": url["expanded_url"] }) recollected_statuses.append(recollected_status) print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}", "| STATUSES:", success_counter, "| URLS:", len(recollected_urls)) self.save_statuses(recollected_statuses) self.save_urls(recollected_urls) def save_statuses(self, recollected_statuses): self.bq_service.insert_records_in_batches( self.recollected_statuses_table, recollected_statuses) def save_urls(self, recollected_urls): self.bq_service.insert_records_in_batches(self.recollected_urls_table, recollected_urls) @property @lru_cache(maxsize=None) def recollected_statuses_table(self): return self.bq_service.client.get_table( f"{self.bq_service.dataset_address}.recollected_statuses") @property @lru_cache(maxsize=None) def recollected_urls_table(self): return self.bq_service.client.get_table( f"{self.bq_service.dataset_address}.recollected_status_urls")
,b.bot_id -- ,b.bot_screen_name --,b.day_count --,b.avg_daily_score ,count(distinct t.status_id) as tweet_count ,COALESCE(STRING_AGG(DISTINCT upper(t.user_screen_name), ' | ') , "") as screen_names ,COALESCE(STRING_AGG(DISTINCT upper(t.user_name), ' | ') , "") as user_names ,COALESCE(STRING_AGG(DISTINCT upper(t.user_description), ' | ') , "") as user_descriptions FROM impeachment_production.bots_above_80 b JOIN impeachment_production.2_bot_communities c ON c.user_id = b.bot_id JOIN impeachment_production.tweets t on cast(t.user_id as int64) = b.bot_id GROUP BY 1,2 ORDER BY 1,2 """ # TODO: move me into the BQ service results = [dict(row) for row in list(bq_service.execute_query(sql))] print("PROCESSING", len(results), "RECORDS...") for i, row in enumerate(results): row["profile_tokens"] = [] row["profile_lemmas"] = [] row["profile_tags"] = [] row["profile_handles"] = [] if row["user_descriptions"]: #print("--------------") #print("COMMUNITY", row["community_id"], i, row["bot_id"], row["screen_names"]) #print(row["user_descriptions"]) # we want unique tokens here because otherwise someone changing their sn will have a greater influence over the counts tokens = list(set(tokenizer.custom_stems(