def __init__(self, bq_service=None, twitter_service=None, user_limit=USER_LIMIT, friend_limit=FRIEND_LIMIT): self.bq_service = bq_service or BigQueryService() self.twitter_service = twitter_service or TwitterService() self.dataset_address = self.bq_service.dataset_address self.user_limit = int(user_limit) self.friend_limit = int(friend_limit) print("---------------------------") print("JOB: FRIEND LOOKUPS") print("DATASET:", self.dataset_address.upper()) print("USER LIMIT:", self.user_limit) print("FRIEND LIMIT:", self.friend_limit) print("---------------------------")
def __init__(self, bq_service=None, twitter_service=None, user_limit=USER_LIMIT, status_limit=STATUS_LIMIT): self.bq_service = bq_service or BigQueryService() self.twitter_service = twitter_service or TwitterService() self.dataset_address = self.bq_service.dataset_address self.user_limit = int(user_limit) self.status_limit = int(status_limit) self.parse_status = parse_timeline_status print("---------------------------") print("JOB: TIMELINE LOOKUPS") print("DATASET:", self.dataset_address.upper()) print("USER LIMIT:", self.user_limit) print("STATUS LIMIT:", self.status_limit) print("---------------------------")
def __init__(self, twitter_service=None, storage_env=STORAGE_ENV, bq_service=None, csv_service=None, batch_size=BATCH_SIZE): self.twitter_service = twitter_service or TwitterService() self.api = self.twitter_service.api self.auth = self.api.auth self.parse_status = parse_status self.storage_env = storage_env if self.storage_env == "local": self.storage_service = csv_service or LocalStorageService() elif self.storage_env == "remote": self.storage_service = bq_service or BigQueryService() else: raise ValueError("Expecting the STORAGE_ENV to be 'local' or 'remote'. Please try again...") self.batch_size = batch_size self.batch = [] self.counter = 0 print("-------------------------------") print("STREAM LISTENER...") print(" STORAGE ENV:", self.storage_env.upper()) print(" STORAGE SERVICE:", type(self.storage_service)) print(" BATCH SIZE:", self.batch_size) print("--------------------------------")
from pandas import DataFrame from dotenv import load_dotenv from app import DATA_DIR, seek_confirmation from app.decorators.datetime_decorators import logstamp from app.bq_service import BigQueryService from app.twitter_service import TwitterService load_dotenv() BATCH_SIZE = int(os.getenv("BATCH_SIZE", default=100)) # the max number of processed users to store in BQ at once (with a single insert API call). must be less than 10,000 to avoid error. if __name__ == "__main__": bq_service = BigQueryService() twitter_service = TwitterService() rows = list(bq_service.fetch_idless_screen_names()) row_count = len(rows) print("-------------------------") print(f"FETCHED {row_count} SCREEN NAMES") print("BATCH SIZE:", BATCH_SIZE) print("-------------------------") seek_confirmation() bq_service.migrate_user_id_lookups_table() batch = [] for index, row in enumerate(rows): counter = index + 1
def __init__(self): self.twitter_api = TwitterService().api self.bq_service = BigQueryService() self.limit = STATUS_LIMIT self.batch_size = BATCH_SIZE
class Collector: def __init__(self): self.twitter_api = TwitterService().api self.bq_service = BigQueryService() self.limit = STATUS_LIMIT self.batch_size = BATCH_SIZE def fetch_remaining_status_ids(self): sql = f""" SELECT DISTINCT a.status_id FROM `{self.bq_service.dataset_address}.all_status_ids` a LEFT JOIN `{self.bq_service.dataset_address}.recollected_statuses` completed ON completed.status_id = a.status_id WHERE completed.status_id IS NULL LIMIT {self.limit} """ return [ row["status_id"] for row in list(self.bq_service.execute_query(sql)) ] def perform(self): remaining_status_ids = self.fetch_remaining_status_ids() if any(remaining_status_ids): for batch_of_ids in split_into_batches(remaining_status_ids, batch_size=self.batch_size): self.process_batch(batch_of_ids) else: print("OH ALL DONE! SLEEPING...") server_sleep(10 * 60 * 60) def lookup_statuses(self, status_ids): """Fetch full status info including urls, and full text. Max per request is 100, so batch size must be smaller than that. See: https://docs.tweepy.org/en/stable/api.html#API.statuses_lookup https://developer.twitter.com/en/docs/twitter-api/v1/tweets/post-and-engage/api-reference/get-statuses-lookup """ return self.twitter_api.statuses_lookup( id_=status_ids, include_entities=True, # this is where the full urls are trim_user=True, # we already have this info include_ext_alt_text= True, # If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null. include_card_uri=False, map_= True, # "Tweets that do not exist or cannot be viewed by the current user will still have their key represented but with an explicitly null value paired with it" tweet_mode="extended") def process_batch(self, status_ids): recollected_statuses = [] recollected_urls = [] success_counter = 0 for status in self.lookup_statuses(status_ids): # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field status_id = status.id # all statuses will have an id recollected_status = { "status_id": status_id, "user_id": None, "full_text": None, "created_at": None, "lookup_at": generate_timestamp() } # represent failed lookups with null text values if list(status._json.keys()) != [ "id" ]: # this will be the only field for empty statuses. otherwise try to parse them: success_counter += 1 recollected_status["user_id"] = status.user.id recollected_status["full_text"] = parse_full_text( status) # update the full text if possible recollected_status["created_at"] = generate_timestamp( status.created_at) for url in status.entities["urls"]: recollected_urls.append({ "status_id": status_id, "expanded_url": url["expanded_url"] }) recollected_statuses.append(recollected_status) print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}", "| STATUSES:", success_counter, "| URLS:", len(recollected_urls)) self.save_statuses(recollected_statuses) self.save_urls(recollected_urls) def save_statuses(self, recollected_statuses): self.bq_service.insert_records_in_batches( self.recollected_statuses_table, recollected_statuses) def save_urls(self, recollected_urls): self.bq_service.insert_records_in_batches(self.recollected_urls_table, recollected_urls) @property @lru_cache(maxsize=None) def recollected_statuses_table(self): return self.bq_service.client.get_table( f"{self.bq_service.dataset_address}.recollected_statuses") @property @lru_cache(maxsize=None) def recollected_urls_table(self): return self.bq_service.client.get_table( f"{self.bq_service.dataset_address}.recollected_status_urls")
from app.twitter_service import TwitterService load_dotenv() DATASET_ADDRESS = os.getenv("DATASET_ADDRESS", default="tweet-collector-py.disinfo_2021_development") SEARCH_TERM = os.getenv("SEARCH_TERM", default="#WWG1WGA") LIMIT = os.getenv("LIMIT") # None is OK #class UserLookupJob: # def __init__(self): # pass if __name__ == '__main__': bq_service = BigQueryService() twitter_service = TwitterService() print("SEARCH_TERM:", SEARCH_TERM) print("LIMIT:", LIMIT) print(bq_service.query_to_df(f"SELECT count(distinct user_id) FROM `{DATASET_ADDRESS}.user_lookups`")) seek_confirmation() sql = f""" SELECT DISTINCT u.user_id FROM ( SELECT DISTINCT cast(user_id as INT64) as user_id FROM `{DATASET_ADDRESS}.tweets` WHERE REGEXP_CONTAINS(upper(status_text), '{SEARCH_TERM}') ) u LEFT JOIN (