def metadata(self): return { **super().metadata, **{ "retweet_graph": { "topic": None, "week_id": self.week_id, "tweets_start_at": dt_to_s(self.tweets_start_at), "tweets_end_at": dt_to_s(self.tweets_end_at), } } }
def load_retweets(self): """ Loads or downloads bot community tweets to/from CSV. """ if os.path.isfile(self.retweets_filepath): print("READING BOT COMMUNITY RETWEETS FROM CSV...") self.retweets_df = read_csv( self.retweets_filepath ) # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False #if ROWS_LIMIT: # self.retweets_df = read_csv(local_csv_filepath, nrows=int(ROWS_LIMIT)) #else: # self.retweets_df = read_csv(local_csv_filepath) else: print("DOWNLOADING BOT COMMUNITY RETWEETS...") counter = 0 records = [] for row in self.bq_service.download_n_bot_community_retweets_in_batches( self.n_clusters): records.append({ "community_id": row.community_id, "user_id": row.user_id, "user_screen_name_count": row.user_screen_name_count, "user_screen_names": row.user_screen_names, "user_created_at": dt_to_s(row.user_created_at), "retweeted_user_id": row.retweeted_user_id, "retweeted_user_screen_name": row.retweeted_user_screen_name, "status_id": row.status_id, "status_text": row.status_text, "status_created_at": dt_to_s(row.status_created_at) }) counter += 1 if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter)) self.retweets_df = DataFrame(records) self.retweets_df.index.name = "row_id" self.retweets_df.index += 1 print("WRITING TO FILE...") self.retweets_df.to_csv(self.retweets_filepath)
def test_inverse_conversions(): # we should be able to convert a timestamp to a date-time and then back again and it should be the same thing assert dt_to_ts(dt) == ts assert ts_to_dt(ts) == dt # strings and datetime objects assert dt_to_s(s_to_dt(s)) == s
def load_tweets(self): """ Loads or downloads bot community tweets to/from CSV. """ if os.path.isfile(self.tweets_filepath): print("READING BOT COMMUNITY TWEETS FROM CSV...") self.tweets_df = read_csv(self.tweets_filepath) # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False else: print("DOWNLOADING BOT COMMUNITY TWEETS...") counter = 0 records = [] for row in self.bq_service.download_n_bot_community_tweets_in_batches(self.n_clusters): records.append({ "community_id": row.community_id, "user_id": row.user_id, "user_name": row.user_name, "user_screen_name": row.user_screen_name, "user_description": row.user_description, "user_location": row.user_location, "user_verified": row.user_verified, "user_created_at": dt_to_s(row.user_created_at), "status_id": row.status_id, "status_text": row.status_text, "reply_user_id": row.reply_user_id, "retweet_status_id": row.retweet_status_id, "status_is_quote": row.status_is_quote, "status_geo": row.status_geo, "status_created_at": dt_to_s(row.status_created_at) }) counter+=1 if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter)) self.tweets_df = DataFrame(records) self.tweets_df.index.name = "row_id" self.tweets_df.index = self.tweets_df.index + 1 print("WRITING TO FILE...") self.tweets_df.to_csv(self.tweets_filepath)
def parse_status(status): """ Param status (tweepy.models.Status) Converts a nested status structure into a flat row of non-normalized status and user attributes. """ if hasattr(status, "retweeted_status") and status.retweeted_status: retweeted_status_id = status.retweeted_status.id_str retweeted_user_id = status.retweeted_status.user.id retweeted_user_screen_name = status.retweeted_status.user.screen_name else: retweeted_status_id = None retweeted_user_id = None retweeted_user_screen_name = None user = status.user row = { "status_id": status.id_str, "status_text": parse_string(parse_full_text(status)), "truncated": status.truncated, "retweeted_status_id": retweeted_status_id, "retweeted_user_id": retweeted_user_id, "retweeted_user_screen_name": retweeted_user_screen_name, "reply_status_id": status.in_reply_to_status_id_str, "reply_user_id": status.in_reply_to_user_id_str, "is_quote": status.is_quote_status, "geo": status.geo, "created_at": dt_to_s(status.created_at), "user_id": user.id_str, "user_name": user.name, "user_screen_name": user.screen_name, "user_description": parse_string(user.description), "user_location": user.location, "user_verified": user.verified, "user_created_at": dt_to_s(user.created_at), } return row
def perform(self): self.save_metadata() self.start() self.results = [] self.graph = DiGraph() for row in self.bq_service.fetch_retweet_counts_in_batches( start_at=dt_to_s(self.tweets_start_at), end_at=dt_to_s(self.tweets_end_at)): self.graph.add_edge( row["user_screen_name"], # todo: user_id row["retweet_user_screen_name"], # todo: retweet_user_id weight=row["retweet_count"]) self.counter += 1 if self.counter % self.batch_size == 0: rr = { "ts": logstamp(), "counter": self.counter, "nodes": self.graph.number_of_nodes(), "edges": self.graph.number_of_edges() } print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"])) self.results.append(rr) # gets us an approximate users limit but reached a fraction of the time (perhaps more performant when there are millions of rows) if self.users_limit and self.counter >= self.users_limit: break self.end() self.report() self.save_results() self.save_graph()
def test_datetime_decorators(): assert dt_to_date(dt) == '2020-07-26' assert dt_to_s(dt) == s assert dt_to_ts(dt) == ts
-- WHERE t.retweet_status_id IS NULL -- ORDER BY 1,2 """ counter = 0 records = [] for row in grapher.bq_service.execute_query_in_batches(sql): records.append({ "community_id": row.community_id, "user_id": row.user_id, "user_name": row.user_name, "user_screen_name": row.user_screen_name, "user_description": row.user_description, "user_location": row.user_location, "user_verified": row.user_verified, "user_created_at": dt_to_s(row.user_created_at), "status_id": row.status_id, "status_text": row.status_text, "reply_user_id": row.reply_user_id, "retweet_status_id": row.retweet_status_id, "status_is_quote": row.status_is_quote, "status_geo": row.status_geo, "status_created_at": dt_to_s(row.status_created_at) }) counter += 1 if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter)) df = DataFrame(records) print(df.head()) print("WRITING TO FILE...")
from app import seek_confirmation from app.decorators.datetime_decorators import dt_to_s from app.bq_service import BigQueryService from app.tweet_collection_v2.csv_storage import LocalStorageService if __name__ == "__main__": bq_service = BigQueryService() # TOPICS bq_service.migrate_topics_table() print("--------------------") print("SEEDING TOPICS...") local_storage = LocalStorageService() topics = local_storage.fetch_topic_names() bq_service.append_topics(topics) for row in bq_service.fetch_topics(): print(row.topic, "|", dt_to_s(row.created_at)) # TWEETS seek_confirmation() if bq_service.destructive: input( f"THIS WILL DESTROY THE TWEETS TABLE ON '{bq_service.dataset_address.upper()}'. ARE YOU REALLY SURE YOU WANT TO DO THIS?" ) bq_service.migrate_tweets_table()