def metadata(self):
     return {
         **super().metadata,
         **{
             "retweet_graph": {
                 "topic": None,
                 "week_id": self.week_id,
                 "tweets_start_at": dt_to_s(self.tweets_start_at),
                 "tweets_end_at": dt_to_s(self.tweets_end_at),
             }
         }
     }
示例#2
0
    def load_retweets(self):
        """
        Loads or downloads bot community tweets to/from CSV.
        """
        if os.path.isfile(self.retweets_filepath):
            print("READING BOT COMMUNITY RETWEETS FROM CSV...")
            self.retweets_df = read_csv(
                self.retweets_filepath
            )  # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False
            #if ROWS_LIMIT:
            #    self.retweets_df = read_csv(local_csv_filepath, nrows=int(ROWS_LIMIT))
            #else:
            #    self.retweets_df = read_csv(local_csv_filepath)
        else:
            print("DOWNLOADING BOT COMMUNITY RETWEETS...")
            counter = 0
            records = []
            for row in self.bq_service.download_n_bot_community_retweets_in_batches(
                    self.n_clusters):
                records.append({
                    "community_id":
                    row.community_id,
                    "user_id":
                    row.user_id,
                    "user_screen_name_count":
                    row.user_screen_name_count,
                    "user_screen_names":
                    row.user_screen_names,
                    "user_created_at":
                    dt_to_s(row.user_created_at),
                    "retweeted_user_id":
                    row.retweeted_user_id,
                    "retweeted_user_screen_name":
                    row.retweeted_user_screen_name,
                    "status_id":
                    row.status_id,
                    "status_text":
                    row.status_text,
                    "status_created_at":
                    dt_to_s(row.status_created_at)
                })
                counter += 1
                if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter))

            self.retweets_df = DataFrame(records)
            self.retweets_df.index.name = "row_id"
            self.retweets_df.index += 1
            print("WRITING TO FILE...")
            self.retweets_df.to_csv(self.retweets_filepath)
def test_inverse_conversions():
    # we should be able to convert a timestamp to a date-time and then back again and it should be the same thing
    assert dt_to_ts(dt) == ts
    assert ts_to_dt(ts) == dt

    # strings and datetime objects
    assert dt_to_s(s_to_dt(s)) == s
示例#4
0
    def load_tweets(self):
        """
        Loads or downloads bot community tweets to/from CSV.
        """

        if os.path.isfile(self.tweets_filepath):
            print("READING BOT COMMUNITY TWEETS FROM CSV...")
            self.tweets_df = read_csv(self.tweets_filepath) # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False
        else:
            print("DOWNLOADING BOT COMMUNITY TWEETS...")
            counter = 0
            records = []
            for row in self.bq_service.download_n_bot_community_tweets_in_batches(self.n_clusters):
                records.append({
                    "community_id": row.community_id,

                    "user_id": row.user_id,
                    "user_name": row.user_name,
                    "user_screen_name": row.user_screen_name,
                    "user_description": row.user_description,
                    "user_location": row.user_location,
                    "user_verified": row.user_verified,
                    "user_created_at": dt_to_s(row.user_created_at),

                    "status_id": row.status_id,
                    "status_text": row.status_text,
                    "reply_user_id": row.reply_user_id,
                    "retweet_status_id": row.retweet_status_id,
                    "status_is_quote": row.status_is_quote,
                    "status_geo": row.status_geo,
                    "status_created_at": dt_to_s(row.status_created_at)
                })
                counter+=1
                if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter))

            self.tweets_df = DataFrame(records)
            self.tweets_df.index.name = "row_id"
            self.tweets_df.index = self.tweets_df.index + 1
            print("WRITING TO FILE...")
            self.tweets_df.to_csv(self.tweets_filepath)
def parse_status(status):
    """
    Param status (tweepy.models.Status)

    Converts a nested status structure into a flat row of non-normalized status and user attributes.
    """

    if hasattr(status, "retweeted_status") and status.retweeted_status:
        retweeted_status_id = status.retweeted_status.id_str
        retweeted_user_id = status.retweeted_status.user.id
        retweeted_user_screen_name = status.retweeted_status.user.screen_name
    else:
        retweeted_status_id = None
        retweeted_user_id = None
        retweeted_user_screen_name = None

    user = status.user
    row = {
        "status_id": status.id_str,
        "status_text": parse_string(parse_full_text(status)),
        "truncated": status.truncated,
        "retweeted_status_id": retweeted_status_id,
        "retweeted_user_id": retweeted_user_id,
        "retweeted_user_screen_name": retweeted_user_screen_name,
        "reply_status_id": status.in_reply_to_status_id_str,
        "reply_user_id": status.in_reply_to_user_id_str,
        "is_quote": status.is_quote_status,
        "geo": status.geo,
        "created_at": dt_to_s(status.created_at),
        "user_id": user.id_str,
        "user_name": user.name,
        "user_screen_name": user.screen_name,
        "user_description": parse_string(user.description),
        "user_location": user.location,
        "user_verified": user.verified,
        "user_created_at": dt_to_s(user.created_at),
    }
    return row
    def perform(self):
        self.save_metadata()

        self.start()
        self.results = []
        self.graph = DiGraph()

        for row in self.bq_service.fetch_retweet_counts_in_batches(
                start_at=dt_to_s(self.tweets_start_at),
                end_at=dt_to_s(self.tweets_end_at)):

            self.graph.add_edge(
                row["user_screen_name"],  # todo: user_id
                row["retweet_user_screen_name"],  # todo: retweet_user_id
                weight=row["retweet_count"])

            self.counter += 1
            if self.counter % self.batch_size == 0:
                rr = {
                    "ts": logstamp(),
                    "counter": self.counter,
                    "nodes": self.graph.number_of_nodes(),
                    "edges": self.graph.number_of_edges()
                }
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|",
                      fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.results.append(rr)

                # gets us an approximate users limit but reached a fraction of the time (perhaps more performant when there are millions of rows)
                if self.users_limit and self.counter >= self.users_limit:
                    break

        self.end()
        self.report()
        self.save_results()
        self.save_graph()
def test_datetime_decorators():
    assert dt_to_date(dt) == '2020-07-26'
    assert dt_to_s(dt) == s
    assert dt_to_ts(dt) == ts
示例#8
0
            -- WHERE t.retweet_status_id IS NULL
            -- ORDER BY 1,2

        """
        counter = 0
        records = []
        for row in grapher.bq_service.execute_query_in_batches(sql):
            records.append({
                "community_id": row.community_id,
                "user_id": row.user_id,
                "user_name": row.user_name,
                "user_screen_name": row.user_screen_name,
                "user_description": row.user_description,
                "user_location": row.user_location,
                "user_verified": row.user_verified,
                "user_created_at": dt_to_s(row.user_created_at),
                "status_id": row.status_id,
                "status_text": row.status_text,
                "reply_user_id": row.reply_user_id,
                "retweet_status_id": row.retweet_status_id,
                "status_is_quote": row.status_is_quote,
                "status_geo": row.status_geo,
                "status_created_at": dt_to_s(row.status_created_at)
            })
            counter += 1
            if counter % BATCH_SIZE == 0:
                print(logstamp(), fmt_n(counter))

        df = DataFrame(records)
        print(df.head())
        print("WRITING TO FILE...")
from app import seek_confirmation
from app.decorators.datetime_decorators import dt_to_s
from app.bq_service import BigQueryService
from app.tweet_collection_v2.csv_storage import LocalStorageService

if __name__ == "__main__":

    bq_service = BigQueryService()

    # TOPICS

    bq_service.migrate_topics_table()

    print("--------------------")
    print("SEEDING TOPICS...")
    local_storage = LocalStorageService()
    topics = local_storage.fetch_topic_names()
    bq_service.append_topics(topics)
    for row in bq_service.fetch_topics():
        print(row.topic, "|", dt_to_s(row.created_at))

    # TWEETS

    seek_confirmation()
    if bq_service.destructive:
        input(
            f"THIS WILL DESTROY THE TWEETS TABLE ON '{bq_service.dataset_address.upper()}'. ARE YOU REALLY SURE YOU WANT TO DO THIS?"
        )

    bq_service.migrate_tweets_table()