示例#1
0
    def download_user_friends(self):
        self.start_at = time.perf_counter()
        self.batch = []
        self.counter = 0

        if self.pg_destructive and UserFriend.__table__.exists():
            print("DROPPING THE USER FRIENDS TABLE!")
            UserFriend.__table__.drop(self.pg_engine)
            self.pg_session.commit()

        if not UserFriend.__table__.exists():
            print("CREATING THE USER FRIENDS TABLE!")
            UserFriend.__table__.create(self.pg_engine)
            self.pg_session.commit()

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_user_friends_in_batches(
                limit=self.users_limit):
            self.batch.append({
                "user_id": row["user_id"],
                "screen_name": row["screen_name"],
                "friend_count": row["friend_count"],
                "friend_names": row["friend_names"]
            })
            self.counter += 1

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(UserFriend, self.batch)
                self.pg_session.commit()
                self.batch = []

        print("ETL COMPLETE!")
        self.end_at = time.perf_counter()
        self.pg_session.close()
示例#2
0
 def upload_results(self):
     print(logstamp(), "UPLOADING RESULTS...", self.gcs_results_filepath)
     blob = self.gcs_service.upload(self.local_results_filepath,
                                    self.gcs_results_filepath)
     print(
         logstamp(), blob
     )  #> <Blob: impeachment-analysis-2020, storage/data/2020-05-26-0002/metadata.json, 1590465770194318>
    def perform(self):
        self.start()
        self.write_metadata_to_file()
        self.upload_metadata()

        print(logstamp(), "CONSTRUCTING GRAPH OBJECT...")
        self.graph = DiGraph()
        self.running_results = []
        self.cursor.execute(self.sql)
        while True:
            batch = self.cursor.fetchmany(size=self.batch_size)
            if not batch: break
            self.counter += len(batch)

            if not self.dry_run:
                for row in batch:
                    self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]])

            rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
            print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
            self.running_results.append(rr)

        self.cursor.close()
        self.connection.close()
        print(logstamp(), "GRAPH CONSTRUCTED!")
        self.report()

        self.write_results_to_file()
        self.upload_results()

        self.write_graph_to_file()
        self.upload_graph()

        self.end()
示例#4
0
    def download_retweeter_details(self):
        self.start_job()
        self.destructively_migrate(RetweeterDetail)

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_retweeter_details_in_batches(
                limit=self.users_limit):
            item = {
                "user_id": row['user_id'],
                "verified": row["verified"],
                "created_at": row["created_at"],
                "screen_name_count": row["screen_name_count"],
                "name_count": row["name_count"],
                "retweet_count": row["retweet_count"],
            }
            self.batch.append(item)
            self.counter += 1

            # temporarily testing individual inserts...
            #record = RetweeterDetail(**item)
            #self.pg_session.add(record)
            #self.pg_session.commit()

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(RetweeterDetail,
                                                     self.batch)
                self.pg_session.commit()
                self.batch = []

        self.end_job()
示例#5
0
    def download_retweeter_details(self):
        self.start_at = time.perf_counter()
        self.batch = []
        self.counter = 0

        if self.pg_destructive and RetweeterDetail.__table__.exists():
            print("DROPPING THE RETWEETER DETAILS TABLE!")
            RetweeterDetail.__table__.drop(self.pg_engine)
            self.pg_session.commit()

        if not RetweeterDetail.__table__.exists():
            print("CREATING THE RETWEETER DETAILS TABLE!")
            RetweeterDetail.__table__.create(self.pg_engine)
            self.pg_session.commit()

        print(logstamp(), "DATA FLOWING LIKE WATER...")
        for row in self.bq_service.fetch_retweeter_details_in_batches(
                limit=self.users_limit):
            item = {
                "user_id": row['user_id'],
                "verified": row["verified"],
                "created_at": row["created_at"],
                "screen_name_count": row["screen_name_count"],
                "name_count": row["name_count"],
                "retweet_count": row["retweet_count"],
                # # todo: these topics are specific to the impeachment dataset, so will need to generalize if/when working with another topic (leave for future concern)
                # "ig_report":           row["ig_report"],
                # "ig_hearing":          row["ig_hearing"],
                # "senate_hearing":      row["senate_hearing"],
                # "not_above_the_law":   row["not_above_the_law"],
                # "impeach_and_convict": row["impeach_and_convict"],
                # "impeach_and_remove":  row["impeach_and_remove"],
                # "facts_matter":        row["facts_matter"],
                # "sham_trial":          row["sham_trial"],
                # "maga":                row["maga"],
                # "acquitted_forever":   row["acquitted_forever"],
                # "country_over_party":  row["country_over_party"],
            }
            self.batch.append(item)
            self.counter += 1

            # temporarily testing individual inserts...
            #record = RetweeterDetail(**item)
            #self.pg_session.add(record)
            #self.pg_session.commit()

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(RetweeterDetail,
                                                     self.batch)
                self.pg_session.commit()
                self.batch = []

        print("ETL COMPLETE!")
        self.end_at = time.perf_counter()
        self.pg_session.close()
示例#6
0
    def download_user_details(self):
        self.start_job()
        self.destructively_migrate(UserDetail)

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_user_details_in_batches(
                limit=self.users_limit):
            item = {
                "user_id": row['user_id'],
                "screen_name": clean_string(row['screen_name']),
                "name": clean_string(row['name']),
                "description": clean_string(row['description']),
                "location": clean_string(row['location']),
                "verified": row['verified'],
                "created_at":
                row['created_at'],  #.strftime("%Y-%m-%d %H:%M:%S"),
                "screen_name_count": row['screen_name_count'],
                "name_count": row['name_count'],
                "description_count": row['description_count'],
                "location_count": row['location_count'],
                "verified_count": row['verified_count'],
                "created_count": row['created_at_count'],
                "screen_names": [clean_string(s) for s in row['screen_names']],
                "names": [clean_string(s) for s in row['names']],
                "descriptions": [clean_string(s) for s in row['descriptions']],
                "locations": [clean_string(s) for s in row['locations']],
                "verifieds": row['verifieds'],
                "created_ats":
                row['created_ats'],  #[dt.strftime("%Y-%m-%d %H:%M:%S") for dt in row['_created_ats']]
                "friend_count": row["friend_count"],
                "status_count": row["status_count"],
                "retweet_count": row["retweet_count"],
            }
            self.batch.append(item)
            self.counter += 1

            # temporarily testing individual inserts...
            #record = UserDetail(**item)
            #self.pg_session.add(record)
            #self.pg_session.commit()

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(UserDetail, self.batch)
                self.pg_session.commit()
                self.batch = []

        self.end_job()
    def perform(self):
        self.write_metadata_to_file()
        self.upload_metadata()

        self.start()
        self.graph = DiGraph()
        self.running_results = []

        users = list(self.bq_service.fetch_random_users(limit=self.users_limit, topic=self.topic,
                                                        start_at=self.convo_start_at, end_at=self.convo_end_at))
        print("FETCHED", len(users), "USERS")
        screen_names = sorted([row["user_screen_name"] for row in users])

        for row in self.bq_service.fetch_specific_user_friends(screen_names=screen_names):
            self.counter += 1

            if not self.dry_run:
                self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]])

            if self.counter % self.batch_size == 0:
                rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.running_results.append(rr)

        self.end()
        self.report()
        self.write_results_to_file()
        self.upload_results()
        self.write_graph_to_file()
        self.upload_graph()
示例#8
0
 def write_metadata_to_file(self, metadata):
     """
     Params: metadata (dict)
     """
     print(logstamp(), "WRITING METADATA...")
     with open(self.local_metadata_filepath, "w") as f:
         json.dump(metadata, f)
示例#9
0
 def running_results(self):
     rr = {"ts": logstamp(),
         "counter": self.counter,
         "nodes": self.graph.number_of_nodes(),
         "edges": self.graph.number_of_edges()
     }
     print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
     return rr
示例#10
0
    def download_community_predictions(self, start_at=None, end_at=None):
        self.start_job()
        self.destructively_migrate(CommunityPrediction)

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_predictions(limit=self.tweets_limit):
            self.batch.append(dict(row))

            self.counter += 1
            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(CommunityPrediction,
                                                     self.batch)
                self.pg_session.commit()
                self.batch = []

        self.end_job()
def test_logstamp():
    # this test might fail if run right when the day changes.
    # the point is it contains the current datetime info...
    logstr = logstamp()
    assert isinstance(logstr, str)
    assert str(datetime.now().year) in logstr
    assert str(datetime.now().month) in logstr
    assert str(datetime.now().day) in logstr
示例#12
0
 def write_results_to_file(self, results):
     """
     Params: results (list of dict)
     """
     print(logstamp(), "WRITING RESULTS...")
     df = DataFrame(results)
     df.index.name = "row_id"
     df.index = df.index + 1
     df.to_csv(self.local_results_filepath)
示例#13
0
 def write_edges_to_file(self):
     """
     overwrite the parent method because we need self.edges vs self.graph.edges
     todo: inherit / mix-in
     """
     print(logstamp(), "WRITING EDGES...:")
     with open(self.local_edges_filepath, "wb") as pickle_file:
         pickle.dump(self.edges,
                     pickle_file)  # write edges before graph is constructed
示例#14
0
    def perform(self):
        self.start()
        self.write_metadata_to_file()
        self.upload_metadata()

        self.edges = []
        self.running_results = []
        for row in self.bq_service.fetch_user_friends_in_batches(
                limit=self.users_limit):
            self.counter += 1

            if not self.dry_run:
                self.edges += [(row["screen_name"], friend)
                               for friend in row["friend_names"]]

            if self.counter % self.batch_size == 0:
                rr = {
                    "ts": logstamp(),
                    "counter": self.counter,
                    "edges": len(self.edges)
                }
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|",
                      fmt_n(rr["edges"]))
                self.running_results.append(rr)

        self.write_results_to_file()
        self.upload_results()

        self.write_edges_to_file()
        self.upload_edges()

        print(logstamp(), "CONSTRUCTING GRAPH OBJECT...")
        self.graph = DiGraph(self.edges)
        print(logstamp(), "GRAPH CONSTRUCTED!")
        self.report()

        del self.running_results  # remove in hopes of freeing up some memory
        del self.edges  # remove in hopes of freeing up some memory

        self.write_graph_to_file()
        #del self.graph # remove in hopes of freeing up some memory
        self.upload_graph()

        self.end()
    def perform(self):
        """
        Given:
            bot_ids (list) a unique list of bot ids, which should all be included as nodes in the bot retweet graph.
                The retweet graph will also contain retweeted users. So that's why we need a separate list.
                The bot ids will be used as nodes in the similarity graph.

            bot_retweet_graph (networkx.DiGraph) a retweet graph generated from the bot list

        Returns: a similarity graph (networkx.Graph), where the similarity is based on the Jaccard index.
            For each pair of bots we calculate the Jaccard index based on the sets of people they retweet.
            If two bots retweet exactly the same users, their Jaccard index is one.
            If they don't retweet anyone in common, their Jaccard index is zero.
        """

        grapher.retweet_graph_report()

        bot_ids = [
            row.user_id
            for row in self.bq_service.fetch_bot_ids(bot_min=self.bot_min)
        ]
        print("FETCHED", fmt_n(len(bot_ids)), "BOT IDS")

        node_pairs = []
        for i, bot_id in enumerate(bot_ids):
            for other_bot_id in bot_ids[i + 1:]:
                if self.retweet_graph.has_node(
                        other_bot_id) and self.retweet_graph.has_node(bot_id):
                    node_pairs.append((bot_id, other_bot_id))
        # could maybe just take the combinations between all nodes in the bot graph
        # because we can assume they were assembled using the same bot ids as the ones here
        # but the point is to be methodologically sound and it doesn't take that long
        print("NODE PAIRS:", fmt_n(len(node_pairs)))

        results = jaccard_coefficient(self.retweet_graph.to_undirected(),
                                      node_pairs)
        #> returns an iterator of 3-tuples in the form (u, v, p)
        #> where (u, v) is a pair of nodes and p is their Jaccard coefficient.
        print("JACCARD COEFFICIENTS BETWEEN EACH NODE PAIR - COMPLETE!"
              )  #, fmt_n(len(list(results))))

        print("CONSTRUCTING SIMILARITY GRAPH...")
        self.similarity_graph = Graph()
        edge_count = 0
        #positive_results = [r for r in results if r[2] > 0] # this takes a while, maybe let's just stick with the original iterator approach
        for bot_id, other_bot_id, similarity_score in results:
            if similarity_score > 0:
                self.similarity_graph.add_edge(bot_id,
                                               other_bot_id,
                                               weight=similarity_score)
                edge_count += 1

            self.counter += 1
            if self.counter % self.batch_size == 0:
                print(logstamp(), "|", fmt_n(self.counter), "|",
                      fmt_n(edge_count), "EDGES")
    def perform(self):
        self.edges = []
        self.running_results = []
        self.start()

        self.cursor.execute(self.sql)
        while True:
            batch = self.cursor.fetchmany(size=self.batch_size)
            if not batch: break
            self.counter += len(batch)

            if not self.dry_run:
                for row in batch:
                    self.edges += [(row["screen_name"], friend)
                                   for friend in row["friend_names"]]

            rr = {
                "ts": logstamp(),
                "counter": self.counter,
                "edges": len(self.edges)
            }
            print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["edges"]))
            self.running_results.append(rr)

        self.write_results_to_file()
        self.upload_results()

        self.write_edges_to_file()
        self.upload_edges()

        print(logstamp(), "CONSTRUCTING GRAPH OBJECT...")
        self.graph = DiGraph(self.edges)
        print(logstamp(), "GRAPH CONSTRUCTED!")
        del self.edges  # try to free up some memory maybe, before writing to file
        self.report()

        self.write_graph_to_file()
        self.upload_graph()

        self.end()
示例#17
0
    def download_user_friends(self):
        self.start_job()
        self.destructively_migrate(UserFriend)

        print(logstamp(), "DATA FLOWING...")
        for row in self.bq_service.fetch_user_friends_in_batches(
                limit=self.users_limit):
            self.batch.append({
                "user_id": row["user_id"],
                "screen_name": row["screen_name"],
                "friend_count": row["friend_count"],
                "friend_names": row["friend_names"]
            })
            self.counter += 1

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(UserFriend, self.batch)
                self.pg_session.commit()
                self.batch = []

        self.end_job()
示例#18
0
def perform(batch, bq_service, bas_service):
    thread_name = current_thread().name
    print(logstamp(), thread_name, "...")

    try:
        embeddings = list(
            bas_service.embed_tweets([row["status_text"] for row in batch],
                                     timeout=100))
        print(logstamp(), thread_name, "EMBEDDINGS COMPLETE!")
    except Exception as err:
        print(logstamp(), thread_name, "OOPS", err, "SKIPPING...")
        return 0

    for i, row in enumerate(batch):
        row = dict(row)
        row["embedding"] = embeddings[i]
        del row["status_text"]
    #print(logstamp(), thread_name, "PROCESSING COMPLETE!")

    bq_service.upload_basilica_embeddings(batch)
    #print(logstamp(), thread_name, "UPLOAD COMPLETE!")

    return len(batch)
示例#19
0
    def perform(self):
        self.graph = DiGraph()

        print("FETCHING BOT FOLLOWERS...")

        for row in self.bq_service.fetch_bot_follower_lists(
                bot_min=self.bot_min):
            bot_id = row["bot_id"]
            self.graph.add_edges_from([(follower_id, bot_id)
                                       for follower_id in row["follower_ids"]])

            self.counter += 1
            if self.counter % self.batch_size == 0:
                print("  ", logstamp(), "| BOTS:", fmt_n(self.counter))
示例#20
0
    def perform(self):
        self.graph = DiGraph()
        self.running_results = []

        for row in self.bq_service.fetch_user_friends_in_batches():
            self.counter += 1

            if not self.dry_run:
                self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]])

            if self.counter % self.batch_size == 0:
                rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.running_results.append(rr)
示例#21
0
    def load_retweets(self):
        """
        Loads or downloads bot community tweets to/from CSV.
        """
        if os.path.isfile(self.retweets_filepath):
            print("READING BOT COMMUNITY RETWEETS FROM CSV...")
            self.retweets_df = read_csv(
                self.retweets_filepath
            )  # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False
            #if ROWS_LIMIT:
            #    self.retweets_df = read_csv(local_csv_filepath, nrows=int(ROWS_LIMIT))
            #else:
            #    self.retweets_df = read_csv(local_csv_filepath)
        else:
            print("DOWNLOADING BOT COMMUNITY RETWEETS...")
            counter = 0
            records = []
            for row in self.bq_service.download_n_bot_community_retweets_in_batches(
                    self.n_clusters):
                records.append({
                    "community_id":
                    row.community_id,
                    "user_id":
                    row.user_id,
                    "user_screen_name_count":
                    row.user_screen_name_count,
                    "user_screen_names":
                    row.user_screen_names,
                    "user_created_at":
                    dt_to_s(row.user_created_at),
                    "retweeted_user_id":
                    row.retweeted_user_id,
                    "retweeted_user_screen_name":
                    row.retweeted_user_screen_name,
                    "status_id":
                    row.status_id,
                    "status_text":
                    row.status_text,
                    "status_created_at":
                    dt_to_s(row.status_created_at)
                })
                counter += 1
                if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter))

            self.retweets_df = DataFrame(records)
            self.retweets_df.index.name = "row_id"
            self.retweets_df.index += 1
            print("WRITING TO FILE...")
            self.retweets_df.to_csv(self.retweets_filepath)
示例#22
0
def perform(group_name, filtered_df, parent_dirpath, tokenize):
    community_id = group_name[0]
    date = group_name[1]
    parent_dirpath = os.path.join(parent_dirpath, f"community-{community_id}")

    print("----------------")
    #print(logstamp(), "COMMUNITY", community_id, "| DATE:", date, "|", "| RETWEETS:", fmt_n(len(filtered_df)))
    print(logstamp(), "COMMUNITY", community_id, "| DATE:", date, "|",
          current_thread().name, "| RETWEETS:", fmt_n(len(filtered_df)))

    analyzer = DailyRetweetsAnalyzer(community_id, filtered_df, parent_dirpath,
                                     date, tokenize)

    analyzer.generate_most_retweets_chart()
    analyzer.generate_most_retweeters_chart()

    analyzer.top_tokens_df
    analyzer.save_top_tokens()
    analyzer.generate_top_tokens_wordcloud()
示例#23
0
    def load_tweets(self):
        """
        Loads or downloads bot community tweets to/from CSV.
        """

        if os.path.isfile(self.tweets_filepath):
            print("READING BOT COMMUNITY TWEETS FROM CSV...")
            self.tweets_df = read_csv(self.tweets_filepath) # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False
        else:
            print("DOWNLOADING BOT COMMUNITY TWEETS...")
            counter = 0
            records = []
            for row in self.bq_service.download_n_bot_community_tweets_in_batches(self.n_clusters):
                records.append({
                    "community_id": row.community_id,

                    "user_id": row.user_id,
                    "user_name": row.user_name,
                    "user_screen_name": row.user_screen_name,
                    "user_description": row.user_description,
                    "user_location": row.user_location,
                    "user_verified": row.user_verified,
                    "user_created_at": dt_to_s(row.user_created_at),

                    "status_id": row.status_id,
                    "status_text": row.status_text,
                    "reply_user_id": row.reply_user_id,
                    "retweet_status_id": row.retweet_status_id,
                    "status_is_quote": row.status_is_quote,
                    "status_geo": row.status_geo,
                    "status_created_at": dt_to_s(row.status_created_at)
                })
                counter+=1
                if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter))

            self.tweets_df = DataFrame(records)
            self.tweets_df.index.name = "row_id"
            self.tweets_df.index = self.tweets_df.index + 1
            print("WRITING TO FILE...")
            self.tweets_df.to_csv(self.tweets_filepath)
    def perform(self):
        self.write_metadata_to_file()
        self.upload_metadata()

        self.start()
        self.graph = DiGraph()
        self.running_results = []

        for row in self.bq_service.fetch_retweet_counts_in_batches(topic=self.topic, start_at=self.convo_start_at, end_at=self.convo_end_at):
            # see: https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.DiGraph.add_edge.html#networkx.DiGraph.add_edge
            self.graph.add_edge(row["user_screen_name"], row["retweet_user_screen_name"], rt_count=row["retweet_count"])

            self.counter += 1
            if self.counter % self.batch_size == 0:
                rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)}
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.running_results.append(rr)

        self.end()
        self.report()
        self.write_results_to_file()
        self.upload_results()
        self.write_graph_to_file()
        self.upload_graph()
    def perform(self):
        self.save_metadata()

        self.start()
        self.results = []
        self.graph = DiGraph()

        for row in self.bq_service.fetch_retweet_counts_in_batches(
                start_at=dt_to_s(self.tweets_start_at),
                end_at=dt_to_s(self.tweets_end_at)):

            self.graph.add_edge(
                row["user_screen_name"],  # todo: user_id
                row["retweet_user_screen_name"],  # todo: retweet_user_id
                weight=row["retweet_count"])

            self.counter += 1
            if self.counter % self.batch_size == 0:
                rr = {
                    "ts": logstamp(),
                    "counter": self.counter,
                    "nodes": self.graph.number_of_nodes(),
                    "edges": self.graph.number_of_edges()
                }
                print(rr["ts"], "|", fmt_n(rr["counter"]), "|",
                      fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"]))
                self.results.append(rr)

                # gets us an approximate users limit but reached a fraction of the time (perhaps more performant when there are millions of rows)
                if self.users_limit and self.counter >= self.users_limit:
                    break

        self.end()
        self.report()
        self.save_results()
        self.save_graph()
    batch = []
    for index, row in enumerate(rows):
        counter = index + 1

        try:
            user_id = twitter_service.get_user_id(row.screen_name)
            message = None
        except TweepError as err:
            #print(err)
            #> [{'code': 50, 'message': 'User not found.'}]
            #> [{'code': 63, 'message': 'User has been suspended.'}]
            user_id = None
            message = json.loads(err.reason.replace("'", '"'))[0]["message"]

        lookup = {"lookup_at": logstamp(), "counter": counter, "screen_name": row.screen_name.upper(), "user_id": user_id, "message": message}
        print(lookup)
        batch.append(lookup)

        if (len(batch) >= BATCH_SIZE) or (counter >= row_count): # if the batch is full or the row is last
            print("SAVING BATCH...", len(batch))
            bq_service.upload_user_id_lookups(batch)
            batch = [] # clear the batch

    print("-------------")
    print("LOOKUPS COMPLETE!")

    #print("WRITING TO CSV...")
    #df = DataFrame(lookups)
    #print(df.head())
    #csv_filepath = os.path.join(DATA_DIR, "user_id_lookups.csv")
示例#27
0
    #def get_bot_follower_lists(self, limit=None, bot_min=0.8):
    #    bot_min_str = str(int(bot_min * 100)) #> "80"
    #    sql = f"""
    #        SELECT bot_id, ARRAY_AGG(distinct follower_id) as follower_ids
    #        FROM bot_followers_above_{bot_min_str}
    #        GROUP BY 1
    #    """ # takes 90 seconds for ~25K rows
    #    if limit:
    #        sql += f" LIMIT {int(limit)};"
    #    self.cursor.execute(sql)


if __name__ == "__main__":

    LIMIT = 100_000
    BATCH_SIZE = 10_000

    pg_service = PgService()

    counter = 0
    pg_service.get_user_friends(limit=LIMIT)
    while True:
        batch = pg_service.cursor.fetchmany(size=BATCH_SIZE)
        if not batch: break
        counter += len(batch)
        print(logstamp(), fmt_n(counter))

    pg_service.close()
    print("COMPLETE!")
示例#28
0
def perform():
    generator = execute_query(
        "SELECT user_id, screen_name, friend_count FROM user_friends LIMIT 100;"
    )
    results = list(generator)
    print(logstamp(), fmt_n(len(results)))
示例#29
0
def batch_perform():
    counter = 0
    for batch in fetch_in_batches():
        counter += len(batch)
        print(logstamp(), fmt_n(counter))
示例#30
0
    def download_user_details(self):
        self.start_at = time.perf_counter()
        self.batch = []
        self.counter = 0

        if self.pg_destructive and UserDetail.__table__.exists():
            print("DROPPING THE USER DETAILS TABLE!")
            UserDetail.__table__.drop(self.pg_engine)
            self.pg_session.commit()

        if not UserDetail.__table__.exists():
            print("CREATING THE USER DETAILS TABLE!")
            UserDetail.__table__.create(self.pg_engine)
            self.pg_session.commit()

        print(logstamp(), "DATA FLOWING LIKE WATER...")
        for row in self.bq_service.fetch_user_details_in_batches(
                limit=self.users_limit):
            item = {
                "user_id": row['user_id'],
                "screen_name": clean_string(row['screen_name']),
                "name": clean_string(row['name']),
                "description": clean_string(row['description']),
                "location": clean_string(row['location']),
                "verified": row['verified'],
                "created_at":
                row['created_at'],  #.strftime("%Y-%m-%d %H:%M:%S"),
                "screen_name_count": row['screen_name_count'],
                "name_count": row['name_count'],
                "description_count": row['description_count'],
                "location_count": row['location_count'],
                "verified_count": row['verified_count'],
                "created_count": row['created_at_count'],
                "screen_names": [clean_string(s) for s in row['screen_names']],
                "names": [clean_string(s) for s in row['names']],
                "descriptions": [clean_string(s) for s in row['descriptions']],
                "locations": [clean_string(s) for s in row['locations']],
                "verifieds": row['verifieds'],
                "created_ats":
                row['created_ats'],  #[dt.strftime("%Y-%m-%d %H:%M:%S") for dt in row['_created_ats']]
                "friend_count": row["friend_count"],
                "status_count": row["status_count"],
                "retweet_count": row["retweet_count"],

                # # todo: these topics are specific to the impeachment dataset, so will need to generalize if/when working with another topic (leave for future concern)
                # "impeach_and_convict": row["impeach_and_convict"],
                # "senate_hearing":      row["senate_hearing"],
                # "ig_hearing":          row["ig_hearing"],
                # "facts_matter":        row["facts_matter"],
                # "sham_trial":          row["sham_trial"],
                # "maga":                row["maga"],
                # "acquitted_forever":   row["acquitted_forever"],
            }
            self.batch.append(item)
            self.counter += 1

            # temporarily testing individual inserts...
            #record = UserDetail(**item)
            #self.pg_session.add(record)
            #self.pg_session.commit()

            if len(self.batch) >= self.batch_size:
                print(logstamp(), fmt_n(self.counter), "SAVING BATCH...")
                self.pg_session.bulk_insert_mappings(UserDetail, self.batch)
                self.pg_session.commit()
                self.batch = []

        print("ETL COMPLETE!")
        self.end_at = time.perf_counter()
        self.pg_session.close()