def collect_retweets(news_list, news_source, label, config: Config): create_dir("{}/{}/raw".format(config.dump_location, news_source)) news_list_to_process = [] empty_data_objects = 0 for news in news_list: news_dir = "{}/{}/{}/tweets/{}.csv".format(config.dump_location, news_source, label, news.news_id) data = pd.read_csv(news_dir) raw_dir = "{}/{}/complete/{}.csv".format(config.dump_location, news_source, news.news_id) if data.empty: empty_data_objects += 1 continue if path.exists(raw_dir): continue else: news_list_to_process.append(NewsItem(data, raw_dir)) print('Collecting for ' + str(len(news_list_to_process)) + ' news storie retweets.') print( str(empty_data_objects) + '/' + str(len(news_list)) + ' datasets were skipped, as they were empty. ') multiprocess_data_collection(dump_retweets_job, news_list_to_process, (config, config.twython_connector), config)
def collect_retweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir(f"{config.dump_location}/{news_source}") create_dir(f"{config.dump_location}/{news_source}/{label}") tweet_id_list = [] for news in news_list: # check whether the news is existed news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json" if not os.path.exists(news_path): # print(f"News {news.news_id} is not existed, skip downloading retweets") continue for tweet_id in news.tweet_ids: # check whether the tweet is existed tweet_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/tweets/{tweet_id}.json" if not os.path.exists(tweet_path): # print(f"Tweet {tweet_id} is not existed, skip downloading retweets") continue tweet_id_list.append( Tweet(tweet_id, news.news_id, news_source, label)) multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config)
def collect_retweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/{}".format(config.dump_location, news_source, label)) save_dir = "{}/{}/{}".format(config.dump_location, news_source, label) tweet_id_list = [] for news in news_list: for tweet_id in news.tweet_ids: tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label)) filtered_tweet_id_list = [ tweet for tweet in tweet_id_list if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),) ] multiprocess_data_collection( dump_retweets_job, filtered_tweet_id_list, (config, config.twython_connector), config, )
def collect_tweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) # create dir for tweets create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/{}".format(config.dump_location, news_source, label)) # create dir for user profiles create_dir(f"{config.dump_location}/user_profiles") tweet_list = [] for news in news_list: # check whether the news is existed news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json" if not os.path.exists(news_path): # print(f"News {news.news_id} is not existed, skip downloading tweets") continue for tweet_id in news.tweet_ids: tweet_list.append(Tweet(tweet_id, news.news_id, news_source, label)) print(f"Total tweets to be downloaded: {len(tweet_list)}") tweet_chunks = equal_chunks(tweet_list, 100) multiprocess_data_collection(dump_tweet_information, tweet_chunks, (config, config.twython_connector), config)
def collect_user_profiles(config: Config, twython_connector: TwythonConnector): dump_location = config.dump_location all_user_ids = set() all_user_ids.update( get_user_ids_in_folder("{}/politifact/fake".format(dump_location))) all_user_ids.update( get_user_ids_in_folder("{}/politifact/real".format(dump_location))) all_user_ids.update( get_user_ids_in_folder("{}/gossipcop/fake".format(dump_location))) all_user_ids.update( get_user_ids_in_folder("{}/gossipcop/real".format(dump_location))) user_profiles_folder = "{}/{}".format(dump_location, "user_profiles") user_timeline_tweets_folder = "{}/{}".format(dump_location, "user_timeline_tweets") create_dir(user_profiles_folder) create_dir(user_timeline_tweets_folder) multiprocess_data_collection(dump_user_profile_job, all_user_ids, (user_profiles_folder, twython_connector), config) multiprocess_data_collection( dump_user_recent_tweets_job, all_user_ids, (user_timeline_tweets_folder, twython_connector), config)
def collect_data(self, choices): use_id_from_profile = True # collect user IDs if use_id_from_profile: all_user_ids = get_user_ids_from_profile( self.config.dump_location) # List object returned else: all_user_ids = set() for choice in choices: choice_dir = f"{self.config.dump_location}/{choice['news_source']}/{choice['label']}" all_user_ids.update( get_user_ids_in_folder(choice_dir)) # Set object returned all_user_ids = list(all_user_ids) # create dir to store user followers user_followers_folder = f"{self.config.dump_location}/user_followers" create_dir(user_followers_folder) multiprocess_data_collection( dump_user_followers, all_user_ids, (user_followers_folder, self.config.twython_connector), self.config)
def collect_data(self, choices): all_user_ids = set() for choice in choices: all_user_ids.update(get_user_ids_in_folder( "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"]))) user_friends_folder = "{}/{}".format(self.config.dump_location, "user_following") create_dir(user_friends_folder) multiprocess_data_collection(dump_user_following, list(all_user_ids), (user_friends_folder, self.config.twython_connector), self.config)
def collect_data(self, choices): all_user_ids = [] for choice in choices: all_user_ids.extend(get_user_ids_in_folder( "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"]))) user_timeline_tweets_folder = "{}/{}".format(self.config.dump_location, "user_timeline_tweets") create_dir(user_timeline_tweets_folder) multiprocess_data_collection(dump_user_recent_tweets_job, list(all_user_ids), (user_timeline_tweets_folder, self.config.twython_connector), self.config)
def collect_retweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/{}".format(config.dump_location, news_source, label)) save_dir = "{}/{}/{}".format(config.dump_location, news_source, label) tweet_id_list = [] for news in news_list: for tweet_id in news.tweet_ids: tweet_id_list.append( Tweet(tweet_id, news.news_id, news_source, label)) multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config)
def collect_data(self, choices): if not os.path.exists(f"{self.config.dump_location}/all_user_id.json"): print(f"all_user_id.json not found") return print(f"loads IDs to be fetched from all_user_id.json") with open(f"{self.config.dump_location}/all_user_id.json", "r") as id_list_file: all_user_ids = json.loads(id_list_file.read()) # set and create dest dir timeline_folder = f"{self.config.dump_location}/user_timeline_tweets" create_dir(timeline_folder) multiprocess_data_collection( dump_user_recent_tweets_job, all_user_ids, (timeline_folder, self.config.twython_connector), self.config)
def collect_data(self, choices): # create dir to store user followers user_followers_folder = f"{self.config.dump_location}/rt_user_followers" create_dir(user_followers_folder) user_id_list_path = f"{self.config.dump_location}/rt_user_ids_1.json" # number need to be set final_user_id_list = [] with open(user_id_list_path, "r") as id_file: id_list = json.loads(id_file.read())['users'] for uid in id_list: if not os.path.exists(f"{user_followers_folder}/{uid}.json"): final_user_id_list.append(int(uid)) multiprocess_data_collection( dump_user_followers, final_user_id_list, (user_followers_folder, self.config.twython_connector), self.config)
def collect_retweets(news_list, news_source, label, config: Config, hop_index): assert hop_index >= 3 create_dir(config.dump_location) create_dir(f"{config.dump_location}/{news_source}") create_dir(f"{config.dump_location}/{news_source}/{label}") tweet_id_list = [] for news in news_list: news_dir = f"{config.dump_location}/{news_source}/{label}/{news.news_id}" # check whether the news is existed news_path = f"{news_dir}/news content.json" if not os.path.exists(news_path): # print(f"News {news.news_id} is not existed, skip downloading retweets") continue # get start date of Tweet cascade cascade_start_date = None first_tweets_dir = f"{news_dir}/tweets" if os.path.exists(first_tweets_dir): # look for date records created at previous execution if os.path.exists(f"{first_tweets_dir}/first_date.json"): with open(f"{first_tweets_dir}/first_date.json") as date_file: date_dict = json.loads(date_file.read()) cascade_start_date = date(date_dict['year'], date_dict['month'], date_dict['day']) # iterate through all tweets to find the date of earliest Tweet else: for tweet in os.listdir(first_tweets_dir): with open(f"{first_tweets_dir}/{tweet}", "r") as tweet_file: tweet_dict = json.loads(tweet_file.read()) tweet_date = parse_tweet_date(tweet_dict['created_at']) if cascade_start_date == None: cascade_start_date = tweet_date elif tweet_date < cascade_start_date: cascade_start_date = tweet_date # save start date back to news dir with open(f"{first_tweets_dir}/first_date.json", "w") as date_file: date_file.write(json.dumps({ "year": cascade_start_date.year, "month": cascade_start_date.month, "day": cascade_start_date.day })) print(cascade_start_date) if cascade_start_date == None: continue # read RTs of previous hop if hop_index == 3: previous_hop_dir = f"{news_dir}/retweets" else: previous_hop_dir = f"{news_dir}/retweets_{hop_index - 1}" for rt_collection in os.listdir(previous_hop_dir): with open(f"{previous_hop_dir}/{rt_collection}", "r") as rt_collection_file: rt_collection_dict = json.loads(rt_collection_file.read()) for rt in rt_collection_dict['retweets']: rt_date = parse_tweet_date(rt['created_at']) # is date of parent tweet in time range limitation? if is_date_in_range(rt_date, cascade_start_date, config): tweet_id_list.append(Tweet(rt['id'], news.news_id, news_source, label, hop_index)) if len(tweet_id_list) == 0: print(f"There's no parent tweet in the time range limitation, hop {hop_index} retweet crawling stopped") return False else: multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config) return True
def collect_data(self, choices): if os.path.exists( f"{self.config.dump_location}/follower_profile_ids.json"): print(f"loads IDs to be fetched from follower_profile_ids.json") with open(f"{self.config.dump_location}/follower_profile_ids.json", "r") as id_list_file: final_follower_ids = json.loads(id_list_file.read()) # set and create dest dir follower_profiles_folder = f"{self.config.dump_location}/follower_profiles" create_dir(follower_profiles_folder) else: if not os.path.exists( f"{self.config.dump_location}/follower_sample_map.json"): print( f"can find file {self.config.dump_location}/follower_sample_map.json, please do sampling first" ) return print( f"loads sampled follower IDs from file follower_sample_map.json" ) with open(f"{self.config.dump_location}/follower_sample_map.json", "r") as map_file: all_sampled_followers = json.loads(map_file.read()) # set and create dest dir print("set and create dest dir") follower_profiles_folder = f"{self.config.dump_location}/follower_profiles" create_dir(follower_profiles_folder) # check profile duplication print("check profile duplication") all_follower_ids = [] for li in all_sampled_followers.values(): all_follower_ids += li final_follower_ids = [] skipped_count = 0 all_follower_length = len(all_follower_ids) for (idx, id) in enumerate(all_follower_ids): if idx % 100 == 0: print(f"{idx} / {all_follower_length}") if os.path.exists( f"{self.config.dump_location}/user_profiles/{id}.json" ): skipped_count += 1 shutil.copyfile( f"{self.config.dump_location}/user_profiles/{id}.json", f"{follower_profiles_folder}/{id}.json") continue if os.path.exists( f"{self.config.dump_location}/rt_user_profiles/{id}.json" ): skipped_count += 1 shutil.copyfile( f"{self.config.dump_location}/rt_user_profiles/{id}.json", f"{follower_profiles_folder}/{id}.json") continue final_follower_ids.append(id) print( f"Total follower profiles to be fetched: {len(final_follower_ids)}, skipped: {skipped_count}" ) # save download list back to file with open( f"{self.config.dump_location}/follower_download_ids.json", "w") as id_list_file: id_list_file.write(json.dumps(final_follower_ids)) multiprocess_data_collection( dump_user_profile_job, final_follower_ids, (follower_profiles_folder, self.config.twython_connector), self.config)