def collect_retweets(news_list, news_source, label, config: Config):
    create_dir("{}/{}/raw".format(config.dump_location, news_source))
    news_list_to_process = []
    empty_data_objects = 0
    for news in news_list:
        news_dir = "{}/{}/{}/tweets/{}.csv".format(config.dump_location,
                                                   news_source, label,
                                                   news.news_id)
        data = pd.read_csv(news_dir)
        raw_dir = "{}/{}/complete/{}.csv".format(config.dump_location,
                                                 news_source, news.news_id)
        if data.empty:
            empty_data_objects += 1
            continue
        if path.exists(raw_dir):
            continue
        else:
            news_list_to_process.append(NewsItem(data, raw_dir))
    print('Collecting for ' + str(len(news_list_to_process)) +
          ' news storie retweets.')
    print(
        str(empty_data_objects) + '/' + str(len(news_list)) +
        ' datasets were skipped, as they were empty. ')
    multiprocess_data_collection(dump_retweets_job, news_list_to_process,
                                 (config, config.twython_connector), config)
Exemplo n.º 2
0
def collect_retweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir(f"{config.dump_location}/{news_source}")
    create_dir(f"{config.dump_location}/{news_source}/{label}")

    tweet_id_list = []

    for news in news_list:

        # check whether the news is existed
        news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json"

        if not os.path.exists(news_path):
            # print(f"News {news.news_id} is not existed, skip downloading retweets")
            continue

        for tweet_id in news.tweet_ids:

            # check whether the tweet is existed
            tweet_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/tweets/{tweet_id}.json"

            if not os.path.exists(tweet_path):
                # print(f"Tweet {tweet_id} is not existed, skip downloading retweets")
                continue

            tweet_id_list.append(
                Tweet(tweet_id, news.news_id, news_source, label))

    multiprocess_data_collection(dump_retweets_job, tweet_id_list,
                                 (config, config.twython_connector), config)
Exemplo n.º 3
0
def collect_retweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/{}".format(config.dump_location, news_source, label))

    save_dir = "{}/{}/{}".format(config.dump_location, news_source, label)

    tweet_id_list = []

    for news in news_list:
        for tweet_id in news.tweet_ids:
            tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label))

    filtered_tweet_id_list = [
        tweet
        for tweet in tweet_id_list
        if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),)
    ]

    multiprocess_data_collection(
        dump_retweets_job,
        filtered_tweet_id_list,
        (config, config.twython_connector),
        config,
    )
Exemplo n.º 4
0
def collect_tweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    # create dir for tweets
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/{}".format(config.dump_location, news_source, label))
    # create dir for user profiles
    create_dir(f"{config.dump_location}/user_profiles")

    tweet_list = []

    for news in news_list:

        # check whether the news is existed
        news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json"

        if not os.path.exists(news_path):
            # print(f"News {news.news_id} is not existed, skip downloading tweets")
            continue

        for tweet_id in news.tweet_ids:
            tweet_list.append(Tweet(tweet_id, news.news_id, news_source, label))

    print(f"Total tweets to be downloaded: {len(tweet_list)}")
    tweet_chunks = equal_chunks(tweet_list, 100)
    multiprocess_data_collection(dump_tweet_information, tweet_chunks, (config, config.twython_connector), config)
Exemplo n.º 5
0
def collect_user_profiles(config: Config, twython_connector: TwythonConnector):
    dump_location = config.dump_location

    all_user_ids = set()

    all_user_ids.update(
        get_user_ids_in_folder("{}/politifact/fake".format(dump_location)))
    all_user_ids.update(
        get_user_ids_in_folder("{}/politifact/real".format(dump_location)))
    all_user_ids.update(
        get_user_ids_in_folder("{}/gossipcop/fake".format(dump_location)))
    all_user_ids.update(
        get_user_ids_in_folder("{}/gossipcop/real".format(dump_location)))

    user_profiles_folder = "{}/{}".format(dump_location, "user_profiles")
    user_timeline_tweets_folder = "{}/{}".format(dump_location,
                                                 "user_timeline_tweets")

    create_dir(user_profiles_folder)
    create_dir(user_timeline_tweets_folder)

    multiprocess_data_collection(dump_user_profile_job, all_user_ids,
                                 (user_profiles_folder, twython_connector),
                                 config)
    multiprocess_data_collection(
        dump_user_recent_tweets_job, all_user_ids,
        (user_timeline_tweets_folder, twython_connector), config)
    def collect_data(self, choices):

        use_id_from_profile = True

        # collect user IDs
        if use_id_from_profile:
            all_user_ids = get_user_ids_from_profile(
                self.config.dump_location)  # List object returned

        else:
            all_user_ids = set()
            for choice in choices:
                choice_dir = f"{self.config.dump_location}/{choice['news_source']}/{choice['label']}"
                all_user_ids.update(
                    get_user_ids_in_folder(choice_dir))  # Set object returned

            all_user_ids = list(all_user_ids)

        # create dir to store user followers
        user_followers_folder = f"{self.config.dump_location}/user_followers"
        create_dir(user_followers_folder)

        multiprocess_data_collection(
            dump_user_followers, all_user_ids,
            (user_followers_folder, self.config.twython_connector),
            self.config)
    def collect_data(self, choices):
        all_user_ids = set()

        for choice in choices:
            all_user_ids.update(get_user_ids_in_folder(
                "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"])))

        user_friends_folder = "{}/{}".format(self.config.dump_location, "user_following")
        create_dir(user_friends_folder)

        multiprocess_data_collection(dump_user_following, list(all_user_ids), (user_friends_folder,
                                                                                       self.config.twython_connector),
                                     self.config)
    def collect_data(self, choices):
        all_user_ids = []

        for choice in choices:
            all_user_ids.extend(get_user_ids_in_folder(
                "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"])))

        user_timeline_tweets_folder = "{}/{}".format(self.config.dump_location, "user_timeline_tweets")
        create_dir(user_timeline_tweets_folder)

        multiprocess_data_collection(dump_user_recent_tweets_job, list(all_user_ids), (user_timeline_tweets_folder,
                                                                                       self.config.twython_connector),
                                     self.config)
Exemplo n.º 9
0
def collect_retweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/{}".format(config.dump_location, news_source, label))

    save_dir = "{}/{}/{}".format(config.dump_location, news_source, label)

    tweet_id_list = []

    for news in news_list:
        for tweet_id in news.tweet_ids:
            tweet_id_list.append(
                Tweet(tweet_id, news.news_id, news_source, label))

    multiprocess_data_collection(dump_retweets_job, tweet_id_list,
                                 (config, config.twython_connector), config)
    def collect_data(self, choices):

        if not os.path.exists(f"{self.config.dump_location}/all_user_id.json"):

            print(f"all_user_id.json not found")
            return

        print(f"loads IDs to be fetched from all_user_id.json")

        with open(f"{self.config.dump_location}/all_user_id.json",
                  "r") as id_list_file:
            all_user_ids = json.loads(id_list_file.read())

        # set and create dest dir
        timeline_folder = f"{self.config.dump_location}/user_timeline_tweets"
        create_dir(timeline_folder)

        multiprocess_data_collection(
            dump_user_recent_tweets_job, all_user_ids,
            (timeline_folder, self.config.twython_connector), self.config)
    def collect_data(self, choices):

        # create dir to store user followers
        user_followers_folder = f"{self.config.dump_location}/rt_user_followers"
        create_dir(user_followers_folder)

        user_id_list_path = f"{self.config.dump_location}/rt_user_ids_1.json"  # number need to be set

        final_user_id_list = []

        with open(user_id_list_path, "r") as id_file:
            id_list = json.loads(id_file.read())['users']

            for uid in id_list:
                if not os.path.exists(f"{user_followers_folder}/{uid}.json"):
                    final_user_id_list.append(int(uid))

            multiprocess_data_collection(
                dump_user_followers, final_user_id_list,
                (user_followers_folder, self.config.twython_connector),
                self.config)
Exemplo n.º 12
0
def collect_retweets(news_list, news_source, label, config: Config, hop_index):

    assert hop_index >= 3

    create_dir(config.dump_location)
    create_dir(f"{config.dump_location}/{news_source}")
    create_dir(f"{config.dump_location}/{news_source}/{label}")

    tweet_id_list = []

    for news in news_list:

        news_dir = f"{config.dump_location}/{news_source}/{label}/{news.news_id}"

        # check whether the news is existed
        news_path = f"{news_dir}/news content.json"

        if not os.path.exists(news_path):
            # print(f"News {news.news_id} is not existed, skip downloading retweets")
            continue

        # get start date of Tweet cascade
        cascade_start_date = None

        first_tweets_dir = f"{news_dir}/tweets"
        if os.path.exists(first_tweets_dir):

            # look for date records created at previous execution 
            if os.path.exists(f"{first_tweets_dir}/first_date.json"):
                with open(f"{first_tweets_dir}/first_date.json") as date_file:
                    date_dict = json.loads(date_file.read())
                    cascade_start_date = date(date_dict['year'], date_dict['month'], date_dict['day'])

            # iterate through all tweets to find the date of earliest Tweet
            else:
                for tweet in os.listdir(first_tweets_dir):
                    with open(f"{first_tweets_dir}/{tweet}", "r") as tweet_file:
                        tweet_dict = json.loads(tweet_file.read())
                        tweet_date = parse_tweet_date(tweet_dict['created_at'])

                        if cascade_start_date == None:
                            cascade_start_date = tweet_date
                        elif tweet_date < cascade_start_date:
                            cascade_start_date = tweet_date
                # save start date back to news dir
                with open(f"{first_tweets_dir}/first_date.json", "w") as date_file:
                    date_file.write(json.dumps({
                        "year": cascade_start_date.year,
                        "month": cascade_start_date.month,
                        "day": cascade_start_date.day
                    }))
        
        print(cascade_start_date)
        if cascade_start_date == None:
            continue

        # read RTs of previous hop
        if hop_index == 3:
            previous_hop_dir = f"{news_dir}/retweets"
        else:
            previous_hop_dir = f"{news_dir}/retweets_{hop_index - 1}"

        for rt_collection in os.listdir(previous_hop_dir):
            
            with open(f"{previous_hop_dir}/{rt_collection}", "r") as rt_collection_file:
                rt_collection_dict = json.loads(rt_collection_file.read())

                for rt in rt_collection_dict['retweets']:
                    rt_date = parse_tweet_date(rt['created_at'])

                    # is date of parent tweet in time range limitation?
                    if is_date_in_range(rt_date, cascade_start_date, config):
                        tweet_id_list.append(Tweet(rt['id'], news.news_id, news_source, label, hop_index))

    if len(tweet_id_list) == 0:
        print(f"There's no parent tweet in the time range limitation, hop {hop_index} retweet crawling stopped")
        return False
    else:
        multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config)
        return True
    def collect_data(self, choices):

        if os.path.exists(
                f"{self.config.dump_location}/follower_profile_ids.json"):

            print(f"loads IDs to be fetched from follower_profile_ids.json")

            with open(f"{self.config.dump_location}/follower_profile_ids.json",
                      "r") as id_list_file:
                final_follower_ids = json.loads(id_list_file.read())

            # set and create dest dir
            follower_profiles_folder = f"{self.config.dump_location}/follower_profiles"
            create_dir(follower_profiles_folder)

        else:

            if not os.path.exists(
                    f"{self.config.dump_location}/follower_sample_map.json"):

                print(
                    f"can find file {self.config.dump_location}/follower_sample_map.json, please do sampling first"
                )
                return

            print(
                f"loads sampled follower IDs from file follower_sample_map.json"
            )

            with open(f"{self.config.dump_location}/follower_sample_map.json",
                      "r") as map_file:
                all_sampled_followers = json.loads(map_file.read())

            # set and create dest dir
            print("set and create dest dir")
            follower_profiles_folder = f"{self.config.dump_location}/follower_profiles"
            create_dir(follower_profiles_folder)

            # check profile duplication
            print("check profile duplication")
            all_follower_ids = []
            for li in all_sampled_followers.values():
                all_follower_ids += li
            final_follower_ids = []
            skipped_count = 0

            all_follower_length = len(all_follower_ids)
            for (idx, id) in enumerate(all_follower_ids):

                if idx % 100 == 0:
                    print(f"{idx} / {all_follower_length}")

                if os.path.exists(
                        f"{self.config.dump_location}/user_profiles/{id}.json"
                ):

                    skipped_count += 1
                    shutil.copyfile(
                        f"{self.config.dump_location}/user_profiles/{id}.json",
                        f"{follower_profiles_folder}/{id}.json")
                    continue

                if os.path.exists(
                        f"{self.config.dump_location}/rt_user_profiles/{id}.json"
                ):

                    skipped_count += 1
                    shutil.copyfile(
                        f"{self.config.dump_location}/rt_user_profiles/{id}.json",
                        f"{follower_profiles_folder}/{id}.json")
                    continue

                final_follower_ids.append(id)

            print(
                f"Total follower profiles to be fetched: {len(final_follower_ids)}, skipped: {skipped_count}"
            )

            # save download list back to file
            with open(
                    f"{self.config.dump_location}/follower_download_ids.json",
                    "w") as id_list_file:
                id_list_file.write(json.dumps(final_follower_ids))

        multiprocess_data_collection(
            dump_user_profile_job, final_follower_ids,
            (follower_profiles_folder, self.config.twython_connector),
            self.config)