Exemplo n.º 1
0
def collect_user_profiles(config: Config, twython_connector: TwythonConnector):
    dump_location = config.dump_location

    all_user_ids = set()

    all_user_ids.update(
        get_user_ids_in_folder("{}/politifact/fake".format(dump_location)))
    all_user_ids.update(
        get_user_ids_in_folder("{}/politifact/real".format(dump_location)))
    all_user_ids.update(
        get_user_ids_in_folder("{}/gossipcop/fake".format(dump_location)))
    all_user_ids.update(
        get_user_ids_in_folder("{}/gossipcop/real".format(dump_location)))

    user_profiles_folder = "{}/{}".format(dump_location, "user_profiles")
    user_timeline_tweets_folder = "{}/{}".format(dump_location,
                                                 "user_timeline_tweets")

    create_dir(user_profiles_folder)
    create_dir(user_timeline_tweets_folder)

    multiprocess_data_collection(dump_user_profile_job, all_user_ids,
                                 (user_profiles_folder, twython_connector),
                                 config)
    multiprocess_data_collection(
        dump_user_recent_tweets_job, all_user_ids,
        (user_timeline_tweets_folder, twython_connector), config)
Exemplo n.º 2
0
    def collect_user_followers(self, users):
        """
        users should be a set that you want to crawl
        """
        create_dir(self.user_followers_dir)
        # users = self.get_own_user_id(self.user_profiles_dir)
        existed_id_set = self.get_own_user_id(self.user_followers_dir)

        new_users_set = users - existed_id_set
        print("We are adding {}/{} to {}".format(len(new_users_set),
                                                 len(users),
                                                 self.user_followers_dir))

        for i, id in enumerate(new_users_set):
            try:
                save_dir = "{}/{}.json".format(self.user_followers_dir, id)
                followers = self.twython_connector.get_twython_connection(
                    Constants.GET_FOLLOWERS_ID).get_followers_ids(user_id=id,
                                                                  count=200)
                json.dump(followers, open(save_dir, 'w'))
            except TwythonRateLimitError:
                print("Twython API rate limit exception")
            except TwythonAuthError:
                print("{} followers, we have authorized execption".format(id))
            except:
                print("Other execption")
            if (i % 15 == 0):
                print("{}/{} followers are attained".format(
                    i, len(new_users_set)))
Exemplo n.º 3
0
def dump_tweet_information(tweet_chunk: list, config: Config,
                           twython_connector: TwythonConnector):
    """Collect info and dump info of tweet chunk containing atmost 100 tweets"""

    tweet_list = []
    for tweet in tweet_chunk:
        tweet_list.append(tweet.tweet_id)

    try:
        tweet_objects_map = twython_connector.get_twython_connection(
            Constants.GET_TWEET).lookup_status(id=tweet_list,
                                               include_entities=True,
                                               map=True)['id']
        for tweet in tweet_chunk:
            tweet_object = tweet_objects_map[str(tweet.tweet_id)]
            if tweet_object:
                dump_dir = "{}/{}/{}/{}".format(config.dump_location,
                                                tweet.news_source, tweet.label,
                                                tweet.news_id)
                tweet_dir = "{}/tweets".format(dump_dir)
                create_dir(dump_dir)
                create_dir(tweet_dir)

                json.dump(
                    tweet_object,
                    open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w"))

    except TwythonRateLimitError:
        print("Twython API rate limit exception")
        logging.exception("Twython API rate limit exception")

    except Exception as ex:
        logging.exception("exception in collecting tweet objects")
        print("exception in collecting tweet objects:", str(ex))
    return None
Exemplo n.º 4
0
def dump_retweets_job(
    tweet: Tweet, config: Config, twython_connector: TwythonConnector
):
    retweets = []
    connection = None

    dump_dir = get_dump_dir(config, tweet)

    if _should_fetch_retweets(tweet, dump_dir):
        try:
            connection = twython_connector.get_twython_connection(Constants.GET_RETWEET)
            retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)

        except TwythonRateLimitError:
            logging.exception(
                "Twython API rate limit exception - tweet id : {}".format(
                    tweet.tweet_id
                )
            )

        except Exception:
            logging.exception(
                "Exception in getting retweets for tweet id %d using connection %s"
                % (tweet.tweet_id, connection)
            )

    retweet_obj = {"retweets": retweets}

    retweet_dir = "{}/retweets".format(dump_dir)
    create_dir(dump_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj, open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
Exemplo n.º 5
0
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector):

    hop_index = tweet.hop_index

    news_dir = f"{config.dump_location}/{tweet.news_source}/{tweet.label}/{tweet.news_id}"
    retweet_dir = f"{news_dir}/retweets_{hop_index}"
    retweet_path = f"{retweet_dir}/{tweet.tweet_id}.json"

    if os.path.exists(retweet_path):
        print("[PASSED] news:{}, hop index: {}".format(tweet.news_id, hop_index))
        return
    else:
        print("[NEW] news:{}, hop index: {}".format(tweet.news_id, hop_index))

    retweets = []
    connection = None
    try:
        connection = twython_connector.get_twython_connection("get_retweet")
        retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)

    except TwythonRateLimitError:
        logging.exception(f"Twython API rate limit exception - tweet id : {tweet.tweet_id}")

    except Exception:
        logging.exception(
            "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection))

    retweet_obj = {"retweets": retweets}

    create_dir(news_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj, open(retweet_path, "w"))
def collect_retweets(news_list, news_source, label, config: Config):
    create_dir("{}/{}/raw".format(config.dump_location, news_source))
    news_list_to_process = []
    empty_data_objects = 0
    for news in news_list:
        news_dir = "{}/{}/{}/tweets/{}.csv".format(config.dump_location,
                                                   news_source, label,
                                                   news.news_id)
        data = pd.read_csv(news_dir)
        raw_dir = "{}/{}/complete/{}.csv".format(config.dump_location,
                                                 news_source, news.news_id)
        if data.empty:
            empty_data_objects += 1
            continue
        if path.exists(raw_dir):
            continue
        else:
            news_list_to_process.append(NewsItem(data, raw_dir))
    print('Collecting for ' + str(len(news_list_to_process)) +
          ' news storie retweets.')
    print(
        str(empty_data_objects) + '/' + str(len(news_list)) +
        ' datasets were skipped, as they were empty. ')
    multiprocess_data_collection(dump_retweets_job, news_list_to_process,
                                 (config, config.twython_connector), config)
    def collect_data(self, choices):

        use_id_from_profile = True

        # collect user IDs
        if use_id_from_profile:
            all_user_ids = get_user_ids_from_profile(
                self.config.dump_location)  # List object returned

        else:
            all_user_ids = set()
            for choice in choices:
                choice_dir = f"{self.config.dump_location}/{choice['news_source']}/{choice['label']}"
                all_user_ids.update(
                    get_user_ids_in_folder(choice_dir))  # Set object returned

            all_user_ids = list(all_user_ids)

        # create dir to store user followers
        user_followers_folder = f"{self.config.dump_location}/user_followers"
        create_dir(user_followers_folder)

        multiprocess_data_collection(
            dump_user_followers, all_user_ids,
            (user_followers_folder, self.config.twython_connector),
            self.config)
Exemplo n.º 8
0
    def collect_user_recent_tweets(self, users):
        """
        users should be a set that you want to crawl
        """
        create_dir(self.user_timelines_dir)
        # users = self.get_own_user_id(self.user_profiles_dir)
        existed_id_set = self.get_own_user_id(self.user_timelines_dir)

        new_users_set = users - existed_id_set
        print("We are adding {}/{} to user_timelines".format(
            len(new_users_set), len(users)))

        for i, id in enumerate(new_users_set):
            try:
                time_lines = self.twython_connector.get_twython_connection(
                    Constants.GET_USER_TWEETS).get_user_timeline(user_id=id,
                                                                 count=200)
                json.dump(
                    time_lines,
                    open("{}/{}.json".format(self.user_timelines_dir, id),
                         'w'))
            except TwythonRateLimitError:
                print("Twython API rate limit exception")
            except TwythonAuthError:
                print("{} timelines, we have authorized execption".format(id))
            except:
                print("Other exception")
            if (i % 100 == 0):
                print("{} users timelines are attained".format(i))
Exemplo n.º 9
0
def dump_retweets_job(tweet: Tweet, config: Config,
                      twython_connector: TwythonConnector):
    retweets = []
    connection = None
    try:
        connection = twython_connector.get_twython_connection("get_retweet")
        retweets = connection.get_retweets(id=tweet.tweet_id,
                                           count=100,
                                           cursor=-1)

    except TwythonRateLimitError:
        logging.exception(
            "Twython API rate limit exception - tweet id : {}".format(
                tweet.tweet_id))

    except Exception:
        logging.exception(
            "Exception in getting retweets for tweet id %d using connection %s"
            % (tweet.tweet_id, connection))

    retweet_obj = {"retweets": retweets}

    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source,
                                    tweet.label, tweet.news_id)
    retweet_dir = "{}/retweets".format(dump_dir)
    create_dir(dump_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj,
              open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
Exemplo n.º 10
0
    def collect_user_followings(self, users):
        create_dir(self.user_following_dir)
        # users = self.get_own_user_id(self.user_profiles_dir)
        existed_id_set = self.get_own_user_id(self.user_following_dir)

        new_users_set = users - existed_id_set
        print("We are adding {}/{} to {}".format(len(new_users_set),
                                                 len(users),
                                                 self.user_following_dir))

        for i, id in enumerate(new_users_set):
            try:
                save_dir = "{}/{}.json".format(self.user_following_dir, id)
                friends = self.twython_connector.get_twython_connection(
                    Constants.GET_FRIENDS_ID).get_friends_ids(user_id=id,
                                                              count=5000)
                json.dump(friends, open(save_dir, 'w'))
            except TwythonRateLimitError:
                print("Twython API rate limit exception")
            except TwythonAuthError:
                print("{} following, we have authorized execption".format(id))
            except:
                print("other execption")
            if (i % 15 == 0):
                print("{}/{} following are attained".format(
                    i, len(new_users_set)))
Exemplo n.º 11
0
def dump_tweet_information(tweet: Tweet, config: Config,
                           twython_connector: TwythonConnector):
    try:
        tweet_object = twython_connector.get_twython_connection(
            Constants.GET_TWEET).show_status(id=tweet.tweet_id)

        if tweet_object:
            dump_dir = "{}/{}/{}/{}".format(config.dump_location,
                                            tweet.news_source, tweet.label,
                                            tweet.news_id)
            tweet_dir = "{}/tweets".format(dump_dir)
            create_dir(dump_dir)
            create_dir(tweet_dir)

            json.dump(
                tweet_object,
                open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w"))

    except TwythonRateLimitError:
        logging.exception("Twython API rate limit exception")

    except Exception as ex:
        logging.exception("exception in collecting tweet objects")

    return None
Exemplo n.º 12
0
    def init(self):
        # Create output folder
        create_dir(self.output_path)

        # Save settings
        with open(os.path.join(self.output_path, 'args.txt'), 'w') as outfile:
            json.dump(vars(self.args), outfile, sort_keys=True, indent=4)

        # Copy label map
        copyfile(self.label_map, os.path.join(self.output_path,
                                              self.label_map))

        self.categories = json.load(open(self.label_map, 'r')).get('classes')
        self.categories = [{
            'id': cat['id'] + 1,
            'name': cat['name']
        } for cat in self.categories]
        self.org_categories = copy.deepcopy(self.categories)

        if self.included_classes is None:
            self._check_for_excluded_classes()
        else:
            self._check_for_included_classes()

        # Get included ids
        self.included_ids = [cat['id'] for cat in self.categories]

        if self.remap_labels:
            self._remap_labels()

        if self.args.rearrange_ids:
            self._rearrange_ids()

        self._write_label_map()

        if check_label_names_for_duplicates(self.categories):
            print('\nExiting! Please fix label map.')
            sys.exit(-1)

        self.cat2id = {cat['name']: cat['id'] for cat in self.categories}
        self.id2cat = {cat['id']: cat['name'] for cat in self.categories}

        self.gt_boxes = {
            cat['id']: {
                'name': cat['name'],
                'num_gt_boxes': {}
            }
            for cat in self.categories
        }

        self._fill_lists()

        assert validate_match(
            self.image_sets, self.images,
            self.label), 'Image and label files do not match.'
Exemplo n.º 13
0
    def convert(self):
        # Get supercategories
        if 'supercategory' not in self.categories[0]:
            for item in self.categories:
                name = item.get('name')
                idx = name.rfind('(')
                item['supercategory'] = name[idx + 1:-1]

        time.sleep(0.1)
        print("\nCreating dataset...")

        # Make annotations output dir
        annotations_dir = os.path.join(self.output_path, "annotations")
        create_dir(annotations_dir)

        for image_set in self.image_sets:
            time.sleep(0.1)
            print("\tCreating {} set...".format(image_set))
            time.sleep(0.1)

            # Make image_set output dir
            image_set_dir = os.path.join(self.output_path, image_set)
            create_dir(image_set_dir)

            images, annotations = self._get_images_and_annotations(image_set)

            json_data = {
                "info": self.info,
                "licenses": self.licenses,
                "images": images,
                "annotations": annotations,
                "categories": self.categories
            }

            time.sleep(0.1)
            print('\tWriting annotations to disk...\n')
            time.sleep(0.1)

            annotation_file = os.path.join(self.output_path, "annotations",
                                           "instances_" + image_set + ".json")
            with open(annotation_file, "w") as jsonfile:
                json.dump(json_data, jsonfile, indent=4)

        for image_set in self.image_sets:
            print('\nTesting dataset {} ...'.format(image_set))

            annotation_file = os.path.join(self.output_path, "annotations",
                                           "instances_" + image_set + ".json")
            self._test_dataset(annotation_file)

        if self.args.show_not_verified:
            warning_not_verified_label_files(self.not_verified_label_files)
    def collect_data(self, choices):
        all_user_ids = []

        for choice in choices:
            all_user_ids.extend(get_user_ids_in_folder(
                "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"])))

        user_timeline_tweets_folder = "{}/{}".format(self.config.dump_location, "user_timeline_tweets")
        create_dir(user_timeline_tweets_folder)

        multiprocess_data_collection(dump_user_recent_tweets_job, list(all_user_ids), (user_timeline_tweets_folder,
                                                                                       self.config.twython_connector),
                                     self.config)
Exemplo n.º 15
0
    def __init__(self, root_location, twython_connector):
        self.root_location = root_location
        create_dir(root_location)

        self.twython_connector = twython_connector
        self.user_profiles_dir = os.path.join(self.root_location,
                                              "user_profiles")
        self.user_followers_dir = os.path.join(self.root_location,
                                               "user_followers")
        self.user_following_dir = os.path.join(self.root_location,
                                               "user_following")
        self.user_timelines_dir = os.path.join(self.root_location,
                                               "user_timeline_tweets")
    def collect_data(self, choices):
        all_user_ids = set()

        for choice in choices:
            all_user_ids.update(get_user_ids_in_folder(
                "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"])))

        user_friends_folder = "{}/{}".format(self.config.dump_location, "user_following")
        create_dir(user_friends_folder)

        multiprocess_data_collection(dump_user_following, list(all_user_ids), (user_friends_folder,
                                                                                       self.config.twython_connector),
                                     self.config)
Exemplo n.º 17
0
def collect_tweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    # create dir for tweets
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/{}".format(config.dump_location, news_source, label))
    # create dir for user profiles
    create_dir(f"{config.dump_location}/user_profiles")

    tweet_list = []

    for news in news_list:

        # check whether the news is existed
        news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json"

        if not os.path.exists(news_path):
            # print(f"News {news.news_id} is not existed, skip downloading tweets")
            continue

        for tweet_id in news.tweet_ids:
            tweet_list.append(Tweet(tweet_id, news.news_id, news_source, label))

    print(f"Total tweets to be downloaded: {len(tweet_list)}")
    tweet_chunks = equal_chunks(tweet_list, 100)
    multiprocess_data_collection(dump_tweet_information, tweet_chunks, (config, config.twython_connector), config)
Exemplo n.º 18
0
 def collect_tweets2dir(self, dump_dir, tweet_ids):
     search_tweet_set = set(tweet_ids)
     if (os.path.exists(dump_dir)):
         existed_tweet_set = self.get_tweetID_from_dir(dump_dir)
         search_tweet_set = search_tweet_set - existed_tweet_set
     else:
         existed_tweet_set = set()
         create_dir(dump_dir)
     print("{} tweets will be added, {} searched, {} existed".format(
         len(search_tweet_set), len(tweet_ids), len(existed_tweet_set)))
     from util.util import equal_chunks
     chunks = equal_chunks(list(search_tweet_set), chunk_size=100)
     for tweet_chunk in chunks:
         self.dump_tweet_information(tweet_chunk, dump_dir)
Exemplo n.º 19
0
    def _check_for_excluded_classes(self):
        create_dir(self.output_path)

        if not len(self.excluded_classes) == 0:
            with open(os.path.join(self.output_path, "excluded_classes.txt"),
                      'w') as file:
                remaining_categories = []

                for cat in self.categories:
                    if not cat['id'] in self.excluded_classes:
                        remaining_categories.append(cat)
                    else:
                        file.write("{}\t: {}\n".format(cat['id'], cat['name']))

            self.categories = remaining_categories

        with open(os.path.join(self.output_path, "included_classes.txt"),
                  'w') as file:
            for cat in self.categories:
                file.write("{}\t: {}\n".format(cat['id'], cat['name']))
    def collect_data(self, choices):

        if not os.path.exists(f"{self.config.dump_location}/all_user_id.json"):

            print(f"all_user_id.json not found")
            return

        print(f"loads IDs to be fetched from all_user_id.json")

        with open(f"{self.config.dump_location}/all_user_id.json",
                  "r") as id_list_file:
            all_user_ids = json.loads(id_list_file.read())

        # set and create dest dir
        timeline_folder = f"{self.config.dump_location}/user_timeline_tweets"
        create_dir(timeline_folder)

        multiprocess_data_collection(
            dump_user_recent_tweets_job, all_user_ids,
            (timeline_folder, self.config.twython_connector), self.config)
def collect_news_articles(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/{}".format(config.dump_location, news_source, label))

    save_dir = "{}/{}/{}".format(config.dump_location, news_source, label)

    for news in tqdm(news_list):
        create_dir("{}/{}".format(save_dir, news.news_id))
        news_article = crawl_news_article(news.news_url)
        if news_article:
            json.dump(news_article,
                      open("{}/{}/news content.json".format(save_dir, news.news_id), "w", encoding="UTF-8"))
    def collect_data(self, choices):

        # create dir to store user followers
        user_followers_folder = f"{self.config.dump_location}/rt_user_followers"
        create_dir(user_followers_folder)

        user_id_list_path = f"{self.config.dump_location}/rt_user_ids_1.json"  # number need to be set

        final_user_id_list = []

        with open(user_id_list_path, "r") as id_file:
            id_list = json.loads(id_file.read())['users']

            for uid in id_list:
                if not os.path.exists(f"{user_followers_folder}/{uid}.json"):
                    final_user_id_list.append(int(uid))

            multiprocess_data_collection(
                dump_user_followers, final_user_id_list,
                (user_followers_folder, self.config.twython_connector),
                self.config)
Exemplo n.º 23
0
def collect_retweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/{}".format(config.dump_location, news_source, label))

    save_dir = "{}/{}/{}".format(config.dump_location, news_source, label)

    tweet_id_list = []

    for news in news_list:
        for tweet_id in news.tweet_ids:
            tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label))

    filtered_tweet_id_list = [
        tweet
        for tweet in tweet_id_list
        if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),)
    ]

    multiprocess_data_collection(
        dump_retweets_job,
        filtered_tweet_id_list,
        (config, config.twython_connector),
        config,
    )
Exemplo n.º 24
0
def collect_retweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir(f"{config.dump_location}/{news_source}")
    create_dir(f"{config.dump_location}/{news_source}/{label}")

    tweet_id_list = []

    for news in news_list:

        # check whether the news is existed
        news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json"

        if not os.path.exists(news_path):
            # print(f"News {news.news_id} is not existed, skip downloading retweets")
            continue

        for tweet_id in news.tweet_ids:

            # check whether the tweet is existed
            tweet_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/tweets/{tweet_id}.json"

            if not os.path.exists(tweet_path):
                # print(f"Tweet {tweet_id} is not existed, skip downloading retweets")
                continue

            tweet_id_list.append(
                Tweet(tweet_id, news.news_id, news_source, label))

    multiprocess_data_collection(dump_retweets_job, tweet_id_list,
                                 (config, config.twython_connector), config)
Exemplo n.º 25
0
def dump_retweets_job(tweet: Tweet, config: Config,
                      twython_connector: TwythonConnector):

    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source,
                                    tweet.label, tweet.news_id)
    retweet_dir = "{}/retweets".format(dump_dir)
    retweet_path = "{}/{}.json".format(retweet_dir, tweet.tweet_id)

    if os.path.exists(retweet_path):
        print("[PASSED] source:{}, label:{}, news:{}, retweet: tweet{}".format(
            tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id))
        return
    else:
        print("[NEW] source:{}, label:{}, news:{}, retweet: tweet{}".format(
            tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id))

    retweets = []
    connection = None
    try:
        connection = twython_connector.get_twython_connection("get_retweet")
        retweets = connection.get_retweets(id=tweet.tweet_id,
                                           count=100,
                                           cursor=-1)

    except TwythonRateLimitError:
        logging.exception(
            "Twython API rate limit exception - tweet id : {}".format(
                tweet.tweet_id))

    except Exception:
        logging.exception(
            "Exception in getting retweets for tweet id %d using connection %s"
            % (tweet.tweet_id, connection))

    retweet_obj = {"retweets": retweets}

    create_dir(dump_dir)
    create_dir(retweet_dir)
    json.dump(retweet_obj, open(retweet_path, "w"))
Exemplo n.º 26
0
    def calc_img_statistics(self):
        print("Calculating image statistics ...")

        mean_per_image = []

        for s in self.image_sets:
            time.sleep(0.1)
            print(
                "\tCalculating mean and variance for images in {} ...".format(
                    s))
            time.sleep(0.1)

            for i, image_filename in enumerate(
                    tqdm(self.images[s], desc="\tProgress:", unit="files")):
                image_path = os.path.join(self.image_path, image_filename)
                image_arr = cv2.cvtColor(cv2.imread(image_path),
                                         cv2.COLOR_BGR2RGB)

                mean_per_image.append(np.mean(image_arr, axis=(0, 1)))
                self.img_var.append((np.var(image_arr, axis=(0, 1))))

        self.img_mean = np.mean(mean_per_image, axis=0)
        self.img_std = np.sqrt(np.divide(np.sum(self.img_var, axis=0), i))
        print(
            "Mean (RGB) = {}, {}, {}\nStandard deviation = {}, {}, {}".format(
                self.img_mean[0], self.img_mean[1], self.img_mean[2],
                self.img_std[0], self.img_std[1], self.img_std[2]))

        create_dir(self.output_path)
        with open(os.path.join(self.output_path, "image_stats.txt"),
                  'w') as file:
            file.write("Image statistics per RGB Channel")
            file.write("mean = [{}, {}, {}]\n".format(self.img_mean[0],
                                                      self.img_mean[1],
                                                      self.img_mean[2]))
            file.write("std = [{}, {}, {}]\n".format(self.img_std[0],
                                                     self.img_std[1],
                                                     self.img_std[2]))
Exemplo n.º 27
0
    def dump_tweet_information(self, tweet_chunk, dump_dir):
        """Collect info and dump info of tweet chunk containing atmost 100 tweets"""
        try:
            tweet_objects_map = self.twython_connector.get_twython_connection(
                Constants.GET_TWEET).lookup_status(id=tweet_chunk,
                                                   include_entities=True,
                                                   map=True)['id']
            for tweet_id in tweet_chunk:
                tweet_object = tweet_objects_map[str(tweet_id)]
                if tweet_object:
                    create_dir(dump_dir)

                    json.dump(
                        tweet_object,
                        open("{}/{}.json".format(dump_dir, tweet_id), "w"))

        except TwythonRateLimitError:
            logging.exception("Twython API rate limit exception")

        except Exception as ex:
            logging.exception("exception in collecting tweet objects")

        return None
def dump_user_recent_tweets_job(user_id, save_location, twython_connector: TwythonConnector):
    profile_info = None

    # Fetch and save user information if the file is not already present
    if not Path("{}/{}/{}.json".format(save_location, user_id[2], user_id[0])).is_file():
        create_dir("{}/{}".format(save_location, user_id[2]))
        try:
            profile_info = twython_connector.get_twython_connection(GET_USER_TWEETS).get_user_timeline(user_id=user_id[0],
                                                                                                           count=200,
                                                                                                           exclude_replies=False,
                                                                                                           incude_rts=True,
                                                                                                           max_id=user_id[1])
        except TwythonRateLimitError as ex:
            logging.exception("Twython API rate limit exception")

        finally:
            if len(profile_info) > 0:
                logging.info("found {} tweets in timeline for user {}".format(len(profile_info), user_id[0]))
                json.dump(profile_info, open("{}/{}/{}.json".format(save_location, user_id[2], user_id[0]), "w"))
            else:
                logging.warning("couldn't retrieve the timeline of user {}".format(user_id[0]))
    else:
        logging.info("file for users and story already existis")
Exemplo n.º 29
0
    def collect_user_profiles(self, users):
        dump_location = self.user_profiles_dir
        create_dir(dump_location)

        existed_id_set = self.get_own_user_id(dump_location)
        new_users_set = users - existed_id_set
        print("existed: {}, found: {}, add: {}".format(len(existed_id_set),
                                                       len(users),
                                                       len(new_users_set)))
        print("We are adding {} user profiles to {}".format(
            len(new_users_set), dump_location))

        user_chunks = equal_chunks(list(new_users_set), 100)

        number = 0
        for chunk in user_chunks:
            try:
                user_objects_map = self.twython_connector.get_twython_connection(
                    Constants.GET_USER).lookup_user(user_id=chunk,
                                                    include_entities=True)
                for user_object in user_objects_map:
                    json.dump(
                        user_object,
                        open(
                            "{}/{}.json".format(dump_location,
                                                user_object["id"]), "w"))

                    number += 1
                print("{} has been added".format(number))
            except TwythonError:
                print("Twythonerror")
            except TwythonRateLimitError:
                print("Twython API rate limit exception")
            except:
                print("Exception")
        print("Finish")
def collect_tweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/tweets".format(config.dump_location, news_source))

    for news in news_list:
        print('Downloading ' + news_source + ' ' + label + ' ' + news.news_id +
              ' tweets')
        create_dir("{}/{}/{}/{}".format(config.dump_location, news_source,
                                        label, news.news_id))
        data = pd.DataFrame(columns=features)
        news_dir = "{}/{}/tweets/{}.csv".format(config.dump_location,
                                                news_source, news.news_id)
        if path.exists(news_dir):
            continue
        else:
            for tweet in t.hydrate(news.tweet_ids):
                data = data.append(extract_tweet_features(tweet, label),
                                   ignore_index=True)
            data.to_csv(news_dir, index=False)