Пример #1
0
def get_latest_fetched_articles(db_root):
    providers = get_subdirectories(db_root)

    last_articles = {}
    last_errors = {}

    # todo: fix that shit
    fetched_date = datetime.today().date()

    for p in providers:
        provider_dir = os.path.join(db_root, p)
        all_days = get_subdirectories(provider_dir)
        last_day = get_latest_day(all_days)

        last_day_dir = os.path.join(provider_dir, last_day)
        all_hours = get_subdirectories(last_day_dir)
        last_hour = get_latest_hour(all_hours)

        fetched_date = make_date_from_string(last_day, last_hour)

        filename = os.path.join(last_day_dir, last_hour, 'articles.json')

        dump = get_provider_dump(filename)

        articles, errors = [], []
        for article in dump['articles']:
            articles.append(ArticleData.from_json(article))

        for error in dump['errors']:
            errors.append(error)

        last_articles[p] = articles
        last_errors[p] = errors

    return fetched_date, last_articles, last_errors
Пример #2
0
def get_latest_fetched_articles(db_root):
    providers = get_subdirectories(db_root)

    last_articles = {}
    last_errors = {}

    # todo: fix that shit
    fetched_date = datetime.today().date()

    for p in providers:
        provider_dir = os.path.join(db_root, p)
        all_days = get_subdirectories(provider_dir)
        last_day = get_latest_day(all_days)

        last_day_dir = os.path.join(provider_dir, last_day)
        all_hours = get_subdirectories(last_day_dir)
        last_hour = get_latest_hour(all_hours)

        fetched_date = make_date_from_string(last_day, last_hour)

        filename = os.path.join(last_day_dir, last_hour, 'articles.json')

        dump = get_provider_dump(filename)

        articles, errors = [], []
        for article in dump['articles']:
            articles.append(ArticleData.from_json(article))

        for error in dump['errors']:
            errors.append(error)

        last_articles[p] = articles
        last_errors[p] = errors

    return fetched_date, last_articles, last_errors
Пример #3
0
def get_articles_from_batch(db_root, source_name, date_string, batch_time):
    json_file = os.path.join(db_root, source_name, date_string, batch_time,
                             'articles.json')
    with open(json_file, 'r') as f:
        json_content = json.load(f)
        articles = [
            ArticleData.from_json(json_string)
            for json_string in json_content['articles']
        ]
        return articles
Пример #4
0
def get_articles_per_batch(db_root, source_name, date_string):
    path = os.path.join(db_root, source_name, date_string)

    all_batch_times = get_subdirectories(path)
    all_batches = []
    for batch_time in all_batch_times:
        json_file = os.path.join(path, batch_time, 'articles.json')
        with open(json_file, 'r') as f:
            json_content = json.load(f)
            articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']]
            all_batches.append((batch_time, articles))

    all_batches.sort(key=lambda x: x[0])
    return all_batches
Пример #5
0
 def get_batch_content(self, date_string, batch_time_string):
     """
     Returns the data saved for a specific batch
     """
     batch_dir = os.path.join(self.directory, date_string, batch_time_string)
     if os.path.exists(batch_dir):
         json_filepath = os.path.join(batch_dir, ARTICLES_FILENAME)
         with open(json_filepath, 'r') as f:
             json_content = json.load(f)
             articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']]
             articles.sort(key=lambda art: art.url)
             n_errors = len(json_content['errors'])
             return articles, n_errors
     else:
         raise NonExistentBatchError(self.name, date_string, batch_time_string)
Пример #6
0
def get_articles_per_batch(db_root, source_name, date_string):
    path = os.path.join(db_root, source_name, date_string)

    all_batch_times = get_subdirectories(path)
    all_batches = []
    for batch_time in all_batch_times:
        json_file = os.path.join(path, batch_time, 'articles.json')
        with open(json_file, 'r') as f:
            json_content = json.load(f)
            articles = [
                ArticleData.from_json(json_string)
                for json_string in json_content['articles']
            ]
            all_batches.append((batch_time, articles))

    all_batches.sort(key=lambda x: x[0])
    return all_batches
Пример #7
0
    def get_batch_articles(self, date_string, batch_time_string):
        """
        Returns the articles saved for a specific first batch.
        This function does not return the articles which might have been reprocessed
        after a (manual) error handling session. You should use the
        get_reprocessed_articles() function for that.

        The function return a sorted list of ArticleData instances.
        The list is sorted using the the article url as key.
        """
        batch_dir = os.path.join(self.directory, date_string, batch_time_string)
        if os.path.exists(batch_dir):
            json_filepath = os.path.join(batch_dir, ARTICLES_FILENAME)
            with open(json_filepath, 'r') as f:
                json_content = json.load(f)
                articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']]
                articles.sort(key=lambda art: art.url)
                return articles
        else:
            raise NonExistentBatchError(self.name, date_string, batch_time_string)
Пример #8
0
    def get_reprocessed_batch_articles(self, date_string, batch_time_string):
        """
        Returns articles fetched during an error handling session.

        ((date_string, hour_string), articles)
        """
        batch_dir = os.path.join(self.directory, date_string, batch_time_string)
        if os.path.exists(batch_dir):
            reprocessed_articles = list()
            for reprocessed_data_dir in [i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]:
                reprocessed_date, reprocessed_time = reprocessed_data_dir.split("_")[1:]

                json_filepath = os.path.join(batch_dir, reprocessed_data_dir, ARTICLES_FILENAME)
                with open(json_filepath, 'r') as f:
                    json_content = json.load(f)
                    articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']]
                    articles.sort(key=lambda art: art.url)
                    reprocessed_articles.append(((reprocessed_date, reprocessed_time), articles))
            return reprocessed_articles
        else:
            raise NonExistentBatchError(self.name, date_string, batch_time_string)
Пример #9
0
def get_articles_from_batch(db_root, source_name, date_string, batch_time):
     json_file = os.path.join(db_root, source_name, date_string, batch_time, 'articles.json')
     with open(json_file, 'r') as f:
        json_content = json.load(f)
        articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']]
        return articles