Exemplo n.º 1
0
def parts(feed):
    """Returns tuple of feed parts"""

    # url_to_file_name can also be used on strings
    feed_name = fu.baseurl_to_file_name(feed[1])
    region = fu.url_to_file_name(feed[2])
    topic = fu.url_to_file_name(feed[3])

    return (feed_name, region, topic)
Exemplo n.º 2
0
    def _process(feed):
        hour = tu.ts_to_hour(datetime.now().isoformat())
        output_dir = os.path.join(
            DATA_PATH,
            fu.datestamp_to_path(hour),
        )
        os.makedirs(output_dir, exist_ok=True)

        output_file = fu.url_to_file_name(feed.url) + fu.RAW_FEED
        output_path = os.path.join(output_dir, output_file)
        
        with open(output_path, 'w') as out:
            r = requests.get(feed.url, timeout=15)
            r.raise_for_status()
            out.write(r.text)

        output = {
            'name':fu.baseurl_to_file_name(feed.url),
            'path':output_path,
            'region':feed.region,
            'topic':feed.topic,
            'url':feed.url,
        }

        PRODUCER.poll(0)
        PRODUCER.produce(
            'raw_rss',
            json.dumps(output).encode('utf-8'),
            callback=delivery_report,
        )
Exemplo n.º 3
0
    def _parse_article(article_path, article):
        data_path = os.path.join(fu.get_mount_folder(), fu.FEED_BASE)
        output_dir = os.path.join(
            data_path,
            article['region'],
            article['topic'],
            tu.ts_to_month(datetime.now().isoformat()),
            article['name'],
        )
        os.makedirs(output_dir, exist_ok=True)

        output_file = fu.url_to_file_name(article['url']) + fu.PARSED_ARTICLE
        output_path = os.path.join(output_dir, output_file)

        parser = ArticleParser()

        title, _, authors, publish_date = parser.parse(
            article_path,
            output_path,
            article['url'],
        )

        return {
            'title': title,
            'authors': authors,
            'publish_date': publish_date or datetime.now().isoformat(),
            'path': output_path,
            'article': article,
        }
Exemplo n.º 4
0
def process_feed(feed):

    feed_name = '{}_{}_{}'.format(
        feed.region,
        feed.topic,
        fu.url_to_file_name(feed.url),
    )
    feed_name = feed_name.replace(' ', '_')

    summary_name = '{}_feed_processing_seconds'.format(feed_name)
    process_feed_time = REGISTRY.get_summary(summary_name, 'Time spent processing feed')

    errors_name = '{}_feed_processing_exceptions'.format(feed_name)
    count_errors = REGISTRY.get_counter(errors_name, 'Exceptions processing feed')

    @process_feed_time.time()
    def _process(feed):
        hour = tu.ts_to_hour(datetime.now().isoformat())
        output_dir = os.path.join(
            DATA_PATH,
            fu.datestamp_to_path(hour),
        )
        os.makedirs(output_dir, exist_ok=True)

        output_file = fu.url_to_file_name(feed.url) + fu.RAW_FEED
        output_path = os.path.join(output_dir, output_file)
        
        with open(output_path, 'w') as out:
            r = requests.get(feed.url, timeout=15)
            r.raise_for_status()
            out.write(r.text)

        output = {
            'name':fu.baseurl_to_file_name(feed.url),
            'path':output_path,
            'region':feed.region,
            'topic':feed.topic,
            'url':feed.url,
        }

        PRODUCER.poll(0)
        PRODUCER.produce(
            'raw_rss',
            json.dumps(output).encode('utf-8'),
            callback=delivery_report,
        )

    try:
        _process(feed)
        return 0
    except Exception as e:
        print('{}'.format(repr(e)))
        count_errors.inc()
        return 1
Exemplo n.º 5
0
    def _get_article(article):
        data_path = os.path.join(fu.get_mount_folder(), fu.FEED_BASE)
        output_dir = os.path.join(
            data_path,
            article['region'],
            article['topic'],
            tu.ts_to_month(datetime.now().isoformat()),
            article['name'],
        )
        os.makedirs(output_dir, exist_ok=True)

        output_file = fu.url_to_file_name(article['url']) + fu.RAW_ARTICLE
        output_path = os.path.join(output_dir, output_file)

        # Short circuit
        if os.path.exists(output_path):
            return None

        with open(output_path, 'w') as out:
            res = requests.get(article['url'], timeout=15)
            res.raise_for_status()
            out.write(res.text)

        return output_path