예제 #1
0
def task_compile_extended_asset_info():
    assets = list(config.mongo_db.asset_extended_info.find({'info_status': 'needfetch'}))
    asset_info_urls = []

    def asset_fetch_complete_hook(urls_data):
        logger.info("Enhanced asset info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data))
        for asset in assets:
            logger.debug("Looking at asset %s: %s" % (asset, asset['info_url']))
            if asset['info_url']:
                info_url = ('http://' + asset['info_url']) \
                    if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url']
                assert info_url in urls_data
                if not urls_data[info_url][0]:  # request was not successful
                    inc_fetch_retry(asset, max_retry=ASSET_MAX_RETRY, errors=[urls_data[info_url][1]])
                    logger.warn("Fetch for asset at %s not successful: %s (try %i of %i)" % (
                        info_url, urls_data[info_url][1], asset['fetch_info_retry'], ASSET_MAX_RETRY))
                else:
                    result = process_asset_info(asset, urls_data[info_url][1])
                    if not result[0]:
                        logger.info("Processing for asset %s at %s not successful: %s" % (asset['asset'], info_url, result[1]))
                    else:
                        logger.debug("Processing for asset %s at %s successful" % (asset['asset'], info_url))

    # compose and fetch all info URLs in all assets with them
    for asset in assets:
        if not asset['info_url']:
            continue

        if asset.get('disabled', False):
            logger.info("ExtendedAssetInfo: Skipping disabled asset %s" % asset['asset'])
            continue

        # may or may not end with .json. may or may not start with http:// or https://
        asset_info_urls.append((
            ('http://' + asset['info_url'])
            if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://')
            else asset['info_url']))

    asset_info_urls_str = ', '.join(asset_info_urls)
    asset_info_urls_str = (
        (asset_info_urls_str[:2000] + ' ...')
        if len(asset_info_urls_str) > 2000
        else asset_info_urls_str)  # truncate if necessary
    if len(asset_info_urls):
        logger.info('Fetching enhanced asset info for %i assets: %s' % (len(asset_info_urls), asset_info_urls_str))
        util.stream_fetch(
            asset_info_urls, asset_fetch_complete_hook,
            fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20,
            per_request_complete_callback=lambda url, data: logger.debug("Asset info URL %s retrieved, result: %s" % (url, data)))

    start_task(task_compile_extended_asset_info, delay=60 * 60)  # call again in 60 minutes
예제 #2
0
def task_compile_extended_asset_info():
    assets = list(config.mongo_db.asset_extended_info.find({'info_status': 'needfetch'}))
    asset_info_urls = []

    def asset_fetch_complete_hook(urls_data):
        logger.info("Enhanced asset info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data))
        for asset in assets:
            logger.debug("Looking at asset %s: %s" % (asset, asset['info_url']))
            if asset['info_url']:
                info_url = ('http://' + asset['info_url']) \
                    if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url']
                assert info_url in urls_data
                if not urls_data[info_url][0]:  # request was not successful
                    inc_fetch_retry(asset, max_retry=ASSET_MAX_RETRY, errors=[urls_data[info_url][1]])
                    logger.warn("Fetch for asset at %s not successful: %s (try %i of %i)" % (
                        info_url, urls_data[info_url][1], asset['fetch_info_retry'], ASSET_MAX_RETRY))
                else:
                    result = process_asset_info(asset, urls_data[info_url][1])
                    if not result[0]:
                        logger.info("Processing for asset %s at %s not successful: %s" % (asset['asset'], info_url, result[1]))
                    else:
                        logger.debug("Processing for asset %s at %s successful" % (asset['asset'], info_url))

    # compose and fetch all info URLs in all assets with them
    for asset in assets:
        if not asset['info_url']:
            continue

        if asset.get('disabled', False):
            logger.info("ExtendedAssetInfo: Skipping disabled asset %s" % asset['asset'])
            continue

        # may or may not end with .json. may or may not start with http:// or https://
        asset_info_urls.append((
            ('http://' + asset['info_url'])
            if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://')
            else asset['info_url']))

    asset_info_urls_str = ', '.join(asset_info_urls)
    asset_info_urls_str = (
        (asset_info_urls_str[:2000] + ' ...')
        if len(asset_info_urls_str) > 2000
        else asset_info_urls_str)  # truncate if necessary
    if len(asset_info_urls):
        logger.info('Fetching enhanced asset info for %i assets: %s' % (len(asset_info_urls), asset_info_urls_str))
        util.stream_fetch(
            asset_info_urls, asset_fetch_complete_hook,
            fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20,
            per_request_complete_callback=lambda url, data: logger.debug("Asset info URL %s retrieved, result: %s" % (url, data)))

    start_task(task_compile_extended_asset_info, delay=60 * 60)  # call again in 60 minutes
예제 #3
0
def task_compile_extended_feed_info():
    feeds = list(config.mongo_db.feeds.find({'info_status': 'needfetch'}))
    feed_info_urls = []

    def inc_fetch_retry(feed,
                        max_retry=FEED_MAX_RETRY,
                        new_status='error',
                        errors=[]):
        feed['fetch_info_retry'] += 1
        feed['errors'] = errors
        if feed['fetch_info_retry'] == max_retry:
            feed['info_status'] = new_status
        config.mongo_db.feeds.save(feed)

    def process_feed_info(feed, info_data):
        # sanity check
        assert feed['info_status'] == 'needfetch'
        assert 'info_url' in feed
        assert util.is_valid_url(
            feed['info_url'],
            allow_no_protocol=True)  # already validated in the fetch

        errors = util.is_valid_json(info_data, config.FEED_SCHEMA)

        if not isinstance(info_data, dict) or 'address' not in info_data:
            errors.append('Invalid data format')
        elif feed['source'] != info_data['address']:
            errors.append('Invalid address')

        if len(errors) > 0:
            inc_fetch_retry(feed, new_status='invalid', errors=errors)
            return (False, errors)

        feed['info_status'] = 'valid'

        # fetch any associated images...
        # TODO: parallelize this 2nd level feed image fetching ... (e.g. just compose a list here, and process it in later on)
        if 'image' in info_data:
            info_data['valid_image'] = util.fetch_image(
                info_data['image'],
                config.SUBDIR_FEED_IMAGES,
                feed['source'] + '_topic',
                fetch_timeout=5)
        if 'operator' in info_data and 'image' in info_data['operator']:
            info_data['operator']['valid_image'] = util.fetch_image(
                info_data['operator']['image'],
                config.SUBDIR_FEED_IMAGES,
                feed['source'] + '_owner',
                fetch_timeout=5)
        if 'targets' in info_data:
            for i in range(len(info_data['targets'])):
                if 'image' in info_data['targets'][i]:
                    image_name = feed['source'] + '_tv_' + str(
                        info_data['targets'][i]['value'])
                    info_data['targets'][i]['valid_image'] = util.fetch_image(
                        info_data['targets'][i]['image'],
                        config.SUBDIR_FEED_IMAGES,
                        image_name,
                        fetch_timeout=5)

        feed['info_data'] = sanitize_json_data(info_data)
        config.mongo_db.feeds.save(feed)
        return (True, None)

    def feed_fetch_complete_hook(urls_data):
        logger.info(
            "Enhanced feed info fetching complete. %s unique URLs fetched. Processing..."
            % len(urls_data))
        feeds = config.mongo_db.feeds.find({'info_status': 'needfetch'})
        for feed in feeds:
            #logger.debug("Looking at feed %s: %s" % (feed, feed['info_url']))
            if feed['info_url']:
                info_url = ('http://' + feed['info_url']) \
                    if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']
                if info_url not in urls_data:
                    logger.warn(
                        "URL %s not properly fetched (not one of %i entries in urls_data), skipping..."
                        % (info_url, len(urls_data)))
                    continue
                assert info_url in urls_data
                if not urls_data[info_url][0]:  # request was not successful
                    inc_fetch_retry(feed,
                                    max_retry=FEED_MAX_RETRY,
                                    errors=[urls_data[info_url][1]])
                    logger.warn(
                        "Fetch for feed at %s not successful: %s (try %i of %i)"
                        % (info_url, urls_data[info_url][1],
                           feed['fetch_info_retry'], FEED_MAX_RETRY))
                else:
                    result = process_feed_info(feed, urls_data[info_url][1])
                    if not result[0]:
                        logger.info(
                            "Processing for feed at %s not successful: %s" %
                            (info_url, result[1]))
                    else:
                        logger.info("Processing for feed at %s successful" %
                                    info_url)

    # compose and fetch all info URLs in all feeds with them
    for feed in feeds:
        assert feed['info_url']
        feed_info_urls.append((
            'http://' +
            feed['info_url']) if not feed['info_url'].startswith('http://')
                              and not feed['info_url'].startswith('https://')
                              else feed['info_url'])
    feed_info_urls_str = ', '.join(feed_info_urls)
    feed_info_urls_str = (
        feed_info_urls_str[:2000] + ' ...'
    ) if len(feed_info_urls_str
             ) > 2000 else feed_info_urls_str  # truncate if necessary
    if len(feed_info_urls):
        logger.info('Fetching enhanced feed info for %i feeds: %s' %
                    (len(feed_info_urls), feed_info_urls_str))
        util.stream_fetch(feed_info_urls,
                          feed_fetch_complete_hook,
                          fetch_timeout=10,
                          max_fetch_size=4 * 1024,
                          urls_group_size=20,
                          urls_group_time_spacing=20,
                          per_request_complete_callback=lambda url, data:
                          logger.debug("Feed at %s retrieved, result: %s" %
                                       (url, data)))

    start_task(task_compile_extended_feed_info,
               delay=60 * 5)  # call again in 5 minutes
예제 #4
0
def task_compile_extended_feed_info():
    feeds = list(config.mongo_db.feeds.find({"info_status": "needfetch"}))
    feed_info_urls = []

    def inc_fetch_retry(feed, max_retry=FEED_MAX_RETRY, new_status="error", errors=[]):
        feed["fetch_info_retry"] += 1
        feed["errors"] = errors
        if feed["fetch_info_retry"] == max_retry:
            feed["info_status"] = new_status
        config.mongo_db.feeds.save(feed)

    def process_feed_info(feed, info_data):
        # sanity check
        assert feed["info_status"] == "needfetch"
        assert "info_url" in feed
        assert util.is_valid_url(feed["info_url"], allow_no_protocol=True)  # already validated in the fetch

        errors = util.is_valid_json(info_data, config.FEED_SCHEMA)

        if not isinstance(info_data, dict) or "address" not in info_data:
            errors.append("Invalid data format")
        elif feed["source"] != info_data["address"]:
            errors.append("Invalid address")

        if len(errors) > 0:
            inc_fetch_retry(feed, new_status="invalid", errors=errors)
            return (False, errors)

        feed["info_status"] = "valid"

        # fetch any associated images...
        # TODO: parallelize this 2nd level feed image fetching ... (e.g. just compose a list here, and process it in later on)
        if "image" in info_data:
            info_data["valid_image"] = util.fetch_image(
                info_data["image"], config.SUBDIR_FEED_IMAGES, feed["source"] + "_topic", fetch_timeout=5
            )
        if "operator" in info_data and "image" in info_data["operator"]:
            info_data["operator"]["valid_image"] = util.fetch_image(
                info_data["operator"]["image"], config.SUBDIR_FEED_IMAGES, feed["source"] + "_owner", fetch_timeout=5
            )
        if "targets" in info_data:
            for i in range(len(info_data["targets"])):
                if "image" in info_data["targets"][i]:
                    image_name = feed["source"] + "_tv_" + str(info_data["targets"][i]["value"])
                    info_data["targets"][i]["valid_image"] = util.fetch_image(
                        info_data["targets"][i]["image"], config.SUBDIR_FEED_IMAGES, image_name, fetch_timeout=5
                    )

        feed["info_data"] = sanitize_json_data(info_data)
        config.mongo_db.feeds.save(feed)
        return (True, None)

    def feed_fetch_complete_hook(urls_data):
        logger.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data))
        feeds = config.mongo_db.feeds.find({"info_status": "needfetch"})
        for feed in feeds:
            # logger.debug("Looking at feed %s: %s" % (feed, feed['info_url']))
            if feed["info_url"]:
                info_url = (
                    ("http://" + feed["info_url"])
                    if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://")
                    else feed["info_url"]
                )
                if info_url not in urls_data:
                    logger.warn(
                        "URL %s not properly fetched (not one of %i entries in urls_data), skipping..."
                        % (info_url, len(urls_data))
                    )
                    continue
                assert info_url in urls_data
                if not urls_data[info_url][0]:  # request was not successful
                    inc_fetch_retry(feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]])
                    logger.warn(
                        "Fetch for feed at %s not successful: %s (try %i of %i)"
                        % (info_url, urls_data[info_url][1], feed["fetch_info_retry"], FEED_MAX_RETRY)
                    )
                else:
                    result = process_feed_info(feed, urls_data[info_url][1])
                    if not result[0]:
                        logger.info("Processing for feed at %s not successful: %s" % (info_url, result[1]))
                    else:
                        logger.info("Processing for feed at %s successful" % info_url)

    # compose and fetch all info URLs in all feeds with them
    for feed in feeds:
        assert feed["info_url"]
        feed_info_urls.append(
            ("http://" + feed["info_url"])
            if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://")
            else feed["info_url"]
        )
    feed_info_urls_str = ", ".join(feed_info_urls)
    feed_info_urls_str = (
        (feed_info_urls_str[:2000] + " ...") if len(feed_info_urls_str) > 2000 else feed_info_urls_str
    )  # truncate if necessary
    if len(feed_info_urls):
        logger.info("Fetching enhanced feed info for %i feeds: %s" % (len(feed_info_urls), feed_info_urls_str))
        util.stream_fetch(
            feed_info_urls,
            feed_fetch_complete_hook,
            fetch_timeout=10,
            max_fetch_size=4 * 1024,
            urls_group_size=20,
            urls_group_time_spacing=20,
            per_request_complete_callback=lambda url, data: logger.debug(
                "Feed at %s retrieved, result: %s" % (url, data)
            ),
        )

    start_task(task_compile_extended_feed_info, delay=60 * 5)  # call again in 5 minutes
예제 #5
0
def fetch_all_feed_info(db):
    feeds = list(db.feeds.find({'info_status': 'needfetch'}))
    feed_info_urls = []

    def feed_fetch_complete_hook(urls_data):
        logger.info(
            "Enhanced feed info fetching complete. %s unique URLs fetched. Processing..."
            % len(urls_data))
        feeds = db.feeds.find({'info_status': 'needfetch'})
        for feed in feeds:
            #logger.debug("Looking at feed %s: %s" % (feed, feed['info_url']))
            if feed['info_url']:
                info_url = ('http://' + feed['info_url']) \
                    if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']
                if info_url not in urls_data:
                    logger.warn(
                        "URL %s not properly fetched (not one of %i entries in urls_data), skipping..."
                        % (info_url, len(urls_data)))
                    continue
                assert info_url in urls_data
                if not urls_data[info_url][0]:  #request was not successful
                    inc_fetch_retry(db,
                                    feed,
                                    max_retry=FEED_MAX_RETRY,
                                    errors=[urls_data[info_url][1]])
                    logger.warn(
                        "Fetch for feed at %s not successful: %s (try %i of %i)"
                        % (info_url, urls_data[info_url][1],
                           feed['fetch_info_retry'], FEED_MAX_RETRY))
                else:
                    result = process_feed_info(db, feed,
                                               urls_data[info_url][1])
                    if not result[0]:
                        logger.info(
                            "Processing for feed at %s not successful: %s" %
                            (info_url, result[1]))
                    else:
                        logger.info("Processing for feed at %s successful" %
                                    info_url)

    #compose and fetch all info URLs in all feeds with them
    for feed in feeds:
        assert feed['info_url']
        feed_info_urls.append(('http://' + feed['info_url']) \
            if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url'])
    feed_info_urls_str = ', '.join(feed_info_urls)
    feed_info_urls_str = (
        feed_info_urls_str[:2000] +
        ' ...') if len(feed_info_urls_str
                       ) > 2000 else feed_info_urls_str  #truncate if necessary
    if len(feed_info_urls):
        logger.info('Fetching enhanced feed info for %i feeds: %s' %
                    (len(feed_info_urls), feed_info_urls_str))
        util.stream_fetch(feed_info_urls,
                          feed_fetch_complete_hook,
                          fetch_timeout=10,
                          max_fetch_size=4 * 1024,
                          urls_group_size=20,
                          urls_group_time_spacing=20,
                          per_request_complete_callback=lambda url, data:
                          logger.debug("Feed at %s retrieved, result: %s" %
                                       (url, data)))