def task_compile_extended_asset_info(): assets = list(config.mongo_db.asset_extended_info.find({'info_status': 'needfetch'})) asset_info_urls = [] def asset_fetch_complete_hook(urls_data): logger.info("Enhanced asset info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) for asset in assets: logger.debug("Looking at asset %s: %s" % (asset, asset['info_url'])) if asset['info_url']: info_url = ('http://' + asset['info_url']) \ if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url'] assert info_url in urls_data if not urls_data[info_url][0]: # request was not successful inc_fetch_retry(asset, max_retry=ASSET_MAX_RETRY, errors=[urls_data[info_url][1]]) logger.warn("Fetch for asset at %s not successful: %s (try %i of %i)" % ( info_url, urls_data[info_url][1], asset['fetch_info_retry'], ASSET_MAX_RETRY)) else: result = process_asset_info(asset, urls_data[info_url][1]) if not result[0]: logger.info("Processing for asset %s at %s not successful: %s" % (asset['asset'], info_url, result[1])) else: logger.debug("Processing for asset %s at %s successful" % (asset['asset'], info_url)) # compose and fetch all info URLs in all assets with them for asset in assets: if not asset['info_url']: continue if asset.get('disabled', False): logger.info("ExtendedAssetInfo: Skipping disabled asset %s" % asset['asset']) continue # may or may not end with .json. may or may not start with http:// or https:// asset_info_urls.append(( ('http://' + asset['info_url']) if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url'])) asset_info_urls_str = ', '.join(asset_info_urls) asset_info_urls_str = ( (asset_info_urls_str[:2000] + ' ...') if len(asset_info_urls_str) > 2000 else asset_info_urls_str) # truncate if necessary if len(asset_info_urls): logger.info('Fetching enhanced asset info for %i assets: %s' % (len(asset_info_urls), asset_info_urls_str)) util.stream_fetch( asset_info_urls, asset_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logger.debug("Asset info URL %s retrieved, result: %s" % (url, data))) start_task(task_compile_extended_asset_info, delay=60 * 60) # call again in 60 minutes
def task_compile_extended_feed_info(): feeds = list(config.mongo_db.feeds.find({'info_status': 'needfetch'})) feed_info_urls = [] def inc_fetch_retry(feed, max_retry=FEED_MAX_RETRY, new_status='error', errors=[]): feed['fetch_info_retry'] += 1 feed['errors'] = errors if feed['fetch_info_retry'] == max_retry: feed['info_status'] = new_status config.mongo_db.feeds.save(feed) def process_feed_info(feed, info_data): # sanity check assert feed['info_status'] == 'needfetch' assert 'info_url' in feed assert util.is_valid_url( feed['info_url'], allow_no_protocol=True) # already validated in the fetch errors = util.is_valid_json(info_data, config.FEED_SCHEMA) if not isinstance(info_data, dict) or 'address' not in info_data: errors.append('Invalid data format') elif feed['source'] != info_data['address']: errors.append('Invalid address') if len(errors) > 0: inc_fetch_retry(feed, new_status='invalid', errors=errors) return (False, errors) feed['info_status'] = 'valid' # fetch any associated images... # TODO: parallelize this 2nd level feed image fetching ... (e.g. just compose a list here, and process it in later on) if 'image' in info_data: info_data['valid_image'] = util.fetch_image( info_data['image'], config.SUBDIR_FEED_IMAGES, feed['source'] + '_topic', fetch_timeout=5) if 'operator' in info_data and 'image' in info_data['operator']: info_data['operator']['valid_image'] = util.fetch_image( info_data['operator']['image'], config.SUBDIR_FEED_IMAGES, feed['source'] + '_owner', fetch_timeout=5) if 'targets' in info_data: for i in range(len(info_data['targets'])): if 'image' in info_data['targets'][i]: image_name = feed['source'] + '_tv_' + str( info_data['targets'][i]['value']) info_data['targets'][i]['valid_image'] = util.fetch_image( info_data['targets'][i]['image'], config.SUBDIR_FEED_IMAGES, image_name, fetch_timeout=5) feed['info_data'] = sanitize_json_data(info_data) config.mongo_db.feeds.save(feed) return (True, None) def feed_fetch_complete_hook(urls_data): logger.info( "Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) feeds = config.mongo_db.feeds.find({'info_status': 'needfetch'}) for feed in feeds: #logger.debug("Looking at feed %s: %s" % (feed, feed['info_url'])) if feed['info_url']: info_url = ('http://' + feed['info_url']) \ if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url'] if info_url not in urls_data: logger.warn( "URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data))) continue assert info_url in urls_data if not urls_data[info_url][0]: # request was not successful inc_fetch_retry(feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]]) logger.warn( "Fetch for feed at %s not successful: %s (try %i of %i)" % (info_url, urls_data[info_url][1], feed['fetch_info_retry'], FEED_MAX_RETRY)) else: result = process_feed_info(feed, urls_data[info_url][1]) if not result[0]: logger.info( "Processing for feed at %s not successful: %s" % (info_url, result[1])) else: logger.info("Processing for feed at %s successful" % info_url) # compose and fetch all info URLs in all feeds with them for feed in feeds: assert feed['info_url'] feed_info_urls.append(( 'http://' + feed['info_url']) if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']) feed_info_urls_str = ', '.join(feed_info_urls) feed_info_urls_str = ( feed_info_urls_str[:2000] + ' ...' ) if len(feed_info_urls_str ) > 2000 else feed_info_urls_str # truncate if necessary if len(feed_info_urls): logger.info('Fetching enhanced feed info for %i feeds: %s' % (len(feed_info_urls), feed_info_urls_str)) util.stream_fetch(feed_info_urls, feed_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logger.debug("Feed at %s retrieved, result: %s" % (url, data))) start_task(task_compile_extended_feed_info, delay=60 * 5) # call again in 5 minutes
def task_compile_extended_feed_info(): feeds = list(config.mongo_db.feeds.find({"info_status": "needfetch"})) feed_info_urls = [] def inc_fetch_retry(feed, max_retry=FEED_MAX_RETRY, new_status="error", errors=[]): feed["fetch_info_retry"] += 1 feed["errors"] = errors if feed["fetch_info_retry"] == max_retry: feed["info_status"] = new_status config.mongo_db.feeds.save(feed) def process_feed_info(feed, info_data): # sanity check assert feed["info_status"] == "needfetch" assert "info_url" in feed assert util.is_valid_url(feed["info_url"], allow_no_protocol=True) # already validated in the fetch errors = util.is_valid_json(info_data, config.FEED_SCHEMA) if not isinstance(info_data, dict) or "address" not in info_data: errors.append("Invalid data format") elif feed["source"] != info_data["address"]: errors.append("Invalid address") if len(errors) > 0: inc_fetch_retry(feed, new_status="invalid", errors=errors) return (False, errors) feed["info_status"] = "valid" # fetch any associated images... # TODO: parallelize this 2nd level feed image fetching ... (e.g. just compose a list here, and process it in later on) if "image" in info_data: info_data["valid_image"] = util.fetch_image( info_data["image"], config.SUBDIR_FEED_IMAGES, feed["source"] + "_topic", fetch_timeout=5 ) if "operator" in info_data and "image" in info_data["operator"]: info_data["operator"]["valid_image"] = util.fetch_image( info_data["operator"]["image"], config.SUBDIR_FEED_IMAGES, feed["source"] + "_owner", fetch_timeout=5 ) if "targets" in info_data: for i in range(len(info_data["targets"])): if "image" in info_data["targets"][i]: image_name = feed["source"] + "_tv_" + str(info_data["targets"][i]["value"]) info_data["targets"][i]["valid_image"] = util.fetch_image( info_data["targets"][i]["image"], config.SUBDIR_FEED_IMAGES, image_name, fetch_timeout=5 ) feed["info_data"] = sanitize_json_data(info_data) config.mongo_db.feeds.save(feed) return (True, None) def feed_fetch_complete_hook(urls_data): logger.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) feeds = config.mongo_db.feeds.find({"info_status": "needfetch"}) for feed in feeds: # logger.debug("Looking at feed %s: %s" % (feed, feed['info_url'])) if feed["info_url"]: info_url = ( ("http://" + feed["info_url"]) if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://") else feed["info_url"] ) if info_url not in urls_data: logger.warn( "URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data)) ) continue assert info_url in urls_data if not urls_data[info_url][0]: # request was not successful inc_fetch_retry(feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]]) logger.warn( "Fetch for feed at %s not successful: %s (try %i of %i)" % (info_url, urls_data[info_url][1], feed["fetch_info_retry"], FEED_MAX_RETRY) ) else: result = process_feed_info(feed, urls_data[info_url][1]) if not result[0]: logger.info("Processing for feed at %s not successful: %s" % (info_url, result[1])) else: logger.info("Processing for feed at %s successful" % info_url) # compose and fetch all info URLs in all feeds with them for feed in feeds: assert feed["info_url"] feed_info_urls.append( ("http://" + feed["info_url"]) if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://") else feed["info_url"] ) feed_info_urls_str = ", ".join(feed_info_urls) feed_info_urls_str = ( (feed_info_urls_str[:2000] + " ...") if len(feed_info_urls_str) > 2000 else feed_info_urls_str ) # truncate if necessary if len(feed_info_urls): logger.info("Fetching enhanced feed info for %i feeds: %s" % (len(feed_info_urls), feed_info_urls_str)) util.stream_fetch( feed_info_urls, feed_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logger.debug( "Feed at %s retrieved, result: %s" % (url, data) ), ) start_task(task_compile_extended_feed_info, delay=60 * 5) # call again in 5 minutes
def fetch_all_feed_info(db): feeds = list(db.feeds.find({'info_status': 'needfetch'})) feed_info_urls = [] def feed_fetch_complete_hook(urls_data): logger.info( "Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) feeds = db.feeds.find({'info_status': 'needfetch'}) for feed in feeds: #logger.debug("Looking at feed %s: %s" % (feed, feed['info_url'])) if feed['info_url']: info_url = ('http://' + feed['info_url']) \ if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url'] if info_url not in urls_data: logger.warn( "URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data))) continue assert info_url in urls_data if not urls_data[info_url][0]: #request was not successful inc_fetch_retry(db, feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]]) logger.warn( "Fetch for feed at %s not successful: %s (try %i of %i)" % (info_url, urls_data[info_url][1], feed['fetch_info_retry'], FEED_MAX_RETRY)) else: result = process_feed_info(db, feed, urls_data[info_url][1]) if not result[0]: logger.info( "Processing for feed at %s not successful: %s" % (info_url, result[1])) else: logger.info("Processing for feed at %s successful" % info_url) #compose and fetch all info URLs in all feeds with them for feed in feeds: assert feed['info_url'] feed_info_urls.append(('http://' + feed['info_url']) \ if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']) feed_info_urls_str = ', '.join(feed_info_urls) feed_info_urls_str = ( feed_info_urls_str[:2000] + ' ...') if len(feed_info_urls_str ) > 2000 else feed_info_urls_str #truncate if necessary if len(feed_info_urls): logger.info('Fetching enhanced feed info for %i feeds: %s' % (len(feed_info_urls), feed_info_urls_str)) util.stream_fetch(feed_info_urls, feed_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logger.debug("Feed at %s retrieved, result: %s" % (url, data)))