def fetch(context, data): """Do an HTTP GET on the ``url`` specified in the inbound data.""" url = data.get("url") if urlparse(url).scheme not in ("http", "https", ""): context.log.info("Fetch skipped. Unsupported scheme: %r", url) return attempt = data.pop("retry_attempt", 1) try: result = context.http.get(url, lazy=True) rules = context.get("rules", {"match_all": {}}) if not Rule.get_rule(rules).apply(result): context.log.info("Fetch skip: %r", result.url) return if not result.ok: err = (result.url, result.status_code) context.emit_warning("Fetch fail [%s]: HTTP %s" % err) if not context.params.get("emit_errors", False): return else: context.log.info("Fetched [%s]: %r", result.status_code, result.url) data.update(result.serialize()) if url != result.url: tag = make_key(context.run_id, url) context.set_tag(tag, None) context.emit(data=data) except RequestException as ce: retries = int(context.get("retry", 3)) if retries >= attempt: context.log.warn("Retry: %s (error: %s)", url, ce) data["retry_attempt"] = attempt + 1 context.recurse(data=data, delay=2 ** attempt) else: context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
def fetch(context, data): """Do an HTTP GET on the ``url`` specified in the inbound data.""" url = data.get('url') attempt = data.pop('retry_attempt', 1) try: result = context.http.get(url, lazy=True) rules = context.get('rules', {'match_all': {}}) if not Rule.get_rule(rules).apply(result): context.log.info('Fetch skip: %r', result.url) return if not result.ok: err = (result.url, result.status_code) context.emit_warning("Fetch fail [%s]: HTTP %s" % err) if not context.params.get('emit_errors', False): return else: context.log.info("Fetched [%s]: %r", result.status_code, result.url) data.update(result.serialize()) if url != result.url: tag = make_key(context.run_id, url) context.set_tag(tag, None) context.emit(data=data) except RequestException as ce: retries = int(context.get('retry', 3)) if retries >= attempt: context.log.warn("Retry: %s (error: %s)", url, ce) data['retry_attempt'] = attempt + 1 context.recurse(data=data, delay=2**attempt) else: context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
def parse(context, data): with context.http.rehash(data) as result: if result.html is not None: parse_html(context, data, result) rules = context.params.get('store') or {'match_all': {}} if Rule.get_rule(rules).apply(result): context.emit(rule='store', data=data)
def parse(context, data): with context.http.rehash(data) as result: if result.html is not None: # Get extra metadata from the DOM parse_for_metadata(context, data, result.html) parse_html(context, data, result) rules = context.params.get("store") or {"match_all": {}} if Rule.get_rule(rules).apply(result): context.emit(rule="store", data=data)
def parse(context, data): with context.http.rehash(data) as result: news_article = Article(url=data["url"]) news_article.download() news_article.parse() parse_article(context, data, news_article) if result.html is not None: memorious.operations.parse.parse_for_metadata(context, data, result.html) memorious.operations.parse.parse_html(context, data, result) rules = context.params.get("match") or {"match_all": {}} if Rule.get_rule(rules).apply(result): context.emit(rule="store", data=data)
def fetch(context, data): """Do an HTTP GET on the ``url`` specified in the inbound data.""" url = data.get('url') result = context.http.get(url, lazy=True) rules = context.get('rules', {'match_all': {}}) if not Rule.get_rule(rules).apply(result): context.log.info('Fetch skip: %r', result.url) return if not result.ok: context.emit_warning("Fetch fail [%s]: %s", result.status_code, result.url) return context.log.info("Fetched [%s]: %r", result.status_code, result.url) data.update(result.serialize()) if url != result.url: tag = make_key((context.run_id, url)) context.set_tag(tag, None) context.emit(data=data)
def test_get_rule(self): with pytest.raises(Exception): Rule.get_rule(invalid_spec) assert isinstance(Rule.get_rule(spec), RULES["and"])