Exemplo n.º 1
0
def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get("url")
    if urlparse(url).scheme not in ("http", "https", ""):
        context.log.info("Fetch skipped. Unsupported scheme: %r", url)
        return
    attempt = data.pop("retry_attempt", 1)
    try:
        result = context.http.get(url, lazy=True)
        rules = context.get("rules", {"match_all": {}})
        if not Rule.get_rule(rules).apply(result):
            context.log.info("Fetch skip: %r", result.url)
            return

        if not result.ok:
            err = (result.url, result.status_code)
            context.emit_warning("Fetch fail [%s]: HTTP %s" % err)
            if not context.params.get("emit_errors", False):
                return
        else:
            context.log.info("Fetched [%s]: %r", result.status_code, result.url)

        data.update(result.serialize())
        if url != result.url:
            tag = make_key(context.run_id, url)
            context.set_tag(tag, None)
        context.emit(data=data)
    except RequestException as ce:
        retries = int(context.get("retry", 3))
        if retries >= attempt:
            context.log.warn("Retry: %s (error: %s)", url, ce)
            data["retry_attempt"] = attempt + 1
            context.recurse(data=data, delay=2 ** attempt)
        else:
            context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
Exemplo n.º 2
0
def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get('url')
    attempt = data.pop('retry_attempt', 1)
    try:
        result = context.http.get(url, lazy=True)
        rules = context.get('rules', {'match_all': {}})
        if not Rule.get_rule(rules).apply(result):
            context.log.info('Fetch skip: %r', result.url)
            return

        if not result.ok:
            err = (result.url, result.status_code)
            context.emit_warning("Fetch fail [%s]: HTTP %s" % err)
            if not context.params.get('emit_errors', False):
                return
        else:
            context.log.info("Fetched [%s]: %r", result.status_code,
                             result.url)

        data.update(result.serialize())
        if url != result.url:
            tag = make_key(context.run_id, url)
            context.set_tag(tag, None)
        context.emit(data=data)
    except RequestException as ce:
        retries = int(context.get('retry', 3))
        if retries >= attempt:
            context.log.warn("Retry: %s (error: %s)", url, ce)
            data['retry_attempt'] = attempt + 1
            context.recurse(data=data, delay=2**attempt)
        else:
            context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
Exemplo n.º 3
0
def parse(context, data):
    with context.http.rehash(data) as result:
        if result.html is not None:
            parse_html(context, data, result)

        rules = context.params.get('store') or {'match_all': {}}
        if Rule.get_rule(rules).apply(result):
            context.emit(rule='store', data=data)
Exemplo n.º 4
0
def parse(context, data):
    with context.http.rehash(data) as result:
        if result.html is not None:
            # Get extra metadata from the DOM
            parse_for_metadata(context, data, result.html)
            parse_html(context, data, result)

        rules = context.params.get("store") or {"match_all": {}}
        if Rule.get_rule(rules).apply(result):
            context.emit(rule="store", data=data)
Exemplo n.º 5
0
def parse(context, data):
    with context.http.rehash(data) as result:
        news_article = Article(url=data["url"])
        news_article.download()
        news_article.parse()
        parse_article(context, data, news_article)

        if result.html is not None:
            memorious.operations.parse.parse_for_metadata(context, data, result.html)
            memorious.operations.parse.parse_html(context, data, result)

        rules = context.params.get("match") or {"match_all": {}}
        if Rule.get_rule(rules).apply(result):
            context.emit(rule="store", data=data)
Exemplo n.º 6
0
def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get('url')
    result = context.http.get(url, lazy=True)

    rules = context.get('rules', {'match_all': {}})
    if not Rule.get_rule(rules).apply(result):
        context.log.info('Fetch skip: %r', result.url)
        return

    if not result.ok:
        context.emit_warning("Fetch fail [%s]: %s", result.status_code,
                             result.url)
        return

    context.log.info("Fetched [%s]: %r", result.status_code, result.url)
    data.update(result.serialize())
    if url != result.url:
        tag = make_key((context.run_id, url))
        context.set_tag(tag, None)
    context.emit(data=data)
Exemplo n.º 7
0
 def test_get_rule(self):
     with pytest.raises(Exception):
         Rule.get_rule(invalid_spec)
     assert isinstance(Rule.get_rule(spec), RULES["and"])