Exemplo n.º 1
0
def httptasker(spider_q, retry=5):
    while True:
        req, callback = yield from spider_q.get(interval=1.0)
        if req is None:
            tl = random.random() * 0.001
            yield gen.Task(io_loop.add_timeout, tl)
            continue

        def get_response():
            for i in range(retry):
                try:
                    rq = httpclient.HTTPRequest(**req)
                    response = yield httpcli.fetch(rq)
                    response.ourl = req['url']
                    return response
                except httpclient.HTTPError as e:
                    print('@HTTPError: %s with URL: %s Retry: %s' %
                          (e.code, req['url'], i + 1),
                          file=sys.stderr)
                    if e.code in (403, 404):
                        break
                    continue
                except Exception as e:
                    print('@ErrorProcess: %s\nURL: %s' % (e, req['url']),
                          file=sys.stderr)
                    print('@error_trace_back', format_exc(), file=sys.stderr)
                    break
            return None

        response = yield from get_response()
        try:
            if response is None:
                yield from spider_q.fail(req['url'])
                print('@Failed: %s' % req['url'], file=sys.stderr)
                continue
            else:
                yield from spider_q.ack(req['url'])
            c = get_charset(response, default='gb18030')
            response.ubody = response.body.decode(c, 'ignore')
            response.charset = c
            g = callback(response)
            if isinstance(g, types.GeneratorType):
                yield from g
        except Exception as e:
            print('@ErrorProcess: %s\nURL: %s' % (e, req['url']),
                  file=sys.stderr)
            print('@error_trace_back', format_exc(), file=sys.stderr)
Exemplo n.º 2
0
def httptasker(spider_q, retry=5):
    while True:
        req, callback = yield from spider_q.get(interval=1.0)
        if req is None:
            tl = random.random() * 0.001
            yield gen.Task(io_loop.add_timeout, tl)
            continue

        def get_response():
            for i in range(retry):
                try:
                    rq = httpclient.HTTPRequest(**req)
                    response = yield httpcli.fetch(rq)
                    response.ourl = req["url"]
                    return response
                except httpclient.HTTPError as e:
                    print("@HTTPError: %s with URL: %s Retry: %s" % (e.code, req["url"], i + 1), file=sys.stderr)
                    if e.code in (403, 404):
                        break
                    continue
                except Exception as e:
                    print("@ErrorProcess: %s\nURL: %s" % (e, req["url"]), file=sys.stderr)
                    print("@error_trace_back", format_exc(), file=sys.stderr)
                    break
            return None

        response = yield from get_response()
        try:
            if response is None:
                yield from spider_q.fail(req["url"])
                print("@Failed: %s" % req["url"], file=sys.stderr)
                continue
            else:
                yield from spider_q.ack(req["url"])
            c = get_charset(response, default="gb18030")
            response.ubody = response.body.decode(c, "ignore")
            response.charset = c
            g = callback(response)
            if isinstance(g, types.GeneratorType):
                yield from g
        except Exception as e:
            print("@ErrorProcess: %s\nURL: %s" % (e, req["url"]), file=sys.stderr)
            print("@error_trace_back", format_exc(), file=sys.stderr)
Exemplo n.º 3
0
def httptasker(reqs, retry=3):
    for req, callback in reqs:
        if req is None:
            tl = random.random()*0.01
            yield from asyncio.sleep(tl)
            continue
        def get_response():
            response = None
            for i in range(retry):
                try:
                    try:
                        response = yield from asyncio.wait_for(aiohttp.request(**req), 5.0)
                        response.body = yield from response.read()
                        response.ourl = req['url']
                    finally:
                        if response is not None:
                            response.close()
                    return response
                except aiohttp_errors as e:
                    print('@FetchError: %s, %s, retry: %d' % (type(e), req['url'], i), file=sys.stderr)
                except http.cookies.CookieError as e:
                    print('@CookieError: %s, retry: %d' % (req['url'], i), file=sys.stderr)
                    #print('@error_trace_back', format_exc(), file=sys.stderr)
                except Exception as e:
                    print('@ErrorFetching: %s\nURL: %s' % (e, req['url']), file=sys.stderr)
                    print('@error_trace_back', format_exc(), file=sys.stderr)
            return None
        response = yield from get_response()
        try:
            if response is None:
                print('@Failed: %s' % req['url'], file=sys.stderr)
                continue
            c = get_charset(response)
            response.body = response.body.decode(c, 'ignore')
            response.charset = c
            yield from callback(response)
        except Exception as e:
            print('@ErrorProcess: %s\nURL: %s' % (e, req['url']), file=sys.stderr)
            print('@error_trace_back', format_exc(), file=sys.stderr)