Пример #1
0
def test_spider_with_error_middleware():
    error_middleware = Middleware()

    @error_middleware.request
    def error_request(spider_ins, request, response):
        pass

    @error_middleware.response
    async def error_response(spider_ins, request, response):
        raise TypeError('error')

    class SpiderDemo(Spider):
        start_urls = ['https://httpbin.org/get?p=0']

        async def parse(self, response):
            pass

    SpiderDemo.start(middleware=error_middleware)
Пример #2
0
#!/usr/bin/env python

import asyncio
import os

from ruia import Item, Middleware, Response, Request, Spider, TextField

html_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'for_spider_testing.html')
with open(html_path, mode='r', encoding='utf-8') as file:
    HTML = file.read()

middleware = Middleware()


async def retry_func(request):
    request.request_config['TIMEOUT'] = 10


@middleware.request
async def print_on_request(spider_ins, request):
    request.headers = {
        'User-Agent': 'ruia ua'
    }


@middleware.response
async def print_on_response(spider_ins, request, response):
    assert isinstance(response.html, str)
    assert request.headers == {
        'User-Agent': 'ruia ua'
    }
Пример #3
0
    async def parse(self, res):
        pages = ['http://www.httpbin.org/get', 'http://www.httpbin.org/get']
        for index, page in enumerate(pages):
            yield Request(
                page,
                callback=self.parse_item,
                metadata={'index': index},
                request_config=self.request_config,
            )

    async def parse_item(self, res):
        item_data = res.html
        return item_data


middleware = Middleware()
res_type_middleware = Middleware()


@middleware.request
async def print_on_request(request):
    request.headers = {
        'User-Agent': 'ruia ua'
    }


@middleware.response
async def print_on_response(request, response):
    assert type(response.html) == dict

Пример #4
0
#!/usr/bin/env python

from ruia import Middleware

middleware01 = Middleware()

middleware02 = Middleware()


@middleware01.request
async def print_on_request01(spider_ins, request):
    request.headers = {"User-Agent": "ruia ua"}


@middleware01.response
async def print_on_response01(spider_ins, request, response):
    assert isinstance(await response.text(), str)


@middleware02.request
async def print_on_request02(spider_ins, request):
    pass


@middleware02.response
async def print_on_response02(spider_ins, request, response):
    pass


all_middleware = middleware01 + middleware02
Пример #5
0
#!/usr/bin/env python
"""
 Created by howie.hu at 2018/10/17.
"""

from ruia import Middleware

from owllook.spiders.spider_tools import get_proxy_ip

owl_middleware = Middleware()


@owl_middleware.request
async def add_random_proxy(request):
    request.kwargs.update({'proxy': await update_proxy()})
    request.request_config.update({'RETRY_FUNC': retry_func})


async def update_proxy():
    proxy = await get_proxy_ip()
    if proxy:
        proxy = 'http://' + proxy
    else:
        proxy = None
    return proxy


async def retry_func(request):
    proxy = await update_proxy()
    request.kwargs.update({'proxy': proxy})
    return request
Пример #6
0
DEFAULT_OPTIONS = {
    'payloads': BAIDU_ACCURATEBASIC_PAYLOAD,
    'fail_images': [],
    'image_hook_kwargs': {},
    'region': ''
}

service_type_dic = {
    BAIDU_ACCURATEBASIC_TYPE: BAIDU_ACCURATEBASIC_PAYLOAD,
    BAIDU_ACCURATE_TYPE: BAIDU_ACCURATE_PAYLOAD,
    BAIDU_GENERALBASIC_TYPE: BAIDU_GENERALBASIC_PAYLOAD,
    BAIDU_GENERAL_TYPE: BAIDU_GENERAL_PAYLOAD
}

ocr_middle = Middleware()

logger = get_logger('Spider')
logger_ocr = get_logger('Ocr')


@classmethod
def extension_start(cls,
                    middleware: typing.Union[typing.Iterable,
                                             Middleware] = None,
                    loop=None,
                    after_start=None,
                    before_stop=None,
                    close_event_loop=True,
                    **kwargs) -> Spider:
    try: