示例#1
0
文件: request.py 项目: hubitor/ruia
    def __init__(self,
                 url: str,
                 method: str = 'GET',
                 *,
                 callback=None,
                 encoding: Optional[str] = None,
                 headers: dict = None,
                 metadata: dict = None,
                 request_config: dict = None,
                 request_session=None,
                 **kwargs):
        """
        Initialization parameters
        """
        self.url = url
        self.method = method.upper()
        if self.method not in self.METHOD:
            raise InvalidRequestMethod(
                f'{self.method} method is not supported')

        self.callback = callback
        self.encoding = encoding
        self.headers = headers or {}
        self.metadata = metadata or {}
        self.request_session = request_session
        self.request_config = self.REQUEST_CONFIG if request_config is None else request_config
        self.ssl = kwargs.pop('ssl', False)
        self.kwargs = kwargs

        self.close_request_session = False
        self.logger = get_logger(name=self.name)
        self.retry_times = self.request_config.get('RETRIES', 3)
示例#2
0
    def __init__(self,
                 url: str,
                 method: str = 'GET',
                 *,
                 callback=None,
                 encoding: Optional[str] = None,
                 headers: dict = {},
                 metadata: dict = {},
                 request_config: dict = {},
                 request_session=None,
                 res_type: str = 'text',
                 **kwargs):
        """
        Initialization parameters
        """
        self.url = url
        self.method = method.upper()
        if self.method not in self.METHOD:
            raise ValueError('%s method is not supported' % self.method)

        self.callback = callback
        self.encoding = encoding
        self.headers = headers
        self.metadata = metadata if metadata is not None else {}
        self.request_session = request_session
        self.request_config = request_config or self.REQUEST_CONFIG
        self.res_type = res_type
        self.kwargs = kwargs

        self.close_request_session = False
        self.logger = get_logger(name=self.name)
        self.retry_times = self.request_config.get('RETRIES', 3)
示例#3
0
    def __init__(
        self,
        middleware: typing.Union[typing.Iterable, Middleware] = None,
        loop=None,
        is_async_start: bool = False,
        cancel_tasks: bool = True,
        **spider_kwargs,
    ):
        """
        Init spider object.
        :param middleware: a list of or a single Middleware
        :param loop: asyncio event llo
        :param is_async_start: start spider by using async
        :param spider_kwargs
        """
        if not self.start_urls or not isinstance(
            self.start_urls, collectionsAbc.Iterable
        ):
            raise ValueError(
                "Ruia spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']"
            )

        self.loop = loop
        asyncio.set_event_loop(self.loop)

        # Init object-level properties
        self.callback_result_map = self.callback_result_map or {}

        self.request_config = self.request_config or {}
        self.headers = self.headers or {}
        self.metadata = self.metadata or {}
        self.aiohttp_kwargs = self.aiohttp_kwargs or {}
        self.spider_kwargs = spider_kwargs
        self.request_config = self.request_config or {}
        self.request_session = ClientSession()

        self.cancel_tasks = cancel_tasks
        self.is_async_start = is_async_start

        # set logger
        self.logger = get_logger(name=self.name)

        # customize middleware
        if isinstance(middleware, list):
            self.middleware = reduce(lambda x, y: x + y, middleware)
        else:
            self.middleware = middleware or Middleware()

        # async queue as a producer
        self.request_queue = asyncio.Queue()

        # semaphore, used for concurrency control
        self.sem = asyncio.Semaphore(self.concurrency)
示例#4
0
文件: spider.py 项目: acracker/ruia
 def __init__(self, middleware=None, loop=None, is_async_start=False):
     self.is_async_start = is_async_start
     self.logger = get_logger(name=self.name)
     self.loop = loop
     asyncio.set_event_loop(self.loop)
     # customize middleware
     if isinstance(middleware, list):
         self.middleware = reduce(lambda x, y: x + y, middleware)
     else:
         self.middleware = middleware or Middleware()
     # async queue
     self.request_queue = asyncio.Queue()
     # semaphore
     self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
示例#5
0
 def __init__(self, middleware=None, loop=None):
     if not self.start_urls or not isinstance(self.start_urls, list):
         raise ValueError(
             "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']"
         )
     self.logger = get_logger(name=self.name)
     self.loop = loop or asyncio.new_event_loop()
     asyncio.set_event_loop(self.loop)
     # customize middleware
     if isinstance(middleware, list):
         self.middleware = reduce(lambda x, y: x + y, middleware)
     else:
         self.middleware = middleware or Middleware()
     # async queue
     self.request_queue = asyncio.Queue()
     # semaphore
     self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
示例#6
0
    def __init__(
        self,
        url: str,
        method: str = "GET",
        *,
        callback=None,
        encoding: Optional[str] = None,
        headers: dict = None,
        metadata: dict = None,
        request_config: dict = None,
        request_session=None,
        **aiohttp_kwargs,
    ):
        """
        Initialization parameters
        :param url: Target url
        :param method: HTTP method
        :param callback: Callback func
        :param encoding: Html encoding
        :param headers: Request headers
        :param metadata: Send the data to callback func
        :param request_config: Manage the target request
        :param request_session: aiohttp.ClientSession
        :param aiohttp_kwargs:
        """
        self.url = url
        self.method = method.upper()

        if self.method not in self.METHOD:
            raise InvalidRequestMethod(
                f"{self.method} method is not supported")

        self.callback = callback
        self.encoding = encoding
        self.headers = headers or {}
        self.metadata = metadata or {}
        self.request_session = request_session
        self.request_config = (self.REQUEST_CONFIG
                               if request_config is None else request_config)
        self.ssl = aiohttp_kwargs.pop("ssl", False)
        self.aiohttp_kwargs = aiohttp_kwargs

        self.close_request_session = False
        self.logger = get_logger(name=self.name)
        self.retry_times = self.request_config.get("RETRIES", 3)
示例#7
0
    "BAIDU_BUSINESSLICENSE_TYPE", "BAIDU_RECEIPT_TYPE",
    "BAIDU_TRAINTICKET_TYPE", "BAIDU_TAXIRECEIPT_TYPE", "BAIDU_FORM_TYPE",
    "BAIDU_TABLERECOGNIZE_TYPE", "BAIDU_TABLERESULTGET_TYPE",
    "BAIDU_VINCODE_TYPE", "BAIDU_QUOTAINVOICE_TYPE",
    "BAIDU_HOUSEHOLDREGISTER_TYPE", "BAIDU_HKMACAUEXITENTRYPERMIT_TYPE",
    "BAIDU_TAIWANEXITENTRYPERMIT_TYPE", "BAIDU_BIRTHCERTIFICATE_TYPE",
    "BAIDU_VEHICLEINVOICE_TYPE", "BAIDU_VEHICLECERTIFICATE_TYPE",
    "BAIDU_INVOICE_TYPE", "BAIDU_AIRTICKET_TYPE",
    "BAIDU_INSURANCEDOCUMENTS_TYPE", "BAIDU_VATINVOICE_TYPE",
    "BAIDU_QRCODE_TYPE", "BAIDU_NUMBERS_TYPE", "BAIDU_LOTTERY_TYPE",
    "BAIDU_PASSPORT_TYPE", "BAIDU_BUSINESSCARD_TYPE", "BAIDU_HANDWRITING_TYPE",
    "BAIDU_CUSTOM_TYPE", "BAIDU_GENERALBASIC_PAYLOAD", "BAIDU_GENERAL_PAYLOAD",
    "BAIDU_ACCURATE_PAYLOAD", "BAIDU_ACCURATEBASIC_PAYLOAD"
]

logger = get_logger('Ocr')
baidu_ocr_urls = {}
baidu_ocr_payloads = {}
baidu_ocr_types = {}


def register_baidu_service(global_obj):
    def wrapper(func):
        dic = func()
        global_obj.update(dic)

    return wrapper


try:
    # Adaptive interface changes. It's recommended to do this via installed aip
示例#8
0
class SpiderHook:
    """
    SpiderHook is used for extend spider
    """

    callback_result_map: dict = None
    logger = get_logger(name='Spider')

    async def _run_spider_hook(self, hook_func):
        """
        Run hook before/after spider start crawling
        :param hook_func: aws function
        :return:
        """
        if callable(hook_func):
            try:
                aws_hook_func = hook_func(self)
                if isawaitable(aws_hook_func):
                    await aws_hook_func
            except Exception as e:
                self.logger.error(f'<Hook {hook_func.__name__}: {e}')

    async def process_failed_response(self, request, response):
        """
        Corresponding processing for the failed response
        :param request: Request
        :param response: Response
        :return:
        """
        pass

    async def process_succeed_response(self, request, response):
        """
        Corresponding processing for the succeed response
        :param request: Request
        :param response: Response
        :return:
        """
        pass

    async def process_item(self, item):
        """
        Corresponding processing for the Item type
        :param item: Item
        :return:
        """
        pass

    async def process_callback_result(self, callback_result):
        """
        Corresponding processing for the invalid callback result
        :param item:
        :return:
        """
        callback_result_name = type(callback_result).__name__
        process_func_name = self.callback_result_map.get(
            callback_result_name, '')
        process_func = getattr(self, process_func_name, None)
        if process_func is not None:
            await process_func(callback_result)
        else:
            raise InvalidCallbackResult(
                f'<Parse invalid callback result type: {callback_result_name}>'
            )
示例#9
0
def make(settings, images, out, header, type, pattern, pattern_clean, number, debug, coordinate, lag):
    cp = ConfigParser()
    logger = get_logger('Results')

    if settings:
        cp.add_section('baiduocr')
        file = open(os.path.join(os.path.expanduser('~'), 'ruia_ocr.cfg'), 'w+')
        configs = settings.split(';').split(';')
        for setting in configs:
            key, v = setting.split('=')
            cp.set('baiduocr', key, v)
        cp.write(file)
        file.close()
    else:
        cp.read(os.path.join(os.path.expanduser('~'), 'ruia_ocr.cfg'))

    service = BaiduOcrService(cp.get('baiduocr', 'app_id'),
                              cp.get('baiduocr', 'api_key'),
                              cp.get('baiduocr', 'secret_key'),
                              type_dic.get(type),
                              seq='\n')

    imgs = parse_path(images)

    if not debug:
        logging.root.setLevel(logging.FATAL)
    if number == -1:
        urls = get_file_paths(imgs) if os.path.isdir(imgs) else [imgs]
    else:
        urls = get_file_paths(imgs, num=int(number)) if os.path.isdir(imgs) else [imgs]

    @adapter_item(pattern=pattern, pattern_clean=pattern_clean)
    class OcrItem(Item):
        pass

    @adapter_spider(range=coordinate)
    class OcrSpider(Spider):
        ocr_service = service
        ocr_options = {'region': ''}

        start_urls = urls
        concurrency = 1
        request_config = {'TIMEOUT': 40, 'DELAY': lag}
        dump_res: List[List[str]] = []

        async def parse(self, response):
            item = await OcrItem.get_item(html=response.ocr_html)
            item.path = os.path.basename(response.metadata.get('image'))
            yield item

        async def process_item(self, item: Item):
            dumps = [getattr(item, attr) for attr in OcrItem.collect]
            dumps.insert(0, item.path)
            self.dump_res.append(dumps)

    spider = OcrSpider.start(middleware=ocr_middle)
    logging.root.setLevel(logging.DEBUG)

    for res in spider.dump_res:
        if debug:
            logger.info(res[0] + ': ' + ' || '.join(res[1:]))
        else:
            click.echo(res[0] + ': ' + ' || '.join(res[1:]))

    if out:
        parsed_out = parse_path(out)
        if out.endswith('txt'):
            with open(parsed_out, 'w+', encoding='utf8') as file:
                contents = [pic_contents[1:] for pic_contents in spider.dump_res]
                for content in contents:
                    file.write('\n'.join(content) + '\n')

        elif out.endswith('xlsx'):
            wk = Workbook()
            sh = wk.active
            if header:
                spider.dump_res.insert(0, ['picture'] + header.split(','))
            for res in spider.dump_res:
                sh.append(res)
            wk.save(parsed_out)
        if debug:
            logger.info('save as %s' % os.path.abspath(out))
        else:
            click.echo('save as %s' % os.path.abspath(out))
示例#10
0
from asyncio import Semaphore
from inspect import isawaitable
from urllib.parse import urlparse, quote, urlencode

from ruia import Request
from ruia.utils import get_logger
from .exceptions import ServicePayloadsError, ImageTypeError
from .configs import *

service_type_dic = {
    BAIDU_ACCURATEBASIC_TYPE: BAIDU_ACCURATEBASIC_PAYLOAD,
    BAIDU_ACCURATE_TYPE: BAIDU_ACCURATE_PAYLOAD,
    BAIDU_GENERALBASIC_TYPE: BAIDU_GENERALBASIC_PAYLOAD,
    BAIDU_GENERAL_TYPE: BAIDU_GENERAL_PAYLOAD
}
logger = get_logger('Spider')

try:
    # Adaptive interface changes. It's recommended to do this
    from aip.base import AipBase

    _access_token_url = AipBase._AipBase__accessTokenUrl
except:
    # Fixed api implementation, not recommended
    _access_token_url = 'https://aip.baidubce.com/oauth/2.0/token'

__all__ = ['BaiduOcrService', 'BaseOcrService']


def getAuthrHeaders(method,
                    url,
示例#11
0
#!/usr/bin/env python

import os

from importlib import util

from ruia.utils import get_logger

logger = get_logger('settings')


class SettingsWrapper(object):
    """
    SettingsWrapper returns a spider config
    """

    def __init__(self, settings_name='settings.py'):
        self.my_settings = {}
        self.settings_name = settings_name
        self._load_settings()

    def __call__(self):
        return self.my_settings

    def settings(self):
        return self.my_settings

    def load_with_file(self, file_path):
        file_name = os.path.basename(file_path)
        if file_name[-3:] != '.py':
            logger.error("module name must be python file, such as : example.py")
示例#12
0
    'payloads': BAIDU_ACCURATEBASIC_PAYLOAD,
    'fail_images': [],
    'image_hook_kwargs': {},
    'region': ''
}

service_type_dic = {
    BAIDU_ACCURATEBASIC_TYPE: BAIDU_ACCURATEBASIC_PAYLOAD,
    BAIDU_ACCURATE_TYPE: BAIDU_ACCURATE_PAYLOAD,
    BAIDU_GENERALBASIC_TYPE: BAIDU_GENERALBASIC_PAYLOAD,
    BAIDU_GENERAL_TYPE: BAIDU_GENERAL_PAYLOAD
}

ocr_middle = Middleware()

logger = get_logger('Spider')
logger_ocr = get_logger('Ocr')


@classmethod
def extension_start(cls,
                    middleware: typing.Union[typing.Iterable,
                                             Middleware] = None,
                    loop=None,
                    after_start=None,
                    before_stop=None,
                    close_event_loop=True,
                    **kwargs) -> Spider:
    try:
        loop = loop or asyncio.new_event_loop()
        spider_ins = cls(middleware=middleware, loop=loop, **kwargs)