Exemplo n.º 1
0
class ServerCtrlMiddleware(Middleware):

    logger = getLogger(__name__)

    def process_request(self, request):
        s = request.spider
        if s.status == 'PAUSE':
            self.logger.debug(f'PAUSE {s.name} {request}')
            s._hanged.append(request)
            if s.urlfilter:
                s.urlfilter.delete(_to_feature(request))
            raise DropRequest
        elif s.status == 'RUNNING':
            return request
        elif s.status in ['STOP', 'CLOSE']:
            if s.urlfilter:
                s.urlfilter.delete(_to_feature(request))
            self.logger.debug(f'STOP/CLOSE {s.name} {request}')
            raise DropRequest

    def process_response(self, response):
        spider = response.spider
        if spider.status == 'STOP':
            raise DropResponse
        return response
Exemplo n.º 2
0
 def __init__(self, settings):
     self.req_limits = settings.gets('CONCURRENCY')
     self.recv_req = []
     self.waiting = False
     self.spiders = None
     self.logger = getLogger(__name__)
     self.logger.debug('Loaded scheduler.')
Exemplo n.º 3
0
 def __init__(self, settings):
     super(WebCrawler, self).__init__()
     self.settings = settings
     self.logger = getLogger(__name__)
     self.semaphore = asyncio.Semaphore(
         self.settings['project'].CONCURRENCY)
     self._install_requester()
Exemplo n.º 4
0
 def __init__(self, settings):
     self.settings = settings
     self._spiders = {}
     self.project_path = settings['project'].PROJECT_NAME
     self._found = defaultdict(list)
     self.warn_only = True
     self.logger = getLogger(__name__)
Exemplo n.º 5
0
 def __init__(self, settings, spiders):
     self._settings = settings
     self.spiders = spiders
     self._attrs = ('mw', 'resp_mw', 'req_mw')
     self.logger = getLogger(__name__)
     MiddleWareManager.logger = self.logger
     self.load_middlewares()
Exemplo n.º 6
0
 def __init__(self, settings):
     self.settings = settings
     self.spider_loader = SpiderLoader(settings)
     self.spiders = self.spider_loader.load_all_spiders()
     self.crawler = WebCrawler(settings)
     self.scheduler = Scheduler(settings)
     self.looper = Looper()
     self.spider_hub = SpiderHub(settings, self.crawler)
     self.logger = getLogger(__name__)
Exemplo n.º 7
0
 def __init__(self,*args,**kwargs):
     super(BaseSpider, self).__init__(*args,**kwargs)
     self.status = 'CREATED'
     self.requests = []
     self.session = None
     self.logger = getLogger(__name__)
     self._load_settings()
     self._load_filters()
     self._init_session()
Exemplo n.º 8
0
 def __init__(self, settings, crawler):
     super(SpiderHub, self).__init__()
     self.settings = settings
     self._success_counter = 0
     self._failed_counter = 0
     self._exception_counter = 0
     self.active = False
     self.looper = None
     self._crawler = crawler
     self.logger = getLogger(__name__)
     self._set_queue()
Exemplo n.º 9
0
class MediaRequester(CrawlRequester):

    _down_type = 'media'

    logger = getLogger(__name__)

    async def crawl(self,request):
        delay = request.delay
        url = request.url
        session = request.spider.session
        proxy = request.proxy
        buffer = request.spider.settings.DEFAULT_DOWNLOAD_BUFFER
        path = os.path.normpath(request.save_path)
        if not os.path.exists(os.path.dirname(path)):
            self.logger.error(f'No path:{os.path.dirname(path)}.')
            return
        name = os.path.basename(path)
        try:
            self.logger.info(f'Downloading {name}.')
            async with self._crawler.semaphore:
                resp = await send_async_http(   session,
                                                request.method,
                                                url,
                                                path=path,
                                                retries=request.retry,
                                                timeout=request.timeout,
                                                proxies=proxy,
                                                buffer=buffer
                                            )
                if resp is None:
                    return
                body = resp['body']
                exception = resp['exception']
                if exception and body != True:
                    return Response(url, status=-1, request=request, exc=exception)
            await asyncio.sleep(delay)
            size = get_file_size(size=int(resp['size']))
            self.logger.info(f'Finished downloading:[{name} {size}]')
            return
        except asyncio.CancelledError:
            print(f'Task "{request}" canceled.')
            return Response(url, status=0, request=request)
        except Exception as e:
            return Response(url, status=-1, request=request, exc=e.__class__())
Exemplo n.º 10
0
class RetryPagesMiddleware(Middleware):

    logger = getLogger(__name__)

    def process_response(self, response):
        spider = response.spider
        codes = spider.settings.REQUESTS_ERROR_RETRY_STATUS
        if spider.settings.REQUESTS_FAIL_RETRY_ENABLE:
            if response.status != 200:
                if (response.status in codes
                    and response.status != -1) or \
                    (response.exception.__class__ in exceptions):
                    _tried = response.request._tried
                    if _tried > spider.settings.REQUESTS_FAIL_RETRY_DEPTH:
                        return response
                    response.request._tried = _tried + 1
                    response.request.proxy = None
                    self.logger.debug(
                        f'{response.request} scheduled to retry.Tried:{_tried}'
                    )
                    spider._retries.append(response.request)
        return response
Exemplo n.º 11
0
 def __init__(self):
     self.rparser = {}
     self.rubbish = set()
     self.logger = getLogger(__name__)
Exemplo n.º 12
0
 def __init__(self):
     self.loop = asyncio.get_event_loop()
     self.logger = getLogger(__name__)
Exemplo n.º 13
0
class HttpProxyMiddleware(Middleware):

    inited = False
    invalid_pool = {}
    proxy_pool = set()
    logger = getLogger(__name__)

    def _proxy_invalid(self, proxy, url):
        domain = parse_url(url).netloc
        if proxy in self.invalid_pool:
            if domain in self.invalid_pool[proxy]:
                return True
            return False

    def process_request(self, request):
        if not request.spider.settings.HTTP_PROXY_ENABLE:
            request.proxy = None
            return request
        _type = request.down_type
        proxy = request.proxy
        url = request.url
        if proxy:
            if not is_proxy_valid(proxy):
                if request.spider.settings.HTTP_PROXY_FILL_ENABLE:
                    request.proxy = self.get_proxy(request)
                    if request.proxy:
                        self.logger.warn(
                            f'Filling a new proxy {request.proxy} to {url}.')
                else:
                    self.logger.error(f'Not a valid http proxy:{proxy}')
                    request.proxy = None
                return request
            elif self._proxy_invalid(proxy, url):
                self.logger.warn(f'Proxy {proxy} is invalid for {url} before.')
                if request.spider.settings.HTTP_PROXY_FILL_ENABLE:
                    request.proxy = self.get_proxy(request)
                    if request.proxy:
                        self.logger.warn(
                            f'Filling a new proxy {request.proxy} to {url}.')
                else:
                    self.logger.warn(f'Dropped proxy {proxy} for {url}.')
                    request.proxy = None
                return request
            request.proxy = gen_proxy(proxy, _type)
            self.logger.debug(
                f'[{request.spider.name}]Using proxy {request.proxy} '
                f'for {request.method}-{request.url}')
        else:
            _proxy = None
            while 1:
                _proxy = self.get_proxy(request)
                if _proxy is None:
                    break
                proxy = extract_ip_port(_proxy)
                if self._proxy_invalid(proxy, url):
                    continue
                break
            request.proxy = _proxy
        return request

    def process_response(self, response):
        settings = response.spider.settings
        fakes = settings.HTTP_PROXY_FAKE_STATUS
        domain = parse_url(response.url).netloc
        if not response.spider.settings.HTTP_PROXY_ENABLE:
            return response
        if response.request.proxy and response.status != 200 \
                and response.status not in fakes:
            proxy = extract_ip_port(response.request.proxy)
            if proxy not in self.invalid_pool:
                self.invalid_pool[proxy] = set()
            self.logger.debug(f'Proxy {proxy} is invalid for ' f'{domain}.')
            self.invalid_pool[proxy].add(domain)
        elif response.request.proxy and (response.status == 200
                                         or response.status in fakes):
            proxy = extract_ip_port(response.request.proxy)
            if proxy in self.invalid_pool:
                self.invalid_pool[proxy].discard(domain)
            self.proxy_pool.add(proxy)
        return response

    def get_proxy(self, req):
        http_proxy = req.spider.settings.HTTP_PROXY
        if http_proxy:
            if is_proxy_valid(http_proxy):
                proxy = gen_proxy(http_proxy, req.down_type)
                return proxy
            elif is_url(http_proxy):
                return http_proxy
            else:
                if not req.spider.settings.HTTP_PROXY_FILL_ENABLE:
                    self.logger.debug(f'Invalid proxy format:{http_proxy}')
                    return
        _proxy = self.get_proxy_by_api(req)
        proxy = gen_proxy(_proxy, req.down_type)
        return proxy

    def get_proxy_by_api(self, request):
        domain = parse_url(request.url).netloc

        def _get_from_pool():
            while self.proxy_pool:
                proxy = self.proxy_pool.pop()
                if proxy not in self.invalid_pool or\
                        (domain not in self.invalid_pool.get(proxy)):
                    return proxy
                else:
                    continue

        proxy = _get_from_pool()
        if not proxy:
            self.logger.debug(f'No proxy in proxy pool.Getting some.')
            while 1:
                spider = request.spider
                req = amipy.Request(spider,
                                    spider.settings.HTTP_PROXY_API,
                                    delay=0,
                                    ignore=True)
                crawler = spider.binding_hub._crawler
                looper = spider.binding_hub.looper
                coro = crawler.requesters[req.down_type].crawl(req)
                resp = looper.run_coroutine(coro)
                if not resp:
                    self.logger.error(
                        f'[{resp.status}]Getting Http proxy by api failed.')
                    continue
                _results = [i.strip() for i in resp.text().split('\n')]
                results = [
                    is_proxy_valid(i)[0] for i in _results if is_proxy_valid(i)
                ]
                self.proxy_pool.update(results)
                self.logger.debug(
                    f'Got {len(results)} http proxies from HTTP_PROXY_API.')
                proxy = _get_from_pool()
                if not proxy:
                    continue
                break
        return proxy