def wrapper(*args, **kwargs): try: ret = func(*args, **kwargs) return ret except Exception: logger.info('【%s】error:%s' % (func.__name__, traceback.format_exc()))
def wrapper(*args, **kwargs): rets = func(*args, **kwargs) start = time.clock() for ret in rets: yield ret logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start))
def wrapper(self, response): if not response.m_response: if response.m_response is None: logger.error('response.m_response is None and url : ' + response.request.url + ' and request has been push to queue again!') else: logger.error('response.m_response is failed 【' + str(response.m_response.status_code) + '】 and url : ' + response.request.url + ' content:' + response.m_response.content + ' and request has been push to queue again!') yield response.request else: process = func(self, response) if process is not None: try: start = time.clock() for callback in process: yield callback logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start)) except Exception: logger.error('process error: ' + response.request.url + '\r\n' + response.m_response.content + '\r\n' + traceback.format_exc())
def download(self, request): web = self.web_driver_pool.get() # type:WebDriver web.get(request.url) response = Response(content=web.execute_script( "return document.documentElement.outerHTML"), request=request) self.web_driver_pool.put(web) logger.info("selenium download success:" + request.url) return response
def stop(self): if self._spider_status == 'stopped': logger.info("STOP %s SUCCESS" % self._spider_id) return elif self._spider_status == 'stopping': while self._spider_status == 'stopping': pass elif self._spider_status == 'start': self._spider_status = 'stopping' while self._spider_status == 'stopping': pass
def start(self): try: logger.info("START %s SUCCESS" % self._spider_id) self._spider_status = 'start' self._queue = PriorityQueue(self._processor) if len(self._processor.start_requests) > 0: for start_request in self._processor.start_requests: if self._should_follow(start_request): start_request.duplicate_remove = False self._queue.push(start_request) logger.info("start request:" + str(start_request)) for batch in self._batch_requests(): if len(batch) > 0: self._crawl(batch) if self._spider_status == 'stopping': break self._spider_status = 'stopped' logger.info("STOP %s SUCCESS" % self._spider_id) except Exception: logger.info("%s -- Exception -- Stopped -- %s" % (self._spider_id, traceback.format_exc())) self._spider_status = 'stopped'
def init_pool(self): logger.info('init web driver pool...') self.web_driver_pool = get_web_driver_pool(1) logger.info('init web driver pool success...')
def __init__(self): logger.info("init web driver pool...") self.web_driver_pool = get_web_driver_pool( default_settings.DRIVER_POOL_SIZE) logger.info("init web driver pool success")
def download(self, batch): batch_requests = [] for request in batch: session = requests.session() session.mount('https://', self._request_retry) session.mount('http://', self._request_retry) if not request.headers: request.headers = self._headers session.headers = self._headers if request.method.upper() == "GET": if self.use_proxy: m_proxies = self.proxy_pool.getProxy() batch_requests.append( grequests.get(session=session, url=request.url, headers=request.headers, cookies=self._cookies, verify=False, allow_redirects=request.allow_redirects, timeout=request.timeout, proxies=m_proxies)) else: batch_requests.append( grequests.get(session=session, url=request.url, headers=request.headers, cookies=self._cookies, verify=False, allow_redirects=request.allow_redirects, timeout=request.timeout)) elif request.method.upper() == "POST": if self.use_proxy: m_proxies = self.proxy_pool.getProxy() batch_requests.append( grequests.post(session=session, url=request.url, data=request.data, json=request.json, headers=request.headers, cookies=self._cookies, verify=False, allow_redirects=request.allow_redirects, timeout=request.timeout, proxies=m_proxies)) else: batch_requests.append( grequests.post(session=session, url=request.url, data=request.data, json=request.json, headers=request.headers, cookies=self._cookies, verify=False, allow_redirects=request.allow_redirects, timeout=request.timeout)) else: pass rets = grequests.map(batch_requests, exception_handler=exception_handler) true_responses = [] index = 0 for ret in rets: true_response = Response( m_response=ret, request=batch[index], ) true_responses.append(true_response) logger.info(true_response) index += 1 return true_responses
def wrapper(*args, **kwargs): start = time.clock() ret = func(*args, **kwargs) logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start)) return ret