def main(self, url, response, referer=None, tag=0): try: if not self.status: logging.info('status stopped...') err = { 'code': 500, 'url': url, 'msg': 'status stopped...', 'at': datetime.utcnow() } raise SpiderError(err) self.referer = referer logging.info('referer... %s' % referer) proxies = select_proxy() logging.info('proxy...') logging.info(proxies) h = self.headers logging.info(h) logging.info('cookies...') c = self.cookies logging.info(c) resp = requests.get(url, headers=h, cookies=c, proxies=proxies) resp.encoding = self.encoding self.worker_cookies(resp.cookies) resp.headers.update({'Referer': url}) self.worker_headers(resp.headers) logging.info('resp.headers...') logging.info(resp.headers) if resp.status_code == 404: # do nothing logging.info('Not found... %d' % resp.status_code) elif resp.status_code != 200: logging.info('Not 200... %d' % resp.status_code) err = { 'code': resp.status_code, 'url': url, 'msg': 'not 200', 'at': datetime.utcnow() } raise SpiderError(err) else: parsed = urlparse(url) logging.info(parsed) kwargs = dict( pname=self.pname, url=url, text=resp.text if SAVE_TEXT else '', headers=resp.headers if SAVE_HEADERS else {}, encoding=self.encoding, response=response, scheme=parsed.scheme, host=parsed.netloc, query=parsed.query, path=parsed.path ) logging.info('response tag... %d' % tag) soup = BeautifulSoup(resp.text, "lxml") tag, urls, opt = response(self, soup, tag, **kwargs) kwargs.update({'tag': tag}) kwargs.update({'option': opt}) self.visited(url, kwargs) logging.info('urls... %d', len(urls)) if not urls: logging.info('No urls...') else: i = 0 while len(self.jobs) > self.max_job_count: if i == 10: break msg = 'busy... %d > %d' % \ (len(self.jobs), self.max_job_count) i += 1 self.sleep(msg) urls = self.clean_urls(urls) logging.info('enqueue urls... %d', len(urls)) if i == 0: projects.enqueue(self.redis, self.pname, self.db, self.max_job_count, self.interval, self.wait, urls, response, referer=self.referer, qname=self.qname, tag=tag, debug=self.debug) logging.info('done!') except Exception, e: logging.warning(str(e)) importlib.import_module('mylib.logger').sentry()
def main(self, url, response, referer=None, tag=0): try: if not self.status: logging.info('status stopped...') err = { 'code': 500, 'url': url, 'msg': 'status stopped...', 'at': datetime.utcnow() } raise SpiderError(err) self.referer = referer logging.info('referer... %s' % referer) proxies = select_proxy() logging.info('proxy...') logging.info(proxies) h = self.headers logging.info(h) logging.info('cookies...') c = self.cookies logging.info(c) resp = requests.get(url, headers=h, cookies=c, proxies=proxies) resp.encoding = self.encoding self.worker_cookies(resp.cookies) resp.headers.update({'Referer': url}) self.worker_headers(resp.headers) logging.info('resp.headers...') logging.info(resp.headers) if resp.status_code == 404: # do nothing logging.info('Not found... %d' % resp.status_code) elif resp.status_code != 200: logging.info('Not 200... %d' % resp.status_code) err = { 'code': resp.status_code, 'url': url, 'msg': 'not 200', 'at': datetime.utcnow() } raise SpiderError(err) else: parsed = urlparse(url) logging.info(parsed) kwargs = dict(pname=self.pname, url=url, text=resp.text if SAVE_TEXT else '', headers=resp.headers if SAVE_HEADERS else {}, encoding=self.encoding, response=response, scheme=parsed.scheme, host=parsed.netloc, query=parsed.query, path=parsed.path) logging.info('response tag... %d' % tag) soup = BeautifulSoup(resp.text, "lxml") tag, urls, opt = response(self, soup, tag, **kwargs) kwargs.update({'tag': tag}) kwargs.update({'option': opt}) self.visited(url, kwargs) logging.info('urls... %d', len(urls)) if not urls: logging.info('No urls...') else: i = 0 while len(self.jobs) > self.max_job_count: if i == 10: break msg = 'busy... %d > %d' % \ (len(self.jobs), self.max_job_count) i += 1 self.sleep(msg) urls = self.clean_urls(urls) logging.info('enqueue urls... %d', len(urls)) if i == 0: projects.enqueue(self.redis, self.pname, self.db, self.max_job_count, self.interval, self.wait, urls, response, referer=self.referer, qname=self.qname, tag=tag, debug=self.debug) logging.info('done!') except Exception, e: logging.warning(str(e)) importlib.import_module('mylib.logger').sentry()
if args.p and args.stop: update_worker(args.p, 'status', 0) if args.p and args.start: update_worker(args.p, 'status', 1) elif args.p and args.max_job_count: update_worker(args.p, 'max_job_count', int(args.max_job_count)) elif args.p and args.wait: update_worker(args.p, 'wait', int(args.wait)) else: max_job_count = getattr(m, 'MAX_JOB_COUNT') wait = getattr(m, 'WAIT') url = getattr(m, 'BASE_URL') interval = getattr(m, 'INTERVAL') redis = { 'HOST': rq.REDIS_HOST, 'PORT': rq.REDIS_PORT, 'DB': rq.REDIS_DB } debug = args.debug projects.enqueue(redis, args.p, DB, max_job_count, interval, wait, [url], response, qname=args.q, debug=args.debug) except: importlib.import_module('mylib.logger').sentry(debug=debug)
elif args.p and args.max_job_count: update_worker(args.p, 'max_job_count', int(args.max_job_count)) elif args.p and args.wait: update_worker(args.p, 'wait', int(args.wait)) else: max_job_count = getattr(m, 'MAX_JOB_COUNT') wait = getattr(m, 'WAIT') url = getattr(m, 'BASE_URL') interval = getattr(m, 'INTERVAL') redis = { 'HOST': rq.REDIS_HOST, 'PORT': rq.REDIS_PORT, 'DB': rq.REDIS_DB } debug = args.debug projects.enqueue(redis, args.p, DB, max_job_count, interval, wait, [url], response, qname=args.q, debug=args.debug) except: importlib.import_module('mylib.logger').sentry(debug=debug)