def process_request_exception(self, request, exception): self.request_failed_counter.inc() # 请求失败 try: req_item = { 'url': request.url, 'method': request.method, 'cookies': request.cookies, 'header': {i.decode(): request.headers[i].decode() for i in request.headers}, 'encoding': request.encoding, 'body': str(request.body), } except Exception as e: req_item = {} self.request_db.conn.insert({ 'target_url': request.url, 'task_code': self.spider.task_code, 'rule_code': self.spider.rule_code, 'job_code': self.spider.job_code, 'request_code': request.meta['request_code'], 'status': str(exception), 'request_at': getFormatTime(request.meta['request_at']), 'response_at': getFormatTime(getNowTime()), 'cost_time': getDifferTime(getNowTime(), request.meta['request_at']), 'request': str(req_item), }) self.spider.logger.error('请求异常%s' % exception)
def close_spider(self, spider): # spider (Spider 对象) – 被关闭的spider # 可选实现,当spider被关闭时,这个方法被调用 # import requests as req # url = 'https://oapi.dingtalk.com/robot/send?access_token=' # req.api.post(url + '9dc23fc9f30248e54bcf36ca66d4cbc664660a51e791d282dbe9f97177c55ae6', # json=json.dumps({"msgtype": 'markdown', # 'markdown': {"title": '爬虫完成', "text": '爬虫任务已完成,本次任务rule_code:%s' % spider.rule_code}, # "at": {"isAtAll": True}})) spider.logger.info('爬虫已结束') _req_counter = 0 _req_suc_counter = 0 _req_fai_counter = 0 try: _req_counter = int(spider.request_part.request_counter.counter) _req_suc_counter = int( spider.request_part.request_success_counter.counter) _req_fai_counter = int( spider.request_part.request_failed_counter.counter) except Exception as e: spider.logger.error('爬虫请求统计发生错误,%s' % e) # 关闭时候,更新job数据 spider.task_part.task_job_db.conn.find_one_and_update( filter=dict(_id=spider.task_part.job.get('_id')), update={ '$set': dict(status=2, request_cursor=_req_counter, request_total=int(len(spider.task_urls)), request_success=_req_suc_counter, request_failed=_req_fai_counter, finished_at=getFormatTime(getNowTime()), cost_time=getDifferTime(getNowTime(), spider.spider_start_at)) }) # 关闭时候,更新task数据 spider.task_part.task_db.conn.find_one_and_update( filter=dict(_id=spider.task_part.task.get('_id')), update={ '$set': dict( # request_total=int(spider.request_part.request_counter.counter) + _req_counter, request_success=int( spider.request_part.request_success_counter.counter) + _req_suc_counter, request_failed=int( spider.request_part.request_failed_counter.counter) + _req_fai_counter, updated_at=getFormatTime(getNowTime()), ) }) pass
def process_sync_proxy(self, data, db): """ 储存同步代理IP库 :param data: :param db: """ try: for response_item in data: response_item.update({'sync_at': getFormatTime(getNowTime())}) db.conn.find_one_and_update( {'proxy_ip': response_item['proxy_ip']}, { '$set': dict({ 'weight': 0, 'use_count': 0 }, **response_item) }, projection={ 'proxy_ip': True, '_id': True }, upsert=True, return_document=ReturnDocument.AFTER) logging.info(msg='爬虫同步代理IP [%s] OK!' % response_item['proxy_ip']) except Exception as e: logging.error('爬虫同步代理IP失败!', data=data)
def process_build_url_collection(self, rule_url_dict: dict = ()): # todo 配置项值需要迁移到config文件 _task_redo = 0 try: _task_redo = int(self.task_part.task.get('redo', 0)) except Exception as e: self.logger.error('爬虫获取分片设置错误,原因是%s' % e) if _task_redo: self.logger.info('爬虫任务为重做模式') # 强制重做 result = self.url_part.build_url_from_rule(rule_url_dict, _task_redo) if result: self.task_part.task.update(dict( redo=_task_redo, request_total=result, updated_at=getFormatTime(getNowTime()) )) else: raise Exception('爬虫批量生成url失败') else: self.logger.info('爬虫任务为继续模式') _remain_url = self.url_part.url_db.conn.find({ 'rule_code': self.rule_code, 'task_code': self.task_code, 'status': 1 }).count() if _remain_url < 1: self.logger.info('本次爬虫任务已完成,爬虫即将结束') raise Exception('本次爬虫任务已完成')
def process_request_response(self, request, response): try: req_item = { 'url': request.url, 'method': request.method, 'cookies': request.cookies, 'header': {i.decode(): request.headers[i].decode() for i in request.headers}, 'encoding': request.encoding, 'body': str(request.body), } res_item = { 'url': response.url, 'header': {i.decode(): response.headers[i].decode() for i in response.headers}, 'encoding': response.encoding, 'body': str(response.body), } # 记录接收情况 new_request = self.request_db.get_new_row() new_request.update({ 'target_url': response.url, 'task_code': self.spider.task_code, 'rule_code': self.spider.rule_code, 'job_code': self.spider.job_code, 'request_code': request.meta['request_code'], 'status': response.status, 'request_at': getFormatTime(request.meta['request_at']), 'response_at': getFormatTime(getNowTime()), 'cost_time': getDifferTime(getNowTime(), request.meta['request_at']), 'request_data': str(req_item), 'response_data': str(res_item) }) self.request_db.conn.insert(new_request) if response.status == 200: self.request_success_counter.inc() self.spider.logger.info('爬虫请求是 %s 成功并入库' % request.url) else: self.spider.logger.error('爬虫请求 %s 失败' % request.url) raise Exception('爬虫请求 %s 失败' % request.url) except Exception as e: self.spider.logger.error(e)
class CaptureUrlMode(MongoDB): class Meta: db_table = 'pub_capture_url' fields = dict( url=tableField(), variants=tableField(), url_template=tableField(), job_code=tableField(), rule_code=tableField(), status=tableField(default=1), used_at=tableField(default=''), created_at=tableField(default=getFormatTime(getNowTime())), )
def update_job(self, task_code, job_code, job_dict: dict = None): if job_dict is None: job_dict = {} job_dict.update({'updated_at': getFormatTime(getNowTime())}) self.job = self.task_job_db.conn.find_one_and_update( filter=dict( task_code=task_code, job_code=job_code, ), update={ '$set': job_dict }, return_document=ReturnDocument.AFTER) return self.job
def update_task(self, task_code, task_dict: dict = None): """ 更新任务 :param task_code: :param task_dict: :return: """ if task_dict is None: task_dict = {} task_dict.update({'updated_at': getFormatTime(getNowTime()), }) self.task = self.task_db.conn.find_one_and_update(filter={'task_code': task_code}, update={'$set': task_dict}, return_document=ReturnDocument.AFTER) return self.task
def load_job(self, task_code=None, rule_code=None, job_code=None): # 创建job new_task_detail = self.task_job_db.get_new_row() new_task_detail.update(dict( task_code=task_code, rule_code=rule_code, job_code=job_code, status=1, created_at=getFormatTime(getNowTime()), scrapyd_job_id=os.environ.get('SCRAPY_JOB', 'error'), )) # 返回job信息 _job_id = self.task_job_db.conn.insert_one(new_task_detail).inserted_id self.job = self.task_job_db.conn.find_one(filter={'_id': _job_id}) return self.job
def process_sync_keyword(self, data, db): try: for response_item in data: response_item.update({'sync_at': getFormatTime(getNowTime())}) res = db.conn.find_one_and_update( {'id': response_item['id']}, {'$set': dict({}, **response_item)}, projection={ 'id': True, 'title': True, '_id': False }, upsert=True, return_document=ReturnDocument.AFTER) logging.info(msg='爬虫同步关键词 [%s] OK!' % response_item['title']) except Exception as e: logging.error('爬虫同步关键词失败', data=data) pass
class ProxyIpPoolModel(MongoDB): class Meta: db_table = 'pub_proxy_ip' fields = dict( status=tableField(default=1), proxy_ip=tableField(), proxy_port=tableField(), agent_type=tableField(default=1), country=tableField(), regoin=tableField(), city=tableField(), created_at=tableField(), priority=tableField(default=100), sync_at=tableField(default=getFormatTime(getNowTime())), ) pass
def load_url_by_slice(self, url_slice=20): _urls = [ self.url_db.conn.find_one_and_update( filter=dict( rule_code=self.spider.rule_code, job_code=self.spider.job_code, status=1, ), update={ '$set': dict( used_at=getFormatTime(getNowTime()), status=2, ) }, return_document=ReturnDocument.BEFORE, sort=[('sort', pymongo.ASCENDING)]) for i in range(url_slice) ] _urls = [i for i in _urls if i is not None] self.spider.logger.info('爬虫分片成功取出%s条url' % len(_urls)) return _urls
def process_item(self, item, spider): spider.logger.info('CaptureHTMLBodyPipeline process item') capture_html_db = MongoDB('pub_capture_html') counters_db = MongoDB('counters') insert_item = dict(item) _res = counters_db.conn.find_one_and_update( filter={'_id': 'auto_res_id'}, update={'$inc': { 'sequence_value': 1 }}) insert_item.update({ # 'auto_id': int(getNowTime() * 1000), 'auto_id': int(_res['sequence_value']), 'capture_ip': spider.task_code.split('_')[0], # 抓取内容的时间 'capture_at': getFormatTime(getNowTime()), 'capture_timezone': getNowTime(), }) capture_html_db.conn.insert(insert_item) return item
def process_sync_prodcut(self, data, db): """ 储存同步产品分类 :param data: :param db: """ try: for response_item in data: response_item.update({'sync_at': getFormatTime(getNowTime())}) res = db.conn.find_one_and_update( {'id': response_item['id']}, {'$set': dict({}, **response_item)}, projection={ 'sku': True, '_id': False }, upsert=True, return_document=ReturnDocument.AFTER) logging.info(msg='爬虫同步产品数据SKU [%s] OK!' % response_item['sku']) except Exception as e: logging.error('爬虫同步产品SKU失败!', data=data)
def process_response(self, request, response, spider): """ 响应处理 :param request: :param response: :param spider: :return: """ # todo 可在此处增加 404 或 500 页面的处理 super(SyncDataSpiderMiddlewares, self).process_response(request, response, spider) try: # 检查api接口调用的结果 response_data = json.loads(response.body) if response_data.get('status') == 200: sync_task_db = SyncTaskModel() # 更新同步任务数据 sync_task_db.conn.find_one_and_update( filter={ 'task_code': spider.task_code, 'rule_code': spider.rule_code }, update={ '$set': { 'end_page': response_data['tp'], 'cur_page': response_data['page'], 'total_row': response_data['tr'], 'updated_at': getFormatTime(getNowTime()) } }) else: spider.logger.error('Post-request %s failed!' % spider.task_api) raise Exception('REST API 请求错误') except Exception as e: spider.logger.error(e) finally: return response
def load_task(self, task_code, rule_code, task_dict={}): """ 加载任务 :param task_dict: :param rule_code: :param task_code: """ if task_code is None: raise Exception('爬虫加载任务必须传入task code') _new_task = self.task_db.get_new_row() _new_task.update(task_dict) # 新任务默认redo为True用于生成urls _new_task.update(dict( task_code=task_code, rule_code=rule_code, status=1, redo=1, created_at=getFormatTime(getNowTime()), )) # 读取已有任务 self.task = self.task_db.conn.find_one(filter=dict( task_code=task_code, rule_code=rule_code, )) # 创建新任务 if self.task is None: _task_id = self.task_db.conn.insert_one(_new_task).inserted_id self.task = self.task_db.conn.find_one(filter={'_id': _task_id}) self.spider.logger.info('爬虫创建任务 %s ' % self.task.get('_id')) pass self.spider.logger.info('爬虫加载任务 %s ' % self.task.get('_id')) # todo 增加任务重做的流程 return self.task
def open_spider(self, spider): # spider (Spider 对象) – 被开启的spider # 可选实现,当spider被开启时,这个方法被调用。 # 更新Spider接收到的任务信息 print('Load in SyncSystemDataPipeline') sync_task_db = SyncTaskModel() spider.task = sync_task_db.conn.find_one(filter={ 'task_code': spider.task_code, 'rule_code': spider.rule_code }) if spider.task is None: # 创建新的同步任务记录 task_id = sync_task_db.conn.insert( dict({ 'updated_at': getFormatTime(getNowTime()), 'cur_page': 1, 'end_page': spider.request_end_page, 'task_code': spider.task_code, 'rule_code': spider.rule_code })) spider.task = sync_task_db.conn.find_one({'_id': task_id}) pass