예제 #1
0
    def process_request_exception(self, request, exception):
        self.request_failed_counter.inc()
        # 请求失败
        try:
            req_item = {
                'url': request.url,
                'method': request.method,
                'cookies': request.cookies,
                'header': {i.decode(): request.headers[i].decode() for i in request.headers},
                'encoding': request.encoding,
                'body': str(request.body),
            }
        except Exception as e:
            req_item = {}

        self.request_db.conn.insert({
            'target_url': request.url,
            'task_code': self.spider.task_code,
            'rule_code': self.spider.rule_code,
            'job_code': self.spider.job_code,
            'request_code': request.meta['request_code'],
            'status': str(exception),
            'request_at': getFormatTime(request.meta['request_at']),
            'response_at': getFormatTime(getNowTime()),
            'cost_time': getDifferTime(getNowTime(), request.meta['request_at']),
            'request': str(req_item),
        })
        self.spider.logger.error('请求异常%s' % exception)
예제 #2
0
 def process_spider_start(self, **kwargs):
     # 爬虫准备启动数据
     self.process_ready_start(**kwargs)
     # 创建爬虫 job code
     self.job_code = str(uuid.uuid4())
     # 记录开始执行时间
     self.spider_start_at = getNowTime()
     self.auto_load_parts.update(self.load_parts)
     self.load_parts = self.auto_load_parts
     # 爬虫加载组件
     self.process_load_parts(self.load_parts)
     # 爬虫分析规则
     self.process_analysis_rule(rule_xml_str=self.rule)
     # 爬虫处理当前任务数据
     self.process_spider_task(rule_task_dict=self.rule_part.get_rule_part('task'))
     # 生成默认请求头
     self.request_header = self.process_build_request_header(self.rule_part.get_rule_part('requests'))
     # 生成内容item
     self.process_build_capture_item(self.rule_part.get_rule_part('capture'))
     # 生成url集合并入库
     self.process_build_url_collection(self.rule_part.get_rule_part('urls'))
     # 分片获取urls
     self.process_build_url_slices(self.rule_part.get_rule_part('task'))
     # 获取请求代理
     # self.process_build_url_proxy(self.rule_part.get_rule_part('proxy'))
     self.logger.info('爬虫准备完毕,进入抓取流程')
     self.spider_start_capture_at = getNowTime()
     pass
    def close_spider(self, spider):
        # spider (Spider 对象) – 被关闭的spider
        # 可选实现,当spider被关闭时,这个方法被调用
        # import requests as req
        # url = 'https://oapi.dingtalk.com/robot/send?access_token='
        # req.api.post(url + '9dc23fc9f30248e54bcf36ca66d4cbc664660a51e791d282dbe9f97177c55ae6',
        #              json=json.dumps({"msgtype": 'markdown',
        #                    'markdown': {"title": '爬虫完成', "text": '爬虫任务已完成,本次任务rule_code:%s' % spider.rule_code},
        #                    "at": {"isAtAll": True}}))
        spider.logger.info('爬虫已结束')
        _req_counter = 0
        _req_suc_counter = 0
        _req_fai_counter = 0
        try:
            _req_counter = int(spider.request_part.request_counter.counter)
            _req_suc_counter = int(
                spider.request_part.request_success_counter.counter)
            _req_fai_counter = int(
                spider.request_part.request_failed_counter.counter)
        except Exception as e:
            spider.logger.error('爬虫请求统计发生错误,%s' % e)

        # 关闭时候,更新job数据
        spider.task_part.task_job_db.conn.find_one_and_update(
            filter=dict(_id=spider.task_part.job.get('_id')),
            update={
                '$set':
                dict(status=2,
                     request_cursor=_req_counter,
                     request_total=int(len(spider.task_urls)),
                     request_success=_req_suc_counter,
                     request_failed=_req_fai_counter,
                     finished_at=getFormatTime(getNowTime()),
                     cost_time=getDifferTime(getNowTime(),
                                             spider.spider_start_at))
            })
        # 关闭时候,更新task数据
        spider.task_part.task_db.conn.find_one_and_update(
            filter=dict(_id=spider.task_part.task.get('_id')),
            update={
                '$set':
                dict(
                    # request_total=int(spider.request_part.request_counter.counter) + _req_counter,
                    request_success=int(
                        spider.request_part.request_success_counter.counter) +
                    _req_suc_counter,
                    request_failed=int(
                        spider.request_part.request_failed_counter.counter) +
                    _req_fai_counter,
                    updated_at=getFormatTime(getNowTime()),
                )
            })
        pass
예제 #4
0
    def process_build_url_collection(self, rule_url_dict: dict = ()):
        # todo 配置项值需要迁移到config文件
        _task_redo = 0
        try:
            _task_redo = int(self.task_part.task.get('redo', 0))
        except Exception as e:
            self.logger.error('爬虫获取分片设置错误,原因是%s' % e)

        if _task_redo:
            self.logger.info('爬虫任务为重做模式')
            # 强制重做
            result = self.url_part.build_url_from_rule(rule_url_dict, _task_redo)
            if result:
                self.task_part.task.update(dict(
                    redo=_task_redo,
                    request_total=result,
                    updated_at=getFormatTime(getNowTime())
                ))
            else:
                raise Exception('爬虫批量生成url失败')
        else:
            self.logger.info('爬虫任务为继续模式')
            _remain_url = self.url_part.url_db.conn.find({
                'rule_code': self.rule_code,
                'task_code': self.task_code,
                'status': 1
            }).count()
            if _remain_url < 1:
                self.logger.info('本次爬虫任务已完成,爬虫即将结束')
                raise Exception('本次爬虫任务已完成')
 def process_sync_proxy(self, data, db):
     """
     储存同步代理IP库
     :param data:
     :param db:
     """
     try:
         for response_item in data:
             response_item.update({'sync_at': getFormatTime(getNowTime())})
             db.conn.find_one_and_update(
                 {'proxy_ip': response_item['proxy_ip']}, {
                     '$set':
                     dict({
                         'weight': 0,
                         'use_count': 0
                     }, **response_item)
                 },
                 projection={
                     'proxy_ip': True,
                     '_id': True
                 },
                 upsert=True,
                 return_document=ReturnDocument.AFTER)
             logging.info(msg='爬虫同步代理IP [%s] OK!' %
                          response_item['proxy_ip'])
     except Exception as e:
         logging.error('爬虫同步代理IP失败!', data=data)
 def process_request(self, request, spider):
     """
     请求处理
     :param request:
     :param spider:
     """
     # self.crawler.engine.close_spider(self, 'Spider request counter gt limit by %s ' % self.request_spider_max)
     request.meta.update({'request_at': getNowTime()})
     spider.request_part.request_counter.inc()
     pass
예제 #7
0
 def process_spider_start(self, **kwargs):
     self.logger.info(kwargs)
     # 创建爬虫 job code
     self.job_code = str(uuid.uuid4())
     # 记录开始执行时间
     self.spider_start_at = getNowTime()
     self.auto_load_parts.update(self.load_parts)
     self.load_parts = self.auto_load_parts
     self.process_load_parts(self.load_parts)
     self.process_ready_start(**kwargs)
     # self.process_analysis_rule(rule_xml_str=self.rule)
     self.process_spider_task()
     # self.process_build_capture_item(self.rule_part.get_rule_part('capture'))
     # self.process_build_url_collection(self.rule_part.get_rule_part('urls'), self.rule_part.get_rule_part('task'))
     # self.process_build_url_slices(self.rule_part.get_rule_part('slice'))
     # self.process_build_url_proxy(self.rule_part.get_rule_part('proxy'))
     self.logger.info('爬虫准备完毕,进入同步流程')
     self.spider_start_capture_at = getNowTime()
     pass
 def process_item(self, item, spider):
     spider.logger.info('CaptureHTMLBodyPipeline process item')
     capture_html_db = MongoDB('pub_capture_html')
     counters_db = MongoDB('counters')
     insert_item = dict(item)
     _res = counters_db.conn.find_one_and_update(
         filter={'_id': 'auto_res_id'},
         update={'$inc': {
             'sequence_value': 1
         }})
     insert_item.update({
         # 'auto_id': int(getNowTime() * 1000),
         'auto_id': int(_res['sequence_value']),
         'capture_ip': spider.task_code.split('_')[0],
         # 抓取内容的时间
         'capture_at': getFormatTime(getNowTime()),
         'capture_timezone': getNowTime(),
     })
     capture_html_db.conn.insert(insert_item)
     return item
예제 #9
0
 def process_request_response(self, request, response):
     try:
         req_item = {
             'url': request.url,
             'method': request.method,
             'cookies': request.cookies,
             'header': {i.decode(): request.headers[i].decode() for i in request.headers},
             'encoding': request.encoding,
             'body': str(request.body),
         }
         res_item = {
             'url': response.url,
             'header': {i.decode(): response.headers[i].decode() for i in response.headers},
             'encoding': response.encoding,
             'body': str(response.body),
         }
         # 记录接收情况
         new_request = self.request_db.get_new_row()
         new_request.update({
             'target_url': response.url,
             'task_code': self.spider.task_code,
             'rule_code': self.spider.rule_code,
             'job_code': self.spider.job_code,
             'request_code': request.meta['request_code'],
             'status': response.status,
             'request_at': getFormatTime(request.meta['request_at']),
             'response_at': getFormatTime(getNowTime()),
             'cost_time': getDifferTime(getNowTime(), request.meta['request_at']),
             'request_data': str(req_item),
             'response_data': str(res_item)
         })
         self.request_db.conn.insert(new_request)
         if response.status == 200:
             self.request_success_counter.inc()
             self.spider.logger.info('爬虫请求是 %s 成功并入库' % request.url)
         else:
             self.spider.logger.error('爬虫请求 %s 失败' % request.url)
             raise Exception('爬虫请求 %s 失败' % request.url)
     except Exception as e:
         self.spider.logger.error(e)
예제 #10
0
class CaptureUrlMode(MongoDB):
    class Meta:
        db_table = 'pub_capture_url'

    fields = dict(
        url=tableField(),
        variants=tableField(),
        url_template=tableField(),
        job_code=tableField(),
        rule_code=tableField(),
        status=tableField(default=1),
        used_at=tableField(default=''),
        created_at=tableField(default=getFormatTime(getNowTime())),
    )
예제 #11
0
    def update_job(self, task_code, job_code, job_dict: dict = None):
        if job_dict is None:
            job_dict = {}

        job_dict.update({'updated_at': getFormatTime(getNowTime())})
        self.job = self.task_job_db.conn.find_one_and_update(
            filter=dict(
                task_code=task_code,
                job_code=job_code,
            ), update={
                '$set': job_dict
            },
            return_document=ReturnDocument.AFTER)
        return self.job
예제 #12
0
 def update_task(self, task_code, task_dict: dict = None):
     """
     更新任务
     :param task_code:
     :param task_dict:
     :return:
     """
     if task_dict is None:
         task_dict = {}
     task_dict.update({'updated_at': getFormatTime(getNowTime()), })
     self.task = self.task_db.conn.find_one_and_update(filter={'task_code': task_code},
                                                       update={'$set': task_dict},
                                                       return_document=ReturnDocument.AFTER)
     return self.task
예제 #13
0
    def load_job(self, task_code=None, rule_code=None, job_code=None):

        # 创建job
        new_task_detail = self.task_job_db.get_new_row()
        new_task_detail.update(dict(
            task_code=task_code,
            rule_code=rule_code,
            job_code=job_code,
            status=1,
            created_at=getFormatTime(getNowTime()),
            scrapyd_job_id=os.environ.get('SCRAPY_JOB', 'error'),
        ))

        # 返回job信息
        _job_id = self.task_job_db.conn.insert_one(new_task_detail).inserted_id
        self.job = self.task_job_db.conn.find_one(filter={'_id': _job_id})
        return self.job
예제 #14
0
 def process_sync_keyword(self, data, db):
     try:
         for response_item in data:
             response_item.update({'sync_at': getFormatTime(getNowTime())})
             res = db.conn.find_one_and_update(
                 {'id': response_item['id']},
                 {'$set': dict({}, **response_item)},
                 projection={
                     'id': True,
                     'title': True,
                     '_id': False
                 },
                 upsert=True,
                 return_document=ReturnDocument.AFTER)
             logging.info(msg='爬虫同步关键词 [%s] OK!' % response_item['title'])
     except Exception as e:
         logging.error('爬虫同步关键词失败', data=data)
     pass
예제 #15
0
class ProxyIpPoolModel(MongoDB):
    class Meta:
        db_table = 'pub_proxy_ip'

    fields = dict(
        status=tableField(default=1),
        proxy_ip=tableField(),
        proxy_port=tableField(),
        agent_type=tableField(default=1),
        country=tableField(),
        regoin=tableField(),
        city=tableField(),
        created_at=tableField(),
        priority=tableField(default=100),
        sync_at=tableField(default=getFormatTime(getNowTime())),
    )

    pass
예제 #16
0
 def load_url_by_slice(self, url_slice=20):
     _urls = [
         self.url_db.conn.find_one_and_update(
             filter=dict(
                 rule_code=self.spider.rule_code,
                 job_code=self.spider.job_code,
                 status=1,
             ),
             update={
                 '$set': dict(
                     used_at=getFormatTime(getNowTime()),
                     status=2,
                 )
             },
             return_document=ReturnDocument.BEFORE,
             sort=[('sort', pymongo.ASCENDING)]) for i in range(url_slice)
     ]
     _urls = [i for i in _urls if i is not None]
     self.spider.logger.info('爬虫分片成功取出%s条url' % len(_urls))
     return _urls
예제 #17
0
 def process_sync_prodcut(self, data, db):
     """
     储存同步产品分类
     :param data:
     :param db:
     """
     try:
         for response_item in data:
             response_item.update({'sync_at': getFormatTime(getNowTime())})
             res = db.conn.find_one_and_update(
                 {'id': response_item['id']},
                 {'$set': dict({}, **response_item)},
                 projection={
                     'sku': True,
                     '_id': False
                 },
                 upsert=True,
                 return_document=ReturnDocument.AFTER)
             logging.info(msg='爬虫同步产品数据SKU [%s] OK!' % response_item['sku'])
     except Exception as e:
         logging.error('爬虫同步产品SKU失败!', data=data)
 def process_response(self, request, response, spider):
     """
     响应处理
     :param request:
     :param response:
     :param spider:
     :return:
     """
     # todo 可在此处增加 404 或 500 页面的处理
     super(SyncDataSpiderMiddlewares,
           self).process_response(request, response, spider)
     try:
         # 检查api接口调用的结果
         response_data = json.loads(response.body)
         if response_data.get('status') == 200:
             sync_task_db = SyncTaskModel()
             # 更新同步任务数据
             sync_task_db.conn.find_one_and_update(
                 filter={
                     'task_code': spider.task_code,
                     'rule_code': spider.rule_code
                 },
                 update={
                     '$set': {
                         'end_page': response_data['tp'],
                         'cur_page': response_data['page'],
                         'total_row': response_data['tr'],
                         'updated_at': getFormatTime(getNowTime())
                     }
                 })
         else:
             spider.logger.error('Post-request %s failed!' %
                                 spider.task_api)
             raise Exception('REST API 请求错误')
     except Exception as e:
         spider.logger.error(e)
     finally:
         return response
예제 #19
0
    def load_task(self, task_code, rule_code, task_dict={}):
        """
        加载任务
        :param task_dict:
        :param rule_code:
        :param task_code:
        """
        if task_code is None:
            raise Exception('爬虫加载任务必须传入task code')
        _new_task = self.task_db.get_new_row()

        _new_task.update(task_dict)

        # 新任务默认redo为True用于生成urls
        _new_task.update(dict(
            task_code=task_code,
            rule_code=rule_code,
            status=1,
            redo=1,
            created_at=getFormatTime(getNowTime()),
        ))

        # 读取已有任务
        self.task = self.task_db.conn.find_one(filter=dict(
            task_code=task_code,
            rule_code=rule_code,
        ))

        # 创建新任务
        if self.task is None:
            _task_id = self.task_db.conn.insert_one(_new_task).inserted_id
            self.task = self.task_db.conn.find_one(filter={'_id': _task_id})
            self.spider.logger.info('爬虫创建任务 %s ' % self.task.get('_id'))
            pass

        self.spider.logger.info('爬虫加载任务 %s ' % self.task.get('_id'))
        # todo 增加任务重做的流程
        return self.task
예제 #20
0
    def open_spider(self, spider):
        # spider (Spider 对象) – 被开启的spider
        # 可选实现,当spider被开启时,这个方法被调用。

        # 更新Spider接收到的任务信息
        print('Load in SyncSystemDataPipeline')
        sync_task_db = SyncTaskModel()
        spider.task = sync_task_db.conn.find_one(filter={
            'task_code': spider.task_code,
            'rule_code': spider.rule_code
        })
        if spider.task is None:
            # 创建新的同步任务记录
            task_id = sync_task_db.conn.insert(
                dict({
                    'updated_at': getFormatTime(getNowTime()),
                    'cur_page': 1,
                    'end_page': spider.request_end_page,
                    'task_code': spider.task_code,
                    'rule_code': spider.rule_code
                }))
            spider.task = sync_task_db.conn.find_one({'_id': task_id})
        pass