示例#1
0
    def get_response(self, request):
        if request.method.upper() == "GET":
            res = requests.get(
                request.url,
                headers = request.headers,
                params = request.params
            )

        elif request.method.upper() == "POST":
            res = requests.post(
                request.url,
                headers = request.headers,
                params = request.params,
                data = request.data
            )
        else:
            raise Exception("ERROR : 不支持该请求方法")

        logger.info(u"[{}] <{}>".format(res.status_code, res.url))
        return Response(
            res.url,
            res.status_code,
            res.headers,
            res.content
        )
示例#2
0
 def start(self):
     start_time = datetime.now()
     # print(3333)
     self._start_engine()
     end_time = datetime.now()
     # print(1111)
     logger.info('总时间{}'.format((end_time-start_time).total_seconds()))
示例#3
0
 def get_request(self):
     # request = self.queue.get_nowait()
     try:
         request = self.queue.get(False)
         return request
     except Exception as e:
         logger.info(e)
示例#4
0
    def get_response(self,request):
        # 判断请求方法
        if request.method.upper() == "GET":
            response = requests.get(
                url=request.url,
                headers=request.headers,
                cookies=request.cookies,
                params=request.params,
            )
        elif request.method.upper() == "POST":
            response=request.post(
                url=request.url,
                headers=request.headers,
                cookies=request.cookies,
                params=request.params,
                data=request.data
            )
        else:
            raise Exception('框架不支持的请求类型 {}'.format(request.method))

        logger.info("下载器成功获取<{}>对应的响应".format(request.url))

        # 构建响应对象
        res = Response(
            url=response.url,
            body=response.content,
            headers=response.headers,
            code=response.status_code,
            request=request,
            meta=request.meta
        )
        return res
示例#5
0
 def _filter_request(self, request):
     '''请求去重: request的指纹不在集合中,指纹入集合,返回true'''
     fp = self._gen_fp(request)
     if fp not in self.fp_set:
         self.fp_set.add(fp)
         return True
     self.total_repeat_nums += 1
     logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
     return False
示例#6
0
 def _filter_request(self, request):
     '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True'''
     fp = self._gen_fp(request)
     if fp not in self.fp_set:
         self.fp_set.add(fp)
         return True
     self.total_repeat_nums += 1  # 重复的请求数 +1
     logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
     return False
 def _filter_request(self, request):
     # 过滤去重,暂不实现
     fp = self._gen_fp(request)
     if fp not in self.fp_set:  # 判断如果指纹不在集合中
         self.fp_set.add(fp)  # 于是,就把指纹放入集合中
         return True  # 返回True 请求可以入队
     logger.info('发现重复的请求:<{}>'.format(request.url))
     self.repeat_request_nums += 1
     return False  # 请求重复了,返回False
示例#8
0
    def add_request(self, request):
        if not request.filter:
            request.fp = self._gen_fp(request)
            self.queue.put(request)
            logger.info("添加不去重的请求<{} {}>".format(request.method, request.url))
            return

        if self._filter_request(request):
            self.queue.put(request)
示例#9
0
 def _filter_request(self, fp, request):
     """
     判断是否是重复请求,如果是重复的返回True,否则返回False
     """
     if self._filter_set.is_filter(fp):
         logger.info(u"发现重复请求 : [{}] <{}>".format(request.method, request.url))
         return True
     else:
         return False
示例#10
0
    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
        logger.info("总的请求数量:{}".format(self.total_request_nums))
        logger.info("总的响应数量:{}".format(self.total_response_nums))
示例#11
0
 def add_request(self, request):
     # 把request放入请求队列
     # 判断指纹是否在集合中,如果不在就入队
     request.fp = self._gen_fp(request)
     if not request.filter:  # 构造request声明不过滤重复请求的情况下
         self.fp_container.add_fp(request.fp)  #也要让指纹进集合!
         self.q.put(request)
         logger.info("添加不去重的请求<{} {}>".format(request.method, request.url))
         return  # 避免在本函数中请求重复入队
     if self._filter_request(request):
         self.q.put(request)
示例#12
0
    def _filter_request(self, request):
        # 实现请求去重 如果该请求需要被过滤就返回true,否则返回false
        request.fp = self._gen_fp(request)
        if request.fp not in self._filter_container:
            self._filter_container.add(request.fp)
            return True

        else:
            self.request_repeat_nums += 1
            logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
            return False
示例#13
0
文件: engine.py 项目: 18565372664/-
    def start(self):
        start_time = datetime.now()
        logger.info("开始运行时间:%s"%start_time)

        self._start_engine()
        stop = datetime.now()
        end_time = datetime.now()
        logger.info("结束运行时间:%s"% end_time)
        logger.info("耗时:%.2f" % (stop - start_time).total_seconds())
        logger.info("总的请求数量:{}".format(self.scheduler.total_request_number))
        logger.info("总的响应数量:{}".format(self.total_response_nums))
示例#14
0
    def start_engine(self):
        start = datetime.now()
        logger.info("框架启动的时间为:[{}]".format(start))
        logger.info("并发类型为{}".format(settings.ASYNC_TYPE))
        logger.info("并发数量为{}".format(settings.MAX_ASYNC_THREAD_NUMBER))

        self._start_engine()
        stop = datetime.now()
        logger.info("框架停止的时间为:[{}]".format(stop))
        logger.info("框架运行的时间为:[{}]".format((stop - start).total_seconds()))
        self.collector.clear()  # 清除redis中所有的计数的值,但不清除指纹集合; 视情况而看
示例#15
0
 def _filter_request(self, request):
     '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True'''
     # request.fp = self._gen_fp(request)
     # if fp not in self.fp_set:
     if not self.fp_container.exists(request.fp):
         self.fp_container.add_fp(request.fp)  # 指纹进集合
         return True
     # self.total_repeat_nums += 1 # 重复的请求数 +1
     self.collector.incr(self.collector.repeat_request_nums_key)
     logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
     return False
示例#16
0
    def _filter_request(self, request):
        """去重"""
        request.fp = self._gen_fp(request)
        if not self._filter_container.exists(request.fp):
            self._filter_container.add_fp(request.fp)
            return True

        else:
            # self.repeat_request_num += 1
            self.collector.incr(self.collector.repeat_request_nums_key)
            logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
            return False
示例#17
0
    def start(self):
        '''提供程序启动入口, 启动整个引擎'''

        # 测试log功能
        start_time = datetime.now()
        logger.info('爬虫启动:{}'.format(start_time))

        self._start_engine()
        end_time = datetime.now()
        print("爬虫结束:{}".format(end_time))
        print('爬虫共运行:{}秒'.format(
            (end_time - start_time).total_seconds()))  # total_seconds
示例#18
0
    def add_request(self, request):
        """添加请求到请求对列中"""
        # 如果需要过滤,并且是重复请求才过滤
        if not request.dont_filter and self.__filter_request(request):
            # 如果请求需要过滤,记录日志,直接返回
            logger.info('过滤掉了重复的请求:%s' % request.url)
            self.stats_collector.incr(
                self.stats_collector.repeat_request_nums_key)
            return

        self.queue.put(request)
        # 每添加一次请求,就让总请求数量加1
        self.stats_collector.incr(self.stats_collector.request_nums_key)
示例#19
0
 def _filter_request(self, request):
     """
     实现对请求对象的去重
     :param request: 请求对象
     :return: bool
     """
     # 给request对象添加一个fp属性,保存指纹
     request.fp = self._gen_fp(request)
     if not self._filter_container.exists(request.fp):
         self._filter_container.add_fp(request.fp)  # 把request指纹添加到指纹集合中
         return True
     else:
         logger.info("发现重复的请求:<{} {}>".format(request.method, request.url))
         # self.repeat_request_nums += 1
         self.collector.incr(self.collector.repeat_request_nums_key)
示例#20
0
    def add_request(self, request):
        """
        实现添加request到队列中
        :param request:
        :return:
        """
        # self._filter_request(request) # 在加入请求队列之前先过滤

        # 判断请求是否进行去重,如果不需要,直接添加到队列
        if not request.filter:  # 不需要去重
            request.fp = self._gen_fp(request)
            self.queue.put(request)
            logger.info("添加不去重的请求<{} {}>".format(request.method, request.url))
            return

        if self._filter_request(request):
            self.queue.put(request)
示例#21
0
    def add_request(self, request):
        """
        对请求去重,并添加不重复的请求到队列中
        """
        if not request.filter:
            logger.info(u"添加请求(dont filter) 成功: [{}] <{}>".format(request.method, request.url))
            self.queue.put(request)
            self.total_request += 1
            return

        # 生成每个请求的指纹数据
        fp = self._gen_fingerprint(request)

        if not self._filter_request(fp, request):
            # 添加请求到请求队列,添加指纹数据到指纹集合里
            logger.info(u"添加请求成功: [{}] <{}>".format(request.method, request.url))
            self.queue.put(request)
            self.total_request += 1
            self._filter_set.add_fp(fp)
示例#22
0
    def put_request(self, request):
        '''
        将请求放入待爬取队列
        :param request:
        :return:
        '''
        if request.filter == False:
            self.queue.put(request)
            logger.info("重复的请求<{}>被设置为不过滤".format(request.url))
            return

        fp = self._gen_fp(request)
        if not self._filter_request(fp):
            self.queue.put(request)
            self._filter_container.add_fp(fp)
        else:
            # self.repeat_request_num += 1
            self.collector.incr(self.collector.repeat_request_nums_key)
            logger.info("重复的请求<{}>已经被过滤掉了,hash值为<{}>".format(request.url, fp))
示例#23
0
 def start(self):
     start = datetime.now()
     logger.info("引擎启动时间{}".format(start))
     self._start_engine()
     stop = datetime.now()
     logger.info("引擎停止时间{}".format(stop))
     logger.info("爬虫运行时间{}".format((stop-start).total_seconds()))
示例#24
0
 def get_response(self, request):
     """
     实现结构请求对象,发送请求,获取响应
     :param request:
     :return:
     """
     if request.method.upper() == "GET":
         resp = requests.get(request.url,
                             headers=request.headers,
                             params=request.params)
     elif request.method.upper() == "POST":
         resp = requests.post(request.url,
                              headers=request.headers,
                              params=request.params,
                              data=request.data)
     else:
         raise Exception("不支持的请求方法:<{}>".format(request.method))
     logger.info("<{}    {}> ".format(resp.status_code, resp.url))
     return Response(url=resp.url,
                     body=resp.content,
                     headers=resp.headers,
                     status_code=resp.status_code)
示例#25
0
 def start(self):
     start = datetime.now()
     logger.info("start time: {}".format(start))
     self._start_engine()
     stop = datetime.now()
     logger.info("stop time: {}".format(stop))
     time = (stop - start).total_seconds()
     logger.info("useing time: {}".format(time))
示例#26
0
    def add_request(self, request):
        # 在入队列前便生成指纹
        # 根据请求对象生成指纹进行比对
        fp = self._create_fp(request)

        if request.filter:
            # 过滤
            # 直接根据指纹进行判断
            if not self.filter_request(fp):
                # 表示不重复,添加新多请求对象并入队列
                self.queue.put(request)
                # 不再将url进入队列,而是直接进入指纹,方便去重判定
                self.filter_container.add_fp(fp)
            else:
                logger.info('this is a repetitive request:{}'.format(
                    request.url))
                # self.total_repeat_num += 1
                self.collector.incr(self.collector.repeat_request_nums_key)
        else:
            # 过滤,直接入队列
            self.queue.put(request)
            logger.info('a repetitive request is added in queue:{}'.format(
                request.url))
示例#27
0
    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
示例#28
0
文件: engine.py 项目: lf0711/day07
 def start(self):
     """启动整个引擎,主要调用逻辑代码写在_start_engine中"""
     start = datetime.now()  # 获取当前时间
     logger.info('开始运行时间:%s' % start)
     self._start_engine()
     stop = datetime.now()
     logger.info('运行结束时间:%s' % stop)
     # 运行总耗时时间
     logger.info('耗时: %.2f' % (stop - start).total_seconds())
示例#29
0
    def _start_engine(self):
        if ROLE == "master" or ROLE is None:
            # 处理start_urls里的请求
            if ASYNC_TYPE == "coroutine":
                logger.info(u"协程正在执行...")
            else:
                logger.info(u"子线程正在执行...")

            self.pool.apply_async(self._start_requests)

            #self._start_requests()

        if ROLE == "slave" or ROLE is None:
            # 通过settings 控制并发量
            for i in range(ASYNC_MAX_COUNT):
                if ASYNC_TYPE == "coroutine":
                    logger.info(u"协程正在执行...")
                else:
                    logger.info(u"子线程正在执行...")

                self.pool.apply_async(self._execute_request_response_item, callback = self._callback)

        # 处理调度器的请求
        while True:
            # 避免CPU疯狂空转,浪费资源
            time.sleep(0.01)
            #self._execute_request_response_item()
            # 当请求计数器和响应计数器相等时,表示所有请求已经处理结束
            # 且至少有一个响应处理完毕,循环退出(避免程序刚执行就退出)
            if self.total_response == self.scheduler.total_request and self.total_response != 0:
                self.is_running = False
                break

        self.pool.close() # 不再向线程池添加任务了,协程默认执行pass
        self.pool.join()  # 让主线程等待所有子线程执行结束

        logger.info(u"主线程执行结束")
示例#30
0
 def start(self):
     # 框架启动的入口函数
     start = datetime.now()  # 起始时间
     logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
     self._start_engine()
     stop = datetime.now()  # 结束时间
     logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
     logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
     logger.info("总的请求数量:{}".format(self.collector.request_nums))
     logger.info("总的响应数量:{}".format(self.collector.response_nums))
     logger.info("重复请求数量:{}".format(self.collector.repeat_request_nums))
     self.collector.clear()  # 清除计数统计!