예제 #1
0
 def _init_start_requests(self, start_evt):
     """
     init start requests
     :return:
     """
     logger.info("start crawling !")
     for req in self.start_requests:
         # print(req)
         self.crawl(req)
     time.sleep(1)
     start_evt.set()
예제 #2
0
    def process_item(self, item, spider):
        """

        :param item:
        :param spider:
        :return:
        """
        #TODO:写到调度器里面去
        # pass
        #调用收集线程
        logger.info("Process_item:\n \t {item}".format(item=item))
예제 #3
0
    def fetch(self, request):

        # kwargs = {
        #     "headers": request.headers,
        #     "timeout": self.settings["TIMEOUT"],
        #     "proxies":
        # }
        # timeout = self.settings["TIMEOUT"]

        # request.headers.update(header)

        # req = Request(
        #     method=request.method,
        #     url=request.url,
        #     data=request.data,
        #     headers=request.headers
        # )
        # print(request.headers)
        url = request.url
        meta = request.meta
        # pre = self._get_session(url)
        #
        session = self._get_session(url)

        if request.method == 'POST':
            response = session.post(url,
                                    data=request.data,
                                    headers=request.headers,
                                    proxies=meta.get('proxy'),
                                    timeout=meta['timeout'])
        else:
            response = session.get(url,
                                   data=request.data,
                                   headers=request.headers,
                                   proxies=meta.get('proxy'),
                                   timeout=meta['timeout'])
        # prepped = session.prepare_request(req)
        # # logger.info("processing %s", url)
        # response = session.send(prepped,
        #                         proxies=meta.get('proxy'),
        #                         timeout=timeout
        #                                 )

        # print(len(response.text))
        # print(response.text)
        r = Response(response.url, response.status_code, response.headers,
                     response.content, response.text)

        logger.info("Downloaded ({status}) {request}".format(
            request=str(request), status=r.status))
        return r
예제 #4
0
    def close_spider(self, start_evt, close_evt):
        """
        关闭爬虫
        对爬虫队列不断检查
        我的思路是如果为空的话往队列里面放入flag 通过这个flag关闭线程
        :return:
        """

        # time.sleep(2)
        start_evt.wait()
        # wait 直到他set()
        while self.running:
            time.sleep(.1)
            if len(self.scheduler) == 0:
                close_evt.set()
                self.running = False
        logger.info("close spider !")
예제 #5
0
    def get_blog_list(self, response):
        response_dict = json.loads(response.text)
        mainodors = response_dict['mainodor']
        for mianodor in mainodors:
            # print(mianodor)

            sql = """
            insert raw_item_smell_rank
value('%s','%s','%s')""" % (response.meta['item_id'], mianodor['uoodor'],
                            mianodor['cnt'])
            try:
                conn = self.get_conn()
                conn.cursor().execute(sql)
                conn.commit()
                logger.info("insert OK!" + str(mianodor['uoodor']))
            except Exception as e:
                logger.error("Error: %s", str(e), exc_info=True)