예제 #1
0
    def distribute_task(self, *args, **kws):
        """
        @summary: 分发任务 并将返回的request入库
        ---------
        @param tasks:
        ---------
        @result:
        """
        self._is_distributed_task = False

        for parser in self._parsers:
            requests = parser.__start_requests(*args, **kws)
            if requests and not isinstance(requests, Iterable):
                raise Exception("%s.%s返回值必须可迭代" %
                                (parser.name, "start_requests"))

            result_type = 1
            for request in requests or []:
                if isinstance(request, Request):
                    request.parser_name = request.parser_name or parser.name
                    self._request_buffer.put_request(request)

                    self._is_distributed_task = True
                    result_type = 1

                elif isinstance(request, Item):
                    self._item_buffer.put_item(request)
                    result_type = 2

                elif callable(request):  # callbale的request可能是更新数据库操作的函数
                    if result_type == 1:
                        self._request_buffer.put_request(request)
                    else:
                        self._item_buffer.put_item(request)

            self._request_buffer.flush()
            self._item_buffer.flush()

        if self._is_distributed_task:  # 有任务时才提示启动爬虫
            # begin
            self.spider_begin()
            self.record_spider_state(
                spider_type=1,
                state=0,
                batch_date=tools.get_current_date(),
                spider_start_time=tools.get_current_date(),
                batch_interval=self._batch_interval,
            )

            # 重置已经提示无任务状态为False
            self._is_show_not_task = False

        elif not self._is_show_not_task:  # 无任务,且没推送过无任务信息
            # 发送无任务消息
            msg = "《%s》start_requests无任务添加" % (self._spider_name)
            log.info(msg)

            # self.send_msg(msg)

            self._is_show_not_task = True
예제 #2
0
    def record_batch(self):
        """
        @summary: 记录批次信息(初始化)
        ---------
        ---------
        @result:
        """

        # 查询总任务数
        sql = "select count(1) from %s%s" % (
            self._task_table,
            self._task_condition_prefix_where,
        )
        total_task_count = self._mysqldb.find(sql)[0][0]

        batch_date = tools.get_current_date(self._date_format)

        sql = (
            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
            % (
                self._batch_record_table,
                batch_date,
                0,
                total_task_count,
                self._batch_interval
                if self._batch_interval >= 1
                else self._batch_interval * 24,
                "day" if self._batch_interval >= 1 else "hour",
            )
        )

        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
        if affect_count:
            # 重置批次日期
            self._batch_date_cache = batch_date
            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
            os.environ["batch_date"] = self._batch_date_cache

            # 爬虫开始
            self.spider_begin()
            self.record_spider_state(
                spider_type=2,
                state=0,
                batch_date=batch_date,
                spider_start_time=tools.get_current_date(),
                batch_interval=self._batch_interval,
            )
        else:
            log.error("插入新批次失败")

        return affect_count
예제 #3
0
    def run(self):
        if not self.is_reach_next_spider_time():
            return

        self._start()

        while True:
            if self.all_thread_is_done():
                if not self._is_notify_end:
                    self.spider_end()  # 跑完一轮
                    self.record_spider_state(
                        spider_type=1,
                        state=1,
                        spider_end_time=tools.get_current_date(),
                        batch_interval=self._batch_interval,
                    )

                    self._is_notify_end = True

                if self._auto_stop_when_spider_done:
                    self._stop_all_thread()
                    break

            else:
                self._is_notify_end = False

            self.check_task_status()

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
예제 #4
0
    def __add_task(self):
        # 启动parser 的 start_requests
        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
        self.record_spider_state(
            spider_type=1,
            state=0,
            batch_date=tools.get_current_date(),
            spider_start_time=tools.get_current_date(),
            batch_interval=self._batch_interval,
        )

        # 判断任务池中属否还有任务,若有接着抓取
        todo_task_count = self._collector.get_requests_count()
        if todo_task_count:
            log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count)
        else:
            for parser in self._parsers:
                results = parser.start_requests(*self._parser_args,
                                                **self._parser_kwargs)
                # 添加request到请求队列,由请求队列统一入库
                if results and not isinstance(results, Iterable):
                    raise Exception("%s.%s返回值必须可迭代" %
                                    (parser.name, "start_requests"))

                result_type = 1
                for result in results or []:
                    if isinstance(result, Request):
                        result.parser_name = result.parser_name or parser.name
                        self._request_buffer.put_request(result)
                        result_type = 1

                    elif isinstance(result, Item):
                        self._item_buffer.put_item(result)
                        result_type = 2

                    elif callable(result):  # callbale的request可能是更新数据库操作的函数
                        if result_type == 1:
                            self._request_buffer.put_request(result)
                        else:
                            self._item_buffer.put_item(result)
                    else:
                        raise TypeError(
                            "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}"
                            .format(type(result)))

                self._request_buffer.flush()
                self._item_buffer.flush()
예제 #5
0
 def detail_content(self, response):
     request = response.request
     data = request.meta["data"]
     _response = response.response
     s = Selector(response=_response)
     data["content"]= s.xpath('.//div[@style="width: 1105px;margin:0 auto"]').extract_first()
     ctime=s.xpath('.//span[@class="datetime"]/text()').extract_first()
     data["ctime"]=parser.parse(ctime).strftime("%Y-%m-%d %H:%M:%S")
     data["gtime"] = tools.get_current_date()
     if data:
         # self.store_data(data, table_name="test", mysql=True, )
         print("mysql入库成功")
     print(data)
     yield data
예제 #6
0
 def detail_content(self, response):
     request = response.request
     data = request.meta["data"]
     _response = response.response
     soup = BeautifulSoup(_response.text, "html.parser")
     data["content"] = soup.select_one(
         'div[style="width: 1105px;margin:0 auto"]').decode()
     ctime = soup.select_one('span.datetime').text
     data["ctime"] = parser.parse(ctime).strftime("%Y-%m-%d %H:%M:%S")
     data["gtime"] = tools.get_current_date()
     if data:
         self.store_data(data, table_name="test", mysql=True, oss=False)
         print("mysql入库成功")
     print(data)
     yield data
예제 #7
0
파일: ccgp_list.py 프로젝트: sigai/ccgp
    def parse(self, response: Response):
        request = response.request
        task_obj = request.meta["task"]
        url = task_obj.get("url")
        _response = response.response
        try:
            if _response:
                # todo 解析代码
                soup = BeautifulSoup(_response.text, "html.parser")
                rows = soup.select('ul.xinxi_ul li')
                dataList = []
                for item in rows:
                    data = {}
                    data['url'] = urljoin(_response.url,
                                          item.select_one("a").attrs["href"])
                    data['title'] = item.select_one("a").text
                    data['gtime'] = tools.get_current_date()
                    data['ctime'] = parser.parse(item.select_one(
                        "span").text).strftime("%Y-%m-%d %H:%M:%S")
                    data["batch_date"] = self.local_batch_data
                    dataList.append(data)
                    print(data)
                if len(dataList) > 0:
                    if not self.debug:
                        self.store_data(dataList,
                                        table_name=self.task_data_table,
                                        oss=False)
                        print("数据表存储成功")
                if not self.debug:
                    # 更新完成标志
                    self.set_task_state(setting.TASK_FINISH,
                                        condition={"url": url})
                logger.debug("任务完成 {}".format(task_obj))

            else:
                if _response is not None:
                    if _response.status_code == 404:
                        url = _response.url
                        # todo 解析代码
                        # 更新完成标志
                        self.set_task_state(state=-1, condition={"url": url})
                        return
                raise Exception
        except Exception as e:
            logger.exception(e)
            self.put_task(task_obj)
        return
예제 #8
0
    def parse(self, response: Response):
        request = response.request
        task_obj = request.meta["task"]
        url = task_obj.get("url")
        _response = response.response

        try:
            if _response:
                # todo 解析代码
                soup = BeautifulSoup(_response.text, "html.parser")
                data = {}
                data['url'] = _response.url
                data['title'] = soup.select_one(
                    'span[style="font-size: 20px;font-weight: bold"]').text
                data['ctime'] = parser.parse(
                    soup.select_one("span.datetime").text).strftime(
                        "%Y-%m-%d %H:%M:%S")
                data["gtime"] = tools.get_current_date()
                data['content'] = soup.select_one(
                    'div[style="width: 1105px;margin:0 auto"]').decode()
                data["batch_date"] = self.local_batch_data
                print(data)

                if not self.debug:
                    self.store_data([data],
                                    table_name=self.task_data_table,
                                    oss=False)
                    print("数据表存储成功")
                if not self.debug:
                    # 更新完成标志
                    self.set_task_state(setting.TASK_FINISH,
                                        condition={"url": url})
                logger.debug("任务完成 {}".format(task_obj))

            else:
                if _response is not None:
                    if _response.status_code == 404:
                        url = _response.url
                        # todo 解析代码
                        # 更新完成标志
                        self.set_task_state(state=-1, condition={"url": url})
                        return
                raise Exception
        except Exception as e:
            logger.exception(e)
            self.put_task(task_obj)
        return
예제 #9
0
    def run(self):
        """
        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
        ---------
        ---------
        @result:
        """
        try:
            self.create_batch_record_table()

            if not self._parsers:  # 不是add_parser 模式
                self._parsers.append(self)

            self._start()

            while True:
                if (
                        self.task_is_done() and self.all_thread_is_done()
                ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                    if not self._is_notify_end:
                        self.spider_end()
                        self.record_spider_state(
                            spider_type=2,
                            state=1,
                            batch_date=self._batch_date_cache,
                            spider_end_time=tools.get_current_date(),
                            batch_interval=self._batch_interval,
                        )

                        self._is_notify_end = True

                    if self._auto_stop_when_spider_done:
                        self._stop_all_thread()
                        break
                else:
                    self._is_notify_end = False

                self.check_task_status()
                tools.delay_time(10)  # 10秒钟检查一次爬虫状态

        except Exception as e:
            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
            log.error(msg)
            self.send_msg(msg)

            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
예제 #10
0
    def _start(self):
        if self._auto_start_requests:
            # 将添加任务处加锁,防止多进程之间添加重复的任务
            with RedisLock(
                key=self._spider_name,
                timeout=3600,
                wait_timeout=60,
                redis_uri="redis://:{password}@{host_post}/{db}".format(
                    password=setting.REDISDB_USER_PASS,
                    host_post=setting.REDISDB_IP_PORTS,
                    db=setting.REDISDB_DB,
                ),
            ) as lock:
                if lock.locked:

                    # 启动parser 的 start_requests
                    self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
                    self.record_spider_state(
                        spider_type=1,
                        state=0,
                        batch_date=tools.get_current_date(),
                        spider_start_time=tools.get_current_date(),
                        batch_interval=self._batch_interval,
                    )

                    # 判断任务池中属否还有任务,若有接着抓取
                    todo_task_count = self._collector.get_requests_count()
                    if todo_task_count:
                        log.info(
                            "检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count
                        )
                    else:
                        for parser in self._parsers:
                            results = parser.start_requests(
                                *self._parser_args, **self._parser_kwargs
                            )
                            # 添加request到请求队列,由请求队列统一入库
                            if results and not isinstance(results, Iterable):
                                raise Exception(
                                    "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
                                )

                            result_type = 1
                            for result in results or []:
                                if isinstance(result, Request):
                                    result.parser_name = (
                                        result.parser_name or parser.name
                                    )
                                    self._request_buffer.put_request(result)
                                    result_type = 1

                                elif isinstance(result, Item):
                                    self._item_buffer.put_item(result)
                                    result_type = 2

                                elif callable(result):  # callbale的request可能是更新数据库操作的函数
                                    if result_type == 1:
                                        self._request_buffer.put_request(result)
                                    else:
                                        self._item_buffer.put_item(result)
                                else:
                                    raise TypeError(
                                        "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
                                            type(result)
                                        )
                                    )

                            self._request_buffer.flush()
                            self._item_buffer.flush()

        # 启动collector
        self._collector.start()

        # 启动parser control
        for i in range(self._parser_count):
            parser_control = self._parser_control_obj(
                self._collector,
                self._table_folder,
                self._request_buffer,
                self._item_buffer,
            )

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_controls.append(parser_control)

        # 启动request_buffer
        self._request_buffer.start()

        # 启动item_buffer
        self._item_buffer.start()
예제 #11
0
def deal_file_info(file):
    file = file.replace("{DATE}", tools.get_current_date())
    file = file.replace("{USER}", os.getenv("USER"))

    return file
예제 #12
0
def deal_file_info(file):
    file = file.replace("{DATE}", tools.get_current_date())
    file = file.replace("{USER}", getpass.getuser())

    return file