예제 #1
0
 def parser(self, request, response):
     # 解析列表
     article_list_url = response.xpath(
         '//div[@class="kjxw_tit"]/a/@href').extract()
     for url in article_list_url:
         log.debug("下发文章任务 url = {}".format(url))
         yield spider.Request(url, callback=self.parser_artile)
예제 #2
0
    def start_monitor_task(self):
        """
        @summary: 监控任务状态
        ---------
        ---------
        @result:
        """
        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
            self._is_more_parsers = False
            self._parsers.append(self)

        elif len(self._parsers) <= 1:
            self._is_more_parsers = False

        if self._task:
            self.distribute_task([self._task])
        else:
            tasks = self.get_todo_task_from_mysql()
            if not tasks:
                raise Exception("未获取到任务 请检查 task_id: {} 是否存在".format(
                    self._task_id))
            self.distribute_task(tasks)

        os.environ.setdefault("batch_date", "1970-00-00")
        log.debug("下发任务完毕")
예제 #3
0
    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False
예제 #4
0
    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False
예제 #5
0
    def parser_artile(self, request, response):
        log.debug("解析文章 url = {}".format(request.url))
        article_extractor = ArticleExtractor(request.url, response.text)

        item = Item()
        item.url = request.url
        item.content = article_extractor.get_content()
        item.title = article_extractor.get_title()
        item.release_time = article_extractor.get_release_time()
        item.author = article_extractor.get_author()

        log.debug(item)
예제 #6
0
파일: mysqldb.py 프로젝트: weilan157/spider
    def __init__(self,
                 ip=None,
                 port=None,
                 db=None,
                 user_name=None,
                 user_pass=None,
                 **kwargs):
        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
        if not ip:
            ip = setting.MYSQL_IP
        if not port:
            port = setting.MYSQL_PORT
        if not db:
            db = setting.MYSQL_DB
        if not user_name:
            user_name = setting.MYSQL_USER_NAME
        if not user_pass:
            user_pass = setting.MYSQL_USER_PASS

        try:

            self.connect_pool = PooledDB(
                creator=pymysql,
                mincached=1,
                maxcached=100,
                maxconnections=100,
                blocking=True,
                ping=7,
                host=ip,
                port=port,
                user=user_name,
                passwd=user_pass,
                db=db,
                charset="utf8mb4",
                cursorclass=cursors.SSCursor,
            )  # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增

        except Exception as e:
            log.error("""
            连接数据失败:
            ip: {}
            port: {}
            db: {}
            user_name: {}
            user_pass: {}
            exception: {}
            """.format(ip, port, db, user_name, user_pass, e))
        else:
            log.debug("连接到mysql数据库 %s : %s" % (ip, db))
예제 #7
0
파일: mysqldb.py 프로젝트: weilan157/spider
    def set_unique_key(self, table, key):
        try:
            sql = "alter table %s add unique (%s)" % (table, key)

            conn, cursor = self.get_connection()
            cursor.execute(sql)
            conn.commit()

        except Exception as e:
            log.error(table + " " + str(e) + " key = " + key)
            return False
        else:
            log.debug("%s表创建唯一索引成功 索引为 %s" % (table, key))
            return True
        finally:
            self.close_connection(conn, cursor)
예제 #8
0
    def run(self):
        self.distribute_task()

        for i in range(self._parser_count):
            parser_control = SingleSpiderParserControl(self._memory_db)
            parser_control.add_parser(self)
            parser_control.start()
            self._parser_controls.append(parser_control)

        while True:
            if self.all_thread_is_done():
                # 停止 parser_controls
                for parser_control in self._parser_controls:
                    parser_control.stop()

                log.debug("无任务,爬虫结束")
                break
    def reput_failed_requests_to_requests(self):
        log.debug("正在重置失败的requests...")
        total_count = 0
        while True:
            failed_requests = self.get_failed_requests()
            if not failed_requests:
                break

            for request in failed_requests:
                request["retry_times"] = 0
                request_obj = Request.from_dict(request)
                self._request_buffer.put_request(request_obj)

                total_count += 1

        self._request_buffer.flush()

        log.debug("重置%s条失败requests为待抓取requests" % total_count)
예제 #10
0
    def update_task_state(self, task_id, state=1, **kwargs):
        """
        @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写
        调用方法为 yield lambda : self.update_task_state(task_id, state)
        ---------
        @param task_id:
        @param state:
        ---------
        @result:
        """

        kwargs["id"] = task_id
        kwargs[self._task_state] = state

        sql = tools.make_update_sql(
            self._task_table,
            kwargs,
            condition="id = {task_id}".format(task_id=task_id))

        if self._mysqldb.update(sql):
            log.debug("置任务%s状态成功" % task_id)
        else:
            log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
예제 #11
0
    def __init__(self,
                 ip_ports=None,
                 db=None,
                 user_pass=None,
                 url=None,
                 decode_responses=True,
                 service_name=None,
                 max_connections=32,
                 **kwargs):
        """
        redis的封装
        Args:
            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
            db:
            user_pass:
            url:
            decode_responses:
            service_name: 适用于redis哨兵模式
        """

        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
        if ip_ports is None:
            ip_ports = setting.REDISDB_IP_PORTS
        if db is None:
            db = setting.REDISDB_DB
        if user_pass is None:
            user_pass = setting.REDISDB_USER_PASS
        if service_name is None:
            service_name = setting.REDISDB_SERVICE_NAME

        self._is_redis_cluster = False

        try:
            if not url:
                ip_ports = (ip_ports if isinstance(ip_ports, list) else
                            ip_ports.split(","))
                if len(ip_ports) > 1:
                    startup_nodes = []
                    for ip_port in ip_ports:
                        ip, port = ip_port.split(":")
                        startup_nodes.append({"host": ip, "port": port})

                    if service_name:
                        log.debug("使用redis哨兵模式")
                        hosts = [(node["host"], node["port"])
                                 for node in startup_nodes]
                        sentinel = Sentinel(hosts, socket_timeout=3, **kwargs)
                        self._redis = sentinel.master_for(
                            service_name,
                            password=user_pass,
                            db=db,
                            redis_class=redis.StrictRedis,
                            decode_responses=decode_responses,
                            max_connections=max_connections,
                            **kwargs)

                    else:
                        log.debug("使用redis集群模式")
                        self._redis = StrictRedisCluster(
                            startup_nodes=startup_nodes,
                            decode_responses=decode_responses,
                            password=user_pass,
                            max_connections=max_connections,
                            **kwargs)

                    self._is_redis_cluster = True
                else:
                    ip, port = ip_ports[0].split(":")
                    self._redis = redis.StrictRedis(
                        host=ip,
                        port=port,
                        db=db,
                        password=user_pass,
                        decode_responses=decode_responses,
                        max_connections=max_connections,
                        **kwargs)
            else:
                self._redis = redis.StrictRedis.from_url(
                    url, decode_responses=decode_responses)

        except Exception as e:
            raise
        else:
            if not url:
                log.debug("连接到redis数据库 %s db%s" % (ip_ports, db))
            else:
                log.debug("连接到redis数据库 %s" % (url))

        self._ip_ports = ip_ports
        self._db = db
        self._user_pass = user_pass
        self._url = url
예제 #12
0
파일: request.py 프로젝트: weilan157/spider
    def get_response(self, save_cached=False):
        """
        获取带有selector功能的response
        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
        @return:
        """
        # 设置超时默认时间
        self.requests_kwargs.setdefault("timeout", 22)  # connect=22 read=22

        # 设置stream
        self.requests_kwargs.setdefault(
            "stream", True
        )  # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。

        # 关闭证书验证
        self.requests_kwargs.setdefault("verify", False)

        # 设置请求方法
        method = self.__dict__.get("method")
        if not method:
            if "data" in self.requests_kwargs:
                method = "POST"
            else:
                method = "GET"

        # 随机user—agent
        headers = self.requests_kwargs.get("headers", {})
        if "user-agent" not in headers and "User-Agent" not in headers:
            if self.random_user_agent and setting.RANDOM_HEADERS:
                headers.update(
                    {"User-Agent": self.__class__.user_agent_pool.get()})
                self.requests_kwargs.update(headers=headers)
        else:
            self.requests_kwargs.setdefault(
                "headers",
                {
                    "User-Agent":
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
                },
            )

        # 代理
        proxies = self.requests_kwargs.get("proxies", -1)
        if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool:
            while True:
                proxies = self.__class__.proxies_pool.get()
                if proxies:
                    break
                else:
                    log.debug("暂无可用代理 ...")
            proxies and self.requests_kwargs.update(proxies=proxies)

        log.debug("""
                -------------- %s.%s request for ----------------
                url  = %s
                method = %s
                body = %s
                """ % (
            self.parser_name,
            (self.callback and callable(self.callback) and getattr(
                self.callback, "__name__") or self.callback) or "parser",
            self.url,
            method,
            self.requests_kwargs,
        ))

        # def hooks(response, *args, **kwargs):
        #     print(response.url)
        #
        # self.requests_kwargs.update(hooks={'response': hooks})

        use_session = (setting.USE_SESSION if self.use_session is None else
                       self.use_session)  # self.use_session 优先级高

        if use_session:
            response = self._session.request(method, self.url,
                                             **self.requests_kwargs)
        else:
            response = requests.request(method, self.url,
                                        **self.requests_kwargs)

        response = Response(response)
        if save_cached:
            self.save_cached(response,
                             expire_time=self.__class__.cached_expire_time)

        return response