def parser(self, request, response): # 解析列表 article_list_url = response.xpath( '//div[@class="kjxw_tit"]/a/@href').extract() for url in article_list_url: log.debug("下发文章任务 url = {}".format(url)) yield spider.Request(url, callback=self.parser_artile)
def start_monitor_task(self): """ @summary: 监控任务状态 --------- --------- @result: """ if not self._parsers: # 不是多模版模式, 将自己注入到parsers,自己为模版 self._is_more_parsers = False self._parsers.append(self) elif len(self._parsers) <= 1: self._is_more_parsers = False if self._task: self.distribute_task([self._task]) else: tasks = self.get_todo_task_from_mysql() if not tasks: raise Exception("未获取到任务 请检查 task_id: {} 是否存在".format( self._task_id)) self.distribute_task(tasks) os.environ.setdefault("batch_date", "1970-00-00") log.debug("下发任务完毕")
def __add_item_to_db(self, items, update_items, requests, callbacks, items_fingerprints): export_success = False self._is_adding_to_db = True # 去重 if setting.ITEM_FILTER_ENABLE: items, items_fingerprints = self.__dedup_items( items, items_fingerprints) # 分捡 items_dict = self.__pick_items(items) update_items_dict = self.__pick_items(update_items, is_update_item=True) # item批量入库 while items_dict: tab_item, datas = items_dict.popitem() log.debug(""" -------------- item 批量入库 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) export_success = self.__export_to_db(tab_item, datas) # 执行批量update while update_items_dict: tab_item, datas = update_items_dict.popitem() log.debug(""" -------------- item 批量更新 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) update_keys = self._item_update_keys.get(tab_item) export_success = self.__export_to_db(tab_item, datas, is_update=True, update_keys=update_keys) # 执行回调 while callbacks: try: callback = callbacks.pop(0) callback() except Exception as e: log.exception(e) # 删除做过的request if requests: self._db.zrem(self._table_request, requests) # 去重入库 if export_success and setting.ITEM_FILTER_ENABLE: if items_fingerprints: self.__class__.dedup.add(items_fingerprints, skip_check=True) self._is_adding_to_db = False
def __add_request_to_db(self): request_list = [] prioritys = [] callbacks = [] while self._requests_deque: request = self._requests_deque.popleft() self._is_adding_to_db = True if callable(request): # 函数 # 注意:应该考虑闭包情况。闭包情况可写成 # def test(xxx = xxx): # # TODO 业务逻辑 使用 xxx # 这么写不会导致xxx为循环结束后的最后一个值 callbacks.append(request) continue priority = request.priority # 如果需要去重并且库中已重复 则continue if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE and not self.__class__.dedup.add(request.fingerprint)): log.debug("request已存在 url = %s" % request.url) continue else: request_list.append(str(request.to_dict)) prioritys.append(priority) if len(request_list) > MAX_URL_COUNT: self._db.zadd(self._table_request, request_list, prioritys) request_list = [] prioritys = [] # 入库 if request_list: self._db.zadd(self._table_request, request_list, prioritys) # 执行回调 for callback in callbacks: try: callback() except Exception as e: log.exception(e) # 删除已做任务 if self._del_requests_deque: request_done_list = [] while self._del_requests_deque: request_done_list.append(self._del_requests_deque.popleft()) # 去掉request_list中的requests, 否则可能会将刚添加的request删除 request_done_list = list( set(request_done_list) - set(request_list)) if request_done_list: self._db.zrem(self._table_request, request_done_list) self._is_adding_to_db = False
def parser_artile(self, request, response): log.debug("解析文章 url = {}".format(request.url)) article_extractor = ArticleExtractor(request.url, response.text) item = Item() item.url = request.url item.content = article_extractor.get_content() item.title = article_extractor.get_title() item.release_time = article_extractor.get_release_time() item.author = article_extractor.get_author() log.debug(item)
def __init__(self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs): # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 if not ip: ip = setting.MYSQL_IP if not port: port = setting.MYSQL_PORT if not db: db = setting.MYSQL_DB if not user_name: user_name = setting.MYSQL_USER_NAME if not user_pass: user_pass = setting.MYSQL_USER_PASS try: self.connect_pool = PooledDB( creator=pymysql, mincached=1, maxcached=100, maxconnections=100, blocking=True, ping=7, host=ip, port=port, user=user_name, passwd=user_pass, db=db, charset="utf8mb4", cursorclass=cursors.SSCursor, ) # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增 except Exception as e: log.error(""" 连接数据失败: ip: {} port: {} db: {} user_name: {} user_pass: {} exception: {} """.format(ip, port, db, user_name, user_pass, e)) else: log.debug("连接到mysql数据库 %s : %s" % (ip, db))
def set_unique_key(self, table, key): try: sql = "alter table %s add unique (%s)" % (table, key) conn, cursor = self.get_connection() cursor.execute(sql) conn.commit() except Exception as e: log.error(table + " " + str(e) + " key = " + key) return False else: log.debug("%s表创建唯一索引成功 索引为 %s" % (table, key)) return True finally: self.close_connection(conn, cursor)
def run(self): self.distribute_task() for i in range(self._parser_count): parser_control = SingleSpiderParserControl(self._memory_db) parser_control.add_parser(self) parser_control.start() self._parser_controls.append(parser_control) while True: if self.all_thread_is_done(): # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() log.debug("无任务,爬虫结束") break
def reput_failed_requests_to_requests(self): log.debug("正在重置失败的requests...") total_count = 0 while True: failed_requests = self.get_failed_requests() if not failed_requests: break for request in failed_requests: request["retry_times"] = 0 request_obj = Request.from_dict(request) self._request_buffer.put_request(request_obj) total_count += 1 self._request_buffer.flush() log.debug("重置%s条失败requests为待抓取requests" % total_count)
def update_task_state(self, task_id, state=1, **kwargs): """ @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写 调用方法为 yield lambda : self.update_task_state(task_id, state) --------- @param task_id: @param state: --------- @result: """ kwargs["id"] = task_id kwargs[self._task_state] = state sql = tools.make_update_sql( self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id)) if self._mysqldb.update(sql): log.debug("置任务%s状态成功" % task_id) else: log.error("置任务%s状态失败 sql=%s" % (task_id, sql))
def __init__(self, ip_ports=None, db=None, user_pass=None, url=None, decode_responses=True, service_name=None, max_connections=32, **kwargs): """ redis的封装 Args: ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"] db: user_pass: url: decode_responses: service_name: 适用于redis哨兵模式 """ # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 if ip_ports is None: ip_ports = setting.REDISDB_IP_PORTS if db is None: db = setting.REDISDB_DB if user_pass is None: user_pass = setting.REDISDB_USER_PASS if service_name is None: service_name = setting.REDISDB_SERVICE_NAME self._is_redis_cluster = False try: if not url: ip_ports = (ip_ports if isinstance(ip_ports, list) else ip_ports.split(",")) if len(ip_ports) > 1: startup_nodes = [] for ip_port in ip_ports: ip, port = ip_port.split(":") startup_nodes.append({"host": ip, "port": port}) if service_name: log.debug("使用redis哨兵模式") hosts = [(node["host"], node["port"]) for node in startup_nodes] sentinel = Sentinel(hosts, socket_timeout=3, **kwargs) self._redis = sentinel.master_for( service_name, password=user_pass, db=db, redis_class=redis.StrictRedis, decode_responses=decode_responses, max_connections=max_connections, **kwargs) else: log.debug("使用redis集群模式") self._redis = StrictRedisCluster( startup_nodes=startup_nodes, decode_responses=decode_responses, password=user_pass, max_connections=max_connections, **kwargs) self._is_redis_cluster = True else: ip, port = ip_ports[0].split(":") self._redis = redis.StrictRedis( host=ip, port=port, db=db, password=user_pass, decode_responses=decode_responses, max_connections=max_connections, **kwargs) else: self._redis = redis.StrictRedis.from_url( url, decode_responses=decode_responses) except Exception as e: raise else: if not url: log.debug("连接到redis数据库 %s db%s" % (ip_ports, db)) else: log.debug("连接到redis数据库 %s" % (url)) self._ip_ports = ip_ports self._db = db self._user_pass = user_pass self._url = url
def get_response(self, save_cached=False): """ 获取带有selector功能的response @param save_cached: 保存缓存 方便调试时不用每次都重新下载 @return: """ # 设置超时默认时间 self.requests_kwargs.setdefault("timeout", 22) # connect=22 read=22 # 设置stream self.requests_kwargs.setdefault( "stream", True ) # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。 # 关闭证书验证 self.requests_kwargs.setdefault("verify", False) # 设置请求方法 method = self.__dict__.get("method") if not method: if "data" in self.requests_kwargs: method = "POST" else: method = "GET" # 随机user—agent headers = self.requests_kwargs.get("headers", {}) if "user-agent" not in headers and "User-Agent" not in headers: if self.random_user_agent and setting.RANDOM_HEADERS: headers.update( {"User-Agent": self.__class__.user_agent_pool.get()}) self.requests_kwargs.update(headers=headers) else: self.requests_kwargs.setdefault( "headers", { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" }, ) # 代理 proxies = self.requests_kwargs.get("proxies", -1) if proxies == -1 and setting.PROXY_ENABLE and self.__class__.proxies_pool: while True: proxies = self.__class__.proxies_pool.get() if proxies: break else: log.debug("暂无可用代理 ...") proxies and self.requests_kwargs.update(proxies=proxies) log.debug(""" -------------- %s.%s request for ---------------- url = %s method = %s body = %s """ % ( self.parser_name, (self.callback and callable(self.callback) and getattr( self.callback, "__name__") or self.callback) or "parser", self.url, method, self.requests_kwargs, )) # def hooks(response, *args, **kwargs): # print(response.url) # # self.requests_kwargs.update(hooks={'response': hooks}) use_session = (setting.USE_SESSION if self.use_session is None else self.use_session) # self.use_session 优先级高 if use_session: response = self._session.request(method, self.url, **self.requests_kwargs) else: response = requests.request(method, self.url, **self.requests_kwargs) response = Response(response) if save_cached: self.save_cached(response, expire_time=self.__class__.cached_expire_time) return response