def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request
def start(self): self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.add(weixin_request)
def start(self): """ 初始化工作 """ # 全局更新Headers # self.update_cookie() self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2,'sut':7956,'lkt':'1%2C1553052272863%2C1553052272863','s_from':'input','_sug_':'y','sst0':'1553052272967','ie':'utf8','w':'01019900','dr':'1'}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request)
def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) # 解析响应内容,并转为键值对的形式 items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: # 获取详情页URL url = item.attr('href') # 构建详情页请求 weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request # 获取下一页列表页数据 next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) # 构建下一页列表页请求对象 weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request
def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=False, timeout=15) # 调度第一个请求 self.queue.add(weixin_request)
def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) # 拼接参数 start_url = self.base_url + '?' + parse.urlencode({ 'query': self.keyword, 'type': 2 }) # 构建请求对象 weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 将请求加入到队列中 self.queue.add(weixin_request)
def start(self): """ 初始化工作 """ # 全局更新Headers,使得所有请求都能应用Cookies self.session.headers.update(self.headers) # 起始URL的构造 start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) # 构造WeixinRequest对象,回调函数:请求成功后用parse_index()处理和解析 need_proxy参数执行请求须用代理 weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 请求加入队列,调度第一个请求 self.queue.add(weixin_request)
:return: 添加结果 """ if isinstance(request, WeixinRequest): return self.db.rpush(REDIS_KEY, dumps(request)) return False def pop(self): """ 取出下一个Request并反序列化 :return: Request or None """ if self.db.llen(REDIS_KEY): return loads(self.db.lpop(REDIS_KEY)) else: return False def clear(self): self.db.delete(REDIS_KEY) def empty(self): return self.db.llen(REDIS_KEY) == 0 if __name__ == '__main__': db = RedisQueue() start_url = 'http://www.baidu.com' weixin_request = WeixinRequest(url=start_url, callback='hello', need_proxy=True) db.add(weixin_request) request = db.pop() print(request) print(request.callback, request.need_proxy)