def process_request(self, request, spider): current_req_data = self.req_data_list[self.counter % self.wx_num] req_data = TidyReqData.req_to_dict( current_req_data['load_more']['req_data']) request.set_method(req_data['method']) req_data['url_param_dict']['offset'] = request.meta['list_offset'] url = req_data['url'] + dict_to_str(req_data['url_param_dict']) request._set_url(url) request.set_headers(req_data['headers']) self.counter += 1 return None
def process_request(self, request, spider): current_req_data = self.req_data_list[self.counter % self.wx_num] req_data = TidyReqData.req_to_dict( current_req_data['content']['req_data']) url = request._get_url() raw_url = copy(url) if "https" in raw_url: raw_url = raw_url.replace("https", "http") request.set_ext_data({"raw_url": raw_url}) if "https" not in url: url = url.replace("http", "https") request._set_url(url) request.set_method(req_data['method']) if "Cookie" in req_data['headers']: req_data['headers'].pop("Cookie") request.set_headers(req_data['headers']) self.counter += 1 return None
def process_request(self, request, spider): current_req_data = self.req_data_list[self.counter % self.wx_num] req_data = TidyReqData.req_to_dict( current_req_data['getappmsgext']['req_data']) content_url = request._get_url() content_url_param_dict = str_to_dict( content_url.split('?')[-1], '&', '=') body_dict = req_data['body_dict'] body_dict.update(content_url_param_dict) body_dict['comment_id'] = request.get_ext_data['comment_id'] body_dict['is_need_reward'] = 1 url = req_data['url'] + req_data['url_param_str'] request._set_url(url) request.set_method(req_data['method']) request.set_headers(req_data['headers']) body_str = dict_to_str(body_dict) request._set_body(body_str) self.counter += 1 return None
def prepare_req_data(self, current_req_data, request, _type): """ :param current_req_data: 本轮请求需要使用的请求参数 :param request: Request对象 :return: 准备爬取阅读数据的请求参数 """ request_data = {} if _type in ['getappmsgext', 'appmsg_comment']: req_data = TidyReqData.req_to_dict( current_req_data[_type]['req_data']) else: return request_data #根据原始文章的url构建body参数 content_url = request._get_url() content_url_param_dict = str_to_dict( content_url.split('?')[-1], '&', '=') body_dict = copy(req_data['body_dict']) from tools.utils import update_dict_by_dict update_dict_by_dict(body_dict, content_url_param_dict, ['mid', 'sn', 'idx', 'scene']) body_dict['comment_id'] = request.meta['comment_id'] body_dict['is_need_reward'] = 1 # 如果请求的是评论内容 if "comment_id" in req_data['url_param_dict']: url_param_dict = copy(req_data['url_param_dict']) url_param_dict['comment_id'] = request.meta['comment_id'] url_param_dict['idx'] = content_url_param_dict['idx'] from tools.utils import dict_to_str url_param_str = dict_to_str(url_param_dict) request_data['url_str'] = req_data['url'] + url_param_str # 如果请求的是阅读量 else: request_data[ 'url_str'] = req_data['url'] + req_data['url_param_str'] request_data['header_dict'] = req_data['headers'] request_data['body_dict'] = body_dict return request_data