def req_to_dict(raw_req_data): """ :param raw_req_data:_type['req_data'] 它只是5种请求数据的一种 :return:将anyproxy获取的req文件内容解析成为request参数所需要的字典 """ req_data = {} url_lsit = raw_req_data['url'].split('?') url = url_lsit[0] + '?' req_data['url'] = url req_data['method'] = raw_req_data['requestOptions']['method'] req_data['headers'] = raw_req_data['requestOptions']['headers'] body_str = raw_req_data['requestData'] body_dict = str_to_dict(body_str, "&", "=") url_param_str = url_lsit[1] url_param_dict = str_to_dict(url_param_str, "&", "=") req_data['body_dict'] = body_dict req_data['url_param_dict'] = url_param_dict # 添加一个测试字段 req_data['url_param_str'] = url_param_str return req_data
def process_request(self, request, spider): current_req_data = self.req_data_list[self.counter % self.wx_num] req_data = TidyReqData.req_to_dict( current_req_data['getappmsgext']['req_data']) content_url = request._get_url() content_url_param_dict = str_to_dict( content_url.split('?')[-1], '&', '=') body_dict = req_data['body_dict'] body_dict.update(content_url_param_dict) body_dict['comment_id'] = request.get_ext_data['comment_id'] body_dict['is_need_reward'] = 1 url = req_data['url'] + req_data['url_param_str'] request._set_url(url) request.set_method(req_data['method']) request.set_headers(req_data['headers']) body_str = dict_to_str(body_dict) request._set_body(body_str) self.counter += 1 return None
def prepare_req_data(self, current_req_data, request, _type): """ :param current_req_data: 本轮请求需要使用的请求参数 :param request: Request对象 :return: 准备爬取阅读数据的请求参数 """ request_data = {} if _type in ['getappmsgext', 'appmsg_comment']: req_data = TidyReqData.req_to_dict( current_req_data[_type]['req_data']) else: return request_data #根据原始文章的url构建body参数 content_url = request._get_url() content_url_param_dict = str_to_dict( content_url.split('?')[-1], '&', '=') body_dict = copy(req_data['body_dict']) from tools.utils import update_dict_by_dict update_dict_by_dict(body_dict, content_url_param_dict, ['mid', 'sn', 'idx', 'scene']) body_dict['comment_id'] = request.meta['comment_id'] body_dict['is_need_reward'] = 1 # 如果请求的是评论内容 if "comment_id" in req_data['url_param_dict']: url_param_dict = copy(req_data['url_param_dict']) url_param_dict['comment_id'] = request.meta['comment_id'] url_param_dict['idx'] = content_url_param_dict['idx'] from tools.utils import dict_to_str url_param_str = dict_to_str(url_param_dict) request_data['url_str'] = req_data['url'] + url_param_str # 如果请求的是阅读量 else: request_data[ 'url_str'] = req_data['url'] + req_data['url_param_str'] request_data['header_dict'] = req_data['headers'] request_data['body_dict'] = body_dict return request_data
def on_phone_crawler_add(data): data = str_to_dict(data, '&', '=') gc.add_crawler(data) report_data = gc.report_crawler() socketio.emit('phone_crawler_data', report_data)
def on_gzhs_todolist_add(data): data = str_to_dict(data, '&', '=') gc.add_gzh(data) report_data = gc.report_gzh_doing() socketio.emit('gzhs_todolist_data', report_data)