def parse_article_list(self, response): """ 文章列表 :param response: :return: """ body = response.body_as_unicode() jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0) result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')')) # 翻页 has_more = result.get('has_more') if has_more: max_behot_time = result['next']['max_behot_time'] AS, CP = get_as_cp() jsonp_index = response.meta.get('jsonp_index', 0) + 1 url_params_next = { 'max_behot_time': max_behot_time, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list_next = get_update_url(response.url, url_params_next) meta = dict(response.meta, jsonp_index=jsonp_index) yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta) # 详情 data_list = result.get('data', []) for data_item in data_list: detail_url = data_item.get('source_url') meta = dict(response.meta, detail_url=detail_url) yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta)
def jssdk_signature(self, response): AS, CP = get_as_cp() jsonp_index = 3 url = 'https://www.toutiao.com/pgc/ma/' url_params = { 'page_type': 1, 'max_behot_time': '', 'uid': response.meta['userid'], 'media_id': response.meta['mediaid'], 'output': 'json', 'is_json': 1, 'count': 20, 'from': 'user_profile_app', 'version': 2, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list = get_update_url(url, url_params) url_article_list = "https://www.toutiao.com/pgc/ma/?page_type=1&max_behot_time=&uid=6555293927&media_id=6555293927&output=json&is_json=1&count=20&from=user_profile_app&version=2&as=A1D53BF94259E77&cp=5B92C98E87777E1&callback=jsonp3" print("===url_article_list:", url_article_list) meta = dict(response.meta, jsonp_index=jsonp_index) print("===meta:", meta) # print("===headers:", response.headers) yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta)
def jssdk_signature(self, response): AS, CP = get_as_cp() jsonp_index = 3 url = 'https://www.toutiao.com/pgc/ma/' url_params = { 'page_type': 1, 'max_behot_time': '', 'uid': response.meta['userid'], 'media_id': response.meta['mediaid'], 'output': 'json', 'is_json': 1, 'count': 20, 'from': 'user_profile_app', 'version': 2, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list = get_update_url(url, url_params) meta = dict(response.meta, jsonp_index=jsonp_index) yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta)
def jssdk_signature(self, response): AS, CP = get_as_cp() jsonp_index = 3 url = 'https://m.toutiao.com/list/' # url_params = { # 'page_type': 1, # 'max_behot_time': '', # 'uid': response.meta['userid'], # 'media_id': response.meta['mediaid'], # 'output': 'json', # 'is_json': 1, # 'count': 20, # 'from': 'user_profile_app', # 'version': 2, # 'as': AS, # 'cp': CP, # 'callback': 'jsonp%d' % jsonp_index, # } url_params = { 'tag': 'news_hot', 'max_behot_time': '%10d' % time.time(), 'format': 'json_raw', 'output': 'json', 'is_json': 1, 'count': 20, 'version': 2, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list = get_update_url(url, url_params) # url_article_list = "https://m.toutiao.com/list/?tag=__all__&ac=wap&count=20&format=json_raw&as=A1755BB952EA81E&cp=5B922AF8A19E2E1&max_behot_time=1536333324" print("===url_article_list:", url_article_list) meta = dict(response.meta, jsonp_index=jsonp_index) # print("===meta:", meta) # print("===headers:", response.headers) yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta)
def parse_article_list(self, response): """ 文章列表 :param response: :return: """ body = response.body_as_unicode() # print("headers:===\n", response.request.headers) # print("body:====\n", body) jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0) result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')')) # 详情 data_list = result.get('data', []) print("\n====data_list len:", len(data_list)) for data_item in data_list: detail_url = self.web_host_url + data_item.get( 'source_url') + 'info/' print("****detail_url:", detail_url) article_url = self.web_host_url + data_item.get('source_url') article_id = data_item['item_id'] article_title = data_item['title'] pub_time = data_item['behot_time'] keywords = data_item['keywords'] if 'keywords' in data_item else '' meta = dict( response.meta, detail_url=detail_url, article_url=article_url, item_id=article_id, article_title=article_title, article_pub_time=pub_time, keywords=keywords, ) yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta) # 翻页 has_more = result.get('has_more') if has_more: max_behot_time = '' if 'next' in result and 'max_behot_time' in result['next']: max_behot_time = result['next']['max_behot_time'] AS, CP = get_as_cp() jsonp_index = response.meta.get('jsonp_index', 0) + 1 url_params_next = { 'max_behot_time': max_behot_time or '%10d' % time.time(), 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } print("max_behot_time:", url_params_next['max_behot_time']) url_article_list_next = get_update_url(response.url, url_params_next) meta = dict(response.meta, jsonp_index=jsonp_index) time.sleep(self.FRESH_DELAY) yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta)