def after_login(self, response): data = { 'savestate': '1', 'callback': 'jsonpcallback%13d' % (time.time() * 1000), } res = response.body_as_unicode() info = json.loads(res) crossdomainlist = info['data']['crossdomainlist'] self.uid = info['data']['uid'] url_weibo_com = get_update_url(crossdomainlist['weibo.com'], data) url_sina_com_cn = get_update_url(crossdomainlist['sina.com.cn'], data) url_weibo_cn = get_update_url(crossdomainlist['weibo.cn'], data) url_items = { 'url_weibo_com': url_weibo_com, 'url_sina_com_cn': url_sina_com_cn, 'url_weibo_cn': url_weibo_cn, } meta = dict(response.meta, **url_items) # 跨域处理 weibo.com yield scrapy.Request(url=url_weibo_com, callback=self.crossdomain_weibo_com, meta=meta)
def start_requests(self): """ 入口准备 :return: """ url_params = { 'version_code': '6.4.2', 'version_name': '', 'device_platform': 'iphone', 'tt_from': 'weixin', 'utm_source': 'weixin', 'utm_medium': 'toutiao_ios', 'utm_campaign': 'client_share', 'wxshare_count': '1', } task_id = pop_task(self.name) if not task_id: print('%s task is empty' % self.name) return print('%s task id: %s' % (self.name, task_id)) task_item = get_item(FetchTask, task_id) fetch_url = 'http://m.toutiao.com/profile/%s/' % task_item.follow_id url_profile = get_update_url(fetch_url, url_params) meta = { 'task_id': task_item.id, 'platform_id': task_item.platform_id, 'channel_id': task_item.channel_id, 'follow_id': task_item.follow_id, 'follow_name': task_item.follow_name, } yield scrapy.Request(url=url_profile, callback=self.get_profile, meta=meta)
def parse_article_list(self, response): """ 文章列表 :param response: :return: """ body = response.body_as_unicode() jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0) result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')')) # 翻页 has_more = result.get('has_more') if has_more: max_behot_time = result['next']['max_behot_time'] AS, CP = get_as_cp() jsonp_index = response.meta.get('jsonp_index', 0) + 1 url_params_next = { 'max_behot_time': max_behot_time, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list_next = get_update_url(response.url, url_params_next) meta = dict(response.meta, jsonp_index=jsonp_index) yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta) # 详情 data_list = result.get('data', []) for data_item in data_list: detail_url = data_item.get('source_url') meta = dict(response.meta, detail_url=detail_url) yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta)
def start_requests(self): """ 入口准备 :return: """ boot_url = 'http://weixin.sogou.com/weixin' task_id = pop_task(self.name) if not task_id: print('%s task is empty' % self.name) return task_item = get_item(FetchTask, task_id) cookies_id, cookies = get_cookies(self.name) url_params = { 'type': 1, # 'query': task_item.follow_id, 'query': task_item.follow_name.encode('utf-8'), } url_profile = get_update_url(boot_url, url_params) meta = { 'cookiejar': cookies_id, 'task_id': task_item.id, 'platform_id': task_item.platform_id, 'channel_id': task_item.channel_id, 'follow_id': task_item.follow_id, 'follow_name': task_item.follow_name, } yield scrapy.Request(url=url_profile, cookies=cookies, callback=self.parse_account_search_list, meta=meta)
def jssdk_signature(self, response): AS, CP = get_as_cp() jsonp_index = 3 url = 'https://www.toutiao.com/pgc/ma/' url_params = { 'page_type': 1, 'max_behot_time': '', 'uid': response.meta['userid'], 'media_id': response.meta['mediaid'], 'output': 'json', 'is_json': 1, 'count': 20, 'from': 'user_profile_app', 'version': 2, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list = get_update_url(url, url_params) meta = dict(response.meta, jsonp_index=jsonp_index) yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta)
def jssdk_signature(self, response): AS, CP = get_as_cp() jsonp_index = 3 url = 'https://www.toutiao.com/pgc/ma/' url_params = { 'page_type': 1, 'max_behot_time': '', 'uid': response.meta['userid'], 'media_id': response.meta['mediaid'], 'output': 'json', 'is_json': 1, 'count': 20, 'from': 'user_profile_app', 'version': 2, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list = get_update_url(url, url_params) url_article_list = "https://www.toutiao.com/pgc/ma/?page_type=1&max_behot_time=&uid=6555293927&media_id=6555293927&output=json&is_json=1&count=20&from=user_profile_app&version=2&as=A1D53BF94259E77&cp=5B92C98E87777E1&callback=jsonp3" print("===url_article_list:", url_article_list) meta = dict(response.meta, jsonp_index=jsonp_index) print("===meta:", meta) # print("===headers:", response.headers) yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta)
def start_requests(self): """ 入口准备 :return: """ url_params = { 'version_code': '6.4.2', 'version_name': '', 'device_platform': 'iphone', 'tt_from': 'weixin', 'utm_source': 'weixin', 'utm_medium': 'toutiao_ios', 'utm_campaign': 'client_share', 'wxshare_count': '1', } url = 'http://open.snssdk.com/jssdk_signature/' url_params = { 'appid': 'wxe8b89be1715734a0', 'noncestr': 'Wm3WZYTPz0wzccnW', 'timestamp': '%13d' % (time.time() * 1000), 'callback': 'jsonp2', } url_jssdk_signature = get_update_url(url, url_params) yield scrapy.Request( url=url_jssdk_signature, callback=self.jssdk_signature, headers=self.custom_settings['DEFAULT_REQUEST_HEADERS'], cookies=None)
def get_profile(self, response): userid = response.xpath('//button[@itemid="topsharebtn"]/@data-userid').extract_first(default='') mediaid = response.xpath('//button[@itemid="topsharebtn"]/@data-mediaid').extract_first(default='') meta = dict(response.meta, userid=userid, mediaid=mediaid) url = 'http://open.snssdk.com/jssdk_signature/' url_params = { 'appid': 'wxe8b89be1715734a6', 'noncestr': 'Wm3WZYTPz0wzccnW', 'timestamp': '%13d' % (time.time() * 1000), 'callback': 'jsonp2', } url_jssdk_signature = get_update_url(url, url_params) yield scrapy.Request(url=url_jssdk_signature, callback=self.jssdk_signature, meta=meta)
def login_sina_sso_prelogin(self, response): login_data = get_login_data() self.login_form_data.update(login_data) login_sina_sso_prelogin_url = 'https://login.sina.com.cn/sso/prelogin.php' query_payload = { 'checkpin': '1', 'entry': 'mweibo', 'su': get_su(login_data.get('username', '')), 'callback': 'jsonpcallback%13d' % (time.time() * 1000), } request_url = get_update_url(login_sina_sso_prelogin_url, query_payload) yield scrapy.Request(url=request_url, callback=self.passport_weibo_sso_login)
def fetch_proxy(country='China', scheme='http'): """ 获取代理 :param country: :param scheme: :return: """ data = {} if country: data['country'] = country if scheme: data['type'] = scheme url = 'http://proxy.nghuyong.top/' url = get_update_url(url, data) res = requests.get(url, timeout=REQUESTS_TIME_OUT).json() return [ '%s://%s' % (i['type'], i['ip_and_port']) for i in res.get('data', []) ]
def jssdk_signature(self, response): AS, CP = get_as_cp() jsonp_index = 3 url = 'https://m.toutiao.com/list/' # url_params = { # 'page_type': 1, # 'max_behot_time': '', # 'uid': response.meta['userid'], # 'media_id': response.meta['mediaid'], # 'output': 'json', # 'is_json': 1, # 'count': 20, # 'from': 'user_profile_app', # 'version': 2, # 'as': AS, # 'cp': CP, # 'callback': 'jsonp%d' % jsonp_index, # } url_params = { 'tag': 'news_hot', 'max_behot_time': '%10d' % time.time(), 'format': 'json_raw', 'output': 'json', 'is_json': 1, 'count': 20, 'version': 2, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list = get_update_url(url, url_params) # url_article_list = "https://m.toutiao.com/list/?tag=__all__&ac=wap&count=20&format=json_raw&as=A1755BB952EA81E&cp=5B922AF8A19E2E1&max_behot_time=1536333324" print("===url_article_list:", url_article_list) meta = dict(response.meta, jsonp_index=jsonp_index) # print("===meta:", meta) # print("===headers:", response.headers) yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta)
def parse_article_list(self, response): """ 文章列表 :param response: :return: """ body = response.body_as_unicode() # print("headers:===\n", response.request.headers) # print("body:====\n", body) jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0) result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')')) # 详情 data_list = result.get('data', []) print("\n====data_list len:", len(data_list)) for data_item in data_list: detail_url = self.web_host_url + data_item.get( 'source_url') + 'info/' print("****detail_url:", detail_url) article_url = self.web_host_url + data_item.get('source_url') article_id = data_item['item_id'] article_title = data_item['title'] pub_time = data_item['behot_time'] keywords = data_item['keywords'] if 'keywords' in data_item else '' meta = dict( response.meta, detail_url=detail_url, article_url=article_url, item_id=article_id, article_title=article_title, article_pub_time=pub_time, keywords=keywords, ) yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta) # 翻页 has_more = result.get('has_more') if has_more: max_behot_time = '' if 'next' in result and 'max_behot_time' in result['next']: max_behot_time = result['next']['max_behot_time'] AS, CP = get_as_cp() jsonp_index = response.meta.get('jsonp_index', 0) + 1 url_params_next = { 'max_behot_time': max_behot_time or '%10d' % time.time(), 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } print("max_behot_time:", url_params_next['max_behot_time']) url_article_list_next = get_update_url(response.url, url_params_next) meta = dict(response.meta, jsonp_index=jsonp_index) time.sleep(self.FRESH_DELAY) yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta)