def get_hot_videos(self,url="", max_page=10,**kwargs): data_list = [] headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate", "accept-language": "zh,zh-CN;q=0.9", "cache-control": "max-age=0", # "cookie": "pgv_pvi=3517925376; pgv_pvid=3591400976; RK=sDRQYhGkF/; ptcz=8100687e80e810853d573a8a9ced1155a9a9683321075161f61b773de19ff4c5; pac_uid=0_bf3968e8e3157; ts_uid=1260359885; tvfe_boss_uuid=082fecb8ba01b06d; QQLivePCVer=50181223; video_guid=ce0aa0f8275ad435; video_platform=2; bucket_id=9231001; mobileUV=1_1707c108811_53c13; tvfe_search_uid=3c2fd48b-03f8-4f63-af8c-bb2bd367af2b; ts_refer=www.baidu.com/link; pgv_info=ssid=s7741803072; ad_play_index=80", # "if-modified-since": "Fri, 28 Feb 2020 08:00:00 GMT", "referer": "https://v.qq.com/biu/ranks/?t=hotsearch&channel=hot", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", } res = retry_get_url(url,headers=headers,timeout=10,proxies=3) page_text = res.content.decode("utf-8") html = etree.HTML(page_text) print(html) xpath_list = html.xpath("//body[@class='page_search']/div[@class='search_container']/div[@class='wrapper']/div[@class='wrapper_main']/div") for li in xpath_list: title_url = li.xpath("./a/@href") if title_url: print(title_url) data = crawler_qq_video_page(title_url[0]) if not data: continue data["is_hot"] = 1 data_list.append(data) output_result(result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def search_page(self, title=None, search_json=None, **kwargs): data_list = [] timestamp = int(datetime.datetime.now().timestamp() * 1e3) title = urllib.parse.quote(title) headers = { "Accept-Encoding": "gzip", # "X-SS-REQ-TICKET": "1587102750860", "passport-sdk-version": "14", "sdk-version": "2", #"Cookie": "odin_tt=d5d96b2812637e9d20681530fbbe4d52e8f76ae1b6afa8c0a173260321611c507ac6eca10991b21fc4f023e94371d457df784f959e94db673ef29a5bd2137091; qh[360]=1; history=alrvlFic6pJZXJCTWBmSmZt6KW6mevZSz5LU3OJ7DEKX42Zw%2Bc84wMR3iYGBweFy3EzZsPcNTLyXWN1AvLYP8%2BQPMLFfEpUA8bo%2F7nNtYOK7xNwC4k3XmMHe5MtzSTiM48DluNr01dkNTDyXuHrApsi4ejkwsV%2BSmAPmSeXoMzDxXhKcAuIVrRfWAJnJJwA25fG1DoezvFBTZrzZeg6kT%2BwWSG7Gx3UJB5h4L%2FH4gXlVn%2BtAtkvFMQRcjpv%2B%2Be9TBib2S%2BwcYBuUn8xsYGK%2FJKMAkptgfXrDASaOS4yHQHJVPy6UOjDxXuI4BeJN26Fs6MDEcYn%2FEoMDAAAA%2F%2F8%3D; install_id=112651077855; ttreq=1$0b37d53ca5c301ce96959dc97a67886da420b294", # "X-Gorgon": "0401007140017aae019cc2020b1c48dbab0ba42839014487648a", #"X-Khronos": "1587102750", "Host": "is.snssdk.com", "Connection": "Keep-Alive", "User-Agent": "okhttp/3.10.0.1", } url = "https://is.snssdk.com/api/search/content/?os_api=23&device_type=oneplus+a5010&from_search_subtab=synthesis&manifest_version_code=7690&source=search_subtab_switch&offset=0&is_ttwebview=0&action_type&is_incognito=0&keyword_type&rom_version=23&app_name=news_article&format=json&version_name=7.6.9&ac=wifi&host_abi=armeabi-v7a&update_version_code=76909&channel=baidu_0411&is_native_req=1&loadId=1&longitude=116.40717530841052&isIncognito=0&plugin=2050&forum=1&latitude=39.904680919672145&language=zh&pd=video&cur_tab_title=search_tab&aid=13&dpi=270&qrecImprId&fetch_by_ttnet=1&count=10&plugin_enable=3&search_position&ab_group=100167%2C94569%2C102754&keyword={0}&scm_version=1.0.2.830&search_json=%7B%22comment_ids%22%3A%5B%5D%2C%22event_discussion%22%3A74123%2C%22event_impression%22%3A17270790%2C%22forum_id%22%3A1664181806902302%2C%22forum_recall_wtt%22%3A%5B1664190666034183%2C1664192273575943%2C1664184430218253%2C1664185769175051%2C1664184985139212%2C1664196237152267%2C1664186792648732%2C1664188755414019%2C1664187055838215%2C1664184182571022%2C1664185938950148%2C1664188041995268%2C1664188322863172%2C1664190185024520%2C1664185602828300%2C1664184276484099%2C1664188211399684%2C1664187870713868%2C1664184484958211%2C1664183864289288%2C1664186825487371%2C1664195548700686%2C1664186585780228%2C1664197296210947%2C1664188146725901%2C1664191748459523%5D%2C%22group_source%22%3Anull%2C%22hot_gid%22%3A6816255461172445703%2C%22log_pb%22%3A%7B%22cluster_type%22%3A%220%22%2C%22entrance_hotspot%22%3A%22channel%22%2C%22hot_board_cluster_id%22%3A%226816091697949180424%22%2C%22hot_board_impr_id%22%3A%22202004171352010100140411610B1A7741%22%2C%22location%22%3A%22hot_board%22%2C%22rank%22%3A%225%22%2C%22source%22%3A%22trending_tab%22%2C%22style_id%22%3A%2210005%22%7D%2C%22mix_stick_ids%22%3A%5B1664190666034183%2C1664192273575943%2C1664184430218253%2C1664185769175051%2C1664184985139212%2C1664196237152267%2C1664186792648732%2C1664188755414019%2C1664187055838215%2C1664184182571022%2C1664185938950148%2C1664188041995268%2C1664188322863172%2C1664190185024520%2C1664185602828300%2C1664184276484099%2C1664188211399684%2C1664187870713868%2C1664184484958211%2C1664183864289288%2C1664186825487371%2C1664195548700686%2C1664186585780228%2C1664197296210947%2C1664188146725901%2C1664191748459523%5D%2C%22stick_group_ids%22%3A%5B%5D%7D&device_platform=android&search_id&has_count=0&version_code=769&from=video&device_id={1}&resolution=1080*1920&os_version=6.0.1&device_brand=Oneplus&search_sug=1&qc_query".format( title, random.randint(69418800000, 69418899999)) res = retry_get_url(url, headers=headers, timeout=5, proxies=3) page_text = res.json() for one_video in page_text["data"]: video_dic = {} try: video_dic['title'] = one_video.get('title') video_dic['url'] = one_video.get('display').get("info").get( "url") releaser_id = re.findall("user_id=(\d+)", one_video.get('user_source_url'))[0] video_dic['releaser'] = one_video.get('media_name') video_dic[ 'releaserUrl'] = "https://www.toutiao.com/c/user/%s/" % releaser_id release_time = int(one_video.get('create_time')) video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int(one_video.get('video_duration')) video_dic['play_count'] = trans_play_count( one_video.get('play_effective_count')) video_dic['repost_count'] = 0 video_dic['comment_count'] = one_video.get('comment_count') video_dic['favorite_count'] = one_video.get('digg_count') video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['video_img'] = one_video.get('display').get( 'self_info').get('image_url') video_dic['platform'] = "toutiao" video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" except Exception as e: print(e) continue data_list.append(video_dic) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def search_page(self, keyword, search_pages_max=30, output_to_es_raw=False, output_to_es_register=False, es_index=None, doc_type=None): search_data_Lst = [] url = ('http://www.soku.com/nt/search/q_' + keyword + '_orderby_2_limitdate_0?spm=a2h0k.8191414.0.0&site=14&page={}'. format(str(i)) for i in range(1, search_pages_max + 1)) for urls in url: print(urls) get_page = requests.get(urls) page = get_page.text soup = BeautifulSoup(page, 'html.parser') potato = soup.find_all("div", {"class": "v"}) for data_line in potato: dura = data_line.find("span", {"class": "v-time"}).text duration_str = dura dl = duration_str.split(':') dl_int = [] for v in dl: v = int(v) dl_int.append(v) if len(dl_int) == 2: duration = dl_int[0] * 60 + dl_int[1] else: duration = dl_int[0] * 3660 + dl_int[1] * 60 + dl_int[2] url = data_line.find('div', { 'class': 'v-meta-title' }).a['href'] url = 'http:' + url one_video_dic = self.video_page(url) one_video_dic['url'] = url one_video_dic['duration'] = duration search_data_Lst.append(one_video_dic) print('get page done') if len(search_data_Lst) >= 100: output_result(result_Lst=search_data_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) search_data_Lst.clear() if search_data_Lst != []: output_result(result_Lst=search_data_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) return search_data_Lst
def search_page(self, title=None, **kwargs): data_list = [] headers = { "Host": "aweme.snssdk.com", "Connection": "keep-alive", # "Cookie": "d_ticket=38c841789e38ea43c6338910dac65ffe192e3; odin_tt=82086544bb9028f027b5aea78724ccf512dead26658f45321be33bade615793782bf6ac7fe0c18b73b9592f4284413d5300974810d439b42ef0b3eaa761b1640; msh=cakLg8lvbK5CxiSWkIbD2UInwAI; sid_guard=09fe3dfd89dfbc79f081fb2db9dd81ee%7C1581832192%7C5184000%7CThu%2C+16-Apr-2020+05%3A49%3A52+GMT; uid_tt=da0b53b7563eca87c47da41f5f17c30f; uid_tt_ss=da0b53b7563eca87c47da41f5f17c30f; sid_tt=09fe3dfd89dfbc79f081fb2db9dd81ee; sessionid=09fe3dfd89dfbc79f081fb2db9dd81ee; sessionid_ss=09fe3dfd89dfbc79f081fb2db9dd81ee; install_id=104847319549; ttreq=1$51e484720311469c4b70f4754d730d538a074c4b", # "X-SS-REQ-TICKET": "1583139618192", # "X-Tt-Token": "0009fe3dfd89dfbc79f081fb2db9dd81ee013243f7134b3eb37249cc729a5276172df69a4391b56ae4bf253c3c6352322611", "sdk-version": "1", # "x-tt-trace-id": "00-9a797f160a107b431078db3e93480468-9a797f160a107b43-01", "User-Agent": "com.ss.android.ugc.aweme/990 (Linux; U; Android 5.1.1; zh_CN; OPPO R11; Build/NMF26X; Cronet/77.0.3844.0)", "Accept-Encoding": "gzip, deflate", # "X-Gorgon": "0401a0514001f64964a8ebef9f4305ccbef2df1aa3c92fdf955a", # "X-Khronos": "1583139618", } url = "https://aweme.snssdk.com/aweme/v1/hot/search/video/list/?hotword={0}&offset=0&count=12&source=trending_page&is_ad=0&item_id_list&is_trending=0&os_api=22&device_type=OPPO%20R11&ssmix=a&manifest_version_code=990&dpi=320&uuid=866174725888628&app_name=aweme&version_name=9.9.0&ts=1583139619&app_type=normal&ac=wifi&update_version_code=9902&channel=tengxun_new&_rticket=1583139618192&device_platform=android&iid=104847319549&version_code=990&cdid=fce00742-ccef-4b14-943d-1f62b6d637b0&openudid=48a4725886d57203&device_id=70787469432&resolution=900*1600&os_version=5.1.1&language=zh&device_brand=OPPO&aid=1128&mcc_mnc=46007".format( title) res = retry_get_url(url, headers=headers, timeout=5, proxies=3) page_text = res.json() for one_video in page_text["aweme_list"]: video_dic = {} video_dic['title'] = one_video.get('desc') video_dic['url'] = one_video.get('share_url') releaser_id = one_video.get('author_user_id') video_dic['releaser'] = one_video.get('author').get("nickname") video_dic[ 'releaserUrl'] = "https://www.iesdouyin.com/share/user/%s" % releaser_id release_time = one_video.get('create_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int(one_video.get('duration') / 1000) video_dic['play_count'] = 0 video_dic['repost_count'] = one_video.get('statistics').get( 'share_count') video_dic['comment_count'] = one_video.get('statistics').get( 'comment_count') video_dic['favorite_count'] = one_video.get('statistics').get( 'digg_count') video_dic['video_id'] = one_video.get('aweme_id') video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "抖音_%s" % releaser_id video_dic['platform'] = "抖音" video_dic['video_img'] = one_video.get('video').get('cover').get( 'url_list')[0] video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" data_list.append(video_dic) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def parse_video_page_single_process(self, output_to_file=False, filepath=None, push_to_redis=False, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False): error_url_list = [] pid = os.getpid() key = 'new_tudou_video_page_html' result_lst = [] while connect_with_redis.length_of_lst(key) > 0: video_page_html = connect_with_redis.retrieve_video_page_html_from_redis( platform=self.platform) video_info = self.parse_video_page_html(video_page_html) try: video_info['title'] except: continue if video_info['title'] is not None: result_lst.append(video_info) print("platform: %s, action: parse video page, process_id: %s," "count number: %s" % (self.platform, pid, len(result_lst))) if len(result_lst) >= 1000: output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_lst.clear() else: try: error_url_list.append(video_info['url']) except: pass if result_lst != []: output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) if error_url_list != []: connect_with_redis.push_video_url_to_redis_set( platform=self.platform, url_lst=error_url_list)
def list_page_sync(self, list_page_url, channel=None, output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, page_num_max=30): if list_page_url not in self.legal_list_page_urls: print('Wrong list page url, must be one of %s' % self.list_page_url_dict) return None else: list_page_data = [] first_page = self.list_page_single(list_page_url, channel) list_page_data.extend(first_page) page_num = 2 while page_num <= page_num_max: paged_url = self.get_paged_url_for_list_page( list_page_url, page_num) if paged_url is not None: paged_result = self.list_page_single(paged_url, channel) list_page_data.extend(paged_result) page_num += 1 else: print( 'Failed to form paged url for original list page url: %s' % list_page_url) return None if len(list_page_data) >= 100: output_result(list_page_data, self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis) list_page_data.clear() # if list_page_data != []: output_result(list_page_data, self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis) list_page_data.clear() return list_page_data
def releaser_page_by_time(self, start_time, end_time, url, **kwargs): data_lis = [] count_false = 0 output_to_file = kwargs.get("output_to_file") filepath = kwargs.get("filepath") push_to_redis = kwargs.get("push_to_redis") output_to_es_register = kwargs.get("output_to_es_register") output_to_es_raw = kwargs.get("output_to_es_raw") es_index = kwargs.get("es_index") doc_type = kwargs.get("doc_type") for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")): video_time = res["release_time"] # print(res) if video_time: if start_time < video_time: if video_time < end_time: data_lis.append(res) if len(data_lis) >= 100: output_result( result_Lst=data_lis, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) data_lis.clear() else: count_false += 1 if count_false > 10: break else: continue if data_lis != []: output_result(result_Lst=data_lis, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type)
def search_page(self, title): data_list = [] encodestr = base64.b64encode(title.encode('utf-8')) encodestr = str(encodestr, 'utf-8') url = "http://c.m.163.com/search/comp2/Kg%3D%3D/20/{0}.html?".format( encodestr) para = "deviceId=2zx5YfHmoBb72ayxYpQVUg%3D%3D&version=newsclient.32.1.android&channel=VDEzNDg2NDc5MDkxMDc%3D&canal=bmV3c19sZl9jcGFfMg%3D%3D&dtype=0&tabname=shipin&position=5YiX6KGo6aG26YOo&ts={0}&sign=Di3opZw%2FFIPDdgreSK4VCKlnMSpm6FPoel5LeY88RgZ48ErR02zJ6%2FKXOnxX046I&spever=FALSE&open=scheme_%E9%BB%98%E8%AE%A4&openpath=/video/VT5O1KVCO".format( str(int(datetime.datetime.now().timestamp()))) res = retry_get_url(url + para, headers=self.headers, timeout=5, proxies=3) page_text = res.json() for data in page_text["doc"]["result"]: print(data) data_list.append(data) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def search_page(self, keyword, search_pages_max=30, output_to_es_raw=False, output_to_es_register=False, es_index=None, doc_type=None): search_data_Lst = [] headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", "cookie": "SOKUSESSID=1553755022290AhL; cna=2U8aFb1yaVcCAdr3nSX3f47K; _uab_collina=155799409092003127272852; __ayft=1579161875253; __aysid=1579161875253Zl2; __ayscnt=1; P_ck_ctl=5A18564922916D48652E056B1E408EDE; SK_QUERY=%5B%7B%22q%22%3A%22%25E6%2587%2582%25E8%25BD%25A6%25E8%2580%2581%25E5%25BC%25A0%22%7D%2C%7B%22q%22%3A%22%25E5%25B0%258F%25E9%25B9%258F%22%7D%5D; JSESSIONID=57F80DCADD926955C8B91971B5334C42; __arpvid=1579162484198LNMIn3-1579162484216; __aypstp=8; __ayspstp=8; isg=BBQUwi2xNucxPqDWRI5_X4Eo5VKGbThX3kr36q707x8ombXj13kz5f1fnZkBYXCv", "referer": "https://www.soku.com/nt/search/q_%E6%87%82%E8%BD%A6%E8%80%81%E5%BC%A0_orderby_2_limitdate_0?spm=a2h0k.8191414.0.0&site=14&_lg=10&page=2", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", } url = ( 'https://www.soku.com/nt/search/q_' + keyword + '_orderby_2_limitdate_0?&spm=a2h0k.8191414.0.0&site=14&_lg=10&page={}' .format(str(i)) for i in range(1, search_pages_max + 1)) for urls in url: print(urls) get_page = requests.get(urls, headers=headers) page = get_page.text soup = BeautifulSoup(page, 'html.parser') potato = soup.find_all("div", {"class": "v"}) for data_line in potato: dura = data_line.find("span", {"class": "v-time"}).text duration_str = dura dl = duration_str.split(':') dl_int = [] for v in dl: v = int(v) dl_int.append(v) if len(dl_int) == 2: duration = dl_int[0] * 60 + dl_int[1] else: duration = dl_int[0] * 3660 + dl_int[1] * 60 + dl_int[2] url = data_line.find('div', { 'class': 'v-meta-title' }).a['href'] url = 'http:' + url one_video_dic = self.video_page(url) one_video_dic['url'] = url one_video_dic['duration'] = duration one_video_dic['search_word'] = keyword search_data_Lst.append(one_video_dic) print('get page done') if len(search_data_Lst) >= 100: output_result(result_Lst=search_data_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) search_data_Lst.clear() if search_data_Lst != []: output_result(result_Lst=search_data_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) return search_data_Lst
def list_page(self, rid, page_num=1, channel=None, output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, page_num_max=34, output_es_index=None, output_doc_type=None, proxy_dic=None): result_lst = [] fail_time = 0 while page_num <= page_num_max and fail_time < 5: lst_url = ( 'https://api.bilibili.com/x/web-interface/newlist?rid=' + rid + '&type=0&pn=' + str(page_num) + '&ps=20') if proxy_dic is not None: raw_proxy_dic = extract_data_to_use() record_id = raw_proxy_dic['id'] proxy_dic = build_proxy_dic(raw_proxy_dic=raw_proxy_dic) print('get proxy_dic %s' % proxy_dic) try: get_page = retry_get_url(lst_url, proxies=proxy_dic, timeout=15) fail_time = 0 page_num += 1 except: update_status(record_id=record_id, availability=0) fail_time += 1 print('%s has failed %s times' % (lst_url, fail_time)) continue print('get page at %s' % (page_num - 1)) page_dic = get_page.json() total_video = int(page_dic['data']['page']['count']) if page_num == 1: if int(total_video / 20) == total_video / 20: total_page_num = int(total_video / 20) else: total_page_num = int(total_video / 20) + 1 if total_page_num <= page_num_max: page_num_max = total_page_num video_dic = page_dic['data']['archives'] for one_video in video_dic: video_dic = copy.deepcopy(self.video_data) video_dic['title'] = one_video['title'] aid = one_video['aid'] video_dic['aid'] = one_video['aid'] try: attribute = one_video['attribute'] except: attribute = 0 video_dic['attribute'] = attribute video_dic['url'] = 'https://www.bilibili.com/video/av' + str( aid) video_dic['releaser'] = one_video['owner']['name'] video_dic['releaser_id'] = one_video['owner']['mid'] video_dic['video_intro'] = one_video['desc'] video_dic['duration'] = one_video['duration'] video_dic['play_count'] = one_video['stat']['view'] video_dic['danmuku'] = one_video['stat']['danmaku'] video_dic['release_time'] = (one_video['pubdate']) * 1e3 fetch_time = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1e3) video_dic['fetch_time'] = fetch_time result_lst.append(video_dic) if len(result_lst) >= 100: if output_es_index is None and output_doc_type is None: output_result( result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis) result_lst.clear() elif output_es_index is not None and output_doc_type is not None: output_result( result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=output_es_index, doc_type=output_doc_type) result_lst.clear() time.sleep(0) if result_lst != []: if output_es_index is None and output_doc_type is None: output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis) elif output_es_index is not None and output_doc_type is not None: output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=output_es_index, doc_type=output_doc_type) return result_lst
def search_page(self, keyword, search_pages_max=30, output_to_es_raw=False, output_to_es_register=False, es_index=None, doc_type=None): search_lst = [] def get_one_video(video_dict): title = video_dict['title'] title = title.replace('<em class="keyword">', '') title = title.replace('</em>', '') aid = video_dict['aid'] url = video_dict['arcurl'] releaser = video_dict['author'] video_intro = video_dict['description'] dura = video_dict['duration'] dura_lst = dura.split(':') duration = int(dura_lst[0]) * 60 + int(dura_lst[1]) play_count = video_dict['play'] comment_count = video_dict['video_review'] favorite_count = video_dict['favorites'] release_time = int(video_dict['pubdate'] * 1e3) tag = video_dict['tag'] D0 = copy.deepcopy(self.video_data) D0['title'] = title D0['play_count'] = play_count D0['favorite_count'] = favorite_count D0['comment_count'] = comment_count D0['releaser'] = releaser D0['describe'] = video_intro D0['release_time'] = release_time D0['duration'] = duration D0['url'] = url return D0 first_url = ('https://api.bilibili.com/x/web-interface/search/type?' 'jsonp=jsonp&search_type=video' '&keyword=%s' % keyword) if search_pages_max == 1: search_urls = [first_url] else: search_gen = ( 'https://api.bilibili.com/x/web-interface/search/type?' 'jsonp=jsonp&search_type=video&keyword=' + keyword + '&page={}'.format(str(i)) for i in range(2, search_pages_max + 1)) search_urls = [first_url] search_urls.extend(list(search_gen)) for s_url in search_urls: print(s_url) get_page = requests.get(s_url) page_dict = get_page.json() video_dicts = page_dict['data']['result'] for video_dict in video_dicts: one_video_dict = get_one_video(video_dict) search_lst.append(one_video_dict) if len(search_lst) >= 100: output_result(result_Lst=search_lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) search_lst.clear() if search_lst != []: output_result(result_Lst=search_lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) return search_lst
def parse_video_page_single_process(self, output_to_file=False, filepath=None, push_to_redis=False, output_to_es_raw=True, es_index="crawler-data-raw", doc_type="doc", output_to_es_register=False): key = 'iqiyi_video_page_html' result_list = [] pid = os.getpid() while connect_with_redis.length_of_lst(key) > 0: video_page_html = connect_with_redis.retrieve_video_page_html_from_redis( platform=self.platform) soup = BeautifulSoup(video_page_html, 'html.parser') try: page_info = soup.find("div", {"is": "i71-play"})[":page-info"] page_info = page_info.replace("'", '"') page_dic = json.loads(page_info) except: page_dic = None if page_dic is not None: title = page_dic["tvName"] url = page_dic["pageUrl"] dura_str = page_dic["duration"] duration = trans_duration(dura_str) try: releaser = page_dic["user"]["name"] releaserUrl = page_dic["user"]["profileUrl"] except: releaser = None releaserUrl = None else: title = None url = None duration = None releaser = None releaserUrl = None try: video_info = soup.find("div", {"is": "i71-play"})[":video-info"] video_dic = json.loads(video_info) except: video_dic = None if video_dic is not None: if title is None: title = video_dic['name'] if url is None: url = video_dic['url'] if releaser is None: try: releaser = video_dic["user"]["name"] releaserUrl = video_dic["user"]["profileUrl"] except: releaser = None releaserUrl = None release_time = video_dic["firstPublishTime"] tvId = video_dic["tvId"] hot_idx_url = "https://pub.m.iqiyi.com/jp/h5/count/hotDisplay/?qipuId=%s" % tvId get_hot_idx = retry_get_url(hot_idx_url) hot_idx_str = get_hot_idx.text hot_idx = int( re.findall( "\d+", ' '.join(re.findall('"count":\d+', hot_idx_str)))[0]) fetch_time = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1e3) if releaser is None: try: releaser = soup.find('span', { 'class': 'intro-iterm__txt' }).text except: releaser = None video_page_dict = copy.deepcopy(self.video_data) video_page_dict["title"] = title video_page_dict["url"] = url video_page_dict["duration"] = duration video_page_dict["releaser"] = releaser video_page_dict["releaserUrl"] = releaserUrl video_page_dict["release_time"] = release_time video_page_dict["hot_idx"] = hot_idx video_page_dict["fetch_time"] = fetch_time video_page_dict["tvId"] = tvId result_list.append(video_page_dict) print( "platform: %s, action: parse video page, process_id: %s, has done: %s" % (self.platform, pid, len(result_list))) if len(result_list) >= 1000: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear()
def video_page(self, url, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, *args, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 """ releaser = "" count = 1 result_list = [] page_count = 0 size_num = 0 # releaser_id = self.get_releaser_id(url) while count < releaser_page_num_max: if size_num > 1000: size_num = 0 size_num += 20 count += 1 url_dic = { 'channel': 'T1457068979049', 'subtab': 'Video_Recom', 'size': "10", 'offset': size_num, 'fn': '3', 'devId': 'sklfRdL61S9GUQ4M7DSzdvA6U6LFEZr0pAEonUVTJrYHNFmgkLuyUgNU6zUV7MVx', 'version': '33.2.1', 'net': 'wifi', 'ts': '1557126556', 'sign': 'YTk73p++NeCfCJRpZkThWxGYX0gVcFWjUVLCRIRwftV48ErR02zJ6/KXOnxX046I', 'encryption': '1', 'canal': 'lite_wifi_cpa10', 'mac': 'racUMC0A9havm+He6jH3YAvVdjgSXYDtwEDZ03eH1l8=' } releaserUrl = 'https://c.m.163.com/recommend/getChanListNews?%s' % urllib.parse.urlencode( url_dic) print(releaserUrl) page_count += 20 get_page = requests.get(releaserUrl, headers=self.headers) page_dic = get_page.json() data_list = page_dic.get("视频") # print(data_list) # print(releaserUrl) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser, count)) pcursor = "no_more" continue else: print("get data at page: %s" % (count)) for info_dic in data_list: skipID = info_dic.get("vid") video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic[ 'url'] = "https://c.m.163.com/news/v/%s.html" % skipID video_dic['releaser'] = info_dic.get('topicName') video_dic[ 'releaserUrl'] = "https://c.m.163.com/news/sub/%s.html" % info_dic.get( "videoTopic").get("tid") video_dic[ 'releaser_id_str'] = "网易新闻_%s" % self.get_releaser_id( video_dic['releaserUrl']) try: video_dic['release_time'] = int(info_dic.get('ptime')) except: video_dic['release_time'] = trans_strtime_to_timestamp( info_dic.get('ptime')) video_dic['play_count'] = info_dic.get("playCount") video_dic['comment_count'] = info_dic.get('replyCount') video_dic['favorite_count'] = info_dic.get('voteCount') if not video_dic['play_count']: video_dic['play_count'] = 0 if not video_dic['favorite_count']: video_dic['favorite_count'] = 0 video_dic['video_id'] = skipID video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['duration'] = info_dic.get("length") video_dic['video_img'] = self.get_video_image(info_dic) result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() return result_list
def releaser_page_by_time(self, start_time, end_time, url, **kwargs): data_lis = [] count_false = 0 output_to_file = kwargs.get("output_to_file") filepath = kwargs.get("filepath") push_to_redis = kwargs.get("push_to_redis") output_to_es_register = kwargs.get("output_to_es_register") output_to_es_raw = kwargs.get("output_to_es_raw") es_index = kwargs.get("es_index") doc_type = kwargs.get("doc_type") for res in self.baijiahao.releaser_page_web_by_time(url): video_time = res["release_time"] if video_time: if start_time < video_time: if video_time < end_time: data_lis.append(res) print( res["releaser"], datetime.datetime.fromtimestamp( res["release_time"] / 1000)) if len(data_lis) >= 100: output_result( result_Lst=data_lis, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) data_lis.clear() else: count_false += 1 if count_false > 50: break else: continue count_false = 0 for res in self.baijiahao.releaser_dynamic_page_web_by_time(url): video_time = res["release_time"] if video_time: if start_time < video_time: if video_time < end_time: data_lis.append(res) print( res["releaser"], datetime.datetime.fromtimestamp( res["release_time"] / 1000)) if len(data_lis) >= 100: output_result( result_Lst=data_lis, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) data_lis.clear() else: count_false += 1 if count_false > 50: break else: continue if data_lis != []: output_result(result_Lst=data_lis, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type)
def releaser_page_pc(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ # self.get_cookies_and_font(releaserUrl) user_id = "153512{0}".format(random.randint(1000, 9000)) releaser = "" headers = { "accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "content-type": "application/json", "Host": "live.kuaishou.com", "Origin": "https://live.kuaishou.com", "Referer": releaserUrl, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "Cookie": "clientid=3; did=web_504e72386a69c6d6172f1457b591%sc; client_key=65890b29; userId=%s" % (random.randint(300, 800), user_id) } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id pcursor = None principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl # firset_page = requests.get(releaserUrl, headers=self.first_page_headers) # cookie = firset_page.cookies # firset_page = requests.get(releaserUrl, headers=self.first_page_headers,cookies=cookie) # cookie = firset_page.cookies while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": variables = { "principalId": principalId, "pcursor": pcursor, "count": 100 } url_dic = { "operationName": "publicFeedsQuery", "variables": variables, "query": "query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n live {\n user {\n id\n avatar\n name\n __typename\n }\n watchingCount\n poster\n coverUrl\n caption\n id\n playUrls {\n quality\n url\n __typename\n }\n quality\n gameInfo {\n category\n name\n pubgSurvival\n type\n kingHero\n __typename\n }\n hasRedPack\n liveGuess\n expTag\n __typename\n }\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n" } api_url = 'https://live.kuaishou.com/m_graphql' try: if proxies: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5, proxies=proxies) else: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(0.5) page_dic = get_page.json() data_list = page_dic.get("data").get("publicFeeds").get("list") # print(data_list) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser, count)) retry_time += 1 if retry_time > 3: proxies_num = 0 if retry_time > 5: pcursor = "no_more" # if not pcursor: # self.loginObj.delete_cookies(self.cookie_dic) continue else: pcursor = page_dic.get("data").get("publicFeeds").get( "pcursor") print("get data at releaser: %s page: %s" % (releaser, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('caption') releaser_id_ = info_dic.get('user').get("eid") video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id_, info_dic.get('id')) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get('user').get("name") video_dic['play_count'] = trans_play_count( info_dic.get('counts').get("displayView")) video_dic['comment_count'] = trans_play_count( info_dic.get('counts').get("displayComment")) video_dic['favorite_count'] = trans_play_count( info_dic.get('counts').get("displayLike")) video_dic['video_id'] = info_dic.get('id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "kwai_%s" % (releaser_id_) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id_ video_dic['video_img'] = self.get_video_image(info_dic) if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() return result_list
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = 0 headers = { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", # "Cookie": "did=web_c7c42d62cbb24{0}4d1ca5ffca052e3; didv=1582271776000; sid=e12d2ec74ec7af3a24d{1}cd6;pua5rv=1".format(random.randint(6000,8000),random.randint(20,99)), "Cookie": "did=web_790b7bcefe7347c5937a39d34c49f7ed; didv=1583150714000; sid=ab0c3a5497ab3c8fb73c8bef", "Host": "kpfshanghai.m.chenzhongtech.com", "kpf": "H5", "kpn": "KUAISHOU", # "Origin": "https://v.kuaishou.com", "Origin": "https://kpfshanghai.m.chenzhongtech.com", "Referer": "https://kpfshanghai.m.chenzhongtech.com/fw/user/%s?fid=1535125322&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=14810686%s&docABKey=share_textid_profile&shareToken=X-7AeJHKdHOc_-392ps0aWP381Bs&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&shareObjectId=916251992&shareUrlOpened=0" % (releaser_id, random.randint(1000, 9800)), "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } proxies = get_proxy(proxies_num) # print(proxies) # proxies = {'http': 'http://*****:*****@58.55.159.141:16085/', 'https': 'http://*****:*****@58.55.159.141:16085/'} while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post( "https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post( "https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={ "eid": releaser_id, "count": 100, "pcursor": pcursor }, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3, 5)) page_dic = get_page.json() data_list = page_dic.get("feeds") # # print(data_list) # if not data_list: # get_page = requests.post("https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count": 18, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(1) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 print("no proxies") if retry_time > 5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic[ 'url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count( info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count( info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count( info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count( info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int( info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % ( releaser_id) video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = info_dic.get( "coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic[ 'comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() return result_list
def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, releaser_page_num_max=30, es_index=None, doc_type=None): videos_per_page = 42 releaser_page_Lst = [] releaser_id = self.rebuild_releaserUrl(releaserUrl) if releaser_id == '' or releaser_id is None: print('Failed to get releaser id: %s' % releaserUrl) return None real_releaserUrl = 'https://www.iqiyi.com/u/' + releaser_id + '/v' get_page = retry_get_url(real_releaserUrl) if get_page is None: print('Failed to get releaser page: %s' % releaserUrl) return None get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') try: videonum_str = soup.find('span', {'class': 'icon-num'}).text videonum_f = re.findall('[0-9]+', videonum_str) except: print('Failed to get total video number: %s' % releaserUrl) videonum_f = [] if videonum_f != []: videonum = int(videonum_f[0]) totalpage = videonum // videos_per_page + 1 else: videonum = None totalpage = 1000 # assign an arbitary number def process_one_line(data_line): url = data_line.find('p', { 'class': 'site-piclist_info_title_twoline' }).a['href'] if url[:6] != 'https:' or url[:5] != 'http:': url = 'https:' + url get_video_dict = self.video_page(url) if get_video_dict is None: return None return get_video_dict releaser_url_body_f = re.findall('https://www.iqiyi.com/u/[0-9]+/v', releaserUrl) if releaser_url_body_f != []: releaser_url_body = releaser_url_body_f[0] else: releaser_url_body_f = re.findall('http://www.iqiyi.com/u/[0-9]+/v', releaserUrl) if releaser_url_body_f != []: releaser_url_body = releaser_url_body_f[0] else: return None if releaser_page_num_max > totalpage: releaser_page_num_max = totalpage else: pass video_page_url = [ releaser_url_body + '?page={}&video_type=1'.format(str(i)) for i in range(1, releaser_page_num_max + 1) ] for urls in video_page_url: get_page = retry_get_url(urls) if get_page is None: continue print("get %s successfully" % urls) page = get_page.text soup = BeautifulSoup(page, 'html.parser') iqiyi = soup.find_all('li', {'j-delegate': 'colitem'}) for data_line in iqiyi: one_video_dic = process_one_line(data_line) releaser_page_Lst.append(one_video_dic) if len(releaser_page_Lst) >= 100: output_result(releaser_page_Lst, self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=es_index, doc_type=doc_type) releaser_page_Lst.clear() if releaser_page_Lst != []: output_result(releaser_page_Lst, self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=es_index, doc_type=doc_type)
def search_short_video_page(self, title=None, search_json=None, **kwargs): data_list = [] timestamp = int(datetime.datetime.now().timestamp() * 1e3) title = urllib.parse.quote(title) headers = { "Accept-Encoding": "gzip", # "X-SS-REQ-TICKET": "1587103224961", "passport-sdk-version": "14", "sdk-version": "2", #"Cookie": "odin_tt=d5d96b2812637e9d20681530fbbe4d52e8f76ae1b6afa8c0a173260321611c507ac6eca10991b21fc4f023e94371d457df784f959e94db673ef29a5bd2137091; qh[360]=1; history=alrvlFic6pJZXJCTWBmSmZt6KW6mevZSz5LU3OJ7DEKX42Zw%2Bc84wMR3iYGBweFy3EzZsPcNTLyXWN1AvLYP8%2BQPMLFfEpUA8bo%2F7nNtYOK7xNwC4k3XmMHe5MtzSTiM48DluNr01dkNTDyXuHrApsi4ejkwsV%2BSmAPmSeXoMzDxXhKcAuIVrRfWAJnJJwA25fG1DoezvFBTZrzZeg6kT%2BwWSG7Gx3UJB5h4L%2FH4gXlVn%2BtAtkvFMQRcjpv%2B%2Be9TBib2S%2BwcYBuUn8xsYGK%2FJKMAkptgfXrDASaOS4yHQHJVPy6UOjDxXuI4BeJN26Fs6MDEcYn%2FEoMDAAAA%2F%2F8%3D; install_id=112651077855; ttreq=1$0b37d53ca5c301ce96959dc97a67886da420b294", # "X-Gorgon": "0401e08b4001a628dcf96b16d01278ad842e915d905b213dc48f", # "X-Khronos": "1587103224", "Host": "is.snssdk.com", "Connection": "Keep-Alive", "User-Agent": "okhttp/3.10.0.1", } url = "https://is.snssdk.com/api/search/content/?os_api=23&device_type=oneplus%2Ba5010&from_search_subtab=video&manifest_version_code=7690&source=search_subtab_switch&offset=0&is_ttwebview=0&uuid=440000000189785&action_type&is_incognito=0&keyword_type&rom_version=23&app_name=news_article&format=json&version_name=7.6.9&ac=wifi&host_abi=armeabi-v7a&update_version_code=76909&channel=baidu_0411&is_native_req=1&loadId=1&longitude=113.40717530841052&isIncognito=0&plugin=2050&openudid=e44cc0264b92bcbf&forum=1&latitude=39.904680919672145&search_start_time=1587102733626&language=zh&pd=xiaoshipin&cur_tab_title=search_tab&aid=13&pos=5r_-9Onkv6e_eBEKeScxeCUfv7G_8fLz-vTp6Pn4v6esrKuzqa2qrKqorq2lqaytqK-xv_H86fTp6Pn4v6eupLOkramrpa2krKSrqq-sqaixv_zw_O3e9Onkv6e_eBEKeScxeCUfv7G__PD87dHy8_r06ej5-L-nrKyrs6mtqqyqqK6tpamsraivsb_88Pzt0fzp9Ono-fi_p66ks6StqaulraSspKuqr6ypqOA%253D&dpi=270&qrecImprId&fetch_by_ttnet=1&count=10&plugin_enable=3&search_position&ab_group=100167%252C94569%252C102754&keyword={0}&scm_version=1.0.2.830&search_json=%257B%2522comment_ids%2522%253A%255B%255D%252C%2522event_discussion%2522%253A74123%252C%2522event_impression%2522%253A17270790%252C%2522forum_id%2522%253A1664181806902302%252C%2522forum_recall_wtt%2522%253A%255B1664190666034183%252C1664192273575943%252C1664184430218253%252C1664185769175051%252C1664184985139212%252C1664196237152267%252C1664186792648732%252C1664188755414019%252C1664187055838215%252C1664184182571022%252C1664185938950148%252C1664188041995268%252C1664188322863172%252C1664190185024520%252C1664185602828300%252C1664184276484099%252C1664188211399684%252C1664187870713868%252C1664184484958211%252C1664183864289288%252C1664186825487371%252C1664195548700686%252C1664186585780228%252C1664197296210947%252C1664188146725901%252C1664191748459523%255D%252C%2522group_source%2522%253Anull%252C%2522hot_gid%2522%253A6816255461172445703%252C%2522log_pb%2522%253A%257B%2522cluster_type%2522%253A%25220%2522%252C%2522entrance_hotspot%2522%253A%2522channel%2522%252C%2522hot_board_cluster_id%2522%253A%25226816091697949180424%2522%252C%2522hot_board_impr_id%2522%253A%2522202004171352010100140411610B1A7741%2522%252C%2522location%2522%253A%2522hot_board%2522%252C%2522rank%2522%253A%25225%2522%252C%2522source%2522%253A%2522trending_tab%2522%252C%2522style_id%2522%253A%252210005%2522%257D%252C%2522mix_stick_ids%2522%253A%255B1664190666034183%252C1664192273575943%252C1664184430218253%252C1664185769175051%252C1664184985139212%252C1664196237152267%252C1664186792648732%252C1664188755414019%252C1664187055838215%252C1664184182571022%252C1664185938950148%252C1664188041995268%252C1664188322863172%252C1664190185024520%252C1664185602828300%252C1664184276484099%252C1664188211399684%252C1664187870713868%252C1664184484958211%252C1664183864289288%252C1664186825487371%252C1664195548700686%252C1664186585780228%252C1664197296210947%252C1664188146725901%252C1664191748459523%255D%252C%2522stick_group_ids%2522%253A%255B%255D%257D&device_platform=android&search_id&has_count=0&version_code=769&mac_address=08%253A00%253A27%253A1F%253A7E%253AA0&from=xiaoshipin&device_id={1}&resolution=810*1440&os_version=6.0.1&device_brand=Oneplus&search_sug=1&qc_query".format( title, random.randint(69418800000, 69418899999)) res = retry_get_url(url, headers=headers, timeout=5, proxies=3) page_text = res.json() for one_video in page_text["data"]: video_dic = {} try: one_video = one_video["raw_data"] video_dic['title'] = one_video.get('title') video_dic['url'] = one_video.get('share').get("share_url") releaser_id = one_video.get('user').get("info").get("user_id") video_dic['releaser'] = one_video.get('user').get("info").get( "name") video_dic[ 'releaserUrl'] = "https://www.toutiao.com/c/user/%s/" % releaser_id release_time = int(one_video.get('create_time')) video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int( one_video.get('video').get("duration")) video_dic['play_count'] = one_video.get('action').get( "play_count") video_dic['repost_count'] = one_video.get('action').get( "share_count") video_dic['comment_count'] = one_video.get('action').get( "comment_count") video_dic['favorite_count'] = one_video.get('action').get( "digg_count") video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['video_img'] = one_video.get('video').get( 'origin_cover').get('url_list')[0] video_dic['platform'] = "toutiao" if "iesdouyin" in video_dic['url']: video_dic[ 'releaserUrl'] = "https://www.douyin.com/share/user/%s/" % releaser_id video_dic['platform'] = "抖音" video_dic['releaser_id_str'] = "抖音_%s" % releaser_id video_dic['play_count'] = 0 video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" except: continue data_list.append(video_dic) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def key_customer(self, releaserUrl, releaser_page_num_max=1000, output_to_es_raw=False, es_index='crawler-data-raw', doc_type='doc'): """ input releaserUrl must be strict as https://id.tudou.com/i/UMjc5MzI5NDA==/videos? end with /videos otherwise when scrolling it will make mistakes """ releaser_id = self.get_releaser_id(releaserUrl) print("working on releaser: %s" % releaser_id) releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id result_lst = [] get_page = retry_get_url(releaserUrl) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') try: releaser = soup.find('div', {'class': 'user-name'}).a.text except: releaser = None try: total_video_num_str = soup.find('div', { 'class': 'title' }).span.text total_video_num = total_video_num_str.replace('(', '').replace( ')', '').replace(',', '') total_video_num = trans_play_count(total_video_num) except: print(releaserUrl) if total_video_num % 50 == 0: total_page_num = int(total_video_num / 50) else: total_page_num = int(total_video_num / 50) + 1 if releaser_page_num_max > total_page_num: releaser_page_num_max = total_page_num print("releaser page num max is %s" % releaser_page_num_max) video_lst = soup.find_all('div', {'class': 'v'}) for line in video_lst: video_info = self.process_one_video(line) video_info['releaserUrl'] = releaserUrl video_info['releaser'] = releaser result_lst.append(video_info) if releaser_page_num_max >= 2: page_num = 2 try: partial_releaserUrl = soup.find('li', { 'class': 'next' }).a['href'] new_releaserUrl = 'https://id.tudou.com%s' % partial_releaserUrl except: print(new_releaserUrl) while page_num <= releaser_page_num_max: get_page = retry_get_url(new_releaserUrl) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') if page_num != releaser_page_num_max: try: new_releaserUrl = 'https://id.tudou.com' + soup.find( 'li', { 'class': 'next' }).a['href'] except: new_releaserUrl = ( 'https://id.tudou.com/i/%s/videos?order=1&page=%s' % (releaser_id, page_num)) video_lst = soup.find_all('div', {'class': 'v'}) for line in video_lst: video_info = self.process_one_video(line) video_info['releaserUrl'] = releaserUrl video_info['releaser'] = releaser result_lst.append(video_info) print('get page %s list length is %s' % (page_num, len(result_lst))) page_num += 1 output_result(result_Lst=result_lst, platform=self.platform, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) result_lst.clear() if result_lst != []: output_result(result_Lst=result_lst, platform=self.platform, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) result_lst.clear()
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): releaser_id = self.get_releaser_id(releaserUrl) # releaser = self.get_releaser_name(releaserUrl) releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id json_headers = { "accept": "application/json, text/javascript, */*; q=0.01", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", # "cookie": "cna=W99aFOvX+QACAXL4fBJI3rAw; __ysuid=1541219939103JPW; ykss=e93bad5ef9c26af71c8e7ee5; P_ck_ctl=47F163FE35A5B1B2E479B158A12376A7; __ayvstp=16; __aysvstp=16; _zpdtk=ecd18a6d5d86a28b786b653356133cfb606dd1dc; isg=BOzsOnpUnhIGhYq8YxHgZ36EvcoepZBPH_JJJ0Yt-Rc6UY5bbrVJ3rr3dxdpWcin", "referer": releaserUrl, "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", "x-csrf-token": "ecd18a6d5d86a28b786b653356133cfb606dd1dc", "x-requested-with": "XMLHttpRequest", } json_cookies = { "cna": "W99aFOvX+QACAXL4fBJI3rAw", "__ysuid": "1541219939103JPW", "ykss": "e93bad5ef9c26af71c8e7ee5", "P_ck_ctl": "47F163FE35A5B1B2E479B158A12376A7", "__ayvstp": "16", "__aysvstp": "16", "_zpdtk": "ecd18a6d5d86a28b786b653356133cfb606dd1dc", "isg": "BOzsOnpUnhIGhYq8YxHgZ36EvcoepZBPH_JJJ0Yt-Rc6UY5bbrVJ3rr3dxdpWcin", } firsh_page_headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", #"cookie": "cna=W99aFOvX+QACAXL4fBJI3rAw; __ysuid=1541219939103JPW; ykss=e93bad5ef9c26af71c8e7ee5; P_ck_ctl=47F163FE35A5B1B2E479B158A12376A7; __ayvstp=16; __aysvstp=16; _zpdtk=9053e5d58ee0c51b1f3da8008dd4bda164ecd846; isg=BHl5FRo0A8WDkd_DnlItMBsXiOVThm042sF8-Juu9KAfIpu049ZUCb80oCjUmgVw", "referer": releaserUrl, "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } first_page_res = retry_get_url(releaserUrl, headers=firsh_page_headers, proxies=proxies_num) json_cookies.update(dict(first_page_res.cookies)) user_id = re.findall('uid="(\d+)"', first_page_res.text)[0] zptk_url = "https://id.tudou.com/i/h5/id_%s/playlisttab?uid=%s" % ( user_id, user_id) playlisttab_res = retry_get_url(zptk_url, headers=json_headers, proxies=proxies_num, cookies=json_cookies) # print(dict(playlisttab_res.cookies)) json_cookies.update(dict(playlisttab_res.cookies)) json_headers["x-csrf-token"] = dict(playlisttab_res.cookies)["_zpdtk"] count = 1 retry_time = 0 result_list = [] self.video_data['releaserUrl'] = releaserUrl print("working on releaser_id: %s" % (releaser_id)) while count <= releaser_page_num_max and retry_time < 5: proxies = get_proxy(proxies_num) api_url = 'https://id.tudou.com/i/h5/id_%s/videos?ajax=1&pn=%s&pl=20' % ( user_id, count) print(api_url) if proxies: get_page = requests.get(api_url, headers=json_headers, proxies=proxies, timeout=3, cookies=json_cookies) else: get_page = requests.get(api_url, headers=json_headers, timeout=3, cookies=json_cookies) _zpdtk = dict(get_page.cookies) json_cookies.update(_zpdtk) # print(dict(get_page.cookies)) json_headers["x-csrf-token"] = _zpdtk["_zpdtk"] page_dic = get_page.json() # releaser_page_num_max = page_dic["page"]["pz"] releaser = page_dic['channelOwnerInfo']["data"]["nickname"] # has_more = page_dic.get('has_more') try: data_list = page_dic['data']["data"] time.sleep(0.25) except: retry_time += 1 time.sleep(0.25) print("no more data at page: %s try_time: %s" % (count, retry_time)) continue if data_list == []: retry_time += 1 time.sleep(0.25) print("no more data at page: %s try_time: %s" % (count, retry_time)) continue else: retry_time = 0 print("get data at page: %s" % (count)) count += 1 for info_dic in data_list: video_info = copy.deepcopy(self.video_data) if type(info_dic) == str: info_dic = data_list[info_dic] video_info['video_id'] = info_dic["videoid"] video_info['title'] = info_dic["title"] video_info['releaser'] = releaser video_info[ 'url'] = 'https://video.tudou.com/v/%s.html' % info_dic[ "videoid"] video_info['duration'] = int(info_dic.get('seconds') / 1e3) video_info['releaser_id_str'] = "new_tudou_%s" % ( releaser_id) video_info['comment_count'] = int( info_dic.get('total_comment')) video_info['favorite_count'] = int( info_dic.get('total_up')) # favorite_count in database means 点赞数, while in web page the factor # named praiseNumber # in web page facorite_count means 收藏数 video_info['video_img'] = info_dic.get('thumburl') video_info['play_count'] = info_dic.get('total_vv') video_info['release_time'] = int( info_dic.get('publishtime') * 1e3) # print(video_info['release_time']) # if '天前' in release_time_str: # video_info['release_time'] = self.video_page(video_info['url'])['release_time'] # else: # video_info['release_time'] = trans_strtime_to_timestamp(input_time=release_time_str, # missing_year=True) video_info['fetch_time'] = int(time.time() * 1e3) result_list.append(video_info) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) return result_list
def releaser_page_app(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ headers = { 'Host': 'apis.tudou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Cookie': ('isg=BIeH6gcJlwZw_xQESm9jlG-vFTuRJGXxikf0g1l0mJY9yKeKYVuAvzKJbkgzOzPm;' 'cna=XA2EFIGslWoCAWp4y3KXcZh7; ykss=cdbd115c102a68710215ad93;' '__ysuid=1543316262167mjE; P_ck_ctl=62DE1D55DFE1C0F4F27A8662E6575F08;' '__ayvstp=32'), 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0' } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaser = self.get_releaser_name(releaserUrl) releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id self.video_data['releaser'] = releaser self.video_data['releaserUrl'] = releaserUrl url_dic = {"uid": releaser_id, "pL": "20"} print("working on releaser: %s releaser_id: %s" % (releaser, releaser_id)) while count <= releaser_page_num_max and retry_time < 5: proxies = get_proxy(proxies_num) url_dic['pg'] = str(count) url_dic['pn'] = str(count) api_url = 'http://apis.tudou.com/subscribe/v1/video?%s' % urllib.parse.urlencode( url_dic) print(api_url) if proxies: get_page = requests.get(api_url, headers=headers, proxies=proxies, timeout=3) else: get_page = requests.get(api_url, headers=headers, timeout=3) page_dic = get_page.json() # has_more = page_dic.get('has_more') try: data_list = page_dic['entity'] except: retry_time += 1 time.sleep(0.25) print("no more data at releaser: %s page: %s try_time: %s" % (releaser, count, retry_time)) continue if data_list == []: retry_time += 1 time.sleep(0.25) print("no more data at releaser: %s page: %s try_time: %s" % (releaser, count, retry_time)) continue else: retry_time = 0 print("get data at releaser: %s page: %s" % (releaser, count)) count += 1 for info_dic in data_list: video_info = copy.deepcopy(self.video_data) one_video = info_dic.get('detail') if one_video is not None: get_title = one_video.get('base_detail') if get_title is not None: video_info['title'] = get_title.get('title') detail_info = one_video.get('video_detail') if detail_info is not None: video_id = detail_info.get('video_id') if video_id is not None: video_info['video_id'] = video_id video_info[ 'url'] = 'https://video.tudou.com/v/%s.html' % video_id video_info['duration'] = detail_info.get( 'duration') video_info['releaser_id_str'] = "new_tudou_%s" % ( releaser_id) video_info['comment_count'] = int( detail_info.get('comment_count')) video_info['favorite_count'] = int( detail_info.get('praiseNumber')) # favorite_count in database means 点赞数, while in web page the factor # named praiseNumber # in web page facorite_count means 收藏数 video_info['shoucang_count'] = ( detail_info.get('favorite_count')) video_info['video_img'] = self.get_video_image( detail_info) print('play_count', detail_info.get('vv_count')) video_info['play_count'] = detail_info.get( 'vv_count') release_time_str = detail_info.get('publish_time') # print(video_info['release_time']) if '天前' in release_time_str: video_info['release_time'] = self.video_page( video_info['url'])['release_time'] else: video_info[ 'release_time'] = trans_strtime_to_timestamp( input_time=release_time_str, missing_year=True) video_info['fetch_time'] = int(time.time() * 1e3) result_list.append(video_info) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register ) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) return result_list
def search_page(self, title=None, *args, **kwargs): data_list = [] timestamp = int(datetime.datetime.now().timestamp() * 1e3) url = "https://r.inews.qq.com/search?chlid=_qqnews_custom_search_all&search_from=&needSearchTabs=1&needSpreadAds=1&rtAd=1&new_user=0&uid=48a4725886d57203&omgid=&trueVersion=6.0.40&qimei=866174725888628&devid=866174725888628&appver=22_android_6.0.40&Cookie=lskey%3D;skey%3D;uin%3D;%20luin%3D;logintype%3D0;%20main_login%3D;%20&qn-sig=07db3b98ab9133d39b8b053fa1c51bd9&qn-rid=1002_2f55f6ab-2eb6-45e5-a4df-6dd9778c8b9d&qn-newsig=39b264b07173439d052ff2d6875cb7bc6aa47770dea55c7b64addee42138715a" post_json = { "search_type": "all", "query": title, "cp_type": "0", "disable_qc": "0", "searchStartFrom": "header", "launchSearchFrom": "billboard", "isDefault": "0", "searchTag": title, "adReqData": '{"adtype":0,"pf":"aphone","app_channel":"17","ext":{"mob":{"mobstr":"AdiIlDlcnKXLQu1Gx+HOa9fvgiA9BRLUAJ+RowxbYWkHaon9eDa0Qwt66FFNIY+xQHqSdGqfLc6p9ylswsJt1g4qWDeFDIxT6590GrPXznUizTPR0SutVVVQrHa1pbvX4WGx3yOrDNHGJCSrP38Gxej3\/ixgaVTB84d6i7sXgUhFCzcs3pS+DNShM79K7bIwO5U38eccvqle6nYKvELivuDIVr46chKdSokttQzbmf7OUSutGSHdn1+pihXvbFDkzgD+ut6PT\/G1E+O8eHwjZBf7K4Y8tpPABOH182j7JA6xpvoAP8r1WaHh73EtA5+T1M2dU3LtOMC0Sv\/Ngcf6btjefIkMDVoY+hWb8yKKd65UHSYvzpzLEdFNuEV8Sm33B789P9fCqLbnjf11OokPFjtC\/ORvR0dHItka56fkSNAZ2D+rmH8PPbMhZxSa\/bgOZywy2i8yu\/JRg8Rv8zRu4FkB6\/jIXkGCoWI1S7jUfnTIxCHu8iFOGo+Jr4VzMzqbnsi7XWhvKBye\/hPJkrISvw0wg5kg\/TPoj5Yu7aHH2pk31+uIbFRMFIzyj3p0I+yNmvpJECr4MuQmIXf8OP5OUlNVcDuZoXkyR4xy8ON1ou2Vtx+LQ\/x9xK2\/VR7up5apAPQMzmuzTOMcizdpO3FkrcXh0baOYJ7drGJWx4EO\/6nP9Y6J3GAU+YZsc+hCE3XHJpuZsfRsM2i7M4FnrZGz948VfFhY50Zk09eqK7y\/QsS++6su71tzvghFW0u3FOe1WMDvu3c4mMyYKIHkPQtGd5paAR81Xr6\/tGrhjh6CMcoHdppa9BV\/yM2s+NCTnxaZXoyuzljspI8x\/LjHLJuCLchAoPdOoND6mfoE7HGAajgdoFwR06I6zxN3RNQpB1RHIpmJCt+GcmAI4qld6qooO3lb\/8jkO8CBb69wapSAmvyzRvNVNPRa91ubAARkhW5DM62NjIDLN6COAWNEPZs6SfMbQ4jXNsIdXSR8ZZ8NuhO2uS9hU4+EadRYqVgn4yg1Z23d0HwQd0t0Gnw1X\/sAEIrR4sHyW0cVNMoWXkcfmM7UEq4oSCjLm6KTEhFuIR8EDm2HUEcUvcL+y0xr3Rr2YBuTVRR+bpnqffhYvyqRJILXaP2ddNrPt+a1Cl2sbL0INHVxfymPabok4Us8+jgbseBAf3iy8yOLDAQjG4z3iYVcLtgnoJnTLzTtAMC+wPYCbzoGi+hlXlBEF6FcxpU569ZT4YSIFI0xV8RXia+p7CnkaUWwmoKLBEwIG58rjqWO3+uyhvF0o\/\/RFi7QSF4U1DFy7qNQBPyoOiwEyKYZlbq4pQ6DjMYPWjBboU8NjY3qyoE\/CzwwSE75Gwk7w5DwYLs="}},"ver":"6.0.40","appversion":"200302","chid":2,"slot":[{"loid":"40,39","loid_watch_count":",0","channel":"_qqnews_custom_search_all","refresh_type":0,"recent_rot":["1,2,3"],"orders_info":["215508757,9693616,1554848392,1000,2801,110,2,CKsEMMrx8JcOOKTExfqioLSXG1AAYO32wqqapY+yEg==","215016046,9899501,1204054842,1000,4109,110,2,CKsEMMez\/wE4+fad\/47u\/sJLUNvVyZUNYKTQneXuxPaYngFyDAgBEI\/dwd33hde\/WXIECAIQAA==","214804999,14224364,2744407378,1000,606,110,2,CNkDMLuQydYFOJzXk9iVub73ZlC7rffuBGAAcgwIARDVn9eQtc2S6yNyBAgCEAA="]}],"launch":"0","wxversion":"0"}', "lon": "121.321859", "cityList": "news_news_sh", "loc_street": "申兰路", "village_name": "Unknown", "lastLocatingTime": str(int(timestamp / 1e3)), "provinceId": "12", "loc_city_name": "上海市", "loc_catalog": "基础设施:交通设施:火车站", "loc_province_name": "上海市", "loc_name": "上海虹桥站", "town_name": "新虹街道", "loc_district_name": "闵行区", "loc_addr": "上海市闵行区申贵路1500号", "lat": "31.194424", "cityId": "12", "adcode": "310112", "is_special_device": "0", "mid": "0", "dpi": "320", "qqnetwork": "wifi", "rom_type": "R11-user 5.1.1 NMF26X 500200210 release-keys", "isColdLaunch": "1", "real_device_width": "2.81", "net_proxy": "DIRECT@", "net_bssid": "48:A4:72:58:86:D5", "isMainUserLogin": "******", "currentChannelId": "_qqnews_custom_search_all", "isElderMode": "0", "apptype": "android", "islite": "0", "hw": "OPPO_OPPOR11", "global_session_id": str(timestamp), "screen_width": "900", "isClosePersonalized": "0", "videoAutoPlay": "1", "imsi": "460077203886213", "cpuabi": "armeabi-v7a", "isoem": "0", "currentTabId": "news_news", "startTimestamp": str(int(timestamp / 1e3)), "net_slot": "0", "qn-time": str(timestamp), "pagestartfrom": "icon", "mac": "48:A4:72:58:86:D5", "activefrom": "icon", "net_ssid": "R1148a4725886d57203", "store": "17", "screen_height": "1600", "top_activity": "NewsSearchResultListActivity", "real_device_height": "5", "origin_imei": "866174725888628", "network_type": "wifi", "origCurrentTab": "top", "global_info": "1|1|1|1|1|14|4|1|0|6|1|1|1||0|J309P000000000:J902P000000000:J601P900000000:A601P800217702:A601P700321102:B601P600286205:A601P500154501:A601P400161601:J601P300000000:B601P200096102:A601P100272502:A601P000261102:J601P904000000:J601P903000000:A601P902266601:A601P901291001:J601P811000000:A601P701226201:A601P622269601:A601P621294101:A601P620269601:J601P111000000:J601P110000000:A601P109107102:A601P105118803:A601P019237403:A601P016212405:J601P006000000:J603P000000000:J401P100000000:A401P000050901:J602P900000000:J602P800000000:J602P700000000:J602P600000000:A602P500267502:B602P400286004:J602P300000000:J602P200000000:J602P100000000:B602P000315504:A602P901257901:J602P616000000:A602P615304801:A602P613271701:A602P611253801:A602P516234601:A602P414259901:A602P307160708:J602P302000000:A602P208205801:J602P117000000:A602P007272801:A602P003136401:J304P000000000:J310P700000000:A310P200210802:J310P100000000:B310P020314103:A310P010301701:B310P000267107:B701P000323002:A703P000322204:A704P000309801:J702P000000000:J405P000000000:J064P400000000:J064P300000000:B064P100243802:B064P020290902:J064P010000000:J064P000000000:A085P000087701:B074P200238202:J074P040000000:B074P030315703:A074P020315602:A074P010315401:B074P000142402:J903P000000000:A267P300215801:A267P200263601:A267P100299801:B267P000300102:A073P040317201:B073P030314503:A073P020313801:J073P010000000:B073P000313603:J060P700000000:J060P300000000:J060P200000000:B060P100299703:A060P090287301:J060P020000000:J060P010000000:B060P000311102:J060P099000000:J060P016000000:A406P000313203:J403P700000000:J403P600000000:A403P200206702:B403P100246105:J403P010000000:A403P000310401:A403P602218702:B404P200262402:A404P000263407:J055P200000000:J055P090000000:J055P080000000:J055P070000000:J055P060000000:J055P050000000:J055P010000000:A055P000265801:J402P100000000:J402P090000000:J402P080000000:J402P060000000:J402P020000000:A402P000301403:J054P400000000:J054P300000000:J054P200000000:A054P100269701:B054P090289604:A054P080289702:J054P050000000:J054P040000000:A054P030288501:J054P010000000:A054P000319901:J056P000000000:A901P200252304:B901P100226405:B901P000232405:J407P000000000|1402|0|1|25|25|0|0|0||3|3|1|1|1|1|1|1|-1|0|0|5|2|0|0|0|3|0|0|1|3|0|2|0|0|2|0|0|1|0|1|1|0|0|1|0|4|0|1|1|11|20|1|0|1|1|0|0|1|4|0|1|1|41|2|51|60|0|1|0|0|1|5|1|0|0|71|0|0|1|71", "imsi_history": "460077203886213", "net_apn": "0", } res = requests.post(url, headers=self.headers, data=post_json) page_text = res.json() for one_video in page_text["secList"]: video_dic = {} try: one_video = one_video["newsList"][0] video_dic['title'] = one_video.get('title') video_dic['url'] = one_video.get("url") releaser_id = one_video.get('media_id') video_dic['releaser'] = one_video.get('chlname') video_dic[ 'releaserUrl'] = "https://view.inews.qq.com/media/%s" % releaser_id release_time = int(one_video.get('timestamp')) video_dic['release_time'] = int(release_time * 1e3) video_dic['video_id'] = one_video.get('video_channel').get( "video").get("vid") video_dic['duration'] = trans_duration( one_video.get('video_channel').get("video").get( "duration")) video_dic['play_count'] = one_video.get('readCount') video_dic['repost_count'] = one_video.get('shareCount') video_dic['comment_count'] = one_video.get('comments') video_dic['favorite_count'] = one_video.get('likeInfo') video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "腾讯新闻_%s" % releaser_id video_dic['video_img'] = one_video.get('miniProShareImage') video_dic['platform'] = self.platform video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" except Exception as e: print(e) continue data_list.append(video_dic) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ proxies = get_proxy(proxies_num) releaser = "" count = 1 count_false = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) page_count = 0 # releaserUrl_name = 'http://c.m.163.com/nc/subscribe/list/%s/video/%s-20.html' % (releaser_id, page_count) pcursor = None self.video_data['releaserUrl'] = releaserUrl while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": proxies = get_proxy(proxies_num) releaserUrl = 'http://c.m.163.com/nc/subscribe/list/%s/video/%s-20.html' % ( releaser_id, page_count) try: if proxies: get_page = requests.get(releaserUrl, headers=self.headers, timeout=5, proxies=proxies) else: get_page = requests.get(releaserUrl, headers=self.headers, timeout=5) # print(data_list) # print(releaserUrl) page_dic = get_page.json() data_list = page_dic.get("tab_list") except: proxies = get_proxy(1) count_false += 1 if count_false <= 5: continue else: break page_count += 20 if data_list == []: print("no more data at releaser: %s page: %s " % (releaser, count)) pcursor = "no_more" continue else: print("get data at releaser: %s page: %s" % (releaser, count)) count += 1 for info_dic in data_list: skipID = info_dic.get("skipID") page_data, release_url = self.one_video_page(skipID) video_dic = copy.deepcopy(self.video_data) video_dic['title'] = page_data.get('title') video_dic['url'] = release_url video_dic['releaser'] = page_data.get('topicName') video_dic[ 'releaserUrl'] = "https://c.m.163.com/news/sub/%s.html" % releaser_id video_dic['release_time'] = int( datetime.datetime.strptime( info_dic.get('ptime'), "%Y-%m-%d %H:%M:%S").timestamp() * 1e3) video_dic['play_count'] = page_data.get("playCount") if not video_dic['play_count']: video_dic['play_count'] = 0 video_dic['favorite_count'] = page_data.get('voteCount') if not video_dic['favorite_count']: video_dic['favorite_count'] = 0 video_dic['comment_count'] = page_data.get('replyCount') video_dic['video_id'] = skipID video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['duration'] = page_data.get("length") video_dic['releaser_id_str'] = "网易新闻_%s" % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) result_list.append(video_dic) time.sleep(0.5) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() return result_list
def releaser_page_via_m(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None): releaser_id = self.get_releaser_id(releaserUrl) uk = self.releaser_id_to_uk(releaser_id) print("platform: %s releaser_id: %s uk: %s" % (self.platform, releaser_id, uk)) result_lst = [] video_info = self.video_data_template page_count = 1 headers = { 'Charset': 'UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Content-Type': 'application/x-javascript; charset=utf-8', 'Host': 'author.baidu.com', 'Connection': 'Keep-Alive', 'Accept-Encoding': 'gzip', 'Cookie': 'BAIDUID=5B4BD931D455EA625D8B5E20BD348270:FG=1; BIDUPSID=5B4BD931D455EA625D8B5E20BD348270; PSTM=1540776027; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; H_PS_PSSID=1423_27211_21123_28131_27750_28139_20718; BDSFRCVID=Y2PsJeCCxG37oNO9K0MmeTd-epk7qPMdDVTa3J; H_BDCLCKID_SF=tR333R7oKRu_HRjYbb__-P4DHUjHfRO2X5REVMTHBPOkeqOJ2Mt5jP4NXNriJnOCfgjtXxcc5q_MoCDzbpnp05tpeGLsaPoy2K6XsJoq2RbhKROvhjntK6uQ-nnjhjnWLbneaJ5n0-nnhI3vXxPByTODyfQwXpoO0KcG_UFhHR3rsftRy6CaePk_hURK2D6aKC5bL6rJabCQe4_ZK-brKbTM0tvrbMT-027OKK85ahrcbqkxXtvI5lRBKtOh3j3zt4jMMh5xthF0hDvd-tnO-t6H-xQ0KnLXKKOLVMI-LPOkeqOJ2Mt5jP4NXNriJUrL5GnbsR5M2K3aVh6gQhjx-jtpexbH55utfnID3J; delPer=0; PSINO=2' } params1 = { 'type': 'video', 'tab': '9', 'uk': uk, # 'ctime': '15448673604154', # '_': '1545633915094', 'callback': 'jsonp5' } rq_get1 = requests.get('https://author.baidu.com/list', params=params1, headers=headers) page_info1 = json.loads(rq_get1.text[7:-2]) releaser = page_info1['user']['display_name'] def handle_one_video(one, video_info, releaser, releaserUrl, platform): video_data = copy.deepcopy(video_info) video_itemid = one['attr']['itemId'] find_asyncData = one['asyncData'] video_data['platform'] = platform video_data['releaser'] = releaser video_data['releaserUrl'] = releaserUrl video_data['title'] = one['title'] video_data['url'] = r'https://sv.baidu.com/videoui/page/videoland?context=' + parse.quote( '{"nid":"sv_%s"}' % \ one['id'][3:]) video_data['duration'] = trans_duration(one['timeLong']) video_data['video_id'] = one['article_id'] video_data['release_time'] = int(one['publish_at']) * 1000 fetch_time = int(time.time() * 1e3) video_data['fetch_time'] = fetch_time params2 = { 'params': json.dumps([find_asyncData]), 'uk': uk, '_': str(int(time.time()) * 1000) } rq_get2 = requests.get( 'https://mbd.baidu.com/webpage?type=homepage&action=interact&format=jsonp&callback=jsonp2', params=params2) page_info2 = json.loads(rq_get2.text[7:-1]) try: video_data['play_count'] = int( page_info2['data']['user_list'][video_itemid]['read_num']) except: video_data['play_count'] = 0 try: video_data['favorite_count'] = int( page_info2['data']['user_list'][video_itemid] ['praise_num']) except: video_data['favorite_count'] = 0 try: video_data['comment_count'] = int( page_info2['data']['user_list'][video_itemid] ['comment_num']) except: video_data['comment_count'] = 0 return video_data while page_info1['data'][ 'has_more'] == 1 and page_count < releaser_page_num_max: time.sleep(random.randint(4, 6)) print("get data at page: %s" % str(page_count)) ctime = page_info1['data']['ctime'] for one in page_info1['data']['list']: one_result = handle_one_video(one, video_info, releaser, releaserUrl, self.platform) result_lst.append(one_result) if len(result_lst) >= 100: output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type) result_lst.clear() params1['ctime'] = ctime rq_next_page = requests.get('https://author.baidu.com/list', params=params1, headers=headers) page_info1 = json.loads(rq_next_page.text[7:-2]) page_count += 1 for one in page_info1['data']['list']: one_result = handle_one_video(one, video_info, releaser, releaserUrl, self.platform) result_lst.append(one_result) output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type)
def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 """ result_list = [] has_more = True count = 1 count_false = 0 releaser_id = self.find_releaser_id(releaserUrl) offset = "0" #vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999) ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint( 10000, 99999) openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint( 1000000, 9999999) idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999) iid = str(random.randint(100000000000, 103000000000)) device_id = str(random.randint(66800000000, 66990000000)) proxies = get_proxy(proxies_num) while has_more and count <= releaser_page_num_max: # print(str(releaser_id)+str(max_behot_time)) # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time))) print("get %s video on page %s" % (releaser_id, count)) time.sleep(random.randint(1, 2)) url_dic = { "source": "0", "max_cursor": offset, "user_id": releaser_id, "count": "21", "os_api": "23", "device_type": "Huawei P20", "ssmix": "a", "manifest_version_code": "985", "dpi": "429", # "uuid": "440000000189785", "app_name": "douyin", "version_name": "9.8.5", "ts": "1585532172", "app_type": "normal", "ac": "wifi", "update_version_code": "9852", "channel": "baidu", "_rticket": str(int(datetime.datetime.now().timestamp()) * 1e3), "device_platform": "android", # "iid": iid, "version_code": "985", #"cdid": "87cc1c77-cc3c-41a1-8df6-1e060b9c510b", #"openudid": "e44cc0264b92bcbf", "device_id": device_id, "resolution": "1080*2244", "os_version": "9.0.1", "language": "zh", "device_brand": "Huawei", "aid": "2329", "mcc_mnc": "46005", } # url_dic = { # "ac": "WIFI", # # "iid": iid, # "device_id": device_id, # "os_api": "18", # "app_name": "aweme", # "channel": "App Store", # # "idfa": "7AED33DD-0F97-418D-AFAA-72ED0578A44E", # # "idfa": idfa, # "device_platform": "iphone", # "build_number": "92113", # # "vid": "21B39A50-8C28-4E7E-AEB8-A67B12B1A82B", # # "vid": vid, # # "openudid": "b1021c76124449e0e9f0e43bdf51f3314aac263b", # # "openudid": openudid, # "device_type": "iPhone9,4", # "app_version": "9.2.1", # "js_sdk_version": "1.43.0.1", # "version_code": "9.2.1", # "os_version": "13.3", # "screen_width": "1242", # "aid": "1128", # "mcc_mnc": "", # "user_id": releaser_id, # "max_cursor": offset, # "count": "21", # "source": "0", # } # 其中aid 可能为获取数据的关键字段 处理方法 尝试+1 # host = random.choice(self.api_list) # self.headers["Host"] = host url = "https://{1}/aweme/v1/aweme/post/?{0}".format( urllib.parse.urlencode(url_dic), random.choice(self.api_list)) # url = "https://aweme.snssdk.com/aweme/v1/aweme/post/?source=0&max_cursor=1584105171000&user_id=100027325090&count=10&os_api=23&device_type=MI%205s&ssmix=a&manifest_version_code=940&dpi=270&uuid=440000000189785&app_name=douyin_lite&version_name=9.4.0&ts=1585532172&app_type=normal&ac=wifi&update_version_code=9402&channel=xiaomi&_rticket=1585532172572&device_platform=android&iid=109688778422&version_code=940&cdid=87cc1c77-cc3c-41a1-8df6-1e060b9c510b&openudid=e44cc0264b92bcbf&device_id=69418894872&resolution=810*1440&os_version=6.0.1&language=zh&device_brand=Xiaomi&aid=2329&mcc_mnc=46005" try: # proxies = get_proxy(proxies_num) if proxies_num: get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10) # get_page = retry_get_url(url, headers=self.headers, proxies=proxies_num, timeout=10) else: get_page = requests.get(url, headers=self.headers, timeout=10) except Exception as e: proxies = get_proxy(1) print(e) continue page_dic = {} # print(get_page.text) try: page_dic = get_page.json() # print(get_page) # print(page_dic) data_list = page_dic.get('aweme_list') if not data_list: get_page = requests.get(url, headers=self.headers, timeout=10) page_dic = get_page.json() data_list = page_dic.get('aweme_list') if not data_list: raise ValueError has_more = page_dic.get('has_more') offset = str(page_dic.get("max_cursor")) except: if not data_list: proxies = get_proxy(1) count_false += 1 if count_false >= 5: break else: continue if has_more is None: has_more = False if data_list == []: print("no data in releaser %s page %s" % (releaser_id, count)) # print(page_dic) # print(url) proxies = get_proxy(1) count_false += 1 if count_false >= 5: has_more = False continue else: count_false = 0 count += 1 for one_video in data_list: # info_str = one_video.get('content') video_dic = copy.deepcopy(self.video_data) video_dic['title'] = one_video.get('desc') video_dic['url'] = one_video.get('share_url') video_dic['releaser'] = one_video.get('author').get( "nickname") video_dic['releaserUrl'] = releaserUrl release_time = one_video.get('create_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int( one_video.get('duration') / 1000) video_dic['play_count'] = 0 video_dic['repost_count'] = one_video.get( 'statistics').get('share_count') video_dic['comment_count'] = one_video.get( 'statistics').get('comment_count') video_dic['favorite_count'] = one_video.get( 'statistics').get('digg_count') video_dic['video_id'] = one_video.get('aweme_id') video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "抖音_%s" % releaser_id video_dic['video_img'] = one_video.get('video').get( 'cover').get('url_list')[0] result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: # data_count += len(result_list) # print(result_list) # print(data_count) output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register)
def search_page(self, title, *args): data_list = [] timestamp = int(datetime.datetime.now().timestamp() * 1e8) headers = { "Host": "apissl.ksapisrv.com", "Accept": "application/json", "Content-Type": "application/x-www-form-urlencoded", "Connection": "keep-alive", "X-REQUESTID": "158332601966266545", "Cookie": "region_ticket=RT_A900E3AE922501DB6DED77DD16B0143B1099175ABCF9B34E20A04FED32A04", "User-Agent": "kwai-ios", "Accept-Language": "zh-Hans-CN;q=1", "Accept-Encoding": "gzip, deflate", } url = "https://apissl.ksapisrv.com/rest/n/search/feed?kcv=188&kpf=IPAD&net=_5&appver=7.1.2.1527&kpn=KUAISHOU&mod=iPad6%2C11&c=a&sys=ios13.3.1&sh=2048&ver=7.1&isp=&did=8DE774CA-8F57-488F-8889-79D60AEDE388&ud=1815100604&browseType=1&sw=1536&egid=DFPF406D78B30ED8A082E3613DB9F5E4EA35664F0527B2BE54F7719A7A61C90B" post_json = { "__NS_sig3": "2207493808d8530faf9fb99c7f0aa3339c090c83fa", "__NStokensig": "e54c8fcb82174761862807ce5ffbe8d18af9898fc5464105a71d4a58cf53a5aa", "client_key": "56c3713c", "country_code": "cn", "global_id": "DFPF406D78B30ED8A082E3613DB9F5E4EA35664F0527B2BE54F7719A7A61C90B", "isRecoRequest": "0", "keyword": title, "kuaishou.api_st": "Cg9rdWFpc2hvdS5hcGkuc3QSoAGx9VGkywy5Mvl97AA_lk1kfdCLTA0X61478qJ0Kb3fKEpZBlNM-jYcJ52ydyEaBpzbWAy7QEed0FcZ5t9zUltbXHaviXuUq0FiZkbuJXUs3md9v3Iro_NQ9NYWn7pzyc59CERXsLzM6G3N3DH-vUt8qT9Mg-HAAaLZngM_7UlQxkrlNmb4k9AJnz0BjxcXSK7FbQgde-U1B6Gt1VQFTLZHGhJrNN-fL8lNH4AKLNFPdkh-L7kiIAXhY7DWXa-tqY6EzPQmA3ZVd0M8b60rSf2LVxxji-LqKAUwAQ", "language": "zh-Hans-CN;q=1", # "sig": "f67fd34ff32ad20b8648f8013ba8cf14", "token": "dd79d261eeec4ef1b0370d07691a8122-1815100604", } # sig 为keyword 加密的字段 无法破解 res = requests.post(url, headers=headers, data=post_json) page_text = res.json() for one_video in page_text["mixFeeds"]: video_dic = {} try: one_video = one_video["feed"] photoId_list = one_video.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") elif "userId=" in photoid: releaser_id = photoid.replace("userId=", "") video_dic['video_id'] = photoid video_dic['title'] = one_video.get('caption') video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % ( releaser_id, photoid) video_dic['releaser'] = one_video.get('user_name') video_dic[ 'releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id release_time = int(one_video.get('timestamp')) video_dic['release_time'] = int(release_time) video_dic['video_id'] = photoid video_dic['duration'] = int(one_video.get("duration") / 1000) video_dic['play_count'] = one_video.get('view_count') video_dic['repost_count'] = one_video.get('share_count') video_dic['comment_count'] = one_video.get('comment_count') video_dic['favorite_count'] = one_video.get('like_count') video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "kwai_%s" % releaser_id video_dic['platform'] = self.platform video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" except Exception as e: print(e) continue data_list.append(video_dic) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()