def process_one_video(self, line): video_info = copy.deepcopy(self.video_data) try: video_info['title'] = line.find('a', {'target': 'video'})['title'] except: video_info['title'] = None try: url = line.find('a', {'target': 'video'})['href'] video_info['url'] = 'https:' + url except: video_info['url'] = None try: play_count_str = line.find('span', {'class': 'v-num'}).text video_info['play_count'] = trans_play_count(play_count_str) except: video_info['play_count'] = 0 # logging.warning("can't get play_count at page %s" % video_info['url']) try: release_time_str = line.find('span', { 'class': 'v-publishtime' }).text video_info['release_time'] = trans_strtime_to_timestamp( input_time=release_time_str, missing_year=True) except: release_time_str = 0 # logging.warning("can't get release_time at page %s" % video_info['url']) try: dura_str = line.find('span', {'class': 'v-time'}).text video_info['duration'] = trans_duration(dura_str) except: video_info['duration'] = 0 # logging.warning("can't get duration at page %s" % video_info['url']) fetch_time = int(time.time() * 1e3) video_info['fetch_time'] = fetch_time return video_info
def parse_video_page_html(self, html): page_lst = html.split('fangyuchenggoalkeeper') url = page_lst[0] page = page_lst[1] soup = BeautifulSoup(page, 'html.parser') try: title = soup.find('h1', {'class': 'td-playbase__title'}).span.text except: title = None try: releaser = soup.find('a', { 'class': 'td-play__userinfo__name' }).text except: releaser = None try: midsteptime = soup.find( 'div', { 'class': 'td-play__videoinfo__details-box__time' }).text[:-2] release_time = int( datetime.datetime.strptime( midsteptime, '%Y-%m-%d %H:%M:%S').timestamp() * 1e3) except: release_time = None try: releaserUrl = soup.find( "a", {"class": "td-play__userinfo__name"})['href'] except: releaserUrl = None try: find_play_count = ' '.join( re.findall('total_vv.*stripe_bottom', page)) replace_comma = find_play_count.replace(',', '') play_count_str = ' '.join( re.findall('total_vv":"\d+', replace_comma)) play_count = int(' '.join(re.findall('\d+', play_count_str))) except: play_count = 0 try: find_dura = re.findall('stripe_bottom":"\d+:\d+', page) dura_str = ' '.join(find_dura).split('":"')[-1] duration = trans_duration(dura_str) except: duration = 0 fetch_time = int(time.time() * 1e3) info_dic = { 'platform': self.platform, "title": title, 'url': url, 'duration': duration, "releaser": releaser, "release_time": release_time, "releaserUrl": releaserUrl, 'play_count': play_count, 'fetch_time': fetch_time } return info_dic
def video_page(self, url, channel=None): """ Due to iqiyi import hot index instead of play count, the crawler is updated on 2018-11-23 """ url = self.rebuild_video_url(url) start = time.time() get_page = retry_get_url(url) end = time.time() - start print("first request costs %s seconds" % end) if get_page is None: print('Failed to get html page for url: %s' % url) return None get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') page_info = soup.find("div", {"is": "i71-play"})[":page-info"] page_dic = json.loads(page_info) title = page_dic["tvName"] url = page_dic["pageUrl"] dura_str = page_dic["duration"] duration = trans_duration(dura_str) try: releaser = page_dic["user"]["name"] releaserUrl = page_dic["user"]["profileUrl"] except: releaser = None releaserUrl = None video_info = soup.find("div", {"is": "i71-play"})[":video-info"] video_dic = json.loads(video_info) release_time = video_dic["firstPublishTime"] tvId = video_dic["tvId"] start1 = time.time() hot_idx_url = "https://pub.m.iqiyi.com/jp/h5/count/hotDisplay/?qipuId=%s" % tvId get_hot_idx = retry_get_url(hot_idx_url) end2 = time.time() - start1 print("second request costs %s seconds" % end2) hot_idx_str = get_hot_idx.text hot_idx = int( re.findall("\d+", ' '.join(re.findall('"count":\d+', hot_idx_str)))[0]) fetch_time = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1e3) video_page_dict = copy.deepcopy(self.video_data) video_page_dict["title"] = title video_page_dict["url"] = url video_page_dict["duration"] = duration video_page_dict["releaser"] = releaser video_page_dict["releaserUrl"] = releaserUrl video_page_dict["release_time"] = release_time video_page_dict["hot_idx"] = hot_idx video_page_dict["fetch_time"] = fetch_time video_page_dict["tvId"] = tvId if channel is not None: video_page_dict["channel"] = channel return video_page_dict
def video_page(self, url): video_info = copy.deepcopy(self.video_data) get_page = requests.get(url) page = get_page.text soup = BeautifulSoup(page, 'html.parser') try: video_info['title'] = soup.find('h1', {'class': 'td-playbase__title'}).span.text except: video_info['title'] = None try: video_info['releaser'] = soup.find('a',{'class':'td-play__userinfo__name'}).text except: video_info['releaser'] = None try: midsteptime = soup.find('div',{'class': 'td-play__videoinfo__details-box__time'}).text[:-2] video_info['release_time'] = int(datetime.datetime.strptime(midsteptime, '%Y-%m-%d %H:%M:%S').timestamp()*1e3) except: video_info['release_time'] = None try: video_info['releaserUrl'] = soup.find("a", {"class": "td-play__userinfo__name"})['href'] except: video_info['releaserUrl'] = None try: find_play_count = ' '.join(re.findall('total_vv.*stripe_bottom', page)) replace_comma_pcnt = find_play_count.replace(',', '') play_count_str = ' '.join(re.findall('total_vv":"\d+', replace_comma_pcnt)) video_info['play_count'] = int(' '.join(re.findall('\d+', play_count_str))) except: video_info['play_count'] = 0 try: find_comment_count = ' '.join(re.findall('total_comment.*recommend', page)) replace_comma_ccnt = find_comment_count.replace(',', '') comment_count_str = ' '.join(re.findall('total_comment":"\d+', replace_comma_ccnt)) video_info['comment_count'] = int(' '.join(re.findall('\d+', comment_count_str))) except: video_info['comment_count'] = 0 try: find_dura = re.findall('stripe_bottom":"\d+:\d+', page) dura_str = ' '.join(find_dura).split('":"')[-1] video_info['duration'] = trans_duration(dura_str) except: video_info['duration'] = 0 video_info['fetch_time'] = int(time.time()*1e3) video_info['url'] = url print("get video data at %s" % url) return video_info
def handle_one_video(one, video_info, releaser, releaserUrl, platform): video_data = copy.deepcopy(video_info) video_itemid = one['attr']['itemId'] find_asyncData = one['asyncData'] video_data['platform'] = platform video_data['releaser'] = releaser video_data['releaserUrl'] = releaserUrl video_data['title'] = one['title'] video_data['url'] = r'https://sv.baidu.com/videoui/page/videoland?context=' + parse.quote( '{"nid":"sv_%s"}' % \ one['id'][3:]) video_data['duration'] = trans_duration(one['timeLong']) video_data['video_id'] = one['article_id'] video_data['release_time'] = int(one['publish_at']) * 1000 fetch_time = int(time.time() * 1e3) video_data['fetch_time'] = fetch_time params2 = { 'params': json.dumps([find_asyncData]), 'uk': uk, '_': str(int(time.time()) * 1000) } rq_get2 = requests.get( 'https://mbd.baidu.com/webpage?type=homepage&action=interact&format=jsonp&callback=jsonp2', params=params2) page_info2 = json.loads(rq_get2.text[7:-1]) try: video_data['play_count'] = int( page_info2['data']['user_list'][video_itemid]['read_num']) except: video_data['play_count'] = 0 try: video_data['favorite_count'] = int( page_info2['data']['user_list'][video_itemid] ['praise_num']) except: video_data['favorite_count'] = 0 try: video_data['comment_count'] = int( page_info2['data']['user_list'][video_itemid] ['comment_num']) except: video_data['comment_count'] = 0 return video_data
def video_page_seleium(self, task=0): # seleium 获取单集视频 缺少vid self.driver = webdriver.Chrome(options=self.chrome_options) self.driver.maximize_window() has_data = rds_get.dbsize() while has_data: keys = rds_get.randomkey() res = rds_get.hgetall(keys) has_data = rds_get.dbsize() # time.sleep(0.2) try: self.driver.get(res["url"]) time.sleep(5) video_list = self.driver.find_elements_by_xpath("//div[@id='rightPlayList']//li") for count, video_obj in enumerate(video_list): # if count <= 1: # continue try: ActionChains(self.driver).click(video_obj).perform() time.sleep(2) except: continue self.driver.implicitly_wait(10) title = self.driver.find_element_by_xpath("//h1[@class='player-title']").text # WebDriverWait(self.driver, 10,2).until(lambda x: x.find_element_by_xpath("/html[1]/body[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/article[1]")) # playcount_obj = self.driver.find_element_by_xpath("//span[@class='basic-title']") duration_str = self.driver.find_element_by_xpath("//iqpspan[@class='iqp-time-dur']").text duration = trans_duration(duration_str) # action = ActionChains(self.driver) # action.click(playcount_obj).perform() # play_conut_obj_lsit = self.driver.find_elements_by_xpath("//div[@class='hot-chart-tab']//li") # action = ActionChains(self.driver) # action.click(play_conut_obj_lsit[-1]).perform() # for x in range(11): # # print(x) # ActionChains(self.driver).move_by_offset(10 * x, 20).perform() # play_count_sum_obj = self.driver.find_element_by_xpath("//div[@class='hot-chart']//div[2]") # print(play_count_sum_obj.location) # print(play_count_sum_obj.text) print("task ", task) action = ActionChains(self.driver) # action.move_by_offset(1406,1048) # action.move_to_element_with_offset(play_count_sum_obj,500,-300).perform() # time.sleep(0.5) # play_count_re = re.findall('指数:(\d+)', play_count_sum_obj.text) # if play_count_re: # play_count = play_count_re[0] # else: # play_count = "" # print(play_count_re) # language = self.driver.find_elements_by_xpath("//div[@class='intro-mn']//a[1]") # if language: # language = language[0].text # else: # language = "" # style_tags = self.driver.find_elements_by_xpath("//div[@class='intro-mn']//a[2]") # if style_tags: # style_tags = style_tags[0].text # else: # style_tags = "" # project_tags = self.driver.find_elements_by_xpath("//div[@class='intro-mn']//a[3]") # if project_tags: # project_tags = project_tags[0].text # else: # project_tags = "" # faverate_count_sum = self.driver.find_element_by_xpath("//span[@class='like-icon-box']") one_video_dic = { # "play_count_sum": play_count, "url": res["url"], "video_url": self.driver.current_url, "video_title": title, "album": res["title"], "duration": duration } self.parse_single_data(one_video_dic, one_video_dic["video_url"]) # self.detail_page_api(res) rds_get.delete(keys) except Exception as e: print(e, res["url"])
def one_video_page(self, title, url): video_obj_list = self.driver.find_elements_by_xpath( "//div[@id='eplist_module']//li") if video_obj_list: # time.sleep(0.1) action = ActionChains(self.driver) video_name_tags = self.driver.find_elements_by_xpath( "//i[@class='mode-change iconfont icon-ep-list-simple']") if video_name_tags: # time.sleep(0.1) action.move_to_element(video_name_tags[0]).click().perform() del action time.sleep(0.1) video_obj_list = self.driver.find_elements_by_xpath( "//div[@id='eplist_module']//li") video_obj = self.driver.find_element_by_xpath( "//div[@id='eplist_module']//li") for video_count, video_obj in enumerate(video_obj_list): self.driver.implicitly_wait(10) action = ActionChains(self.driver) action.click(video_obj).perform() del action self.driver.execute_script("window.scrollBy(0,1000)") time.sleep(0.2) video_title = video_obj.text if_pay = "" # print(video_title) if "\n" in video_title: video_title, if_pay = video_title.split("\n", -1) # print(self.driver.page_source) comment_count_list = self.driver.find_elements_by_xpath( "//span[@class='results']") if comment_count_list: comment_count = comment_count_list[0].text else: comment_count = 0 # print(comment_count) video_id = self.driver.find_element_by_xpath( "//a[@class='av-link']") video_url = video_id.get_attribute("href") # print(video_url) # print(video_id.text) barrage_count_list = self.driver.find_elements_by_xpath( "//span[@class='bilibili-player-video-info-danmaku-number']" ) if barrage_count_list: barrage_count = barrage_count_list[0].text else: barrage_count = "-" # print(barrage_count) duration = self.driver.find_elements_by_xpath( '//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[10]/div[2]/div[2]/div[1]/div[3]/div/span[3]' ) try: duration = trans_duration(duration[0].text) print(video_count, duration) except: duration = "" # print(duration) project_name = "bilibili_%s_%s" % (title, video_title) dic = { "title": title, "video_title": video_title, "if_pay": if_pay, "comment_count": comment_count, "url": url, "video_url": video_url, "video_id": video_id.text, "barrage_count": barrage_count, "duration": duration, "video_count": video_count + 1 } self.parse_single_data(dic, project_name) else: self.driver.execute_script("window.scrollBy(0,1000)") # time.sleep(0.4) video_title = self.driver.find_element_by_xpath( '//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[1]/div[1]').text if_pay = "" # print(video_title) if "\n" in video_title: video_title, if_pay = video_title.split("\n", -1) # print(self.driver.page_source) comment_count = self.driver.find_element_by_xpath( "//span[@class='results']").text # print(comment_count) video_id = self.driver.find_element_by_xpath( "//a[@class='av-link']") video_url = video_id.get_attribute("href") # print(video_url) # print(video_id.text) barrage_count = self.driver.find_element_by_xpath( "//span[@class='bilibili-player-video-info-danmaku-number']" ).text # print(barrage_count) duration = self.driver.find_elements_by_xpath( '//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[10]/div[2]/div[2]/div[1]/div[3]/div/span[3]' ) try: duration = trans_duration(duration[0].text) print(duration) except: duration = 0 project_name = "bilibili_%s_%s" % (title, video_title) dic = { "title": title, "video_title": video_title, "if_pay": if_pay, "comment_count": comment_count, "url": url, "video_url": video_url, "video_id": video_id.text, "barrage_count": barrage_count, "duration": duration, "video_count": 1 } self.parse_single_data(dic, project_name)
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, fetchFavoriteCommnt=True): pid = os.getpid() releaser_id = self.get_releaser_id(releaserUrl) print('releaser_id is %s' % releaser_id) result_lst = [] # video_info = self.video_data page_num = 0 has_more = True ctime = "" count_false = 0 # proxies = None proxies = get_proxy_dic() while page_num <= releaser_page_num_max and has_more: post_url = 'https://haokan.baidu.com/haokan/wiseauthor?app_id={0}&_api=1&_skip={1}&ctime={2}&_limit=10&video_type=media&sort_type=sort_by_time'.format( releaser_id, page_num, ctime) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', "referer": "https://haokan.baidu.com/haokan/wiseauthor?app_id=1564003728536358", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", "content-type": "application/x-www-form-urlencoded" } try: if page_num == 0: for loop in range(5): get_page = requests.get(releaserUrl, headers=headers, timeout=3, proxies=proxies) # print(get_page.text) page_dic, fans_num = self.web_first_pag(get_page.text) if page_dic['apiData']['video']['results']: page_num += 1 break else: get_page = requests.get(post_url, headers=headers, timeout=3) page_dic = get_page.json() page_num += 1 # print(page_dic) except: continue try: info_lst = page_dic['apiData']['video']['results'] except: info_lst = [] try: ctime = page_dic['apiData']['video']['ctime'] has_more = page_dic['apiData']['video']['has_more'] if not has_more: has_more = False except: has_more = False if info_lst != []: count_false = 0 print("Process %s is processing %s at page %s" % (pid, releaser_id, page_num)) time.sleep(int(random.uniform(1, 2))) for line in info_lst: video_data = copy.deepcopy(self.video_data_template) video_data['title'] = line['content']['title'] video_id = line['content']['vid'] video_data['video_id'] = video_id # partial_url = '{"nid":"sv_%s"}' % video_id # partial_url_encode = urllib.parse.quote_plus(partial_url) video_data['url'] = line['content']["video_short_url"] video_data['play_count'] = line['content']['playcnt'] video_data['favorite_count'] = int( line['content']['praiseNum']) try: video_data['comment_count'] = int( line['content']['commentNum']) except: video_data['comment_count'] = 0 video_data['releaser_followers_count'] = int(fans_num) # print('like num is %s' % video_data['favorite_count']) try: video_data['duration'] = trans_duration( line['content']['duration']) except: video_data['duration'] = 0 video_data['releaser'] = line['content']['author'] video_data['releaser_id_str'] = "haokan_%s" % ( line['content']['authorid']) video_data[ 'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + line[ 'content']['authorid'] fetch_time = int(time.time() * 1e3) video_data['fetch_time'] = fetch_time releaser_time_str = line['content']['publish_time'] video_data['release_time'] = trans_strtime_to_timestamp( input_time=releaser_time_str) print( video_id, releaser_time_str, datetime.datetime.fromtimestamp( video_data['release_time'] / 1000), page_num) yield video_data else: count_false += 1 if count_false < 5: continue else: break
def search_page(self, title=None, *args, **kwargs): data_list = [] timestamp = int(datetime.datetime.now().timestamp() * 1e3) url = "https://r.inews.qq.com/search?chlid=_qqnews_custom_search_all&search_from=&needSearchTabs=1&needSpreadAds=1&rtAd=1&new_user=0&uid=48a4725886d57203&omgid=&trueVersion=6.0.40&qimei=866174725888628&devid=866174725888628&appver=22_android_6.0.40&Cookie=lskey%3D;skey%3D;uin%3D;%20luin%3D;logintype%3D0;%20main_login%3D;%20&qn-sig=07db3b98ab9133d39b8b053fa1c51bd9&qn-rid=1002_2f55f6ab-2eb6-45e5-a4df-6dd9778c8b9d&qn-newsig=39b264b07173439d052ff2d6875cb7bc6aa47770dea55c7b64addee42138715a" post_json = { "search_type": "all", "query": title, "cp_type": "0", "disable_qc": "0", "searchStartFrom": "header", "launchSearchFrom": "billboard", "isDefault": "0", "searchTag": title, "adReqData": '{"adtype":0,"pf":"aphone","app_channel":"17","ext":{"mob":{"mobstr":"AdiIlDlcnKXLQu1Gx+HOa9fvgiA9BRLUAJ+RowxbYWkHaon9eDa0Qwt66FFNIY+xQHqSdGqfLc6p9ylswsJt1g4qWDeFDIxT6590GrPXznUizTPR0SutVVVQrHa1pbvX4WGx3yOrDNHGJCSrP38Gxej3\/ixgaVTB84d6i7sXgUhFCzcs3pS+DNShM79K7bIwO5U38eccvqle6nYKvELivuDIVr46chKdSokttQzbmf7OUSutGSHdn1+pihXvbFDkzgD+ut6PT\/G1E+O8eHwjZBf7K4Y8tpPABOH182j7JA6xpvoAP8r1WaHh73EtA5+T1M2dU3LtOMC0Sv\/Ngcf6btjefIkMDVoY+hWb8yKKd65UHSYvzpzLEdFNuEV8Sm33B789P9fCqLbnjf11OokPFjtC\/ORvR0dHItka56fkSNAZ2D+rmH8PPbMhZxSa\/bgOZywy2i8yu\/JRg8Rv8zRu4FkB6\/jIXkGCoWI1S7jUfnTIxCHu8iFOGo+Jr4VzMzqbnsi7XWhvKBye\/hPJkrISvw0wg5kg\/TPoj5Yu7aHH2pk31+uIbFRMFIzyj3p0I+yNmvpJECr4MuQmIXf8OP5OUlNVcDuZoXkyR4xy8ON1ou2Vtx+LQ\/x9xK2\/VR7up5apAPQMzmuzTOMcizdpO3FkrcXh0baOYJ7drGJWx4EO\/6nP9Y6J3GAU+YZsc+hCE3XHJpuZsfRsM2i7M4FnrZGz948VfFhY50Zk09eqK7y\/QsS++6su71tzvghFW0u3FOe1WMDvu3c4mMyYKIHkPQtGd5paAR81Xr6\/tGrhjh6CMcoHdppa9BV\/yM2s+NCTnxaZXoyuzljspI8x\/LjHLJuCLchAoPdOoND6mfoE7HGAajgdoFwR06I6zxN3RNQpB1RHIpmJCt+GcmAI4qld6qooO3lb\/8jkO8CBb69wapSAmvyzRvNVNPRa91ubAARkhW5DM62NjIDLN6COAWNEPZs6SfMbQ4jXNsIdXSR8ZZ8NuhO2uS9hU4+EadRYqVgn4yg1Z23d0HwQd0t0Gnw1X\/sAEIrR4sHyW0cVNMoWXkcfmM7UEq4oSCjLm6KTEhFuIR8EDm2HUEcUvcL+y0xr3Rr2YBuTVRR+bpnqffhYvyqRJILXaP2ddNrPt+a1Cl2sbL0INHVxfymPabok4Us8+jgbseBAf3iy8yOLDAQjG4z3iYVcLtgnoJnTLzTtAMC+wPYCbzoGi+hlXlBEF6FcxpU569ZT4YSIFI0xV8RXia+p7CnkaUWwmoKLBEwIG58rjqWO3+uyhvF0o\/\/RFi7QSF4U1DFy7qNQBPyoOiwEyKYZlbq4pQ6DjMYPWjBboU8NjY3qyoE\/CzwwSE75Gwk7w5DwYLs="}},"ver":"6.0.40","appversion":"200302","chid":2,"slot":[{"loid":"40,39","loid_watch_count":",0","channel":"_qqnews_custom_search_all","refresh_type":0,"recent_rot":["1,2,3"],"orders_info":["215508757,9693616,1554848392,1000,2801,110,2,CKsEMMrx8JcOOKTExfqioLSXG1AAYO32wqqapY+yEg==","215016046,9899501,1204054842,1000,4109,110,2,CKsEMMez\/wE4+fad\/47u\/sJLUNvVyZUNYKTQneXuxPaYngFyDAgBEI\/dwd33hde\/WXIECAIQAA==","214804999,14224364,2744407378,1000,606,110,2,CNkDMLuQydYFOJzXk9iVub73ZlC7rffuBGAAcgwIARDVn9eQtc2S6yNyBAgCEAA="]}],"launch":"0","wxversion":"0"}', "lon": "121.321859", "cityList": "news_news_sh", "loc_street": "申兰路", "village_name": "Unknown", "lastLocatingTime": str(int(timestamp / 1e3)), "provinceId": "12", "loc_city_name": "上海市", "loc_catalog": "基础设施:交通设施:火车站", "loc_province_name": "上海市", "loc_name": "上海虹桥站", "town_name": "新虹街道", "loc_district_name": "闵行区", "loc_addr": "上海市闵行区申贵路1500号", "lat": "31.194424", "cityId": "12", "adcode": "310112", "is_special_device": "0", "mid": "0", "dpi": "320", "qqnetwork": "wifi", "rom_type": "R11-user 5.1.1 NMF26X 500200210 release-keys", "isColdLaunch": "1", "real_device_width": "2.81", "net_proxy": "DIRECT@", "net_bssid": "48:A4:72:58:86:D5", "isMainUserLogin": "******", "currentChannelId": "_qqnews_custom_search_all", "isElderMode": "0", "apptype": "android", "islite": "0", "hw": "OPPO_OPPOR11", "global_session_id": str(timestamp), "screen_width": "900", "isClosePersonalized": "0", "videoAutoPlay": "1", "imsi": "460077203886213", "cpuabi": "armeabi-v7a", "isoem": "0", "currentTabId": "news_news", "startTimestamp": str(int(timestamp / 1e3)), "net_slot": "0", "qn-time": str(timestamp), "pagestartfrom": "icon", "mac": "48:A4:72:58:86:D5", "activefrom": "icon", "net_ssid": "R1148a4725886d57203", "store": "17", "screen_height": "1600", "top_activity": "NewsSearchResultListActivity", "real_device_height": "5", "origin_imei": "866174725888628", "network_type": "wifi", "origCurrentTab": "top", "global_info": "1|1|1|1|1|14|4|1|0|6|1|1|1||0|J309P000000000:J902P000000000:J601P900000000:A601P800217702:A601P700321102:B601P600286205:A601P500154501:A601P400161601:J601P300000000:B601P200096102:A601P100272502:A601P000261102:J601P904000000:J601P903000000:A601P902266601:A601P901291001:J601P811000000:A601P701226201:A601P622269601:A601P621294101:A601P620269601:J601P111000000:J601P110000000:A601P109107102:A601P105118803:A601P019237403:A601P016212405:J601P006000000:J603P000000000:J401P100000000:A401P000050901:J602P900000000:J602P800000000:J602P700000000:J602P600000000:A602P500267502:B602P400286004:J602P300000000:J602P200000000:J602P100000000:B602P000315504:A602P901257901:J602P616000000:A602P615304801:A602P613271701:A602P611253801:A602P516234601:A602P414259901:A602P307160708:J602P302000000:A602P208205801:J602P117000000:A602P007272801:A602P003136401:J304P000000000:J310P700000000:A310P200210802:J310P100000000:B310P020314103:A310P010301701:B310P000267107:B701P000323002:A703P000322204:A704P000309801:J702P000000000:J405P000000000:J064P400000000:J064P300000000:B064P100243802:B064P020290902:J064P010000000:J064P000000000:A085P000087701:B074P200238202:J074P040000000:B074P030315703:A074P020315602:A074P010315401:B074P000142402:J903P000000000:A267P300215801:A267P200263601:A267P100299801:B267P000300102:A073P040317201:B073P030314503:A073P020313801:J073P010000000:B073P000313603:J060P700000000:J060P300000000:J060P200000000:B060P100299703:A060P090287301:J060P020000000:J060P010000000:B060P000311102:J060P099000000:J060P016000000:A406P000313203:J403P700000000:J403P600000000:A403P200206702:B403P100246105:J403P010000000:A403P000310401:A403P602218702:B404P200262402:A404P000263407:J055P200000000:J055P090000000:J055P080000000:J055P070000000:J055P060000000:J055P050000000:J055P010000000:A055P000265801:J402P100000000:J402P090000000:J402P080000000:J402P060000000:J402P020000000:A402P000301403:J054P400000000:J054P300000000:J054P200000000:A054P100269701:B054P090289604:A054P080289702:J054P050000000:J054P040000000:A054P030288501:J054P010000000:A054P000319901:J056P000000000:A901P200252304:B901P100226405:B901P000232405:J407P000000000|1402|0|1|25|25|0|0|0||3|3|1|1|1|1|1|1|-1|0|0|5|2|0|0|0|3|0|0|1|3|0|2|0|0|2|0|0|1|0|1|1|0|0|1|0|4|0|1|1|11|20|1|0|1|1|0|0|1|4|0|1|1|41|2|51|60|0|1|0|0|1|5|1|0|0|71|0|0|1|71", "imsi_history": "460077203886213", "net_apn": "0", } res = requests.post(url, headers=self.headers, data=post_json) page_text = res.json() for one_video in page_text["secList"]: video_dic = {} try: one_video = one_video["newsList"][0] video_dic['title'] = one_video.get('title') video_dic['url'] = one_video.get("url") releaser_id = one_video.get('media_id') video_dic['releaser'] = one_video.get('chlname') video_dic[ 'releaserUrl'] = "https://view.inews.qq.com/media/%s" % releaser_id release_time = int(one_video.get('timestamp')) video_dic['release_time'] = int(release_time * 1e3) video_dic['video_id'] = one_video.get('video_channel').get( "video").get("vid") video_dic['duration'] = trans_duration( one_video.get('video_channel').get("video").get( "duration")) video_dic['play_count'] = one_video.get('readCount') video_dic['repost_count'] = one_video.get('shareCount') video_dic['comment_count'] = one_video.get('comments') video_dic['favorite_count'] = one_video.get('likeInfo') video_dic['fetch_time'] = int( datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "腾讯新闻_%s" % releaser_id video_dic['video_img'] = one_video.get('miniProShareImage') video_dic['platform'] = self.platform video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" except Exception as e: print(e) continue data_list.append(video_dic) output_result( result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear()
def get_page_list(self, data): headers = { "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", "cookie": "__ysuid=1553755022290AhL; juid=01d9bjgm0l2ngc; cna=2U8aFb1yaVcCAdr3nSX3f47K; __aryft=1557994090; __artft=1557994090; UM_distinctid=16eea2ee55739c-08e02b1a26b73c-2393f61-161012-16eea2ee55881b; ykss=f0cdf05d77e5a6dcebeb4c1c; __ayft=1576549468599; __aysid=1577241653062aoh; __ayscnt=5; yseid=1577241727003hUSqrH; yseidcount=4; ycid=0; __arycid=dz-3-00; __arcms=dz-3-00; referhost=https%3A%2F%2Flist.youku.com; _m_h5_c=60b9e2b4228097503d3975caca016d24_1577269476232%3B6030e92d9f896f1b7024ac8e5df7c81a; P_ck_ctl=70E1D32F5B5E92006640274BCF8D7371; _m_h5_tk=92bfeed90e6fedabcac24cf2fbc211de_1577268775512; _m_h5_tk_enc=9c860e1c7cdd927ab133515c7922f98e; CNZZDATA1277955961=1269611647-1575885450-https%253A%252F%252Flist.youku.com%252F%7C1577259749; seid=01dsu50o381d4f; __arpvid=15772649493186fuodZ-1577264949336; __aypstp=141; __ayspstp=80; seidtimeout=1577266751889; ypvid=1577264954182j6rlMt; ysestep=32; yseidtimeout=1577272154186; ystep=41; __ayvstp=95; __aysvstp=95; isg=BE1Nn-BVLy9TNInF8eGG1rJNXGkHgr-WrpQ5v4_I4OQxhm44V3_MzRVQ9FJFRpm0", "referer": "https://v.youku.com/", "sec-fetch-mode": "no-cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } page = 1 while True: page_info_text = requests.get("http:" + data["url"], headers=headers).text # print(page_info_text) vid = re.findall("videoId: \'(\d*)\'", page_info_text)[0] showid = re.findall("showid: \'(\d*)\'", page_info_text)[0] encode_id = data["id"] pm = re.findall("playmode: \'(\d*)\'", page_info_text)[0] cat_id = re.findall("catId: \'(\d*)\'", page_info_text)[0] componentid = re.findall('"componentId":(\d*)', page_info_text)[0] isSimple = re.findall("isSimple: \'(.*)\'", page_info_text)[0] parser_dic = { "l": "debug", "pm": pm, "vid": vid, "fid": "0", "showid": showid, "sid": "0", "componentid": componentid, "videoCategoryId": cat_id, "isSimple": isSimple, "videoEncodeId": encode_id, "page": page, } page_html = requests.get("https://v.youku.com/page/playlist?%s" % urllib.parse.urlencode(parser_dic), headers=headers) page += 1 page_json = page_html.json() # print(page_html.json) if page_json["html"] == "\n": break soup = BeautifulSoup(page_json["html"], 'lxml') # print(soup) # soup.contents dev_list = soup.find_all(attrs={"class": "item item-cover"}) for dev in dev_list: video_title = dev.get("title") vid = dev.get("item-id") video_url = dev.a.get("href") dev_text = dev.text if "VIP" in dev_text: if_pay = "VIP" else: if_pay = "" play_count = re.findall("热度 (\d+)", dev_text)[0] try: duration = re.findall('(\d+:\d+:\d+)', dev_text)[0] except: duration = re.findall('(\d+:\d+)', dev_text)[0] # print(dev.get_text) dic = { "video_title": video_title, "duration": trans_duration(duration), "play_count": play_count, "if_pay": if_pay, "video_url": "https:" + video_url, "url": "https:" + data["url"], "vid": vid, "album": data["title"] } self.parse_single_data(dic, video_url)
def parse_video_page_single_process(self, output_to_file=False, filepath=None, push_to_redis=False, output_to_es_raw=True, es_index="crawler-data-raw", doc_type="doc", output_to_es_register=False): key = 'iqiyi_video_page_html' result_list = [] pid = os.getpid() while connect_with_redis.length_of_lst(key) > 0: video_page_html = connect_with_redis.retrieve_video_page_html_from_redis( platform=self.platform) soup = BeautifulSoup(video_page_html, 'html.parser') try: page_info = soup.find("div", {"is": "i71-play"})[":page-info"] page_info = page_info.replace("'", '"') page_dic = json.loads(page_info) except: page_dic = None if page_dic is not None: title = page_dic["tvName"] url = page_dic["pageUrl"] dura_str = page_dic["duration"] duration = trans_duration(dura_str) try: releaser = page_dic["user"]["name"] releaserUrl = page_dic["user"]["profileUrl"] except: releaser = None releaserUrl = None else: title = None url = None duration = None releaser = None releaserUrl = None try: video_info = soup.find("div", {"is": "i71-play"})[":video-info"] video_dic = json.loads(video_info) except: video_dic = None if video_dic is not None: if title is None: title = video_dic['name'] if url is None: url = video_dic['url'] if releaser is None: try: releaser = video_dic["user"]["name"] releaserUrl = video_dic["user"]["profileUrl"] except: releaser = None releaserUrl = None release_time = video_dic["firstPublishTime"] tvId = video_dic["tvId"] hot_idx_url = "https://pub.m.iqiyi.com/jp/h5/count/hotDisplay/?qipuId=%s" % tvId get_hot_idx = retry_get_url(hot_idx_url) hot_idx_str = get_hot_idx.text hot_idx = int( re.findall( "\d+", ' '.join(re.findall('"count":\d+', hot_idx_str)))[0]) fetch_time = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1e3) if releaser is None: try: releaser = soup.find('span', { 'class': 'intro-iterm__txt' }).text except: releaser = None video_page_dict = copy.deepcopy(self.video_data) video_page_dict["title"] = title video_page_dict["url"] = url video_page_dict["duration"] = duration video_page_dict["releaser"] = releaser video_page_dict["releaserUrl"] = releaserUrl video_page_dict["release_time"] = release_time video_page_dict["hot_idx"] = hot_idx video_page_dict["fetch_time"] = fetch_time video_page_dict["tvId"] = tvId result_list.append(video_page_dict) print( "platform: %s, action: parse video page, process_id: %s, has done: %s" % (self.platform, pid, len(result_list))) if len(result_list) >= 1000: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear()