def process_one_video(self, line): video_info = copy.deepcopy(self.video_data) try: video_info['title'] = line.find('a', {'target': 'video'})['title'] except: video_info['title'] = None try: url = line.find('a', {'target': 'video'})['href'] video_info['url'] = 'https:' + url except: video_info['url'] = None try: play_count_str = line.find('span', {'class': 'v-num'}).text video_info['play_count'] = trans_play_count(play_count_str) except: video_info['play_count'] = 0 # logging.warning("can't get play_count at page %s" % video_info['url']) try: release_time_str = line.find('span', { 'class': 'v-publishtime' }).text video_info['release_time'] = trans_strtime_to_timestamp( input_time=release_time_str, missing_year=True) except: release_time_str = 0 # logging.warning("can't get release_time at page %s" % video_info['url']) try: dura_str = line.find('span', {'class': 'v-time'}).text video_info['duration'] = trans_duration(dura_str) except: video_info['duration'] = 0 # logging.warning("can't get duration at page %s" % video_info['url']) fetch_time = int(time.time() * 1e3) video_info['fetch_time'] = fetch_time return video_info
def sogou_info_page(keyword): result_lst = [] for page_num in range(1, 11): search_url = 'http://news.sogou.com/news?&query=' + keyword + '&page=' + str( page_num) get_page = requests.get(search_url, headers=headers) page = get_page.text soup = BeautifulSoup(page, 'html.parser') news_lst = soup.find_all('div', {'class': 'vrwrap'}) for line in news_lst: try: title = line.div.h3.a.text url = line.div.h3.a['href'] source_and_release_time = line.find('p', { 'class': 'news-from' }).text source_and_release_time_lst = source_and_release_time.split( '\xa0') source = source_and_release_time_lst[0] release_time_str = source_and_release_time_lst[-1] release_time = trans_strtime_to_timestamp(release_time_str) try: content = line.find('span').text except: print('no content at %s' % title) content = 'missing' fetch_time = int(time.time() * 1000) try: similar_news = line.find('a', {'id': 'news_similar'}).text except: print('no similar news at %s' % title) similar_news = 'missing' news_info = { 'title': title, 'url': url, 'source': source, 'release_time': release_time, 'fetch_time': fetch_time, 'content': content, 'similar_news': similar_news, 'keyword': keyword } result_lst.append(news_info) print('get data at page %s' % page_num) except: ('the error occured at position %s' % news_lst.index(line)) return result_lst
def video_page(self, url, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, *args, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 """ releaser = "" count = 1 result_list = [] page_count = 0 size_num = 0 # releaser_id = self.get_releaser_id(url) while count < releaser_page_num_max: if size_num > 1000: size_num = 0 size_num += 20 count += 1 url_dic = { 'channel': 'T1457068979049', 'subtab': 'Video_Recom', 'size': "10", 'offset': size_num, 'fn': '3', 'devId': 'sklfRdL61S9GUQ4M7DSzdvA6U6LFEZr0pAEonUVTJrYHNFmgkLuyUgNU6zUV7MVx', 'version': '33.2.1', 'net': 'wifi', 'ts': '1557126556', 'sign': 'YTk73p++NeCfCJRpZkThWxGYX0gVcFWjUVLCRIRwftV48ErR02zJ6/KXOnxX046I', 'encryption': '1', 'canal': 'lite_wifi_cpa10', 'mac': 'racUMC0A9havm+He6jH3YAvVdjgSXYDtwEDZ03eH1l8=' } releaserUrl = 'https://c.m.163.com/recommend/getChanListNews?%s' % urllib.parse.urlencode( url_dic) print(releaserUrl) page_count += 20 get_page = requests.get(releaserUrl, headers=self.headers) page_dic = get_page.json() data_list = page_dic.get("视频") # print(data_list) # print(releaserUrl) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser, count)) pcursor = "no_more" continue else: print("get data at page: %s" % (count)) for info_dic in data_list: skipID = info_dic.get("vid") video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic[ 'url'] = "https://c.m.163.com/news/v/%s.html" % skipID video_dic['releaser'] = info_dic.get('topicName') video_dic[ 'releaserUrl'] = "https://c.m.163.com/news/sub/%s.html" % info_dic.get( "videoTopic").get("tid") video_dic[ 'releaser_id_str'] = "网易新闻_%s" % self.get_releaser_id( video_dic['releaserUrl']) try: video_dic['release_time'] = int(info_dic.get('ptime')) except: video_dic['release_time'] = trans_strtime_to_timestamp( info_dic.get('ptime')) video_dic['play_count'] = info_dic.get("playCount") video_dic['comment_count'] = info_dic.get('replyCount') video_dic['favorite_count'] = info_dic.get('voteCount') if not video_dic['play_count']: video_dic['play_count'] = 0 if not video_dic['favorite_count']: video_dic['favorite_count'] = 0 video_dic['video_id'] = skipID video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['duration'] = info_dic.get("length") video_dic['video_img'] = self.get_video_image(info_dic) result_list.append(video_dic) if len(result_list) >= 100: output_result( result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # print((result_list)) result_list.clear() return result_list
def releaser_page_app(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=4000, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None): """ get video info from api instead of web page html the most scroll page is 1000 """ headers = { 'Host': 'apis.tudou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Cookie': ('isg=BIeH6gcJlwZw_xQESm9jlG-vFTuRJGXxikf0g1l0mJY9yKeKYVuAvzKJbkgzOzPm;' 'cna=XA2EFIGslWoCAWp4y3KXcZh7; ykss=cdbd115c102a68710215ad93;' '__ysuid=1543316262167mjE; P_ck_ctl=62DE1D55DFE1C0F4F27A8662E6575F08;' '__ayvstp=32'), 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0' } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaser = self.get_releaser_name(releaserUrl) releaserUrl = 'https://id.tudou.com/i/%s/videos' % releaser_id self.video_data['releaser'] = releaser self.video_data['releaserUrl'] = releaserUrl url_dic = {"uid": releaser_id, "pL": "20"} print("working on releaser: %s releaser_id: %s" % (releaser, releaser_id)) while count <= releaser_page_num_max and retry_time < 5: proxies = get_proxy(proxies_num) url_dic['pg'] = str(count) url_dic['pn'] = str(count) api_url = 'http://apis.tudou.com/subscribe/v1/video?%s' % urllib.parse.urlencode( url_dic) # print(api_url) if proxies: get_page = requests.get(api_url, headers=headers, proxies=proxies, timeout=5) else: get_page = requests.get(api_url, headers=headers, timeout=5) page_dic = get_page.json() # has_more = page_dic.get('has_more') try: data_list = page_dic['entity'] except: retry_time += 1 proxies = get_proxy(1) time.sleep(0.25) print("no more data at releaser: %s page: %s try_time: %s" % (releaser, count, retry_time)) continue if data_list == []: retry_time += 1 proxies = get_proxy(1) time.sleep(0.25) print("no more data at releaser: %s page: %s try_time: %s" % (releaser, count, retry_time)) continue else: retry_time = 0 print("get data at releaser: %s page: %s" % (releaser, count)) count += 1 for info_dic in data_list: video_info = copy.deepcopy(self.video_data) one_video = info_dic.get('detail') if one_video is not None: get_title = one_video.get('base_detail') if get_title is not None: video_info['title'] = get_title.get('title') detail_info = one_video.get('video_detail') if detail_info is not None: video_id = detail_info.get('video_id') if video_id is not None: video_info['video_id'] = video_id video_info[ 'url'] = 'https://video.tudou.com/v/%s.html' % video_id video_info['duration'] = detail_info.get( 'duration') video_info['releaser_id_str'] = "new_tudou_%s" % ( releaser_id) video_info['comment_count'] = int( detail_info.get('comment_count')) video_info['favorite_count'] = int( detail_info.get('praiseNumber')) #favorite_count in database means 点赞数, while in web page the factor #named praiseNumber #in web page facorite_count means 收藏数 video_info['shoucang_count'] = ( detail_info.get('favorite_count')) # print('play_count', detail_info.get('vv_count')) video_info['play_count'] = detail_info.get( 'vv_count') video_info['video_img'] = self.get_video_image( detail_info) release_time_str = detail_info.get('publish_time') print(release_time_str) # print(video_info['release_time']) if '天前' in release_time_str: video_info['release_time'] = self.video_page( video_info['url'])['release_time'] else: video_info[ 'release_time'] = trans_strtime_to_timestamp( input_time=release_time_str, missing_year=True) video_info['fetch_time'] = int(time.time() * 1e3) yield video_info else: continue
def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, fetchFavoriteCommnt=True): pid = os.getpid() releaser_id = self.get_releaser_id(releaserUrl) print('releaser_id is %s' % releaser_id) result_lst = [] # video_info = self.video_data page_num = 0 has_more = True ctime = "" count_false = 0 # proxies = None proxies = get_proxy_dic() while page_num <= releaser_page_num_max and has_more: post_url = 'https://haokan.baidu.com/haokan/wiseauthor?app_id={0}&_api=1&_skip={1}&ctime={2}&_limit=10&video_type=media&sort_type=sort_by_time'.format( releaser_id, page_num, ctime) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', "referer": "https://haokan.baidu.com/haokan/wiseauthor?app_id=1564003728536358", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh,zh-CN;q=0.9", "content-type": "application/x-www-form-urlencoded" } try: if page_num == 0: for loop in range(5): get_page = requests.get(releaserUrl, headers=headers, timeout=3, proxies=proxies) # print(get_page.text) page_dic, fans_num = self.web_first_pag(get_page.text) if page_dic['apiData']['video']['results']: page_num += 1 break else: get_page = requests.get(post_url, headers=headers, timeout=3) page_dic = get_page.json() page_num += 1 # print(page_dic) except: continue try: info_lst = page_dic['apiData']['video']['results'] except: info_lst = [] try: ctime = page_dic['apiData']['video']['ctime'] has_more = page_dic['apiData']['video']['has_more'] if not has_more: has_more = False except: has_more = False if info_lst != []: count_false = 0 print("Process %s is processing %s at page %s" % (pid, releaser_id, page_num)) time.sleep(int(random.uniform(1, 2))) for line in info_lst: video_data = copy.deepcopy(self.video_data_template) video_data['title'] = line['content']['title'] video_id = line['content']['vid'] video_data['video_id'] = video_id # partial_url = '{"nid":"sv_%s"}' % video_id # partial_url_encode = urllib.parse.quote_plus(partial_url) video_data['url'] = line['content']["video_short_url"] video_data['play_count'] = line['content']['playcnt'] video_data['favorite_count'] = int( line['content']['praiseNum']) try: video_data['comment_count'] = int( line['content']['commentNum']) except: video_data['comment_count'] = 0 video_data['releaser_followers_count'] = int(fans_num) # print('like num is %s' % video_data['favorite_count']) try: video_data['duration'] = trans_duration( line['content']['duration']) except: video_data['duration'] = 0 video_data['releaser'] = line['content']['author'] video_data['releaser_id_str'] = "haokan_%s" % ( line['content']['authorid']) video_data[ 'releaserUrl'] = 'https://haokan.baidu.com/haokan/wiseauthor?app_id=' + line[ 'content']['authorid'] fetch_time = int(time.time() * 1e3) video_data['fetch_time'] = fetch_time releaser_time_str = line['content']['publish_time'] video_data['release_time'] = trans_strtime_to_timestamp( input_time=releaser_time_str) print( video_id, releaser_time_str, datetime.datetime.fromtimestamp( video_data['release_time'] / 1000), page_num) yield video_data else: count_false += 1 if count_false < 5: continue else: break