def get_video_attrs(html, removeLongResult=True): """ get video attributes from html """ result = {} # get video id and description regex = 'yt\-lockup\-title.*?href.*?watch\?v\=(.*?[^\"]+)' regex += '.*? title\=\"(.*?[^\"]+)' temp = re.findall(regex, html) if len(temp) and len(temp[0]) == 2: result['id'] = temp[0][0] result['title'] = html_unescape(temp[0][1].decode('utf-8')) # length length_regex = 'video\-time.*?\>([^\<]+)' temp = re.findall(length_regex, html) if len(temp) > 0: result['length'] = temp[0].strip() # uploader upl_regex = 'yt\-lockup\-byline.*?\>.*?\>([^\<]+)' temp = re.findall(upl_regex, html) if len(temp) > 0: result['uploader'] = temp[0].strip() # time ago time_regex = 'yt\-lockup\-meta\-info.*?\>.*?\>([^\<]+).*?([0-9\,]+)' temp = re.findall(time_regex, html) if len(temp) and len(temp[0]) == 2: result['time'] = temp[0][0] result['views'] = temp[0][1] # thumbnail if 'id' in result: thumb = 'http://img.youtube.com/vi/%s/0.jpg' % result['id'] result['thumb'] = thumb else: return None # Description desc_regex = 'yt-lockup-description.*?>(.*?)<' temp = re.findall(desc_regex, html) if len(temp) > 0: result['description'] = temp[0] else: result['description'] = '' # check if all items present. If not present, usually some problem in parsing if len(result) != 8: return None # check length if removeLongResult and extends_length(result['length'], 20 * 60): return None # return result['get_url'] = '/g?url=' + encode_data( get_key(), id=result['id'], title=result['title'], length=result['length'] ) result['stream_url'] = result['get_url'].replace('/g?', '/stream?', 1) return result
def get_suggestions(vid_id, get_url_prefix='/api/v1'): url = "https://www.youtube.com/watch?v=" + vid_id raw_html = open_page(url) area_of_concern_regex = r'<div class=\"watch-sidebar-section\"(.*?)<div id=\"watch7-hidden-extras\"' area_of_concern = ' '.join(re.findall(area_of_concern_regex, raw_html, re.DOTALL)) videos_html_regex = r'class=\"video-list-item.*?a href=\"/watch\?v=(.*?)\" class.*? class=\"title.*?>(.*?)</span>' \ r'.*?Duration: (.*?)\..*?<span class=\"g-hovercard.*?>(.*?)</span>.*?view-count\">(.*?) ' \ r'views.*?<li ' videos_html = re.findall(videos_html_regex, area_of_concern, re.DOTALL) ret_list = [] for video in videos_html: _id = video[0] if '&list=' in _id: continue title = video[1].strip('\n\t ') duration = video[2] uploader = video[3] views = video[4] get_url = get_url_prefix + '/g?url=' + encode_data(get_key(), id=_id, title=title, length=duration) stream_url = get_url.replace('/g?', '/stream?', 1) suggest_url = get_url.replace('/g?', '/suggest?', 1) if extends_length(duration, 20*60): continue ret_list.append( { "id": _id, "title": html_unescape(title.decode('utf-8')), "length": duration, "uploader": uploader, "thumb": 'http://img.youtube.com/vi/%s/0.jpg' % _id, "get_url": get_url, "stream_url": stream_url, "views": views, "suggest_url": suggest_url } ) return ret_list
def get_trending_videos(html): """ Get trending youtube videos from html """ regex = '<tr.*?data-video-id="(.*?)".*?src="(.*?)".*?<a cl.*?>(.*?)</a>.*?by.*?>(.*?)</a>.*?<span .*?>(.*?)</' raw_results = re.findall( regex, html, re.DOTALL)[:int(environ.get('PLAYLIST_VIDEOS_LIMIT', 100))] vids = [] for raw_result in raw_results: try: url = 'https://www.youtube.com/watch?v=' + raw_result[0] html = open_page(url) vids.append({ 'id': raw_result[0], 'thumb': 'https://img.youtube.com/vi/{0}/0.jpg'.format(raw_result[0]), 'title': html_unescape(raw_result[2].strip().decode('utf-8')), 'uploader': raw_result[3].decode('utf8'), 'length': raw_result[4], 'views': get_views(html), 'get_url': encode_data(get_key(), id=raw_result[0], title=raw_result[2].strip(), length=raw_result[4]), 'description': html_unescape(get_description(html)) }) except Exception as e: logger.info( 'Getting trending video failed. Message: %s, Video: %s' % (str(e), raw_result[0])) return vids
def get_suggestions(vid_id, get_url_prefix='/api/v1'): url = "https://www.youtube.com/watch?v=" + vid_id raw_html = open_page(url) area_of_concern = ' '.join( area_of_concern_regex.findall(raw_html, re.DOTALL)) videos_html = videos_html_regex.findall(area_of_concern, re.DOTALL) ret_list = [] for video in videos_html: try: _id = single_video_regex['id'].findall(video)[0] if '&list=' in _id: continue title = single_video_regex['title'].findall(video)[0] duration = single_video_regex['duration'].findall(video)[0] uploader = single_video_regex['uploader'].findall(video)[0] views = single_video_regex['views'].findall(video)[0] get_url = get_url_prefix + '/g?url=' + encode_data( get_key(), id=_id, title=title, length=duration) stream_url = get_url.replace('/g?', '/stream?', 1) suggest_url = get_url.replace('/g?', '/suggest?', 1) ret_list.append({ "id": _id, "title": html_unescape(title.decode('utf-8')), "length": duration, "uploader": uploader, "thumb": 'https://img.youtube.com/vi/%s/0.jpg' % _id, "get_url": get_url, "stream_url": stream_url, "views": views, "suggest_url": suggest_url }) except Exception: print('Error while getting suggestion at video \n' + video) traceback.print_exc() return ret_list