예제 #1
0
def get_video_attrs(html, removeLongResult=True):
    """
    get video attributes from html
    """
    result = {}
    # get video id and description
    regex = 'yt\-lockup\-title.*?href.*?watch\?v\=(.*?[^\"]+)'
    regex += '.*? title\=\"(.*?[^\"]+)'
    temp = re.findall(regex, html)
    if len(temp) and len(temp[0]) == 2:
        result['id'] = temp[0][0]
        result['title'] = html_unescape(temp[0][1].decode('utf-8'))
    # length
    length_regex = 'video\-time.*?\>([^\<]+)'
    temp = re.findall(length_regex, html)
    if len(temp) > 0:
        result['length'] = temp[0].strip()
    # uploader
    upl_regex = 'yt\-lockup\-byline.*?\>.*?\>([^\<]+)'
    temp = re.findall(upl_regex, html)
    if len(temp) > 0:
        result['uploader'] = temp[0].strip()
    # time ago
    time_regex = 'yt\-lockup\-meta\-info.*?\>.*?\>([^\<]+).*?([0-9\,]+)'
    temp = re.findall(time_regex, html)
    if len(temp) and len(temp[0]) == 2:
        result['time'] = temp[0][0]
        result['views'] = temp[0][1]
    # thumbnail
    if 'id' in result:
        thumb = 'http://img.youtube.com/vi/%s/0.jpg' % result['id']
        result['thumb'] = thumb
    else:
        return None
    # Description
    desc_regex = 'yt-lockup-description.*?>(.*?)<'
    temp = re.findall(desc_regex, html)
    if len(temp) > 0:
        result['description'] = temp[0]
    else:
        result['description'] = ''

    # check if all items present. If not present, usually some problem in parsing
    if len(result) != 8:
        return None
    # check length
    if removeLongResult and extends_length(result['length'], 20 * 60):
        return None
    # return
    result['get_url'] = '/g?url=' + encode_data(
        get_key(), id=result['id'],
        title=result['title'], length=result['length']
    )
    result['stream_url'] = result['get_url'].replace('/g?', '/stream?', 1)
    return result
예제 #2
0
def get_suggestions(vid_id, get_url_prefix='/api/v1'):
    url = "https://www.youtube.com/watch?v=" + vid_id
    raw_html = open_page(url)

    area_of_concern_regex = r'<div class=\"watch-sidebar-section\"(.*?)<div id=\"watch7-hidden-extras\"'
    area_of_concern = ' '.join(re.findall(area_of_concern_regex, raw_html, re.DOTALL))

    videos_html_regex = r'class=\"video-list-item.*?a href=\"/watch\?v=(.*?)\" class.*? class=\"title.*?>(.*?)</span>' \
                        r'.*?Duration: (.*?)\..*?<span class=\"g-hovercard.*?>(.*?)</span>.*?view-count\">(.*?) ' \
                        r'views.*?<li '
    videos_html = re.findall(videos_html_regex, area_of_concern, re.DOTALL)

    ret_list = []
    for video in videos_html:
        _id = video[0]
        if '&amp;list=' in _id:
            continue
        title = video[1].strip('\n\t ')
        duration = video[2]
        uploader = video[3]
        views = video[4]
        get_url = get_url_prefix + '/g?url=' + encode_data(get_key(), id=_id, title=title, length=duration)
        stream_url = get_url.replace('/g?', '/stream?', 1)
        suggest_url = get_url.replace('/g?', '/suggest?', 1)

        if extends_length(duration, 20*60):
            continue

        ret_list.append(
            {
                "id": _id,
                "title": html_unescape(title.decode('utf-8')),
                "length": duration,
                "uploader": uploader,
                "thumb": 'http://img.youtube.com/vi/%s/0.jpg' % _id,
                "get_url": get_url,
                "stream_url": stream_url,
                "views": views,
                "suggest_url": suggest_url
            }
        )

    return ret_list
예제 #3
0
def get_trending_videos(html):
    """
    Get trending youtube videos from html
    """
    regex = '<tr.*?data-video-id="(.*?)".*?src="(.*?)".*?<a cl.*?>(.*?)</a>.*?by.*?>(.*?)</a>.*?<span .*?>(.*?)</'

    raw_results = re.findall(
        regex, html,
        re.DOTALL)[:int(environ.get('PLAYLIST_VIDEOS_LIMIT', 100))]

    vids = []
    for raw_result in raw_results:
        try:
            url = 'https://www.youtube.com/watch?v=' + raw_result[0]
            html = open_page(url)
            vids.append({
                'id':
                raw_result[0],
                'thumb':
                'https://img.youtube.com/vi/{0}/0.jpg'.format(raw_result[0]),
                'title':
                html_unescape(raw_result[2].strip().decode('utf-8')),
                'uploader':
                raw_result[3].decode('utf8'),
                'length':
                raw_result[4],
                'views':
                get_views(html),
                'get_url':
                encode_data(get_key(),
                            id=raw_result[0],
                            title=raw_result[2].strip(),
                            length=raw_result[4]),
                'description':
                html_unescape(get_description(html))
            })
        except Exception as e:
            logger.info(
                'Getting trending video failed. Message: %s, Video: %s' %
                (str(e), raw_result[0]))
    return vids
예제 #4
0
def get_suggestions(vid_id, get_url_prefix='/api/v1'):
    url = "https://www.youtube.com/watch?v=" + vid_id
    raw_html = open_page(url)

    area_of_concern = ' '.join(
        area_of_concern_regex.findall(raw_html, re.DOTALL))

    videos_html = videos_html_regex.findall(area_of_concern, re.DOTALL)

    ret_list = []
    for video in videos_html:
        try:
            _id = single_video_regex['id'].findall(video)[0]
            if '&amp;list=' in _id:
                continue
            title = single_video_regex['title'].findall(video)[0]
            duration = single_video_regex['duration'].findall(video)[0]
            uploader = single_video_regex['uploader'].findall(video)[0]
            views = single_video_regex['views'].findall(video)[0]
            get_url = get_url_prefix + '/g?url=' + encode_data(
                get_key(), id=_id, title=title, length=duration)
            stream_url = get_url.replace('/g?', '/stream?', 1)
            suggest_url = get_url.replace('/g?', '/suggest?', 1)

            ret_list.append({
                "id": _id,
                "title": html_unescape(title.decode('utf-8')),
                "length": duration,
                "uploader": uploader,
                "thumb": 'https://img.youtube.com/vi/%s/0.jpg' % _id,
                "get_url": get_url,
                "stream_url": stream_url,
                "views": views,
                "suggest_url": suggest_url
            })
        except Exception:
            print('Error while getting suggestion at video \n' + video)
            traceback.print_exc()

    return ret_list