Exemplo n.º 1
0
def get_videos(url, cw=None):
    print_ = get_print(cw)
    info = {}
    user_id = re.find(r'twitch.tv/([^/?]+)', url, err='no user_id')
    print(user_id)
    session = Session()
    r = session.get(url)
    s = cut_pair(re.find(r'headers *: *({.*)', r.text, err='no headers'))
    print(s)
    headers = json_loads(s)

    payload = [{
        'operationName': 'ClipsCards__User',
        'variables': {
            'login': user_id,
            'limit': 20,
            'criteria': {
                'filter': 'ALL_TIME'
            }
        },
        'extensions': {
            'persistedQuery': {
                'version':
                1,
                'sha256Hash':
                'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777'
            }
        },
    }]
    videos = []
    cursor = None
    cursor_new = None
    while True:
        if cursor:
            payload[0]['variables']['cursor'] = cursor
        r = session.post('https://gql.twitch.tv/gql',
                         json=payload,
                         headers=headers)
        #print(r)
        data = r.json()
        for edge in data[0]['data']['user']['clips']['edges']:
            url_video = edge['node']['url']
            info['name'] = edge['node']['broadcaster']['displayName']
            video = Video(url_video)
            video.id = int(edge['node']['id'])
            videos.append(video)
            cursor_new = edge['cursor']
        print_('videos: {} / cursor: {}'.format(len(videos), cursor))
        if cursor == cursor_new:
            print_('same cursor')
            break
        if cursor_new is None:
            break
        cursor = cursor_new
    if not videos:
        raise Exception('no videos')
    info['videos'] = sorted(videos, key=lambda video: video.id, reverse=True)
    return info
Exemplo n.º 2
0
def get_sd(url, session=None, html=None, cw=None, wait=True):
    print_ = get_print(cw)

    if html:
        soup = Soup(html)
        check_error(soup, cw, wait)
        for script in soup.findAll('script'):
            j = get_j(script)
            if j:
                break
        else:
            raise Exception('no _sharedData!!')
    else:
        for try_ in range(4):
            _wait(cw)
            html = read_html(url, session, cw)
            soup = Soup(html)
            check_error(soup, cw, wait)
            for script in soup.findAll('script'):
                j = get_j(script)
                if j:
                    break
            else:
                continue
            break
        else:
            raise Exception('no _sharedData')
    for script in soup.findAll('script'):
        s = script.string
        if s and 'window.__additionalDataLoaded(' in s:
            s = cut_pair(s)
            j_add = json.loads(s)
            try:
                j['entry_data']['PostPage'][0].update(j_add)
            except:
                j['entry_data']['ProfilePage'][0].update(j_add)  #2900

    # Challenge
    challenge = j['entry_data'].get('Challenge')
    if challenge:
        try:
            for cont in challenge[0]['extraData']['content']:
                title = cont.get('title')
                if title:
                    break
            else:
                raise Exception('no title')
        except:
            title = 'Err'
        raise errors.LoginRequired(title)

    # LoginAndSignupPage
    login = j['entry_data'].get('LoginAndSignupPage')
    if login:
        raise errors.LoginRequired()

    return j
Exemplo n.º 3
0
def get_info(url, html):
    soup = Soup(html)
    info = {}

    uname = soup.find('div', class_='user-name') or soup.find('p', class_='uname') or soup.find('div', class_='user-info-name')

    info['artist'] = uname.text.strip()
    
    s = cut_pair(html.split('window.__ssr_data = JSON.parse("')[1])
    j = json.loads(json.loads(u'"{}"'.format(s)))
    
    if '/detail/' in url:
        info['uid'] = j['detail']['detail_user']['uid']
        info['id'] = j['detail']['post_data']['item_id']
    else:
        info['uid'] = j['homeInfo']['uid']

    return info
Exemplo n.º 4
0
def get_info(url):
    url = url.replace('/gallery/', '/a/')
    if '/r/' in url and url.split('/r/')[1].strip('/').count('/') == 0:
        title = re.find(r'/r/([^/]+)', url)
        info = {}
        info['title'] = title
        info['type'] = 'r'
    else:
        try: # legacy
            html = downloader.read_html(url, cookies={'over18':'1'})
            s = re.find('image *: *({.+)', html)
            info_raw = cut_pair(s)
        except Exception as e: # new
            print(e)
            id_ = re.find(r'/a/([0-9a-zA-Z_]+)', url) or re.find(r'/r/[0-9a-zA-Z_]+/([0-9a-zA-Z_]+)', url, err='no id')
            url_api = 'https://api.imgur.com/post/v1/albums/{}?client_id=546c25a59c58ad7&include=media%2Cadconfig%2Caccount'.format(id_)
            info_raw = downloader.read_html(url_api, cookies={'over18':'1'})
        info = json.loads(info_raw)
        info['type'] = 'a'
    return info
Exemplo n.º 5
0
def get_imgs(url, html=None, cw=None):
    if '/detail/' not in url:
        return get_imgs_channel(url, html, cw)

    if html is None:
        html = downloader.read_html(url)

    s = cut_pair(html.split('window.__ssr_data = JSON.parse("')[1])
    s = json.loads(u'"{}"'.format(s))

    data = json.loads(s)

    multi = data['detail']['post_data']['multi']

    imgs = []

    for m in multi:
        path = m['original_path']
        img = json.loads(u'"{}"'.format(path))
        img = Image_single(img, url, len(imgs))
        imgs.append(img)

    return imgs
Exemplo n.º 6
0
def get_data(html):
    data_raw = cut_pair(re.find('window.initials *= *(.+)', html))
    return json.loads(data_raw)
Exemplo n.º 7
0
def get_pages(html):
    s = re.find(r'__INITIAL_STATE__=(.+)', html)
    data_raw = cut_pair(s)
    data = json.loads(data_raw)
    pages = data['videoData']['pages']
    return pages
Exemplo n.º 8
0
def get_imgs_page_legacy(page, session, cw=None, depth=0):
    if cw is not None and not cw.alive:
        return
    print_ = get_print(cw)

    try:
        html = read_html(page.url, session)
    except Exception as e:
        print_('get_imgs_page_legacy error: {}'.format(e))
        if e.args and e.args[0] == 502:
            return []
        raise

    if isProtected(html):
        data = get_soup(page.url, cw=cw, session=session)
        page.url = data['url']
        html = data['html']

    soup = Soup(html, 'html5lib')  # 1653

    # skip empty pages
    if not html:
        print_(u'empty page: {}'.format(page.title))
        return []

    # skip invalid pages
    err = soup.find('span', class_='cf-error-code')
    if err:
        print_(u'cf-error-code: {} ({})'.format(err.text.strip(), page.title))
        if depth > 0:
            return []
        else:
            return get_imgs_page_legacy(page, session, cw, depth + 1)

    #page.title = get_title_page(soup)
    matches = re.findall('var img_list *= *(.+?]);', html.replace('\n', ''))
    matches1 = re.findall('var img_list1 *= *(.+?]);', html.replace('\n', ''))
    img_list = json.loads(matches[0]) if matches else []
    img_list1 = json.loads(matches1[0]) if matches1 else []

    # 1780
    img_list = [img for img in img_list if img]
    img_list1 = [img for img in img_list1 if img]

    # 1589
    '''
    if not img_list and not img_list1:
        print_((u'no imgs; retry... {}').format(page.title))
        raise Exception('No images')
    '''

    for script in soup.findAll('script'):
        script = script.text
        if 'var img_list =' in script:
            break
    else:
        raise Exception('No script')

    seed = int(re.find('view_cnt *= *([0-9]+)', script))
    chapter = int(re.find('var +chapter *= *([0-9]+)', script))
    try:
        cdn_domains = cut_pair(re.find('var +cdn_domains *= *(.+)', script),
                               '[]')
        cdn_domains = json.loads(cdn_domains)
    except Exception as e:
        print(e)
        cdn_domains = []

    n = max(len(img_list), len(img_list1))
    img_list += [''] * (n - len(img_list))
    img_list1 += [''] * (n - len(img_list1))

    print_(u'{}    chapter:{}    seed:{}    domains:{}'.format(
        page.title, chapter, seed, len(cdn_domains)))
    if seed != 0:
        return 'seed'
    imgs = []
    for p, (img, img1) in enumerate(zip(img_list, img_list1)):

        # fix img url
        img = fix_img_url(img, cdn_domains, chapter, p)
        img1 = fix_img_url(img1, cdn_domains, chapter, p)

        img = urljoin(page.url, img) if img else ''
        img1 = urljoin(page.url,
                       img1) if img1 else ''  # most likely googledrive
        if img.strip('/').count('/') == 2:  #1425
            continue
        img = Image(img, page, p, img1)
        imgs.append(img)

    return imgs
Exemplo n.º 9
0
def get_ssr_data(html):
    s = html.split('window.__ssr_data = JSON.parse("')[1].replace('\\"', '"')
    s = cut_pair(s).replace('"', '\\"')
    data = json.loads(json.loads('"{}"'.format(s)))
    return data