def fix_url(cls, url):
     if re.search(r'/(videos|clips)\?filter=', url):
         return url.strip('/')
     return url.split('?')[0].strip('/')
def get_info(url, cw=None, depth=0, tags_add=None):
    print_ = get_print(cw)
    api = PixivAPI()
    info = {}
    imgs = []

    ugoira_ext = [None, '.gif', '.webp', '.png'
                  ][utils.ui_setting.ugoira_convert.currentIndex(
                  )] if utils.ui_setting else None
    format_ = compatstr(utils.ui_setting.pixivFormat.currentText()
                        ) if utils.ui_setting else 'id_ppage'

    max_pid = get_max_range(cw)

    if api.illust_id(url):  # Single post
        id_ = api.illust_id(url)
        data = api.illust(id_)
        login = '******' not in data
        if FORCE_LOGIN and not login:  #
            raise errors.LoginRequired()
        if data['xRestrict'] and not login:
            raise errors.LoginRequired('R-18')
        info['artist'] = data['userName']
        info['artist_id'] = data['userId']
        info['raw_title'] = data['illustTitle']
        info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_)
        info['create_date'] = parse_time(data['createDate'])
        tags_illust = set(tag['tag'] for tag in data['tags']['tags'])

        if tags_matched(tags_illust, tags_add, cw):
            if data['illustType'] == 2:  # ugoira
                data = api.ugoira_meta(id_)
                ugoira = {
                    'ext': ugoira_ext,
                    'delay': [frame['delay'] for frame in data['frames']],
                }
                img = Image(data['originalSrc'],
                            url,
                            id_,
                            0,
                            format_,
                            info,
                            cw,
                            ugoira=ugoira)
                imgs.append(img)
            else:
                data = api.pages(id_)
                for img in data:
                    img = Image(img['urls']['original'], url, id_, len(imgs),
                                format_, info, cw)
                    imgs.append(img)
        else:
            print('tags mismatched')
    elif '/bookmarks/' in url or 'bookmark.php' in url:  # User bookmarks
        id_ = api.user_id(url)
        if id_ is None:  #
            id_ = my_id()
        if id_ == my_id():
            rests = ['show', 'hide']
        else:
            rests = ['show']
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'],
                                                   info['artist_id'])
        ids = []
        ids_set = set()
        for rest in rests:
            offset = 0
            while len(ids) < max_pid:
                data = api.bookmarks(id_, offset, rest=rest)
                c = 0
                for id in [work['id'] for work in data['works']]:
                    if id in ids_set:
                        continue
                    ids_set.add(id)
                    ids.append(id)
                    c += 1
                if not c:
                    break
                offset += LIMIT
                if depth == 0:
                    check_alive(cw)
        process_ids(ids, info, imgs, cw, depth)
    elif '/tags/' in url or 'search.php' in url:  # Search
        q = unquote(
            re.find(r'/tags/([^/]+)', url)
            or re.find('[?&]word=([^&]*)', url, err='no tags'))
        info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+'))
        qs = query_url(url)
        order = qs.get('order', ['date_d'])[0]
        mode = qs.get('mode', ['all'])[0]
        s_mode = qs.get('s_mode', ['s_tag_full'])[0]
        scd = qs.get('scd', [None])[0]
        ecd = qs.get('ecd', [None])[0]
        type_ = qs.get('type', ['all'])[0]
        wlt = qs.get('wlt', [None])[0]
        wgt = qs.get('wgt', [None])[0]
        hlt = qs.get('hlt', [None])[0]
        hgt = qs.get('hgt', [None])[0]
        blt = qs.get('blt', [None])[0]
        bgt = qs.get('bgt', [None])[0]
        ratio = qs.get('ratio', [None])[0]
        tool = qs.get('tool', [None])[0]
        logs = [
            'order: {}'.format(order),
            'mode: {}'.format(mode),
            's_mode: {}'.format(s_mode),
            'scd / ecd: {} / {}'.format(scd, ecd),
            'type: {}'.format(type_),
            'wlt /  wgt: {} / {}'.format(wlt, wgt),
            'hlt / hgt: {} / {}'.format(hlt, hgt),
            'blt / bgt: {} / {}'.format(blt, bgt),
            'ratio: {}'.format(ratio),
            'tool: {}'.format(tool),
        ]
        print_('\n'.join(logs))
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.search(q,
                              order,
                              mode,
                              p=p,
                              s_mode=s_mode,
                              scd=scd,
                              ecd=ecd,
                              type_=type_,
                              wlt=wlt,
                              wgt=wgt,
                              hlt=hlt,
                              hgt=hgt,
                              blt=blt,
                              bgt=bgt,
                              ratio=ratio,
                              tool=tool)
            c = 0
            for id in [
                    illust['id'] for illust in data['illustManga']['data']
                    if 'id' in illust
            ]:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids, info, imgs, cw, depth)
    elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url:  # Newest works: Following
        r18 = 'bookmark_new_illust_r18.php' in url
        id_ = my_id()
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_following_{}{})'.format(
            info['artist'], 'r18_' if r18 else '', info['artist_id'])
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.following(p, r18=r18)
            c = 0
            for id in data['page']['ids']:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids, info, imgs, cw, depth)
    elif api.user_id(url):  # User illusts
        m = re.search(r'/users/[0-9]+/([\w]+)/?([^\?#/]*)', url)
        type_ = {
            'illustrations': 'illusts',
            'manga': 'manga'
        }.get(m and m.groups()[0])
        if type_:
            types = [type_]
        else:
            types = ['illusts', 'manga']
        if m:
            tag = unquote(m.groups()[1]) or None
        else:
            tag = None
        print_('types: {}, tag: {}'.format(types, tag))

        id_ = api.user_id(url)
        process_user(id_, info, api)
        data = api.profile(id_)
        info['title'] = '{} (pixiv_{})'.format(info['artist'],
                                               info['artist_id'])

        ids = []
        for type_ in types:
            illusts = data[type_]
            if not illusts:
                continue
            ids += list(illusts.keys())
        ids = sorted(ids, key=int, reverse=True)
        if not ids:
            raise Exception('no imgs')
        process_ids(ids,
                    info,
                    imgs,
                    cw,
                    depth,
                    tags_add=[tag] if tag else None)
    else:
        raise NotImplementedError()
    info['imgs'] = imgs[:max_pid]

    return info
示例#3
0
def isVisible(tag):
    while tag:
        if re.search('display: *none', tag.get('style', ''), re.IGNORECASE):
            return False
        tag = tag.parent
    return True
示例#4
0
 def fix_url(cls, url):
     m = re.search('xhamster(?P<number>[0-9]*)\\.(?P<top>[a-z0-9]+)/', url)
     number, top = m.groups()
     return url.replace((u'xhamster{}.{}/').format(number, top),
                        u'xhamster.com/')
def _get_page_id(html):
    m = re.search("CONFIG\\['page_id'\\]='([0-9]+?)'", html)
    return m
    def read(self):
        print_ = get_print(self.customWidget)
        for try_ in range(8):
            self.customWidget.print_('get_session')
            try:
                session = get_session()
                html = downloader.read_html(self.url, session=session)
                soup = Soup(html)
                get_title_artist(soup)
                break
            except Exception as e:
                print(e)

        else:
            raise

        title, self.artist = get_title_artist(soup)
        self.__title = title
        title_dir = clean_title((u'[{}] {}').format(self.artist, title))
        ex = soup.find('div', id='novel_ex')
        self.novel_ex = ex.text.strip() if ex else None
        texts = []
        subtitles = soup.findAll('dd', class_='subtitle')
        if subtitles:
            for subtitle in subtitles:
                update = subtitle.parent.find('dt', class_='long_update')
                update2 = None
                if update:
                    for span in update.findAll('span'):
                        update2 = span.attrs['title']
                        span.decompose()

                    update = update.text.strip()
                if update2:
                    update += (u'  ({})').format(update2)
                a = subtitle.find('a')
                subtitle = a.text.strip()
                href = urljoin(self.url, a.attrs['href'])
                if not re.search(('ncode.syosetu.com/{}/[0-9]+').format(self.id_), href):
                    print_((u'skip: {}').format(href))
                    continue
                text = Text(subtitle, update, href, session, False)
                texts.append(text)

        else:
            self.single = True
            text = Text(title_dir, None, self.url, session, True)
            texts.append(text)
        self.print_((u'single: {}').format(self.single))
        outdir = get_outdir('syosetu')
        for text in texts:
            if self.single:
                file = os.path.join(outdir, text.filename)
            else:
                file = os.path.join(outdir, title_dir, text.filename)
            if os.path.isfile(file):
                self.urls.append(file)
            else:
                self.urls.append(text.url)

        self.title = title_dir
示例#7
0
 def init(self):
     self.url = self.url.replace('xhamster_', '')
     if not re.search('xhamster[0-9]*\\.', self.url):
         self.url = (u'https://xhamster.com/videos/{}').format(self.url)
def f(url):
    if re.search(PATTERN_ID, url):
        raise Exception(tr_(u'목록 주소를 입력해주세요'))
    session = Session()
    pages = get_pages(url, session=session)
    return pages
示例#9
0
 def init(self):
     if re.search(r'xhamsterlive[0-9]*\.', self.url):
         raise Exception('xHamsterLive')
     if not re.search(r'xhamster[0-9]*\.', self.url):
         self.url = 'https://xhamster.com/videos/{}'.format(self.url)
示例#10
0
    def f(html, browser=None):
        soup = Soup(html)
        if is_captcha(soup):
            print('captcha')
            browser.show()
            sd['shown'] = True
        elif sd['shown'] and not SHOW:
            browser.hide()
            sd['shown'] = False
        try:
            st = soup.find('h2', class_='share-title')
            if st is None:
                st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c)
            info['uid'] = st.text.strip()
            st = soup.find('h1', class_='share-sub-title')
            if st is None:
                st = soup.find('h1',
                               class_=lambda c: c and 'ShareSubTitle' in c)
            info['nickname'] = st.text.strip()
        except Exception as e:
            print_(print_error(e)[0])
        c = 0
        ids_now = set()
        items = soup.findAll('div', class_='video-feed-item') + soup.findAll(
            'div', class_=lambda c: c and 'DivItemContainer' in c)
        for div in items:
            a = div.find('a')
            if a is None:
                continue
            href = a['href']
            if not href:
                continue
            m = re.search(PATTERN_VID, href)
            if m is None:
                continue
            id_video = int(m.group('id'))
            ids_now.add(id_video)
            if id_video in ids:
                continue
            ids.add(id_video)
            info['items'].append({'id': id_video})
            c += 1

        print_('items: {}'.format(len(info['items'])))
        if len(info['items']) >= max_pid:
            info['items'] = info['items'][:max_pid]
            return True

        browser.runJavaScript(
            'window.scrollTo(0, document.body.scrollHeight);')
        sleep(15, cw)

        if c or (ids_now and min(ids_now) > min(ids)):
            sd['count_empty'] = 0
        else:
            print_('empty')
            sd['count_empty'] += 1
        msg = '{}  {} (tiktok_{}) - {}'.format(tr_('읽는 중...'),
                                               info.get('nickname'),
                                               info.get('uid'),
                                               len(info['items']))
        if cw:
            if not cw.alive:
                raise Exception('cw dead')
            cw.setTitle(msg)
        else:
            print(msg)
        return sd['count_empty'] > 4
示例#11
0
    def get(self, url):
        '''
        get
        '''
        cw = self.cw
        session = self.session
        print_ = get_print(cw)
        if self._url:
            return self._url

        id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \
              re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id')
        print_('id: {}'.format(id_))
        if 'viewkey=' not in url.lower() and '/gif/' not in url.lower():
            url = urljoin(url, '/view_video.php?viewkey={}'.format(id_))

        url_test = url.replace('pornhubpremium.com', 'pornhub.com')
        try:
            html = downloader.read_html(url_test, session=session)
            soup = Soup(html)
            if soup.find('div', id='lockedPlayer'):
                print_('Locked player')
                raise Exception('Locked player')
            url = url_test
        except: #3511
            url = url.replace('pornhub.com', 'pornhubpremium.com')
            html = downloader.read_html(url, session=session)
            
        soup = Soup(html)
        soup = fix_soup(soup, url, session, cw)
        html = soup.html

        # removed
        if soup.find('div', class_='removed'):
            raise Exception('removed')

        gif = soup.find('div', {'id': 'gifImageSection'})
        if gif:
            print_('GIF')
            id_ = url.split('/gif/')[1]
            id_ = re.findall('[0-9a-zA-Z]+', id_)[0]
            
            jss = list(gif.children)
            for js in jss:
                if 'data-mp4' in getattr(js, 'attrs', {}):
                    break
            else:
                raise Exception('gif mp4 url not found')

            title = js['data-gif-title']
            url = js['data-mp4']
            url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb')
            file = File('gif_{}'.format(id_), title, url, url_thumb)
        else:
            if id_ is None:
                raise Exception('no id')

            print_('Video')

            # 1968
            #title = j['video_title']
            title = soup.find('h1', class_='title').text.strip()

            video_urls = []
            video_urls_set = set()

            def int_or_none(s):
                try:
                    return int(s)
                except:
                    return None

            def url_or_none(url):
                if not url or not isinstance(url, str):
                    return None
                url = url.strip()
                return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
            
            flashvars  = json.loads(re.find(r'var\s+flashvars_\d+\s*=\s*({.+?});', html, err='no flashvars'))
            url_thumb = flashvars.get('image_url')
            media_definitions = flashvars.get('mediaDefinitions')
            if isinstance(media_definitions, list):
                for definition in media_definitions:
                    if not isinstance(definition, dict):
                        continue
                    video_url = definition.get('videoUrl')
                    if not video_url or not isinstance(video_url, str):
                        continue
                    if video_url in video_urls_set:
                        continue
                    video_urls_set.add(video_url)
                    video_urls.append(
                        (video_url, int_or_none(definition.get('quality'))))

            def extract_js_vars(webpage, pattern, default=object()):
                assignments = re.find(pattern, webpage, default=default)
                if not assignments:
                    return {}

                assignments = assignments.split(';')

                js_vars = {}

                def remove_quotes(s):
                    if s is None or len(s) < 2:
                        return s
                    for quote in ('"', "'", ):
                        if s[0] == quote and s[-1] == quote:
                            return s[1:-1]
                    return s

                def parse_js_value(inp):
                    inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
                    if '+' in inp:
                        inps = inp.split('+')
                        return functools.reduce(
                            operator.concat, map(parse_js_value, inps))
                    inp = inp.strip()
                    if inp in js_vars:
                        return js_vars[inp]
                    return remove_quotes(inp)

                for assn in assignments:
                    assn = assn.strip()
                    if not assn:
                        continue
                    assn = re.sub(r'var\s+', '', assn)
                    vname, value = assn.split('=', 1)
                    js_vars[vname] = parse_js_value(value)
                return js_vars

            def add_video_url(video_url):
                v_url = url_or_none(video_url)
                if not v_url:
                    return
                if v_url in video_urls_set:
                    return
                video_urls.append((v_url, None))
                video_urls_set.add(v_url)

            def parse_quality_items(quality_items):
                q_items = json.loads(quality_items)
                if not isinstance(q_items, list):
                    return
                for item in q_items:
                    if isinstance(item, dict):
                        add_video_url(item.get('url'))

            if not video_urls:
                print_('# extract video_urls 2')
                FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
                js_vars = extract_js_vars(
                    html, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
                    default=None)
                if js_vars:
                    for key, format_url in js_vars.items():
                        if key.startswith(FORMAT_PREFIXES[-1]):
                            parse_quality_items(format_url)
                        elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
                            add_video_url(format_url)
                if not video_urls and re.search(
                        r'<[^>]+\bid=["\']lockedPlayer', html):
                    raise Exception('Video is locked')

##            if not video_urls:
##                print_('# extract video_urls 3')
##                js_vars = extract_js_vars(
##                    dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
##                add_video_url(js_vars['mediastring'])

            for mobj in re.finditer(
                    r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
                    html):
                video_url = mobj.group('url')
                if video_url not in video_urls_set:
                    video_urls.append((video_url, None))
                    video_urls_set.add(video_url)

            video_urls_ = video_urls
            video_urls = []
            for video_url, height in video_urls_:
                if '/video/get_media' in video_url:
                    print_(video_url)
                    medias = downloader.read_json(video_url, session=session)
                    if isinstance(medias, list):
                        for media in medias:
                            if not isinstance(media, dict):
                                continue
                            video_url = url_or_none(media.get('videoUrl'))
                            if not video_url:
                                continue
                            height = int_or_none(media.get('quality'))
                            video_urls.append((video_url, height))
                    continue
                video_urls.append((video_url, height))
                

            videos = []
            for video_url, height in video_urls:
                video = {}
                video['height'] = height or int_or_none(re.find(r'(?P<height>\d+)[pP]?_\d+[kK]', video_url))
                video['quality'] = video['height'] or 0
                video['videoUrl'] = video_url
                ext = get_ext(video_url)
                video['ext'] = ext
                if ext.lower() == '.m3u8':
                    video['quality'] -= 1
                print_('[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl']))
                videos.append(video)

            if not videos:
                raise Exception('No videos')

            videos = sorted(videos, key=lambda video: video['quality'])

            res = get_resolution()

            videos_good = [video for video in videos if video['quality'] <= res]
            if videos_good:
                video = videos_good[-1]
            else:
                video = videos[0]
            print_('\n[{}p] {} {}'.format(video['height'], video['ext'], video['videoUrl']))

            file = File(id_, title, video['videoUrl'].strip(), url_thumb)
        
        self._url = file.url
        self.title = file.title
        self.filename = file.filename
        self.thumb = file.thumb
        return self._url
示例#12
0
def get_imgs(url, n_max=2000, title=None, cw=None, session=None):
    print_ = get_print(cw)

    for try_ in range(4):
        try:
            html = read_html(url, session, cw)
            m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)',
                          html)
            if m is None:
                raise Exception('Invalid page')
            break
        except Exception as e:
            e_ = e
            print_(print_error(e)[0])
    else:
        raise e_
    n = int(m.groups()[0])
    n = min(n, n_max)

    data = get_sd(url, html=html, cw=cw)

    uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
    csrf_token = data['config']['csrf_token']  #
    session.cookies.set(name='ig_pr',
                        value='1',
                        path='/',
                        domain='.instagram.com')

    cursor = ''
    edges = []
    bad = 0
    while True:
        check_alive(cw)

        variables = {
            'id': uploader_id,
            'first': 12,
        }
        if cursor:
            variables['after'] = cursor
        #print_(variables)#

        media = None
        try:
            j = get_query('003056d32c2554def87228bc3fd9668a', variables,
                          session, cw)
            media = j['data']['user']['edge_owner_to_timeline_media']
            sleep(2)  #
        except Exception as e:
            if bad > 10:
                raise Exception('no media')
            else:
                print_(u'no media.. retry... ({}) {}'.format(
                    bad + 1,
                    print_error(e)[0]))
                sleep(12 * bad, cw)
                bad += 1
                continue
        bad = 0

        edges_new = media.get('edges')
        if not edges_new or not isinstance(edges_new, list):
            print('no edges_new')
            break

        edges += edges_new

        s = u'{} {}  ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n)
        if cw is not None:
            cw.setTitle(s)
            if not cw.alive:
                return []
        else:
            print(s)

        if len(edges) >= n:
            break

        page_info = media.get('page_info')
        if not page_info:
            break
        if not page_info.get('has_next_page'):
            break
        cursor = page_info.get('end_cursor')
        if not cursor:
            break

    if len(edges) <= n / 2:
        raise Exception(u'Too short: {} / {}'.format(len(edges), n))

    imgs = []
    for edge in edges:
        node = edge['node']
        type = node['__typename']
        id = node['shortcode']
        url = u'https://www.instagram.com/p/{}/'.format(id)
        ##        if type in ['GraphVideo', 'GraphImage']:
        ##            single = True
        ##        else:
        ##            single = False
        for img in Node(url, session=session, cw=cw, media=node).imgs:
            imgs.append(img)
        if len(imgs) >= n_max:
            break

    return imgs