def get(self, url, force=False):
        if self._url:
            return self._url

        type = self.type
        only_mp4 = self.only_mp4
        audio_included = self.audio_included
        max_res = self.max_res
        max_abr = self.max_abr
        cw = self.cw
        print_ = get_print(cw)

        if force:
            max_abr = 0

        print('max_res: {}'.format(max_res))
        for try_ in range(8):
            try:
                yt = ytdl.YouTube(url)
                break
            except Exception as e:
                e_ = e
                s = print_error(e)[-1]
                print_('### youtube retry...\n{}'.format(s))
                sleep(try_ / 2, cw)
        else:
            raise e_

        streams = yt.streams.all()
        print_streams(streams, cw)

        if type == 'video':
            streams[:] = [
                stream for stream in streams if stream.video_codec is not None
            ]
            # Only mp4
            if only_mp4:
                streams_ = list(streams)
                streams[:] = []
                for stream in streams_:
                    if stream.subtype == 'mp4':
                        streams.append(stream)

            # Audio included; Non-HD
            if audio_included:
                streams_ = list(streams)
                streams[:] = []
                for stream in streams_:
                    if stream.audio_codec is not None:
                        streams.append(stream)

            # Maximum resolution
            streams_ = list(streams)
            streams[:] = []
            for stream in streams_:
                if stream.resolution is None:
                    continue
                res = int(stream.resolution.replace('p', ''))
                if max_res is None or res <= max_res:
                    streams.append(stream)
            print_('')
        elif type == 'audio':
            streams[:] = [stream for stream in streams if stream.abr]
            # Maximum abr
            abrs = [stream.abr for stream in streams]
            max_abr = min(max(abrs), max_abr)
            streams_ = list(streams)
            streams[:] = []
            for stream in streams_:
                if stream.abr is None:
                    continue
                abr = stream.abr
                if max_abr is None or abr >= max_abr:
                    streams.append(stream)
            #'''
        else:
            raise Exception(u'type "{}" is not supported'.format(type))

        # Pick the best
        while streams:
            if type == 'video':
                ress = [
                    int_(stream.resolution.replace('p', ''))
                    for stream in streams
                ]
                m = max(ress)
                prefer_format = 'mp4'
            elif type == 'audio':
                ress = [stream.abr for stream in streams]
                m = min(ress)
                prefer_format = 'webm'
            print('Resolutions:', ress)
            stream_final = None
            for stream, res in zip(streams, ress):
                if res == m:
                    if type == 'video':
                        foo = (stream_final is not None) and (
                            stream_final.audio_codec is None) and bool(
                                stream.audio_codec)
                    elif type == 'audio':
                        foo = False
                    if stream_final is None or (
                            stream_final.fps <= stream.fps and
                        (foo or (stream_final.subtype.lower() != prefer_format
                                 and stream.subtype.lower() == prefer_format)
                         or stream_final.fps < stream.fps)):
                        #print(foo)
                        print_(u'# stream_final {} {} {} {} {} {}fps'.format(
                            stream, stream.format, stream.resolution,
                            stream.subtype, stream.audio_codec, stream.fps))
                        stream_final = stream

            ok = downloader.ok_url(stream_final.url,
                                   referer=url) if isinstance(
                                       stream_final.url, str) else True
            if ok:
                break
            else:
                print_('stream is not valid')
                streams.remove(stream_final)
        else:
            if type == 'audio' and not force:
                return self.get(url, force=True)  # 1776
            raise Exception('No videos')

        stream = stream_final

        ##        if stream.video_codec and stream_final.video_codec.lower().startswith('av'):
        ##            self.vcodec = 'h264'

        self.yt = yt
        self.id = yt.video_id
        self.stream = stream
        self.username = yt.info['uploader']
        self.stream_audio = None
        self.audio = None
        self.thumb = None
        self.thumb_url = None
        self.subtitles = yt.subtitles

        if type == 'audio' and 'DASH' in self.stream.format:
            self.stream.setDashType('audio')

        # Audio
        if type == 'video' and stream.audio_codec is None:
            print('audio required')
            streams = [stream for stream in yt.streams.all() if stream.abr]
            print_streams(streams, cw)

            # only mp4; https://github.com/KurtBestor/Hitomi-Downloader-issues/issues/480
            def isGood(stream):
                return stream.audio_codec.lower().startswith('mp4')

            streams_good = [stream for stream in streams if isGood(stream)]
            if streams_good:
                streams = streams_good
                print_streams(streams, cw)
            # only audio?
            if any(stream.resolution is None for stream in streams):
                streams = [
                    stream for stream in streams if stream.resolution is None
                ]
                print_streams(streams, cw)
            best_audio = None
            best_abr = 0
            for stream in streams:
                abr = stream.abr
                if abr > best_abr:
                    best_abr = abr
                    best_audio = stream
            if best_audio is None:
                raise Exception('No audio')
            print(best_audio)
            self.stream_audio = best_audio
            if 'DASH' in self.stream_audio.format:
                self.stream_audio.setDashType('audio')
            self.audio = best_audio.url
            if callable(self.audio):
                self.audio = self.audio()

        # Thumbnail
        for quality in ['sddefault', 'hqdefault', 'mqdefault', 'default']:
            print('####', yt.thumbnail_url)
            self.thumb_url = yt.thumbnail_url.replace('default', quality)
            f = BytesIO()
            try:
                downloader.download(self.thumb_url, buffer=f)
                data = f.read()
                if len(data) == 0:
                    raise AssertionError('Zero thumbnail')
                if data == empty_thumbnail:
                    raise AssertionError('Empty thumbnail')
                f.seek(0)
                break
            except Exception as e:
                print(print_error(e)[-1])
        self.thumb = f

        #
        _url = self.stream.url
        if callable(_url):
            _url = _url()
        self._url = _url
        title = yt.title
        #soup = Soup(yt.watch_html)
        #title =  soup.title.text.replace('- YouTube', '').strip()
        self.title = title
        ext = u'.' + self.stream.subtype
        self.filename = format_filename(title, self.id, ext)

        print_(u'Resolution: {}'.format(stream.resolution))
        print_(u'Codec: {} / {}'.format(stream.video_codec,
                                        stream.audio_codec))
        print_(u'Abr: {}'.format(stream.abr))
        print_(u'Subtype: {}'.format(stream.subtype))
        print_(u'FPS: {}\n'.format(stream.fps))

        return self._url
    def _pagination(self,
                    url_api,
                    params=None,
                    entry_tweet="tweet-",
                    entry_cursor="cursor-bottom-"):
        if params is None:
            params = self.params.copy()

        while True:
            cursor = None
            self.print_('cursor: {}'.format(params.get("cursor")))

            # 2303
            n_try = 20
            for try_ in range(n_try):
                try:
                    data = self._call(url_api, params=params)
                    tweets = data["globalObjects"]["tweets"]
                    break
                except Exception as e:
                    e_ = e
                    e_msg = print_error(e)[0]
                    if try_ < n_try - 1:
                        self.print_('retry... _pagination ({})\n{}'.format(
                            try_ + 1, e_msg))
                        sleep(30)
            else:
                raise e_

            users = data["globalObjects"]["users"]
            for instr in data["timeline"]["instructions"]:
                for entry in instr.get("addEntries", {}).get("entries", []):
                    if entry["entryId"].startswith(entry_tweet):
                        tid = entry["content"]["item"]["content"]["tweet"][
                            "id"]
                        if tid not in tweets:
                            self.print_(
                                "Skipping unavailable Tweet {}".format(tid))
                            continue
                        tweet = tweets[tid]
                        tweet["user"] = users[tweet["user_id_str"]]

                        ##                        if "quoted_status_id_str" in tweet:
                        ##                            quoted = tweets[tweet["quoted_status_id_str"]]
                        ##                            tweet["author"] = tweet["user"]
                        ##                            if "extended_entities" in quoted:
                        ##                                tweet["extended_entities"] = \
                        ##                                    quoted["extended_entities"]
                        ##                        elif "retweeted_status_id_str" in tweet:
                        ##                            retweet = tweets[tweet["retweeted_status_id_str"]]
                        ##                            tweet["author"] = users[retweet["user_id_str"]]
                        ##                        else:
                        ##                            tweet["author"] = tweet["user"]

                        yield tweet

                    elif entry["entryId"].startswith(entry_cursor):
                        cursor = entry["content"]["operation"]["cursor"][
                            "value"]

                if not cursor or params.get('cursor') == cursor:
                    print('same cursor')
                    return
                params["cursor"] = cursor
            if params.get("cursor") is None:  # nothing
                break
def get_imgs_legacy(username,
                    session,
                    title,
                    types,
                    n=None,
                    format='[%y-%m-%d] id_ppage',
                    cw=None,
                    mode='media',
                    method='tab',
                    imgs=None):
    print_ = get_print(cw)
    print_('types: {}'.format(', '.join(types)))

    artist, username = get_artist_username(username, session)  #

    # Range
    n = max(n, get_max_range(cw))

    max_pos = None
    ids_set = set()
    if imgs:
        for img in imgs:
            ids_set.add(img.id)
    else:
        imgs = []
    f**k = 0
    min_position = None
    while len(imgs) < n:
        if mode == 'media':
            if method == 'tab':
                foo = '&max_position={}'.format(
                    max_pos) if max_pos is not None else ''
                url = 'https://twitter.com/i/profiles/show/{}/media_timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                    username, foo)
                print_('max_pos={},  imgs={}'.format(max_pos, len(imgs)))
            elif method == 'search':  # 1028
                max_id = min(ids_set) - 1 if ids_set else None
                if ids_set:
                    q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format(
                        username, max_id)
                else:
                    q = 'from:{} exclude:retweets filter:media -filter:periscope'.format(
                        username)
                q = quote(q, '')
                url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1&reset_error_state=false'.format(
                    q)
                print_('max_id={},  imgs={}'.format(max_id, len(imgs)))
            elif method == 'search2':  # 1028
                max_id = min(ids_set) - 1 if ids_set else None
                if ids_set:
                    q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format(
                        username, max_id)
                else:
                    q = 'from:{} exclude:retweets filter:media -filter:periscope'.format(
                        username)
                q = quote(q, '')
                foo = '&max_position={}'.format(
                    max_pos) if max_pos is not None else ''
                url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                    q, foo)
                print_('max_pos={},  max_id={},  imgs={}'.format(
                    max_pos, max_id, len(imgs)))
            else:
                raise Exception('Invalid method: {}'.format(method))
        elif mode == 'likes':
            foo = '&max_position={}'.format(
                max_pos) if max_pos is not None else ''
            url = 'https://twitter.com/{}/likes/timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                username, foo)
        print(url)

        hdr = {
            "X-Requested-With": "XMLHttpRequest",
            "X-Twitter-Active-User": "******",
        }

        for try_ in range(16):
            if cw and not cw.alive:
                return
            try:
                html = downloader.read_html(
                    url,
                    session=session,
                    referer='https://twitter.com/{}'.format(username),
                    headers=hdr)  #err
            except Exception as e:
                e_msg = print_error(e)[-1]
                print_('retry... ({}) {}\n{}'.format(try_, url, e_msg))
                change_ua(session)
                continue
            try:
                data = json.loads(html)
            except Exception as e:
                change_ua(session)
                soup = Soup(html)
                login = soup.find('div', class_='LoginForm-input')
                if login and method == 'tab':
                    raise Exception('Login required!')
                print_('can not load json: {}'.format(e))
                sleep(1)
                continue
            break
        else:
            print_('over try')
            if not imgs:
                raise Exception('No imgs')
            break

        if 'items_html' in data:
            html = data['items_html']
        else:
            print_('no items_html')
            session.cookies.clear()  # ???
            #break

        soup = Soup(html)
        tweets = soup.findAll('div', class_='tweet') + soup.findAll(
            'span', class_='grid-tweet')

        ids = []
        for tweet in tweets:
            id = int(tweet.attrs['data-tweet-id'])
            if id in ids_set:
                print('duplicate')
                continue
            ids.append(id)
            ids_set.add(id)
            tweet = Tweet(tweet, format, types, session, cw)
            for img in tweet.imgs:
                imgs.append(img)

        if n is not None and len(imgs) >= n:
            break

        if not ids:
            foo = 4 if method != 'search2' else 16
            if len(imgs) == 0:
                raise Exception('No Image')
            elif f**k > foo:
                if method == 'tab':  ### search
                    method = 'search'
                    f**k = 0
                    continue
                elif method == 'search' and not ids and min_position is not None:  ### search2
                    method = 'search2'
                    max_pos = min_position
                    #min_position = None
                    f**k = 0
                    continue
                else:
                    print('too much f**k')
                    break
            else:
                print('f**k!!!!!')
                change_ua(session)
                f**k += 1
        elif f**k:
            print('reset f**k')
            f**k = 0

        max_pos_new = data.get('min_position')  # 1028
        if max_pos_new is None:
            if ids:
                max_pos_new = min(ids)
            else:
                max_pos_new = max_pos  #
        max_pos = max_pos_new

        if data.get('min_position'):
            min_position = data['min_position']
            print('min_position:', min_position)

        try:
            if cw is not None:
                if not cw.alive:
                    break
                cw.setTitle('{}  {} (@{}) - {}'.format(tr_('읽는 중...'), artist,
                                                       username, len(imgs)))
        except Exception as e:
            print(e)
            raise

    return imgs
示例#4
0
def get_imgs_from_illust(illust,
                         api=None,
                         types={'illust', 'manga', 'ugoira'},
                         format=None,
                         format_name=None,
                         dir='',
                         print_=None,
                         cw=None):
    print_ = get_print(cw)
    if api is None:
        api = pixiv_auth.get_api()
    if types is not None and illust.get('type', 'illust') not in types:
        return []
    imgs = []
    if illust.type == 'ugoira':
        sleep(0.2)
        for try_ in range(N_TRY):
            print_(('read ugoira... {}').format(illust.id))
            try:
                ugoira_data = api.ugoira_metadata(illust.id, req_auth=True)
                error = ugoira_data.get('error')
                if error:
                    raise PixivError(error)
                break
            except PixivError as e:
                api = e.api
                print_(e)
                msg = error.get('user_message', '')
                if u'公開制限エラー' in msg:
                    print_('invalid ugoira; ignore')
                    return []
                if u'該当作品の公開レベルにより閲覧できません' in msg:
                    print_('invalid ugoira (2); ignore')
                    return []
                if try_ < N_TRY - 1:
                    print_('retry...')
                sleep(SLEEP)
        else:
            raise

        ugoira_data = ugoira_data.ugoira_metadata
        url = ugoira_data.zip_urls.medium.replace('600x600', '1920x1080')
        img = Img(illust,
                  url,
                  ugoira_data=ugoira_data,
                  format_name=format_name)
        if format is not None:
            filename = os.path.join(dir, img.filename)
            filename = os.path.splitext(filename)[0] + '.' + format
            filename_old = os.path.join(dir, ('{}_ugoira1920x1080.{}').format(
                img.id, format))
            if os.path.isfile(filename_old) and not os.path.isfile(filename):
                print_(
                    (u'rename: {} -> {}').format(os.path.basename(filename),
                                                 os.path.basename(filename)))
                os.rename(filename_old, filename)
            if os.path.isfile(filename):
                print_((u'skip ugoira: {}').format(filename))
                img = Img(illust,
                          filename,
                          ugoira_data=ugoira_data,
                          format_name=format_name)
        imgs.append(img)
    elif illust.page_count == 1:
        img = Img(illust,
                  illust.meta_single_page.original_image_url,
                  format_name=format_name)
        imgs.append(img)
    else:
        pages = illust.meta_pages
        for page in pages:
            img = Img(illust,
                      page.image_urls.original,
                      format_name=format_name)
            imgs.append(img)

    return imgs
示例#5
0
def get_imgs(url,
             title=None,
             customWidget=None,
             d=None,
             types=['img', 'gif', 'video'],
             session=None):
    if False:  #
        raise NotImplementedError('Not Implemented')
    print_ = get_print(customWidget)
    print_(u'types: {}'.format(', '.join(types)))

    # Range
    max_pid = get_max_range(customWidget, 2000)

    local_ids = {}
    if customWidget is not None:
        dir = customWidget.downloader.dir
        try:
            names = os.listdir(dir)
        except Exception as e:
            print(e)
            names = []
        for name in names:
            id = os.path.splitext(name)[0]
            local_ids[id] = os.path.join(dir, name)

    imgs = []
    page = 1
    url_imgs = set()
    if 'chan.sankakucomplex' in url:
        type = 'chan'
    elif 'idol.sankakucomplex' in url:
        type = 'idol'
    else:
        raise Exception('Not supported subdomain')
    url_old = 'https://{}.sankakucomplex.com'.format(type)
    if customWidget is not None:
        customWidget.exec_queue.put(
            (customWidget, u"customWidget.setTitle(u'{}  {}')".format(
                tr_(u'읽는 중...'), title)))
    while len(imgs) < max_pid:
        #if page > 25: # Anonymous users can only view 25 pages of results
        #    break
        sleep(1)  #
        #url = setPage(url, page)
        print_(url)
        html = downloader.read_html(url, referer=url_old, session=session)
        if '429 Too many requests'.lower() in html.lower():
            print_('429 Too many requests... wait 120 secs')
            for i in range(120):
                sleep(1)
                if customWidget and not customWidget.alive:
                    return []
            continue
        page += 1
        url_old = url
        soup = Soup(html)
        articles = soup.findAll('span', {'class': 'thumb'})

        if not articles:
            break

        for article in articles:
            # 1183
            tags = article.find('img', class_='preview').attrs['title'].split()
            if 'animated_gif' in tags:
                type_ = 'gif'
            elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags:  # 1697
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue

            url_img = article.a.attrs['href']
            if not url_img.startswith('http'):
                url_img = urljoin('https://{}.sankakucomplex.com'.format(type),
                                  url_img)
            id = re.findall('show/([0-9]+)', url_img)[0]
            if id in local_ids:
                #print('skip', id)
                local = True
            else:
                local = False
            #print(url_img)
            if url_img not in url_imgs:
                url_imgs.add(url_img)
                if local:
                    url_img = local_ids[id]
                img = Image(type,
                            id,
                            url_img,
                            url,
                            local=local,
                            cw=customWidget,
                            d=d)
                imgs.append(img)
                if len(imgs) >= max_pid:
                    break
        if customWidget and not customWidget.alive:
            break

        try:
            # For page > 50
            pagination = soup.find('div', class_='pagination')
            url = urljoin('https://{}.sankakucomplex.com'.format(type),
                          pagination.attrs['next-page-url'])
        except Exception as e:
            print_(print_error(e)[-1])
            #url = setPage(url, page)
            break

        if customWidget is not None:
            customWidget.setTitle(u'{}  {} - {}'.format(
                tr_(u'읽는 중...'), title, len(imgs)))
        else:
            print(len(imgs), 'imgs')

    if not imgs:
        raise Exception('no images')

    return imgs
示例#6
0
    def read(self):
        type = self.pixiv_type
        cw = self.customWidget
        print_ = cw.print_
        ui_setting = self.ui_setting

        if type == 'following':
            raise NotImplementedError('following')

        self._format = [None, 'gif', 'webp',
                        'png'][ui_setting.ugoira_convert.currentIndex()]
        self._format_name = compatstr(ui_setting.pixivFormat.currentText())
        types = [t.lower() for t in query_url(self.url).get('type', [])]
        if types:
            s = (u', ').join(sorted(types))
            types = set(types)
        else:
            s = 'all'
            types = None
        print_((u'Type: {}').format(s))
        print_((u'info: {}').format(self.info))
        api = self.api
        query = self.id.replace('_bmk', '').replace('_illust', '').replace(
            'pixiv_', '').replace('search_', '')
        if type != 'search':
            query = int(query)
        print('pixiv_query:', query)
        try:
            if type in ('user', 'bookmark', 'search'):
                max_pid = get_max_range(cw, 2000)
                if ui_setting.groupBox_tag.isChecked():
                    tags = [
                        compatstr(ui_setting.tagList.item(i).text())
                        for i in range(ui_setting.tagList.count())
                    ]
                else:
                    tags = []
                if type == 'search':
                    query = query.replace('+', ' ')
                    name = query
                else:
                    id = self.id.replace('_bmk', '').replace('pixiv_',
                                                             '').replace(
                                                                 'search_', '')
                    print('name', id)
                    name = get_name(id, self.api, cw=cw)
                    cw.artist = name
                title = u'{} ({})'.format(name, self.id)
                print_(title)
                dir = os.path.join(get_outdir('pixiv'), clean_title(title))
                imgs = get_imgs(query,
                                type=type,
                                api=api,
                                n=max_pid,
                                tags=tags,
                                types=types,
                                format=self._format,
                                format_name=self._format_name,
                                dir=dir,
                                cw=cw,
                                title=title,
                                info=self.info)
            elif type == 'illust':
                for try_ in range(N_TRY):
                    try:
                        detail = api.illust_detail(query, req_auth=True)
                        error = detail.get('error')
                        if error:
                            raise PixivError(error)
                        break
                    except PixivError as e:
                        api = e.api
                        print_(e)
                        if try_ < N_TRY - 1:
                            print_('retry...')
                        sleep(SLEEP)
                else:
                    raise

                illust = detail.illust
                name = illust.title
                title = (u'{} ({})').format(name, self.id)
                dir = os.path.join(get_outdir('pixiv'), clean_title(title))
                imgs = get_imgs_from_illust(illust,
                                            api=api,
                                            format=self._format,
                                            dir=dir,
                                            cw=cw,
                                            format_name=self._format_name)
        except PixivError as e:
            msg = (u'PixivError: {}').format(e.message)
            return self.Invalid(msg)

        self.imgs = imgs
        for img in imgs:
            self.urls.append(img.url)
            self.filenames[img.url] = img.filename

        self.title = clean_title(title)  # 1390
示例#7
0
def get_imgs(user_id,
             type='user',
             n=None,
             api=None,
             tags=[],
             types={'illust', 'manga', 'ugoira'},
             format=None,
             format_name=None,
             dir='',
             cw=None,
             title=None,
             info=None):
    print('get_imgs', user_id, type, dir)
    if api is None:
        api = pixiv_auth.get_api()
    print_ = get_print(cw)
    imgs = []
    offset = 0
    bad = 0
    error = None
    tags_ = tags
    tags = set()
    tags_ex = set()
    for tag in tags_:
        tag = tag.strip().replace(' ', '').lower()
        if tag.startswith('-'):
            tags_ex.add(tag[1:].strip())
        else:
            tags.add(tag)

    print_((u'tags: [{}]').format((u', ').join(tags)))
    print_((u'tags_ex: [{}]').format((u', ').join(tags_ex)))
    max_id = None
    while True:
        if bad >= N_TRY:
            raise PixivError(error)
        if type == 'user':
            json_result = api.user_illusts(user_id,
                                           type=None,
                                           req_auth=True,
                                           filter=None,
                                           offset=offset)
        elif type == 'search':
            order = info['order']
            sorts = {
                'date_d': 'date_desc',
                'date': 'date_asc',
                'popular_d': 'popular_desc',
                'popular': 'popular_asc',
                'popular_female_d': 'popular_female_desc',
                'popular_female': 'popular_female_asc',
                'popular_male_d': 'popular_male_desc',
                'popular_male': 'popular_male_asc',
            }
            sort = sorts.get(order, 'date_desc')
            params = {
                'word': user_id,
                'search_target': 'partial_match_for_tags',
                'sort': sort,
                'filter': 'for_ios'
            }
            if offset:
                params['offset'] = offset
            if info.get('blt') is not None:
                params['bookmark_num_min'] = info['blt']
            if info.get('bgt') is not None:
                params['bookmark_num_max'] = info['bgt']
            if info.get('scd') is not None:
                params['start_date'] = info['scd']
            if info.get('ecd') is not None:
                params['end_date'] = info['ecd']
            print(params)
            #r = api.no_auth_requests_call('GET', '%s/v1/search/illust' % api.hosts, params=params, req_auth=True)
            #json_result = api.parse_result(r)
            method, url = api.api.search_illust
            r = api.requests_(method, url, params=params, auth=True)
            json_result = api.parse_json(r)
        elif type == 'bookmark':
            print('max_id:', max_id)
            json_result = api.user_bookmarks_illust(user_id,
                                                    filter=None,
                                                    max_bookmark_id=max_id,
                                                    req_auth=True)
        else:
            raise Exception(('type "{}" is not supported').format(type))
        error = json_result.get('error')
        if error:
            print_(error)
            message = error.get('message', '')
            if 'Offset must be no more than' in message:
                break
            print_('retry...')
            sleep(SLEEP)
            bad += 1
            continue
        bad = 0
        illusts = json_result.illusts
        if len(illusts) == 0:
            break
        for p, illust in enumerate(illusts):
            print('illust: {}'.format(illust.id))
            tags_illust = set(tag['name'].strip().replace(' ', '').lower()
                              for tag in illust.tags)
            if not tags or tags & tags_illust:
                if tags_ex.isdisjoint(tags_illust):
                    imgs += get_imgs_from_illust(illust,
                                                 api=api,
                                                 types=types,
                                                 format=format,
                                                 format_name=format_name,
                                                 dir=dir,
                                                 cw=cw)
            if cw is not None and (illust.type == 'ugoira'
                                   or p == len(illusts) - 1):
                cw.setTitle(
                    (u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'),
                                           title, len(imgs)))
            offset += 1
            if n is not None and len(imgs) >= n:
                break

        if type == 'bookmark':
            if json_result.next_url is None:
                break
            else:
                max_id = api.parse_qs(json_result.next_url)['max_bookmark_id']
        if n is not None and len(imgs) >= n:
            break
        if cw is not None and not cw.alive:
            break

    if not imgs:
        raise Exception('no imgs')
    return imgs[:n]
示例#8
0
def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None):
    print_ = get_print(cw)
    print_('uid: {}, oid:{}'.format(uid, oid))

    max_pid = get_max_range(cw)

    @try_n(4)
    def get_album_imgs(album, page):
        url = 'https://photo.weibo.com/photos/get_all?uid={}&album_id={}&count=30&page={}&type={}&__rnd={}'.format(
            uid, album.id, page, album.type, int(time() * 1000))
        referer = 'https://photo.weibo.com/{}/talbum/index'.format(uid)
        html = downloader.read_html(url, referer, session=session, timeout=30)
        j = json.loads(html)
        data = j['data']
        imgs = []
        for photo in data['photo_list']:
            host = photo['pic_host']
            name = photo['pic_name']
            id = photo['photo_id']
            timestamp = photo['timestamp']
            date = datetime.fromtimestamp(timestamp)
            t = '{:02}-{:02}-{:02}'.format(date.year % 100, date.month,
                                           date.day)
            url = '{}/large/{}'.format(host, name)
            ext = os.path.splitext(name)[1]
            filename = '[{}] {}{}'.format(t, id, ext)
            img = Image(url, filename, timestamp)
            imgs.append(img)

        return imgs

    @try_n(2)
    def get_albums(page):
        url = 'https://photo.weibo.com/albums/get_all?uid={}&page={}&count=20&__rnd={}'.format(
            uid, page, int(time() * 1000))
        referer = 'https://photo.weibo.com/{}/albums?rd=1'.format(uid)
        html = downloader.read_html(url, referer, session=session)
        if '<title>新浪通行证</title>' in html:
            raise errors.LoginRequired()
        j = json.loads(html)
        data = j['data']
        albums = []
        for album in data['album_list']:
            id = album['album_id']
            type = album['type']
            album = Album(id, type)
            albums.append(album)

        return albums

    albums = []
    for p in range(1, 101):
        albums_new = get_albums(p)
        albums += albums_new
        print_('p:{}, albums:{}'.format(p, len(albums)))
        if not albums_new:
            break

    imgs = []
    for album in albums:
        print('Album:', album.id, album.type)
        imgs_album = []
        for p in range(1, 101):
            imgs_new = get_album_imgs(album, p)
            imgs_album += imgs_new
            s = u'{} {}  -  {}'.format(tr_(u'읽는 중...'), title, len(imgs))
            if cw:
                cw.setTitle(s)
            else:
                print(s)
            if len(imgs_album) >= max_pid:
                break
            if not imgs_new:
                break
            sleep(1)
        imgs += imgs_album

    imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True)
    return imgs[:max_pid]
示例#9
0
def process_ids(ids, info, imgs, cw, depth=0, tags_add=None):
    print_ = get_print(cw)
    max_pid = get_max_range(cw)

    class Thread(threading.Thread):
        alive = True
        rem = 0

        def __init__(self, queue):
            super().__init__(daemon=True)
            self.queue = queue

        @classmethod
        @lock
        def add_rem(cls, x):
            cls.rem += x

        def run(self):
            while self.alive:
                try:
                    id_, res, i = self.queue.popleft()
                except Exception as e:
                    sleep(.1)
                    continue
                try:
                    info_illust = get_info(
                        'https://www.pixiv.net/en/artworks/{}'.format(id_),
                        cw,
                        depth=depth + 1,
                        tags_add=tags_add)
                    res[i] = info_illust['imgs']
                except Exception as e:
                    if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました'
                                       or type(e) == errors.LoginRequired
                                       ):  # logout during extraction
                        res[i] = e
                    print_('process_ids error (id: {}, d:{}):\n{}'.format(
                        id_, depth,
                        print_error(e)[0]))
                finally:
                    Thread.add_rem(-1)

    queue = deque()
    n, step = Downloader_pixiv.STEP
    print_('{} / {}'.format(n, step))
    ts = []
    for i in range(n):
        t = Thread(queue)
        t.start()
        ts.append(t)
    for i in range(0, len(ids), step):
        res = [[]] * step
        for j, id_illust in enumerate(ids[i:i + step]):
            queue.append((id_illust, res, j))
            Thread.add_rem(1)
        while Thread.rem:
            sleep(.001, cw)
        for imgs_ in res:
            if isinstance(imgs_, Exception):
                raise imgs_
            imgs += imgs_
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(imgs) >= max_pid:
            break
        if depth == 0:
            check_alive(cw)
    for t in ts:
        t.alive = False
示例#10
0
    def f(html, browser=None):
        soup = Soup(html)
        if is_captcha(soup):
            print('captcha')
            browser.show()
            sd['shown'] = True
        elif sd['shown'] and not SHOW:
            browser.hide()
            sd['shown'] = False
        try:
            st = soup.find('h2', class_='share-title')
            if st is None:
                st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c)
            info['uid'] = st.text.strip()
            st = soup.find('h1', class_='share-sub-title')
            if st is None:
                st = soup.find('h1',
                               class_=lambda c: c and 'ShareSubTitle' in c)
            info['nickname'] = st.text.strip()
        except Exception as e:
            print_(print_error(e)[0])
        c = 0
        ids_now = set()
        items = soup.findAll('div', class_='video-feed-item') + soup.findAll(
            'div', class_=lambda c: c and 'DivItemContainer' in c)
        for div in items:
            a = div.find('a')
            if a is None:
                continue
            href = a['href']
            if not href:
                continue
            m = re.search(PATTERN_VID, href)
            if m is None:
                continue
            id_video = int(m.group('id'))
            ids_now.add(id_video)
            if id_video in ids:
                continue
            ids.add(id_video)
            info['items'].append({'id': id_video})
            c += 1

        print_('items: {}'.format(len(info['items'])))
        if len(info['items']) >= max_pid:
            info['items'] = info['items'][:max_pid]
            return True

        browser.runJavaScript(
            'window.scrollTo(0, document.body.scrollHeight);')
        sleep(15, cw)

        if c or (ids_now and min(ids_now) > min(ids)):
            sd['count_empty'] = 0
        else:
            print_('empty')
            sd['count_empty'] += 1
        msg = '{}  {} (tiktok_{}) - {}'.format(tr_('읽는 중...'),
                                               info.get('nickname'),
                                               info.get('uid'),
                                               len(info['items']))
        if cw:
            if not cw.alive:
                raise Exception('cw dead')
            cw.setTitle(msg)
        else:
            print(msg)
        return sd['count_empty'] > 4
def get_imgs(url, title=None, cw=None, d=None, types=['img', 'gif', 'video'], session=None):
    if False:#
        raise NotImplementedError('Not Implemented')
    print_ = get_print(cw)
    print_('types: {}'.format(', '.join(types)))
    if 'chan.sankakucomplex' in url:
        type = 'chan'
    elif 'idol.sankakucomplex' in url:
        type = 'idol'
    else:
        raise Exception('Not supported subdomain')

    info = {}
    info['single'] = False

    if '/post/show/' in url:
        info['single'] = True
        id = get_id(url)
        info['imgs'] = [Image(type, id, url, None, cw=cw, d=d)]
        return info
    
    # Range
    max_pid = get_max_range(cw)

    local_ids = {}
    if cw is not None:
        dir = cw.downloader.dir
        try:
            names = os.listdir(dir)
        except Exception as e:
            print(e)
            names = []
        for name in names:
            id = os.path.splitext(name)[0]
            local_ids[id] = os.path.join(dir, name)
        
    imgs = []
    page = 1
    url_imgs = set()
    url_old = 'https://{}.sankakucomplex.com'.format(type)
    if cw is not None:
        cw.setTitle('{}  {}'.format(tr_('읽는 중...'), title))
    while len(imgs) < max_pid:
        #if page > 25: # Anonymous users can only view 25 pages of results
        #    break
        wait(cw)
        #url = setPage(url, page)
        print_(url)
        try:
            html = downloader.read_html(url, referer=url_old, session=session)
        except Exception as e: #3366
            print_(print_error(e)[0])
            break
        if '429 Too many requests'.lower() in html.lower():
            print_('429 Too many requests... wait 120 secs')
            sleep(120, cw)
            continue
        page += 1
        url_old = url
        soup = Soup(html)
        articles = soup.findAll('span', {'class': 'thumb'})
        
        if not articles:
            break
            
        for article in articles:
            # 1183
            tags = article.find('img', class_='preview').attrs['title'].split()
            if 'animated_gif' in tags:
                type_ = 'gif'
            elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue
            
            url_img = article.a.attrs['href']
            if not url_img.startswith('http'):
                url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img)
            id = get_id(url_img)
            #print_(article)
            if id is None: # sankaku plus
                continue
            if id in local_ids:
                #print('skip', id)
                local = True
            else:
                local = False
            #print(url_img)
            if url_img not in url_imgs:
                url_imgs.add(url_img)
                if local:
                    url_img = local_ids[id]
                img = Image(type, id, url_img, url, local=local, cw=cw, d=d)
                imgs.append(img)
                if len(imgs) >= max_pid:
                    break

        try:
            # For page > 50
            pagination = soup.find('div', class_='pagination')
            url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url'])
            #3366
            p = int(re.find(r'[?&]page=([0-9]+)', url, default=1))
            if p > 100:
                url = setPage(url, 100)
        except Exception as e:
            print_(print_error(e)[-1])
            #url = setPage(url, page)
            break
        
        if cw is not None:
            cw.setTitle('{}  {} - {}'.format(tr_('읽는 중...'), title, len(imgs)))
        else:
            print(len(imgs), 'imgs')

    if not imgs:
        raise Exception('no images')

    info['imgs'] = imgs
    
    return info
示例#12
0
def get_imgs(url, n_max=2000, title=None, cw=None, session=None):
    print_ = get_print(cw)

    for try_ in range(4):
        try:
            html = read_html(url, session, cw)
            m = re.search('"edge_owner_to_timeline_media":{"count":([0-9]+)',
                          html)
            if m is None:
                raise Exception('Invalid page')
            break
        except Exception as e:
            e_ = e
            print_(print_error(e)[0])
    else:
        raise e_
    n = int(m.groups()[0])
    n = min(n, n_max)

    data = get_sd(url, html=html, cw=cw)

    uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
    csrf_token = data['config']['csrf_token']  #
    session.cookies.set(name='ig_pr',
                        value='1',
                        path='/',
                        domain='.instagram.com')

    cursor = ''
    edges = []
    bad = 0
    while True:
        check_alive(cw)

        variables = {
            'id': uploader_id,
            'first': 12,
        }
        if cursor:
            variables['after'] = cursor
        #print_(variables)#

        media = None
        try:
            j = get_query('003056d32c2554def87228bc3fd9668a', variables,
                          session, cw)
            media = j['data']['user']['edge_owner_to_timeline_media']
            sleep(2)  #
        except Exception as e:
            if bad > 10:
                raise Exception('no media')
            else:
                print_(u'no media.. retry... ({}) {}'.format(
                    bad + 1,
                    print_error(e)[0]))
                sleep(12 * bad, cw)
                bad += 1
                continue
        bad = 0

        edges_new = media.get('edges')
        if not edges_new or not isinstance(edges_new, list):
            print('no edges_new')
            break

        edges += edges_new

        s = u'{} {}  ({}/{})'.format(tr_(u'읽는 중...'), title, len(edges), n)
        if cw is not None:
            cw.setTitle(s)
            if not cw.alive:
                return []
        else:
            print(s)

        if len(edges) >= n:
            break

        page_info = media.get('page_info')
        if not page_info:
            break
        if not page_info.get('has_next_page'):
            break
        cursor = page_info.get('end_cursor')
        if not cursor:
            break

    if len(edges) <= n / 2:
        raise Exception(u'Too short: {} / {}'.format(len(edges), n))

    imgs = []
    for edge in edges:
        node = edge['node']
        type = node['__typename']
        id = node['shortcode']
        url = u'https://www.instagram.com/p/{}/'.format(id)
        ##        if type in ['GraphVideo', 'GraphImage']:
        ##            single = True
        ##        else:
        ##            single = False
        for img in Node(url, session=session, cw=cw, media=node).imgs:
            imgs.append(img)
        if len(imgs) >= n_max:
            break

    return imgs