Exemplo n.º 1
0
def process_ids(ids, info, imgs, cw, depth=0):
    print_ = get_print(cw)
    max_pid = get_max_range(cw)
    for i, id_illust in enumerate(ids):
        try:
            info_illust = get_info(
                'https://www.pixiv.net/en/artworks/{}'.format(id_illust),
                cw,
                depth=depth + 1)
        except Exception as e:
            if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました'
                               or type(e) == errors.LoginRequired
                               ):  # logout during extraction
                raise e
            print_('process_ids error ({}):\n{}'.format(
                depth,
                print_error(e)[0]))
            continue
        imgs += info_illust['imgs']
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(imgs) >= max_pid:
            break
        if depth == 0:
            check_alive(cw)
def get_videos(url,
               type='video',
               only_mp4=False,
               audio_included=False,
               max_res=None,
               max_abr=None,
               cw=None):
    info = {}

    n = get_max_range(cw)

    if '/channel/' in url or '/user/' in url or '/c/' in url:
        info = read_channel(url, n=n, cw=cw)
        info['type'] = 'channel'
        info['title'] = u'[Channel] {}'.format(info['uploader'])
        if cw:
            info['urls'] = filter_range(info['urls'], cw.range)
    elif '/playlist' in url:
        info = read_playlist(url, n=n, cw=cw)
        info['type'] = 'playlist'
        info['title'] = u'[Playlist] {}'.format(info['title'])
        if cw:
            info['urls'] = filter_range(info['urls'], cw.range)
    else:
        info['type'] = 'single'
        info['urls'] = [url]

    info['videos'] = [
        Video(url, type, only_mp4, audio_included, max_res, max_abr, cw)
        for url in info['urls']
    ]

    return info
Exemplo n.º 3
0
def get_ids_multi(q, popular, cw=None):
    print_ = get_print(cw)
    max_pid = get_max_range(cw)
    qs = q.split(' ')
    qs_pos = [q for q in qs if not q.startswith('-')]
    qs_neg = [q[1:] for q in qs if q.startswith('-')]
    q = qs_pos[0] if qs_pos else None
    ids = get_ids(q, popular, cw)
    print_('{}: {}'.format(q, len(ids)))

    # Positive
    for q in qs_pos[1:]:
        ids_ = get_ids(q, popular, cw)
        set_ids_ = set(ids_)
        ids_old = ids
        ids = []
        for id in ids_old:
            if id in set_ids_:
                ids.append(id)
        print_('{}: {} ({})'.format(q, len(ids_), len(ids)))

    # Negative
    for q in qs_neg:
        ids_ = get_ids(q, popular, cw)
        set_ids_ = set(ids_)
        ids_old = ids
        ids = []
        for id in ids_old:
            if id not in set_ids_:
                ids.append(id)
        print_('-{}: {} ({})'.format(q, len(ids_), len(ids)))
    return ids[:max_pid]
def get_imgs(id, api, cw=None, title=None, type='board'):
    n = get_max_range(cw)
    imgs = []
    ids = set()
    print('get_imgs: type={}'.format(type))
    if type == 'board':
        gen = api.board_pins(id)
    elif type == 'section':
        gen = api.board_section_pins(id)
    elif type == 'pin':
        gen = [api.pin(id)]
    else:
        raise Exception((u'Type "{}" is not supported').format(type))
    for img in gen:
        if 'images' not in img:
            print('skip img:', img['id'])
            continue
        img = Image(img)
        if img.id in ids:
            print('duplicate:', img.id)
            continue
        ids.add(img.id)
        print(img.url)
        print(img.filename)
        print()
        imgs.append(img)
        if len(imgs) >= n:
            break
        if cw is not None:
            if not cw.alive:
                return []
            cw.setTitle((u'{} {}  ({})').format(tr_(u'\uc77d\ub294 \uc911...'),
                                                title, len(imgs)))

    return imgs
def get_imgs(url, title=None, cw=None):
    url = clean_url(url)
    if 's=view' in url and 'page=favorites' not in url:
        raise NotImplementedError('Not Implemented')

    tags = get_tags(url)
    tags = quote(tags, safe='/')
    tags = tags.replace('%20', '+')
    url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format(
        tags)

    print_ = get_print(cw)

    # Range
    max_pid = get_max_range(cw, 2000)

    imgs = []
    ids = set()
    count_no_imgs = 0
    for p in range(500):  #1017
        url = setPage(url, len(ids))
        print_(url)
        html = downloader.read_html(url)

        soup = Soup(html)
        posts = soup.findAll('div', class_='thumbnail-preview')
        imgs_new = []
        for post in posts:
            id_ = int(re.find('[0-9]+', post.find('a')['id'], err='no id'))
            if id_ in ids:
                print('duplicate:', id_)
                continue
            ids.add(id_)
            url_img = urljoin(url, post.find('a')['href'])
            img = Image(id_, url_img)
            imgs_new.append(img)
        if imgs_new:
            imgs += imgs_new
            count_no_imgs = 0
        else:
            print('no imgs')
            count_no_imgs += 1
            if count_no_imgs > 1:
                print('break')
                break

        if len(imgs) >= max_pid:
            break

        if cw is not None:
            if not cw.alive:
                break
            cw.setTitle(u'{}  {} - {}'.format(tr_(u'읽는 중...'), title,
                                              len(imgs)))

    if not imgs:
        raise Exception('no imgs')

    return imgs
def get_imgs_more(username, session, title, types, n=None, format='[%y-%m-%d] id_ppage', cw=None, mode='media', method='tab', imgs=None):
    print_ = get_print(cw)
    imgs = imgs or []
    print_('imgs: {}, types: {}'.format(len(imgs), ', '.join(types)))

    artist, username = get_artist_username(username, session)#
    
    # Range
    n = max(n or 0, get_max_range(cw))

    ids_set = set(img.id for img in imgs)

    count_no_imgs = 0

    filter_ = '' if options.get('experimental') else ' filter:media' #2687

    while len(imgs) < n:
        if ids_set:
            max_id = min(ids_set) - 1
            q = 'from:{} max_id:{} exclude:retweets{} -filter:periscope'.format(username, max_id, filter_)
        else:
            q = 'from:{} exclude:retweets{} -filter:periscope'.format(username, filter_)
        print(q)

        tweets = []
        for tweet in list(TwitterAPI(session, cw).search(q)):
            id = int(tweet['id'])
            if id in ids_set:
                print_('duplicate: {}'.format(id))
                continue
            ids_set.add(id)
            tweets.append(tweet)
            
        if tweets:
            count_no_imgs = 0
        else:
            count_no_imgs += 1
            change_ua(session)
            if count_no_imgs >= 3:
                break
            print_('retry...')
            continue
        
        for tweet in tweets:
            imgs += get_imgs_from_tweet(tweet, session, types, format, cw)

        msg = '{}  {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username, len(imgs))
        if cw and not cw.alive:
            break
        if cw:
            cw.setTitle(msg)
        else:
            print(msg)

    return imgs
Exemplo n.º 7
0
def get_imgs_all(url, title=None, cw=None, d=None, session=None, stories=True):
    max_pid = get_max_range(cw)
    url = clean_url(url)
    if stories:
        imgs_str = get_stories(url, title, cw=cw, session=session)
    else:
        imgs_str = []
    max_pid = max(0, max_pid - len(imgs_str))
    imgs = get_imgs(url, max_pid, title=title, cw=cw, session=session)

    return imgs_str + imgs[:max_pid]
def get_imgs(url, title=None, cw=None):
    print_ = get_print(cw)
    url = clean_url(url)

    id = get_id(url)
    url = u'https://nijie.info/members_illust.php?id={}'.format(id)

    # Range
    max_pid = get_max_range(cw)

    imgs = []
    url_imgs = set()
    for p in range(1, 1 + 100):
        url = setPage(url, p)
        print_(url)
        html = downloader.read_html(url)

        soup = Soup(html)
        posts = soup.findAll('div', class_='nijie')
        if not posts:
            print('no posts')
            break
        c = 0
        for post in posts:
            url_img = urljoin(url, post.a.attrs['href'])
            if url_img in url_imgs:
                print('duplicate:', url_img)
                continue
            url_imgs.add(url_img)
            id = int(re.find('[?&]id=([0-9]+)', url_img))
            multi = post.find('div', class_='thumbnail-icon')
            if multi:
                imgs_ = get_imgs_post(id, url_img)  #
            else:
                imgs_ = [Image(id, url_img, 0)]

            imgs += imgs_
            c += 1

            if len(imgs) >= max_pid:
                break

            msg = u'{}  {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
            if cw:
                if not cw.alive:
                    return
                cw.setTitle(msg)
            else:
                print(msg)

        if len(imgs) >= max_pid or c == 0:
            break
    return imgs
Exemplo n.º 9
0
def get_imgs_channel(url, html=None, cw=None):
    print_ = get_print(cw)
    if html is None:
        html = downloader.read_html(url)
    info = get_info(url, html)

    # Range
    max_pid = get_max_range(cw)

    ids = set()
    imgs = []
    for p in range(1000):
        url_api = 'https://bcy.net/apiv3/user/selfPosts?uid={}'.format(
            info['uid'])
        if imgs:
            url_api += '&since={}'.format(imgs[-1].id)
        data_raw = downloader.read_html(url_api, url)
        data = json.loads(data_raw)['data']
        items = data['items']
        if not items:
            print('no items')
            break
        c = 0
        for item in items:
            check_alive(cw)
            id = item['item_detail']['item_id']
            if id in ids:
                print('duplicate')
                continue
            c += 1
            ids.add(id)
            url_single = u'https://bcy.net/item/detail/{}'.format(id)
            imgs_single = get_imgs(url_single, cw=cw)
            print_(str(id))
            for p, img in enumerate(imgs_single):
                img = Image(img._url, url_single, id, p)
                imgs.append(img)
            s = u'{} {} - {}'.format(tr_(u'읽는 중...'), info['artist'],
                                     min(len(imgs), max_pid))
            if cw:
                cw.setTitle(s)
            else:
                print(s)

            if len(imgs) >= max_pid:
                break
        if not c:
            print('not c')
            break
        if len(imgs) >= max_pid:
            print('over max_pid:', max_pid)
            break
    return imgs[:max_pid]
def read_channel(url_page, cw=None):
    print_ = get_print(cw)
    res = re.find(CHANNEL_PATTERN, url_page)
    if res is None:
        raise Exception('Not channel')
    header, username = res
    print(header, username)
    max_pid = get_max_range(cw)
    info = {}
    info['header'] = header
    info['username'] = username
    session = Session()
    urls = []
    ids = set()
    for p in range(100):
        url_api = urljoin(url_page,
                          '/{}/{}/videos/best/{}'.format(header, username, p))
        print_(url_api)
        r = session.post(url_api)
        data = json.loads(r.text)

        videos = data.get('videos')  #4530
        if not videos:
            print_('empty')
            break

        for video in videos:
            id_ = video['id']
            if id_ in ids:
                print_('duplicate: {}'.format(id_))
                continue
            ids.add(id_)
            info['name'] = video['pn']
            urls.append(urljoin(url_page, video['u']))

        if len(urls) >= max_pid:
            break

        n = data['nb_videos']

        s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(ids) >= n:
            break
        sleep(1, cw)
    if not urls:
        raise Exception('no videos')
    info['urls'] = urls[:max_pid]
    return info
Exemplo n.º 11
0
Arquivo: test.py Projeto: ciring/EOJ
def get_imgs(url, title=None, customWidget=None):
    url = clean_url(url)
    if 's=view' in url and 'page=favorites' not in url:
        raise NotImplementedError('Not Implemented')

    if 'page=dapi' not in url.lower():
        tags = get_tags(url).replace(' ', '+')
        print(tags)
        tags = urllib.quote(tags, safe='/+')
        url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(
            tags, 0, LIMIT)

    if customWidget is not None:
        print_ = customWidget.print_
    else:

        def print_(*values):
            sys.stdout.writelines(values + ('\n', ))

    # Range
    max_pid = get_max_range(customWidget, 2000)

    imgs = []
    url_imgs = set()
    for p in range(500):  #1017
        url = setPage(url, p)
        print_(url)
        html = downloader.read_html(url)

        soup = Soup(html)
        posts = soup.findAll('post')
        if not posts:
            break
        for post in posts:
            url_img = post.attrs['file_url']
            if url_img in url_imgs:
                print('already exists', url_img)
            else:
                url_imgs.add(url_img)
                id = post.attrs['id']
                img = Image(id, url_img)
                imgs.append(img)
        if len(imgs) >= max_pid:
            break

        if customWidget is not None:
            if not customWidget.alive:
                break
            customWidget.setTitle(u'{}  {} - {}'.format(
                tr_(u'읽는 중...'), title, len(imgs)))
    return imgs
Exemplo n.º 12
0
def read_channel(url_page, cw=None):
    print_ = get_print(cw)
    res = re.find(CHANNEL_PATTERN, url_page)
    if res is None:
        raise Exception('Not channel')
    header, username = res
    print(header, username)
    max_pid = get_max_range(cw, 2000)
    info = {}
    info['header'] = header
    info['username'] = username
    session = Session()
    urls = []
    urls_set = set()
    for p in range(100):
        url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p))
        print(url_api)
        r = session.post(url_api, data='main_cats=false')
        soup = Soup(r.text)
        thumbs = soup.findAll('div', class_='thumb-block')
        if not thumbs:
            print_('empty')
            break
        for thumb in thumbs:
            info['name'] = thumb.find('span', class_='name').text.strip()
            href = thumb.find('a')['href']
            href = urljoin(url_page, href)
            if href in urls_set:
                print_('duplicate: {}'.format(href))
                continue
            urls_set.add(href)
            urls.append(href)
        
        if len(urls) >= max_pid:
            break
        
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls))
        if cw:
            if not cw.alive:
                return
            cw.setTitle(s)
        else:
            print(s)
    if not urls:
        raise Exception('no videos')
    info['urls'] = urls[:max_pid]
    return info
        
Exemplo n.º 13
0
def get_imgs(url, title=None, cw=None):
    url = clean_url(url)
    if 's=view' in url and 'page=favorites' not in url:
        raise NotImplementedError('Not Implemented')

    if 'page=dapi' not in url.lower():
        tags = get_tags(url)
        tags = quote(tags, safe='/')
        tags = tags.replace('%20', '+')
        url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(
            tags, 0, LIMIT)

    print_ = get_print(cw)

    # Range
    max_pid = get_max_range(cw)

    imgs = []
    ids = set()
    for p in range(500):  #1017
        url = setPage(url, p)
        print_(url)
        html = try_n(4, sleep=30)(downloader.read_html)(url)  #3340

        soup = Soup(html)
        posts = soup.findAll('post')
        if not posts:
            break
        for post in posts:
            id_ = post.attrs['id']
            if id_ in ids:
                print('duplicate:', id_)
                continue
            ids.add(id_)
            url_img = post.attrs['file_url']
            img = Image(id_, url_img)
            imgs.append(img)
        if len(imgs) >= max_pid:
            break

        if cw is not None:
            if not cw.alive:
                break
            cw.setTitle(u'{}  {} - {}'.format(tr_(u'읽는 중...'), title,
                                              len(imgs)))
    return imgs
def get_imgs(url, soup=None, cw=None):
    if soup is None:
        html = downloader.read_html(url)
        soup = Soup(html)
    title = get_title(soup)

    n = get_max_range(cw)

    imgs = []
    p = 1
    while True:
        imgs_new = get_imgs_p(url, p)
        if not imgs_new:
            break
        imgs += imgs_new
        update(cw, title, imgs)
        p += 1
        if len(imgs) >= n:
            break
    return imgs[:n]
Exemplo n.º 15
0
def get_videos(url, cw=None):
    print_ = get_print(cw)
    print_(f'get_videos: {url}')
    info = {}
    options = {
        'extract_flat': True,
        'playlistend': get_max_range(cw),
    }
    videos = []
    ydl = ytdl.YoutubeDL(options, cw=cw)
    info = ydl.extract_info(url)
    for e in info['entries']:
        video = Video(e['url'], cw)
        video.id = int(e['id'])
        videos.append(video)
        if 'name' not in info:
            info['name'] = ydl.extract_info(e['url'])['creator']
    if not videos:
        raise Exception('no videos')
    info['videos'] = sorted(videos, key=lambda video: video.id, reverse=True)
    return info
def get_imgs(username, session, cw=None):
    print_ = get_print(cw)
    artist = get_name(username, session)
    imgs = []
    error_count = 0
    max_pid = get_max_range(cw)
    api = TumblrAPI(session, cw)
    for post in api.posts(username):
        imgs += post.imgs

        s = '{}  {} (tumblr_{}) - {}'.format(tr_(u'\uc77d\ub294 \uc911...'), artist, username, len(imgs))
        if cw:
            if not cw.alive:
                return
            cw.setTitle(s)
        else:
            print(s)
            
        if len(imgs) > max_pid:
            break

    return imgs[:max_pid]
def get_imgs(url, title=None, range_=None, cw=None):
    if 'donmai.us/artists' in url:
        raise NotImplementedError('Not Implemented')
    if 'donmai.us/posts/' in url:
        raise NotImplementedError('Not Implemented')

    print_ = get_print(cw)

    # Range
    max_pid = get_max_range(cw)

    if range_ is None:
        range_ = range(1, 101)
    print(range_)
    imgs = []
    i = 0
    empty_count = 0
    empty_count_global = 0
    url_imgs = set()
    while i < len(range_):
        p = range_[i]
        url = setPage(url, p)
        print_(url)
        html = downloader.read_html(url)
        soup = Soup(html)
        articles = soup.findAll('article')
        if articles:
            empty_count_global = 0
        else:
            empty_count += 1
            if empty_count < 4:
                s = u'empty page; retry... {}'.format(p)
                print_(s)
                continue
            else:
                empty_count = 0
                empty_count_global += 1

        if empty_count_global >= 6:
            break

        for article in articles:
            id = article.attrs['data-id']
            url_img = article.attrs['data-file-url'].strip()
            if url_img.startswith('http://') or url_img.startswith('https://'):
                pass
            else:
                url_img = 'https://{}donmai.us'.format(
                    'danbooru.' if 'danbooru.' in url else '') + url_img
            #print(url_img)
            if url_img not in url_imgs:
                url_imgs.add(url_img)
                img = Image(id, url_img)
                imgs.append(img)

        if len(imgs) >= max_pid:
            break

        if cw is not None:
            if not cw.alive:
                break
            cw.setTitle(u'{}  {} - {}'.format(tr_(u'읽는 중...'), title,
                                              len(imgs)))
        i += 1
    return imgs
Exemplo n.º 18
0
def get_imgs(url,
             title=None,
             customWidget=None,
             d=None,
             types=['img', 'gif', 'video'],
             session=None):
    if False:  #
        raise NotImplementedError('Not Implemented')
    print_ = get_print(customWidget)
    print_(u'types: {}'.format(', '.join(types)))

    # Range
    max_pid = get_max_range(customWidget, 2000)

    local_ids = {}
    if customWidget is not None:
        dir = customWidget.downloader.dir
        try:
            names = os.listdir(dir)
        except Exception as e:
            print(e)
            names = []
        for name in names:
            id = os.path.splitext(name)[0]
            local_ids[id] = os.path.join(dir, name)

    imgs = []
    page = 1
    url_imgs = set()
    if 'chan.sankakucomplex' in url:
        type = 'chan'
    elif 'idol.sankakucomplex' in url:
        type = 'idol'
    else:
        raise Exception('Not supported subdomain')
    url_old = 'https://{}.sankakucomplex.com'.format(type)
    if customWidget is not None:
        customWidget.exec_queue.put(
            (customWidget, u"customWidget.setTitle(u'{}  {}')".format(
                tr_(u'읽는 중...'), title)))
    while len(imgs) < max_pid:
        #if page > 25: # Anonymous users can only view 25 pages of results
        #    break
        sleep(1)  #
        #url = setPage(url, page)
        print_(url)
        html = downloader.read_html(url, referer=url_old, session=session)
        if '429 Too many requests'.lower() in html.lower():
            print_('429 Too many requests... wait 120 secs')
            for i in range(120):
                sleep(1)
                if customWidget and not customWidget.alive:
                    return []
            continue
        page += 1
        url_old = url
        soup = Soup(html)
        articles = soup.findAll('span', {'class': 'thumb'})

        if not articles:
            break

        for article in articles:
            # 1183
            tags = article.find('img', class_='preview').attrs['title'].split()
            if 'animated_gif' in tags:
                type_ = 'gif'
            elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags:  # 1697
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue

            url_img = article.a.attrs['href']
            if not url_img.startswith('http'):
                url_img = urljoin('https://{}.sankakucomplex.com'.format(type),
                                  url_img)
            id = re.find('show/([0-9]+)', url_img)
            print_(article)
            if id is None:  # sankaku plus
                continue
            if id in local_ids:
                #print('skip', id)
                local = True
            else:
                local = False
            #print(url_img)
            if url_img not in url_imgs:
                url_imgs.add(url_img)
                if local:
                    url_img = local_ids[id]
                img = Image(type,
                            id,
                            url_img,
                            url,
                            local=local,
                            cw=customWidget,
                            d=d)
                imgs.append(img)
                if len(imgs) >= max_pid:
                    break
        if customWidget and not customWidget.alive:
            break

        try:
            # For page > 50
            pagination = soup.find('div', class_='pagination')
            url = urljoin('https://{}.sankakucomplex.com'.format(type),
                          pagination.attrs['next-page-url'])
        except Exception as e:
            print_(print_error(e)[-1])
            #url = setPage(url, page)
            break

        if customWidget is not None:
            customWidget.setTitle(u'{}  {} - {}'.format(
                tr_(u'읽는 중...'), title, len(imgs)))
        else:
            print(len(imgs), 'imgs')

    if not imgs:
        raise Exception('no images')

    return imgs
Exemplo n.º 19
0
def get_imgs(url, info=None, cw=None):
    print('get_imgs', url)
    if info is None:
        info = get_info(url)
    imgs = []

    # Range
    max_pid = get_max_range(cw)

    if info['type'] == 'a':
        if 'album_images' in info: # legacy
            imgs_ = info['album_images']['images']
        elif 'media' in info: # new
            imgs_ = info['media']
        else: # legacy
            imgs_ = [info]
        
        for img in imgs_:
            img_url = img.get('url') # new
            if not img_url: # legacy
                hash = img['hash']
                ext = img['ext']
                img_url = 'https://i.imgur.com/{}{}'.format(hash, ext)
            if img_url in imgs:
                continue
            imgs.append(img_url)
            
    elif info['type'] == 'r':
        urls = set()
        for p in range(100):
            url_api = 'https://imgur.com/r/{}/new/page/{}/hit?scrolled'.format(info['title'], p)
            print(url_api)
            html = downloader.read_html(url_api, referer=url)
            soup = Soup(html)
            
            c = 0
            for post in soup.findAll('div', class_='post'):
                a = post.find('a', class_='image-list-link')
                url_post = urljoin(url, a.attrs['href'])
                if url_post in urls:
                    continue
                urls.add(url_post)
                c += 1

                try: # for r18 images
                    imgs += get_imgs(url_post)
                except Exception as e:
                    print(e)

                s = (u'{} {}  ({})').format(tr_(u'\uc77d\ub294 \uc911...'), info['title'], len(imgs))
                if cw is not None:
                    if cw.alive:
                        cw.setTitle(s)
                    else:
                        return []
                else:
                    print(s)
                
            if c == 0:
                print('same; break')
                break

    return imgs
Exemplo n.º 20
0
def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None):
    print_ = get_print(cw)
    print_('uid: {}, oid:{}'.format(uid, oid))

    max_pid = get_max_range(cw)

    @try_n(4)
    def get_album_imgs(album, page):
        url = 'https://photo.weibo.com/photos/get_all?uid={}&album_id={}&count=30&page={}&type={}&__rnd={}'.format(
            uid, album.id, page, album.type, int(time() * 1000))
        referer = 'https://photo.weibo.com/{}/talbum/index'.format(uid)
        html = downloader.read_html(url, referer, session=session, timeout=30)
        j = json.loads(html)
        data = j['data']
        imgs = []
        for photo in data['photo_list']:
            host = photo['pic_host']
            name = photo['pic_name']
            id = photo['photo_id']
            timestamp = photo['timestamp']
            date = datetime.fromtimestamp(timestamp)
            t = '{:02}-{:02}-{:02}'.format(date.year % 100, date.month,
                                           date.day)
            url = '{}/large/{}'.format(host, name)
            ext = os.path.splitext(name)[1]
            filename = '[{}] {}{}'.format(t, id, ext)
            img = Image(url, filename, timestamp)
            imgs.append(img)

        return imgs

    def get_albums(page):
        url = 'https://photo.weibo.com/albums/get_all?uid={}&page={}&count=20&__rnd={}'.format(
            uid, page, int(time() * 1000))
        referer = 'https://photo.weibo.com/{}/albums?rd=1'.format(uid)
        html = downloader.read_html(url, referer, session=session)
        j = json.loads(html)
        data = j['data']
        albums = []
        for album in data['album_list']:
            id = album['album_id']
            type = album['type']
            album = Album(id, type)
            albums.append(album)

        return albums

    albums = []
    for p in range(1, 101):
        albums_new = get_albums(p)
        albums += albums_new
        print_('p:{}, albums:{}'.format(p, len(albums)))
        if not albums_new:
            break

    imgs = []
    for album in albums:
        print('Album:', album.id, album.type)
        imgs_album = []
        for p in range(1, 101):
            imgs_new = get_album_imgs(album, p)
            imgs_album += imgs_new
            s = u'{} {}  -  {}'.format(tr_(u'읽는 중...'), title, len(imgs))
            if cw:
                cw.setTitle(s)
            else:
                print(s)
            if len(imgs_album) >= max_pid:
                break
            if not imgs_new:
                break
            sleep(1)
        imgs += imgs_album

    imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True)
    return imgs[:max_pid]
Exemplo n.º 21
0
def get_info(url, cw=None, depth=0):
    print_ = get_print(cw)
    api = PixivAPI()
    info = {}
    imgs = []

    if utils.ui_setting:
        ugoira_ext = [None, '.gif', '.webp',
                      '.png'][utils.ui_setting.ugoira_convert.currentIndex()]
    else:
        ugoira_ext = None
    if utils.ui_setting:
        format_ = compatstr(utils.ui_setting.pixivFormat.currentText())
    else:
        format_ = 'id_ppage'

    max_pid = get_max_range(cw)

    if api.illust_id(url):  # Single post
        id_ = api.illust_id(url)
        data = api.illust(id_)
        login = '******' not in data
        if FORCE_LOGIN and not login:  #
            raise errors.LoginRequired()
        if data['xRestrict'] and not login:
            raise errors.LoginRequired('R-18')
        info['artist'] = data['userName']
        info['artist_id'] = data['userId']
        info['raw_title'] = data['illustTitle']
        info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_)
        info['create_date'] = parse_time(data['createDate'])
        tags_illust = set(tag['tag'] for tag in data['tags']['tags'])

        if tags_matched(tags_illust, cw):
            if data['illustType'] == 2:  # ugoira
                data = api.ugoira_meta(id_)
                ugoira = {
                    'ext': ugoira_ext,
                    'delay': [frame['delay'] for frame in data['frames']],
                }
                img = Image(data['originalSrc'],
                            url,
                            id_,
                            0,
                            format_,
                            info,
                            cw,
                            ugoira=ugoira)
                imgs.append(img)
            else:
                data = api.pages(id_)
                for img in data:
                    img = Image(img['urls']['original'], url, id_, len(imgs),
                                format_, info, cw)
                    imgs.append(img)
        else:
            print('tags mismatched')
    elif '/bookmarks/' in url or 'bookmark.php' in url:  # User bookmarks
        id_ = api.user_id(url)
        if id_ is None:  #
            id_ = my_id()
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'],
                                                   info['artist_id'])
        ids = []
        ids_set = set()
        offset = 0
        while len(ids) < max_pid:
            data = api.bookmarks(id_, offset)
            c = 0
            for id in [work['id'] for work in data['works']]:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            offset += LIMIT
            if depth == 0:
                check_alive(cw)
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    elif '/tags/' in url or 'search.php' in url:  # Search
        q = unquote(
            re.find(r'/tags/([^/]+)', url)
            or re.find('[?&]word=([^&]*)', url, err='no tags'))
        info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+'))
        qs = query_url(url)
        order = qs.get('order', ['date_d'])[0]
        mode = qs.get('mode', ['all'])[0]
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.search(q, order, mode, p=p)
            c = 0
            for id in [
                    illust['id'] for illust in data['illustManga']['data']
                    if 'id' in illust
            ]:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url:  # Newest works: Following
        r18 = 'bookmark_new_illust_r18.php' in url
        id_ = my_id()
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_following_{}{})'.format(
            info['artist'], 'r18_' if r18 else '', info['artist_id'])
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            c = 0
            for id in api.following(p, r18=r18):
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    elif api.user_id(url):  # User illusts
        id_ = api.user_id(url)
        process_user(id_, info, api)
        data = api.profile(id_)
        info['title'] = '{} (pixiv_{})'.format(info['artist'],
                                               info['artist_id'])
        ids = []
        for illusts in [data['illusts'], data['manga']]:
            if not illusts:
                continue
            ids += list(illusts.keys())
        ids = sorted(ids, key=int, reverse=True)
        process_ids(ids[:max_pid], info, imgs, cw, depth)
    else:
        raise NotImplementedError()
    info['imgs'] = imgs[:max_pid]

    return info
def get_imgs(username,
             session,
             title,
             types,
             n=0,
             format='[%y-%m-%d] id_ppage',
             cw=None):
    print_ = get_print(cw)
    ##    try:
    ##        return get_imgs_legacy(username, session, title, types, n, format, cw)
    ##    except Exception as e:
    ##        print_(print_error(e)[-1])

    # Range
    n = max(n, get_max_range(cw))

    # 2303
    ids = set()
    names = dict()
    dir_ = os.path.join(get_outdir('twitter'), title)
    if os.path.isdir(dir_):
        for name in os.listdir(dir_):
            id_ = re.find('([0-9]+)_p', name)
            if id_ is None:
                continue
            if get_ext(name).lower() == '.mp4':
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue
            id_ = int(id_)
            ids.add(id_)
            if id_ in names:
                names[id_].append(name)
            else:
                names[id_] = [name]
    max_id = max(ids) if ids else 0

    imgs = []
    enough = False
    for tweet in TwitterAPI(session, cw).timeline_media(username):
        imgs += get_imgs_from_tweet(tweet, session, types, format, cw)
        if n is not None and len(imgs) >= n:
            break

        id_ = int(tweet['id_str'])
        if id_ < max_id:
            print_('enough')
            enough = True
            break

        msg = '{}  {} - {}'.format(tr_('읽는 중...'), title, len(imgs))
        if cw:
            if not cw.alive:
                break
            cw.setTitle(msg)
        else:
            print(msg)

    if not imgs:
        raise Exception('no imgs')

    if not enough and len(imgs) < n:
        imgs = get_imgs_legacy(username,
                               session,
                               title,
                               types,
                               n,
                               format,
                               cw,
                               method='search',
                               imgs=imgs)

    # 2303
    ids_new = set()
    for img in imgs:
        ids_new.add(img.id)
    for id_ in sorted(ids, reverse=True):
        if id_ in ids_new:
            continue
        imgs += sorted(os.path.join(dir_, name) for name in names[id_])

    return imgs
def get_imgs_legacy(username,
                    session,
                    title,
                    types,
                    n=None,
                    format='[%y-%m-%d] id_ppage',
                    cw=None,
                    mode='media',
                    method='tab',
                    imgs=None):
    print_ = get_print(cw)
    print_('types: {}'.format(', '.join(types)))

    artist, username = get_artist_username(username, session)  #

    # Range
    n = max(n, get_max_range(cw))

    max_pos = None
    ids_set = set()
    if imgs:
        for img in imgs:
            ids_set.add(img.id)
    else:
        imgs = []
    f**k = 0
    min_position = None
    while len(imgs) < n:
        if mode == 'media':
            if method == 'tab':
                foo = '&max_position={}'.format(
                    max_pos) if max_pos is not None else ''
                url = 'https://twitter.com/i/profiles/show/{}/media_timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                    username, foo)
                print_('max_pos={},  imgs={}'.format(max_pos, len(imgs)))
            elif method == 'search':  # 1028
                max_id = min(ids_set) - 1 if ids_set else None
                if ids_set:
                    q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format(
                        username, max_id)
                else:
                    q = 'from:{} exclude:retweets filter:media -filter:periscope'.format(
                        username)
                q = quote(q, '')
                url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1&reset_error_state=false'.format(
                    q)
                print_('max_id={},  imgs={}'.format(max_id, len(imgs)))
            elif method == 'search2':  # 1028
                max_id = min(ids_set) - 1 if ids_set else None
                if ids_set:
                    q = 'from:{} max_id:{} exclude:retweets filter:media -filter:periscope'.format(
                        username, max_id)
                else:
                    q = 'from:{} exclude:retweets filter:media -filter:periscope'.format(
                        username)
                q = quote(q, '')
                foo = '&max_position={}'.format(
                    max_pos) if max_pos is not None else ''
                url = 'https://twitter.com/i/search/timeline?f=tweets&vertical=default&q={}&src=typd&include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                    q, foo)
                print_('max_pos={},  max_id={},  imgs={}'.format(
                    max_pos, max_id, len(imgs)))
            else:
                raise Exception('Invalid method: {}'.format(method))
        elif mode == 'likes':
            foo = '&max_position={}'.format(
                max_pos) if max_pos is not None else ''
            url = 'https://twitter.com/{}/likes/timeline?include_available_features=1&include_entities=1{}&reset_error_state=false'.format(
                username, foo)
        print(url)

        hdr = {
            "X-Requested-With": "XMLHttpRequest",
            "X-Twitter-Active-User": "******",
        }

        for try_ in range(16):
            if cw and not cw.alive:
                return
            try:
                html = downloader.read_html(
                    url,
                    session=session,
                    referer='https://twitter.com/{}'.format(username),
                    headers=hdr)  #err
            except Exception as e:
                e_msg = print_error(e)[-1]
                print_('retry... ({}) {}\n{}'.format(try_, url, e_msg))
                change_ua(session)
                continue
            try:
                data = json.loads(html)
            except Exception as e:
                change_ua(session)
                soup = Soup(html)
                login = soup.find('div', class_='LoginForm-input')
                if login and method == 'tab':
                    raise Exception('Login required!')
                print_('can not load json: {}'.format(e))
                sleep(1)
                continue
            break
        else:
            print_('over try')
            if not imgs:
                raise Exception('No imgs')
            break

        if 'items_html' in data:
            html = data['items_html']
        else:
            print_('no items_html')
            session.cookies.clear()  # ???
            #break

        soup = Soup(html)
        tweets = soup.findAll('div', class_='tweet') + soup.findAll(
            'span', class_='grid-tweet')

        ids = []
        for tweet in tweets:
            id = int(tweet.attrs['data-tweet-id'])
            if id in ids_set:
                print('duplicate')
                continue
            ids.append(id)
            ids_set.add(id)
            tweet = Tweet(tweet, format, types, session, cw)
            for img in tweet.imgs:
                imgs.append(img)

        if n is not None and len(imgs) >= n:
            break

        if not ids:
            foo = 4 if method != 'search2' else 16
            if len(imgs) == 0:
                raise Exception('No Image')
            elif f**k > foo:
                if method == 'tab':  ### search
                    method = 'search'
                    f**k = 0
                    continue
                elif method == 'search' and not ids and min_position is not None:  ### search2
                    method = 'search2'
                    max_pos = min_position
                    #min_position = None
                    f**k = 0
                    continue
                else:
                    print('too much f**k')
                    break
            else:
                print('f**k!!!!!')
                change_ua(session)
                f**k += 1
        elif f**k:
            print('reset f**k')
            f**k = 0

        max_pos_new = data.get('min_position')  # 1028
        if max_pos_new is None:
            if ids:
                max_pos_new = min(ids)
            else:
                max_pos_new = max_pos  #
        max_pos = max_pos_new

        if data.get('min_position'):
            min_position = data['min_position']
            print('min_position:', min_position)

        try:
            if cw is not None:
                if not cw.alive:
                    break
                cw.setTitle('{}  {} (@{}) - {}'.format(tr_('읽는 중...'), artist,
                                                       username, len(imgs)))
        except Exception as e:
            print(e)
            raise

    return imgs
Exemplo n.º 24
0
    def read(self):
        type = self.pixiv_type
        cw = self.cw
        print_ = cw.print_
        ui_setting = self.ui_setting

        if type == 'following':
            raise NotImplementedError('following')
        
        self._format = [None, 'gif', 'webp', 'png'][ui_setting.ugoira_convert.currentIndex()]
        self._format_name = compatstr(ui_setting.pixivFormat.currentText())
        types = self.get_types()
        if types:
            s = ', '.join(sorted(types))
        else:
            s = 'all'
            types = None
        print_((u'Type: {}').format(s))
        print_((u'info: {}').format(self.info))
        api = self.api
        query = self.id.replace('_bmk', '').replace('_illust', '').replace('pixiv_', '').replace('search_', '')
        if type != 'search':
            query = int(query)
        print('pixiv_query:', query)
        try:
            if type in ('user', 'bookmark', 'search'):
                max_pid = get_max_range(cw)
                if ui_setting.groupBox_tag.isChecked():
                    tags = [ compatstr(ui_setting.tagList.item(i).text()) for i in range(ui_setting.tagList.count()) ]
                else:
                    tags = []
                if type == 'search':
                    query = query.replace('+', ' ')
                    name = query
                else:
                    id = self.id.replace('_bmk', '').replace('pixiv_', '').replace('search_', '')
                    print('name', id)
                    name = get_name(id, self.api, cw=cw)
                    self.artist = name
                title = u'{} ({})'.format(name, self.id)
                print_(title)
                dir = os.path.join(get_outdir('pixiv'), clean_title(title))
                imgs = get_imgs(query, type=type, api=api, n=max_pid, tags=tags, types=types, format=self._format, format_name=self._format_name, dir=dir, cw=cw, title=title, info=self.info)
            elif type == 'illust':
                for try_ in range(N_TRY):
                    try:
                        detail = api.illust_detail(query, req_auth=True)
                        error = detail.get('error')
                        if error:
                            raise PixivError(error)
                        break
                    except PixivError as e:
                        api = e.api
                        print_(e)
                        if try_ < N_TRY - 1:
                            print_('retry...')
                        sleep(SLEEP, cw)
                else:
                    raise

                illust = detail.illust
                name = illust.title
                title = (u'{} ({})').format(name, self.id)
                dir = os.path.join(get_outdir('pixiv'), clean_title(title))
                imgs = get_imgs_from_illust(illust, api=api, format=self._format, dir=dir, cw=cw, format_name=self._format_name)
        except PixivError as e:
            msg = (u'PixivError: {}').format(e.message)
            return self.Invalid(msg)

        self.imgs = []
        for img in imgs:
            d = {'type': img.type, 'url': img.url()}
            if img.type == 'ugoira':
                d['filename'] = img.filename
                d['frames'] = img.ugoira_data.frames
            self.imgs.append(d)
        for img in imgs:
            self.urls.append(img.url)

        self.title = clean_title(title) # 1390
Exemplo n.º 25
0
def process_ids(ids, info, imgs, cw, depth=0, tags_add=None):
    print_ = get_print(cw)
    max_pid = get_max_range(cw)

    class Thread(threading.Thread):
        alive = True
        rem = 0

        def __init__(self, queue):
            super().__init__(daemon=True)
            self.queue = queue

        @classmethod
        @lock
        def add_rem(cls, x):
            cls.rem += x

        def run(self):
            while self.alive:
                try:
                    id_, res, i = self.queue.popleft()
                except Exception as e:
                    sleep(.1)
                    continue
                try:
                    info_illust = get_info(
                        'https://www.pixiv.net/en/artworks/{}'.format(id_),
                        cw,
                        depth=depth + 1,
                        tags_add=tags_add)
                    res[i] = info_illust['imgs']
                except Exception as e:
                    if depth == 0 and (e.args and e.args[0] == '不明なエラーが発生しました'
                                       or type(e) == errors.LoginRequired
                                       ):  # logout during extraction
                        res[i] = e
                    print_('process_ids error (id: {}, d:{}):\n{}'.format(
                        id_, depth,
                        print_error(e)[0]))
                finally:
                    Thread.add_rem(-1)

    queue = deque()
    n, step = Downloader_pixiv.STEP
    print_('{} / {}'.format(n, step))
    ts = []
    for i in range(n):
        t = Thread(queue)
        t.start()
        ts.append(t)
    for i in range(0, len(ids), step):
        res = [[]] * step
        for j, id_illust in enumerate(ids[i:i + step]):
            queue.append((id_illust, res, j))
            Thread.add_rem(1)
        while Thread.rem:
            sleep(.001, cw)
        for imgs_ in res:
            if isinstance(imgs_, Exception):
                raise imgs_
            imgs += imgs_
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(imgs))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(imgs) >= max_pid:
            break
        if depth == 0:
            check_alive(cw)
    for t in ts:
        t.alive = False
Exemplo n.º 26
0
def get_info(url, cw=None, depth=0, tags_add=None):
    print_ = get_print(cw)
    api = PixivAPI()
    info = {}
    imgs = []

    ugoira_ext = [None, '.gif', '.webp', '.png'
                  ][utils.ui_setting.ugoira_convert.currentIndex(
                  )] if utils.ui_setting else None
    format_ = compatstr(utils.ui_setting.pixivFormat.currentText()
                        ) if utils.ui_setting else 'id_ppage'

    max_pid = get_max_range(cw)

    if api.illust_id(url):  # Single post
        id_ = api.illust_id(url)
        data = api.illust(id_)
        login = '******' not in data
        if FORCE_LOGIN and not login:  #
            raise errors.LoginRequired()
        if data['xRestrict'] and not login:
            raise errors.LoginRequired('R-18')
        info['artist'] = data['userName']
        info['artist_id'] = data['userId']
        info['raw_title'] = data['illustTitle']
        info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_)
        info['create_date'] = parse_time(data['createDate'])
        tags_illust = set(tag['tag'] for tag in data['tags']['tags'])

        if tags_matched(tags_illust, tags_add, cw):
            if data['illustType'] == 2:  # ugoira
                data = api.ugoira_meta(id_)
                ugoira = {
                    'ext': ugoira_ext,
                    'delay': [frame['delay'] for frame in data['frames']],
                }
                img = Image(data['originalSrc'],
                            url,
                            id_,
                            0,
                            format_,
                            info,
                            cw,
                            ugoira=ugoira)
                imgs.append(img)
            else:
                data = api.pages(id_)
                for img in data:
                    img = Image(img['urls']['original'], url, id_, len(imgs),
                                format_, info, cw)
                    imgs.append(img)
        else:
            print('tags mismatched')
    elif '/bookmarks/' in url or 'bookmark.php' in url:  # User bookmarks
        id_ = api.user_id(url)
        if id_ is None:  #
            id_ = my_id()
        if id_ == my_id():
            rests = ['show', 'hide']
        else:
            rests = ['show']
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'],
                                                   info['artist_id'])
        ids = []
        ids_set = set()
        for rest in rests:
            offset = 0
            while len(ids) < max_pid:
                data = api.bookmarks(id_, offset, rest=rest)
                c = 0
                for id in [work['id'] for work in data['works']]:
                    if id in ids_set:
                        continue
                    ids_set.add(id)
                    ids.append(id)
                    c += 1
                if not c:
                    break
                offset += LIMIT
                if depth == 0:
                    check_alive(cw)
        process_ids(ids, info, imgs, cw, depth)
    elif '/tags/' in url or 'search.php' in url:  # Search
        q = unquote(
            re.find(r'/tags/([^/]+)', url)
            or re.find('[?&]word=([^&]*)', url, err='no tags'))
        info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+'))
        qs = query_url(url)
        order = qs.get('order', ['date_d'])[0]
        mode = qs.get('mode', ['all'])[0]
        s_mode = qs.get('s_mode', ['s_tag_full'])[0]
        scd = qs.get('scd', [None])[0]
        ecd = qs.get('ecd', [None])[0]
        type_ = qs.get('type', ['all'])[0]
        wlt = qs.get('wlt', [None])[0]
        wgt = qs.get('wgt', [None])[0]
        hlt = qs.get('hlt', [None])[0]
        hgt = qs.get('hgt', [None])[0]
        blt = qs.get('blt', [None])[0]
        bgt = qs.get('bgt', [None])[0]
        ratio = qs.get('ratio', [None])[0]
        tool = qs.get('tool', [None])[0]
        logs = [
            'order: {}'.format(order),
            'mode: {}'.format(mode),
            's_mode: {}'.format(s_mode),
            'scd / ecd: {} / {}'.format(scd, ecd),
            'type: {}'.format(type_),
            'wlt /  wgt: {} / {}'.format(wlt, wgt),
            'hlt / hgt: {} / {}'.format(hlt, hgt),
            'blt / bgt: {} / {}'.format(blt, bgt),
            'ratio: {}'.format(ratio),
            'tool: {}'.format(tool),
        ]
        print_('\n'.join(logs))
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.search(q,
                              order,
                              mode,
                              p=p,
                              s_mode=s_mode,
                              scd=scd,
                              ecd=ecd,
                              type_=type_,
                              wlt=wlt,
                              wgt=wgt,
                              hlt=hlt,
                              hgt=hgt,
                              blt=blt,
                              bgt=bgt,
                              ratio=ratio,
                              tool=tool)
            c = 0
            for id in [
                    illust['id'] for illust in data['illustManga']['data']
                    if 'id' in illust
            ]:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids, info, imgs, cw, depth)
    elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url:  # Newest works: Following
        r18 = 'bookmark_new_illust_r18.php' in url
        id_ = my_id()
        process_user(id_, info, api)
        info['title'] = '{} (pixiv_following_{}{})'.format(
            info['artist'], 'r18_' if r18 else '', info['artist_id'])
        ids = []
        ids_set = set()
        p = 1
        while len(ids) < max_pid:
            data = api.following(p, r18=r18)
            c = 0
            for id in data['page']['ids']:
                if id in ids_set:
                    continue
                ids_set.add(id)
                ids.append(id)
                c += 1
            if not c:
                break
            p += 1
        process_ids(ids, info, imgs, cw, depth)
    elif api.user_id(url):  # User illusts
        m = re.search(r'/users/[0-9]+/([\w]+)/?([^\?#/]*)', url)
        type_ = {
            'illustrations': 'illusts',
            'manga': 'manga'
        }.get(m and m.groups()[0])
        if type_:
            types = [type_]
        else:
            types = ['illusts', 'manga']
        if m:
            tag = unquote(m.groups()[1]) or None
        else:
            tag = None
        print_('types: {}, tag: {}'.format(types, tag))

        id_ = api.user_id(url)
        process_user(id_, info, api)
        data = api.profile(id_)
        info['title'] = '{} (pixiv_{})'.format(info['artist'],
                                               info['artist_id'])

        ids = []
        for type_ in types:
            illusts = data[type_]
            if not illusts:
                continue
            ids += list(illusts.keys())
        ids = sorted(ids, key=int, reverse=True)
        if not ids:
            raise Exception('no imgs')
        process_ids(ids,
                    info,
                    imgs,
                    cw,
                    depth,
                    tags_add=[tag] if tag else None)
    else:
        raise NotImplementedError()
    info['imgs'] = imgs[:max_pid]

    return info
Exemplo n.º 27
0
def get_imgs_more(username,
                  session,
                  title,
                  types,
                  n=None,
                  format='[%y-%m-%d] id_ppage',
                  cw=None,
                  mode='media',
                  method='tab',
                  imgs=None):
    print_ = get_print(cw)
    imgs = imgs or []
    print_('imgs: {}, types: {}'.format(len(imgs), ', '.join(types)))

    artist, username = get_artist_username(username, session, cw)  #

    # Range
    n = max(n or 0, get_max_range(cw))

    ids_set = set(img.id for img in imgs)

    count_no_tweets = 0
    count_no_imgs = 0

    while check_alive(cw) or len(imgs) < n:
        if options.get('experimental') or count_no_tweets:  #2687, #3392
            filter_ = ''
        else:
            filter_ = ' filter:media'
        cache_guest_token = bool(count_no_tweets)

        if ids_set:
            max_id = min(ids_set) - 1
            q = 'from:{} max_id:{} exclude:retweets{} -filter:periscope'.format(
                username, max_id, filter_)
        else:
            q = 'from:{} exclude:retweets{} -filter:periscope'.format(
                username, filter_)
        print(q)

        tweets = []
        for tweet in list(
                TwitterAPI(session, cw, cache_guest_token).search(q)):
            id = int(tweet['id'])
            if id in ids_set:
                print_('duplicate: {}'.format(id))
                continue
            ids_set.add(id)
            tweets.append(tweet)

        if tweets:
            exists_more_imgs = False
            for tweet in tweets:
                imgs_tweet = get_imgs_from_tweet(tweet, session, types, format,
                                                 cw)
                if imgs_tweet:
                    imgs += imgs_tweet
                    exists_more_imgs = True
            if exists_more_imgs:
                count_no_imgs = 0
            else:
                count_no_imgs += 1
                if count_no_imgs >= RETRY_MORE_IMGS:  #4130
                    break
            count_no_tweets = 0
        else:
            count_no_tweets += 1
            change_ua(session)
            if count_no_tweets >= RETRY_MORE:
                break
            print_('retry... {}'.format(count_no_tweets))
            continue

        msg = '{}  {} (@{}) - {}'.format(tr_('읽는 중...'), artist, username,
                                         len(imgs))
        if cw:
            cw.setTitle(msg)
        else:
            print(msg)

    return imgs
Exemplo n.º 28
0
def get_videos(url, cw=None):
    '''
    get_videos
    '''
    print_ = get_print(cw)

    if '/users/' in url:
        mode = 'users'
        username = url.split('/users/')[1].split('/')[0]
    elif '/pornstar/' in url:
        mode = 'pornstar'
        username = url.split('/pornstar/')[1].split('/')[0]
    elif '/model/' in url:
        mode = 'model'
        username = url.split('/model/')[1].split('/')[0]
    elif '/channels/' in url:
        mode = 'channels'
        username = url.split('/channels/')[1].split('/')[0]
    elif '/playlist/' in url:
        mode = 'playlist'
        username = url.split('/playlist/')[1].split('/')[0]
    else:
        raise Exception('Not supported url')
    username = username.split('?')[0].split('#')[0]

    session = Session()

    domain = utils.domain(url)

    if mode in ['pornstar']:
        url_main = 'https://{}/{}/{}'.format(domain, mode, username)
        html = downloader.read_html(url_main, session=session)
        soup = Soup(html)
        soup = fix_soup(soup, url_main, session, cw)
        for a in soup.findAll('a'):
            if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''):
                free = True
                break
        else:
            free = False
        print_('free: {}'.format(free))

    # Range
    max_pid = get_max_range(cw, 500)
    max_pid = min(max_pid, 2000)#

    html = downloader.read_html(url, session=session)
    soup = fix_soup(Soup(html), url, session, cw)

    info = {}

    # get title
    h1 = soup.find('h1')
    if h1:
        header = 'Playlist'
        title = h1.find(id='watchPlaylist')
    else:
        title = None
    if not title:
        header = 'Channel'
        profile = soup.find('div', class_='profileUserName')
        wrapper = soup.find('div', class_='titleWrapper')
        bio = soup.find('div', class_='withBio')
        title = soup.find('h1', {'itemprop':'name'})
        if not title and profile:
            title = profile.a
        if not title and wrapper:
            title = wrapper.h1
        if not title and bio:
            title = bio.h1
    if not title:
        raise Exception('No title')
    #print(title)
    info['title'] = '[{}] {}'.format(header, title.text.strip())
    token = re.find('''token *= *['"](.*?)['"]''', html)
    print_('token: {}'.format(token))

    # get links
    hrefs = []
    fail = 0
    for p in range(1, 1+100):
        try:
            if mode in ['users', 'model']:
                if mode == 'users':
                    url_api = 'https://{}/users/{}/videos/public/'\
                              'ajax?o=mr&page={}'.format(domain, username, p)
                elif mode == 'model':
                    url_api = 'https://{}/model/{}/videos/upload/'\
                              'ajax?o=mr&page={}'.format(domain, username, p)
                r = session.post(url_api)
                soup = Soup(r.text)
                if soup.find('h1'):
                    print('break: h1')
                    break
            elif mode in ['pornstar']:
                if free:
                    url_api = 'https://{}/{}/{}/videos/upload'\
                              '?page={}'.format(domain, mode, username, p)
                    soup = downloader.read_soup(url_api, session=session)
                    soup = fix_soup(soup, url_api, session, cw)
                    soup = soup.find('div', class_='videoUList')
                else:
                    url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p)
                    soup = downloader.read_soup(url_api, session=session)
                    soup = fix_soup(soup, url_api, session, cw)
                    soup = soup.find('ul', class_='pornstarsVideos')
            elif mode in ['channels']:
                url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p)
                soup = downloader.read_soup(url_api, session=session)
                soup = fix_soup(soup, url_api, session, cw)
                try:
                    soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide')
                except:
                    break
            elif mode in ['playlist']:
                #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs))
                if token is None:
                    raise Exception('no token')
                url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p)
                soup = downloader.read_soup(url_api, session=session)
            else:
                raise NotImplementedError(mode)
            fail = 0
        except Exception as e:
            print_(e)
            fail += 1
            if fail < 2:
                continue
            else:
                break
        finally:
            print_('{}  ({})'.format(url_api, len(hrefs)))

        if cw and not cw.alive:
            return

        lis = soup.findAll('li', class_='videoblock')
        if not lis:
            print_('break: no lis')
            break

        if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found':
            print_('Page Not Found')
            break

        c = 0
        for li in lis:
            a = li.find('a')
            href = a.attrs['href']
            href = urljoin(url, href)
            if href in hrefs:
                continue
            c += 1
            if href.startswith('javascript:'): # Remove Pornhub Premium
                print(href)
                continue
            hrefs.append(href)
        if c == 0:
            print('c==0')
            break
        print(c) # 1320

        if len(hrefs) >= max_pid:
            break

    if cw:
        hrefs = filter_range(hrefs, cw.range)

    info['hrefs'] = hrefs

    return info
Exemplo n.º 29
0
def get_imgs(user_id, session, cw=None):
    print_ = get_print(cw)
    url = 'https://{}.bdsmlr.com/'.format(user_id)
    info = {'c': 0, 'posts': [], 'ids': set()}

    html = downloader.read_html(url, session=session)
    soup = Soup(html)

    sorry = soup.find('div', class_='sorry')
    if sorry:
        raise Exception(sorry.text.strip())

    username = soup.find('title').text.strip()  ###
    print('username:'******'username'] = username

    token = soup.find('meta', {'name': 'csrf-token'}).attrs['content']
    print_(u'token: {}'.format(token))
    foo(url, soup, info)

    max_pid = get_max_range(cw)  #, 2000)

    n = len(info['ids'])
    for p in range(1000):
        url_api = 'https://{}.bdsmlr.com/infinitepb2/{}'.format(
            user_id, user_id)
        data = {
            'scroll': str(info['c']),
            'timenow': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'last': str(info['last']),
        }
        print_(u'n:{}, scroll:{}, last:{}'.format(len(info['posts']),
                                                  data['scroll'],
                                                  data['last']))
        headers = {
            'Referer': url,
            'X-CSRF-TOKEN': token,
        }
        for try_ in range(4):
            try:
                r = session.post(url_api, data=data, headers=headers)
                if p == 0:
                    r.raise_for_status()
                break
            except Exception as e:
                print(e)
        else:
            raise
        soup = Soup(r.text)
        foo(url, soup, info)
        if len(info['ids']) == n:
            print('same; break')
            break
        n = len(info['ids'])

        s = u'{}  {} (tumblr_{}) - {}'.format(tr_(u'읽는 중...'), username,
                                              user_id, len(info['posts']))
        if cw is not None:
            if not cw.valid or not cw.alive:
                return []
            cw.setTitle(s)
        else:
            print(s)

        if len(info['posts']) > max_pid:
            break

    return info
Exemplo n.º 30
0
def get_imgs(username,
             session,
             title,
             types,
             n=0,
             format='[%y-%m-%d] id_ppage',
             cw=None):
    print_ = get_print(cw)

    # Range
    n = max(n, get_max_range(cw))

    # 2303
    ids = set()
    names = dict()
    dir_ = os.path.join(get_outdir('twitter'), title)
    if os.path.isdir(dir_) and cw:
        for name in cw.names_old:
            name = os.path.basename(name)
            id_ = re.find('([0-9]+)_p', name)
            if id_ is None:
                continue
            if get_ext(name).lower() == '.mp4':
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue
            id_ = int(id_)
            ids.add(id_)
            if id_ in names:
                names[id_].append(name)
            else:
                names[id_] = [name]
    ids_sure = sorted(ids)[:-100]
    max_id = max(ids_sure) if ids_sure else 0  #3201

    # 2303
    imgs_old = []
    for id_ in sorted(ids, reverse=True):
        for p, file in enumerate(
                sorted(os.path.join(dir_, name) for name in names[id_])):
            img = Image(file, '', id_, 0, p, format, cw, False)
            img.url = LazyUrl_twitter(None, lambda _: file, img)
            img.filename = os.path.basename(file)
            imgs_old.append(img)

    imgs_new = []
    enough = False
    c_old = 0
    counter = SkipCounter(1)
    msg = None
    for tweet in check_alive_iter(
            cw,
            TwitterAPI(session, cw).timeline_media(username)):
        id_ = int(tweet['id_str'])
        if id_ < max_id:
            print_('enough')
            enough = True
            break

        if id_ in ids:
            print_('duplicate: {}'.format(id_))
            c_old += 1
            continue
        ids.add(id_)

        imgs_new += get_imgs_from_tweet(tweet, session, types, format, cw)

        if len(imgs_new) + c_old >= n:  #3201
            break

        if counter.next():
            msg = '{}  {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new))
            if cw:
                cw.setTitle(msg)
            else:
                print(msg)
    if msg:
        if cw:
            cw.setTitle(msg)
        else:
            print(msg)

    if not enough and not imgs_new and c_old == 0:
        raise Exception('no imgs')

    imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True)

    if len(imgs) < n:
        imgs = get_imgs_more(username,
                             session,
                             title,
                             types,
                             n,
                             format,
                             cw,
                             imgs=imgs)

    return imgs[:n]