def get_info(url, cw=None): print_ = get_print(cw) info = {} html = downloader.read_html(url) if '"cafe_cautionpage"' in html: raise errors.LoginRequired() url_article = re.find(r'''//cafe\.naver\.com/ArticleRead\.nhn\?articleid=[0-9]+&clubid=[0-9]+''', html, err='no iframe') url_article = urljoin(url, url_article) print_(url_article) articleid = re.find(r'articleid=([0-9]+)', url_article) clubid = re.find(r'clubid=([0-9]+)', url_article) url_api = f'https://apis.naver.com/cafe-web/cafe-articleapi/v2/cafes/{clubid}/articles/{articleid}?query=&useCafeId=true&requestFrom=A' j = downloader.read_json(url_api, url) info['title'] = j['result']['article']['subject'] info['cafename'] = j['result']['cafe']['url'] info['cafeid'] = clubid info['id'] = articleid html_content = j['result']['article']['contentHtml'] soup = Soup(html_content) imgs = [] pairs = [] for video in soup.findAll('span', class_='_naverVideo'): vid = video.attrs['vid'] key = video.attrs['key'] pairs.append((vid, key)) for script in soup.findAll('script', class_='__se_module_data'): data_raw = script['data-module'] data = json.loads(data_raw)['data'] vid = data.get('vid') if not vid: continue key = data['inkey'] pairs.append((vid, key)) for vid, key in pairs: url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key) data_raw = downloader.read_html(url_api) data = json.loads(data_raw) fs = data['videos']['list'] fs = sorted(fs, key=lambda f: f['size'], reverse=True) video = Image(fs[0]['source'], url_article, len(imgs)) imgs.append(video) for img in soup.findAll('img'): img = Image(urljoin(url_article, img['src']), url, len(imgs)) imgs.append(img) info['imgs'] = imgs return info
def get_img(url): #print('get_img', url) html = read_html(url) soup = Soup(html) for div in soup.findAll('div'): href = div.attrs.get('data-full-size-href') if href: img = href break else: img = None if img is None: # 1869 for code in soup.findAll('code'): code = code.string hidden = Soup(code) soup.append(hidden) for a in soup.findAll('a'): target = a.attrs.get('target') if target == '_blank': img = a.attrs['href'] break else: raise Exception('No img') return img
def get_sd(url, session=None, html=None, cw=None, wait=True): print_ = get_print(cw) if html: soup = Soup(html) check_error(soup, cw, wait) for script in soup.findAll('script'): j = get_j(script) if j: break else: raise Exception('no _sharedData!!') else: for try_ in range(4): _wait(cw) html = read_html(url, session, cw) soup = Soup(html) check_error(soup, cw, wait) for script in soup.findAll('script'): j = get_j(script) if j: break else: continue break else: raise Exception('no _sharedData') for script in soup.findAll('script'): s = script.string if s and 'window.__additionalDataLoaded(' in s: s = cut_pair(s) j_add = json.loads(s) try: j['entry_data']['PostPage'][0].update(j_add) except: j['entry_data']['ProfilePage'][0].update(j_add) #2900 # Challenge challenge = j['entry_data'].get('Challenge') if challenge: try: for cont in challenge[0]['extraData']['content']: title = cont.get('title') if title: break else: raise Exception('no title') except: title = 'Err' raise errors.LoginRequired(title) # LoginAndSignupPage login = j['entry_data'].get('LoginAndSignupPage') if login: raise errors.LoginRequired() return j
def get_pages(url, cw=None): print_ = get_print(cw) url = get_main(url).replace('comic.naver.', 'm.comic.naver.') id = get_id(url) print('id:', id) print(url) html = downloader.read_html(url) soup = Soup(html) try: info = soup.find('div', class_='area_info') artist = info.find('span', class_='author').text.strip() except Exception as e: print(e) try: title = ('\n').join( soup.find( 'div', class_='title').text.strip().split('\n')[:-1]).strip() except: title = 'artist not found' raise Exception(title) print('artist:', artist) title = soup.find('meta', {'property': 'og:title'}).attrs['content'] pages = [] nos = set() for p in range(1, 100): if p == 1: url_page = url else: url_page = set_page(url, p) html = downloader.read_html(url_page) print('read page:', url_page) soup = Soup(html) view = soup.findAll('ul', class_='section_episode_list')[(-1)] for lst in view.findAll('li'): url_page = urljoin(url, lst.find('a').attrs['href']) if 'detail.nhn' not in url_page.lower(): continue print_('url_page: {}'.format(url_page)) text = lst.find('strong', class_='title').find('span', class_='name').text.strip() no = int(re.findall('[?&]no=([0-9]+)', url_page)[0]) if no in nos: print('duplicate no: {}'.format(no)) continue nos.add(no) text = '{:04} - {}'.format(no, text) page = Page(url_page, text, p) pages.append(page) btn_next = soup.find('a', class_='btn_next') if btn_next is None or btn_next.attrs['href'] == '#': print('end of page') break info = Info(id, title, artist) return (info, pages)
def get_info(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) info = {} info['title'] = soup.find('h1', id='workTitle').text.strip() info['artist'] = soup.find('span', id='workAuthor-activityName').text.strip() desc = soup.find('section', id='description') button = desc.find('span', class_='ui-truncateTextButton-expandButton') if button: print('decompose button') button.decompose() catch = desc.find('span', id='catchphrase-body').text.strip() intro = desc.find('p', id='introduction').text.strip() desc = u' {}\n\n\n{}'.format(catch, intro) info['description'] = desc pages = [] for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'): href = urljoin(url, a.attrs['href']) subtitle = a.find('span', class_='widget-toc-episode-titleLabel').text.strip() date = a.find('time', class_='widget-toc-episode-datePublished').text.strip() page = Page(href, subtitle, date, len(pages) + 1) pages.append(page) info['pages'] = pages return info
def fix_url(url, session=None, cw=None): print_ = get_print(cw) if '&manga_name=' not in url: return url print_('fix url') qs = query_url(url) name = qs['manga_name'][0].replace('+', ' ') url_search = urljoin(url, '/bbs/search.php') url_search = update_url_query(url_search, {'stx': [name]}) print(url_search) html = read_html(url_search, session=session) soup = Soup(html) posts = soup.findAll('div', class_='post-row') print_(('posts:').format(len(posts))) if len(posts) != 1: return url for a in posts[0].findAll('a'): href = urljoin(url, a.attrs['href']) if 'manga_detail' in href: break else: raise Exception('Failed to find link') if cw is not None: cw.gal_num = href return href
def read_channel(url, cw=None): print_ = get_print(cw) username = url.split('/users/')[1].split('/')[0] info = {} html = downloader.read_html(url) soup = Soup(html) title = soup.find('div', class_='user-name').text.strip() info['title'] = u'[Channel] {}'.format(title) items = [] for p in range(1, 21): url = 'https://xhamster.com/users/{}/videos/{}'.format(username, p) print_(url) html = downloader.read_html(url) soup = Soup(html) items_ = soup.findAll('div', class_='thumb-list__item') if not items_: print('no items') break for item in items_: items.append(item) urls = [] for item in items: url = item.a.attrs['href'] if url in urls: print('duplicate:', url) continue urls.append(url) info['urls'] = urls return info
def f(html, browser=None): soup = Soup(html) if is_captcha(soup): print('captcha') browser.show() sd['shown'] = True elif sd['shown']: browser.hide() sd['shown'] = False try: info['uid'] = soup.find('h2', class_='share-title').text.strip() info['nickname'] = soup.find( 'h1', class_='share-sub-title').text.strip() except Exception as e: print_(print_error(e)[0]) c = 0 ids_now = set() for div in soup.findAll('div', class_='video-feed-item'): a = div.find('a') if a is None: continue href = a['href'] if not href: continue m = re.search(PATTERN_VID, href) if m is None: continue id_video = int(m.group('id')) ids_now.add(id_video) if id_video in ids: continue ids.add(id_video) info['items'].append({'id': id_video}) c += 1 print_('items: {}'.format(len(info['items']))) if len(info['items']) >= max_pid: info['items'] = info['items'][:max_pid] return True browser.runJavaScript( 'window.scrollTo(0, document.body.scrollHeight);') sleep(15, cw) if c or (ids_now and min(ids_now) > min(ids)): sd['count_empty'] = 0 else: print_('empty') sd['count_empty'] += 1 msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items'])) if cw: if not cw.alive: raise Exception('cw dead') cw.setTitle(msg) else: print(msg) return sd['count_empty'] > 4
def get_imgs_page(page, referer, session, cw): #sleep(2) #html = downloader.read_html(page.url, referer, session=session) #soup = Soup(html) # 2183 res = clf2.solve(page.url, session=session) soup = Soup(res['html']) views = soup.findAll('div', class_='view-content') imgs = [] for view in views: if view is None: continue for img in view.findAll('img'): img = img.attrs.get('data-original') or img.attrs.get('content') if not img: continue img = urljoin(page.url, img) if '/img/cang' in img: continue if '/img/blank.gif' in img: continue img = Image(img, page, len(imgs)) imgs.append(img) if not imgs: raise Exception('no imgs') return imgs
def get_imgs(url, soup=None, name=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) view = soup.findAll('div', class_='rootContant')[:2][-1] v = view.find('video') if v: img = v.find('source').attrs['src'] img = urljoin(url, img) img = Image(img, url, 0, 'video') ext = os.path.splitext(img.url().split('?')[0])[1] img.filename = u'{}{}'.format(name, ext) return [img] imgs = [] for img in view.findAll('img'): img = img.attrs['dataurl'] img = urljoin(url, img) img = re.sub('/[a-z]+images/', '/images/', img).replace('_t.', '.') img = Image(img, url, len(imgs)) imgs.append(img) return imgs
def get_imgs_page(page, session, cw=None): print_ = get_print(cw) print_(page.title) html = downloader.read_html(page.url, session=session) soup = Soup(html) view = soup.find('div', class_='chapter-content') if not view: raise Exception('no chapter-content') imgs = [] for img in soup.findAll('img', class_='chapter-img'): src = img.get('data-pagespeed-lazy-src') or img.get( 'data-src') or img.get('data-srcset') or img.get( 'data-aload') or img['src'] try: src = base64.b64decode(src).strip().decode('utf8') except: pass src = urljoin(page.url, src) if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src: continue if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src: continue if 'LoveHug_600cfd96e98ff.jpg' in src: continue img = Image(src.strip(), page, len(imgs)) imgs.append(img) return imgs
def get_info(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) info = {} info['id'] = get_id(url) title = soup.find('h1').text.strip() info['title'] = title for tag in soup.findAll('span', class_='tag'): href = tag.parent.attrs['href'] href = urljoin(url, href).strip('/') key = href.split('/')[3] value = href.split('/')[-1] if key == 'language' and value == 'translated': continue if key in info: info[key].append(value) else: info[key] = [value] for key in ['artists', 'groups', 'parodies', 'tags', 'characters']: if key not in info: info[key] = [] return info
def get_imgs(url, title=None, cw=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') tags = get_tags(url) tags = quote(tags, safe='/') tags = tags.replace('%20', '+') url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format( tags) print_ = get_print(cw) # Range max_pid = get_max_range(cw, 2000) imgs = [] ids = set() count_no_imgs = 0 for p in range(500): #1017 url = setPage(url, len(ids)) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('div', class_='thumbnail-preview') imgs_new = [] for post in posts: id_ = int(re.find('[0-9]+', post.find('a')['id'], err='no id')) if id_ in ids: print('duplicate:', id_) continue ids.add(id_) url_img = urljoin(url, post.find('a')['href']) img = Image(id_, url_img) imgs_new.append(img) if imgs_new: imgs += imgs_new count_no_imgs = 0 else: print('no imgs') count_no_imgs += 1 if count_no_imgs > 1: print('break') break if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) if not imgs: raise Exception('no imgs') return imgs
def get_imgs(url, title=None, customWidget=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url).replace(' ', '+') tags = urllib.quote(tags) url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) if customWidget is not None: print_ = customWidget.print_ else: def print_(*values): sys.stdout.writelines(values + ('\n', )) # Range if customWidget is not None: range_pid = customWidget.range else: range_pid = None if range_pid is not None: max_pid = max(parse_range(range_pid, max=100000)) else: max_pid = 2000 imgs = [] url_imgs = set() for p in range(100): url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('post') if not posts: break for post in posts: url_img = post.attrs['file_url'] if url_img in url_imgs: print 'already exists', url_img else: url_imgs.add(url_img) id = post.attrs['id'] img = Image(id, url_img) imgs.append(img) if len(imgs) >= max_pid: break if customWidget is not None: if not customWidget.alive: break customWidget.setTitle(u'{} {} - {}'.format( tr_(u'읽는 중...'), title, len(imgs))) return imgs
def get_video(url, session=None): if session is None: session = Session() session.headers['User-Agent'] = downloader.hdr['User-Agent'] session.headers['X-Directive'] = 'api' html = downloader.read_html(url, session=session) soup = Soup(html) for script in soup.findAll('script'): script = script.text or script.string or '' data = re.find('window.__NUXT__=(.+)', script) if data is not None: data = data.strip() if data.endswith(';'): data = data[:-1] data = json.loads(data) break else: raise Exception('No __NUXT__') info = data['state']['data']['video']['hentai_video'] query = info['slug'] #url_api = 'https://members.hanime.tv/api/v3/videos_manifests/{}?'.format(query) # old url_api = 'https://hanime.tv/rapi/v7/videos_manifests/{}?'.format( query) # new print(url_api) hdr = { 'x-signature': ''.join('{:x}'.format(randrange(16)) for i in range(32)), 'x-signature-version': 'web2', 'x-time': str(int(time())), } r = session.get(url_api, headers=hdr) print(r) data = json.loads(r.text) streams = [] for server in data['videos_manifest']['servers']: streams += server['streams'] streams_good = [] for stream in streams: url_video = stream['url'] if not url_video or 'deprecated.' in url_video: continue streams_good.append(stream) if not streams_good: raise Exception('No video available') print('len(streams_good):', len(streams_good)) for stream in streams_good: print(stream['extension'], stream['width'], stream['filesize_mbs'], stream['url']) stream = streams_good[0] return Video(info, stream), session
def get(self, url): html = downloader.read_html(url) soup = Soup(html) for li in soup.findAll('li'): if li.text.strip() == 'Original image': break else: raise Exception('no Original image') url = li.find('a')['href'] ext = get_ext(url) self.filename = u'{}{}'.format(self.id_, ext) return url
def get_imgs(url, title=None, cw=None): print_ = get_print(cw) url = clean_url(url) id = get_id(url) url = u'https://nijie.info/members_illust.php?id={}'.format(id) # Range max_pid = get_max_range(cw) imgs = [] url_imgs = set() for p in range(1, 1 + 100): url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) posts = soup.findAll('div', class_='nijie') if not posts: print('no posts') break c = 0 for post in posts: url_img = urljoin(url, post.a.attrs['href']) if url_img in url_imgs: print('duplicate:', url_img) continue url_imgs.add(url_img) id = int(re.find('[?&]id=([0-9]+)', url_img)) multi = post.find('div', class_='thumbnail-icon') if multi: imgs_ = get_imgs_post(id, url_img) # else: imgs_ = [Image(id, url_img, 0)] imgs += imgs_ c += 1 if len(imgs) >= max_pid: break msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw: if not cw.alive: return cw.setTitle(msg) else: print(msg) if len(imgs) >= max_pid or c == 0: break return imgs
def read_channel(url_page, cw=None): print_ = get_print(cw) res = re.find(CHANNEL_PATTERN, url_page) if res is None: raise Exception('Not channel') header, username = res print(header, username) max_pid = get_max_range(cw, 2000) info = {} info['header'] = header info['username'] = username session = Session() urls = [] urls_set = set() for p in range(100): url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) print(url_api) r = session.post(url_api, data='main_cats=false') soup = Soup(r.text) thumbs = soup.findAll('div', class_='thumb-block') if not thumbs: print_('empty') break for thumb in thumbs: info['name'] = thumb.find('span', class_='name').text.strip() href = thumb.find('a')['href'] href = urljoin(url_page, href) if href in urls_set: print_('duplicate: {}'.format(href)) continue urls_set.add(href) urls.append(href) if len(urls) >= max_pid: break s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) if cw: if not cw.alive: return cw.setTitle(s) else: print(s) if not urls: raise Exception('no videos') info['urls'] = urls[:max_pid] return info
def get_pages(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) pages = [] for item in soup.findAll('div', class_='acBacknumber-item-leftbox'): item = item.parent a = item.find('a') title = a.attrs['title'] href = a.attrs['href'] href = urljoin(url, href) page = Page(href, title) pages.append(page) return pages
def get_imgs(url): id = re.find('album/.*?([0-9]+)', url) print('id:', id) url = 'https://www.tokyomotion.net/album/slideshow/{}'.format(id) html = downloader.read_html(url) soup = Soup(html) imgs = [] for a in soup.findAll('a', {'data-lightbox': 'slideshow-{}'.format(id)}): img = a.find('img').attrs['src'] img = img.replace('/tmb/', '/') img = Image(img, url) imgs.append(img) return imgs
def get_imgs_page(page, referer, session, cw=None): print_ = get_print(cw) print_(page.title) html = downloader.read_html(page.url, referer, session=session) if clf2._is_captcha(Soup(html)): #4124 html = clf2.solve(page.url, session, cw)['html'] if not html: raise Exception('empty html') html = html.replace( '{}='.format(re.find(r"\$\(this\)\.attr\('(.+?)'", html, err='no cn')), 'data-src=') soup = Soup(html) view = soup.find('div', class_='chapter-content') if not view: raise Exception('no chapter-content') imgs = [] for img in soup.findAll('img', class_='chapter-img'): src = img.get('data-pagespeed-lazy-src') or img.get( 'data-src') or img.get('data-srcset') or img.get( 'data-aload') or img['src'] try: src = base64.b64decode(src).strip().decode('utf8') except: pass src0 = src src = src.replace('welovemanga.one', '1') # src = urljoin(page.url, src).strip() if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src: continue if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src: continue if 'LoveHug_600cfd96e98ff.jpg' in src: continue if 'image_5f0ecf23aed2e.png' in src: continue if '/uploads/lazy_loading.gif' in src: continue if not imgs: print_(src0) img = Image(src, page, len(imgs)) imgs.append(img) return imgs
def get_imgs(url, title=None, cw=None): url = clean_url(url) if 's=view' in url and 'page=favorites' not in url: raise NotImplementedError('Not Implemented') if 'page=dapi' not in url.lower(): tags = get_tags(url) tags = quote(tags, safe='/') tags = tags.replace('%20', '+') url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format( tags, 0, LIMIT) print_ = get_print(cw) # Range max_pid = get_max_range(cw) imgs = [] ids = set() for p in range(500): #1017 url = setPage(url, p) print_(url) html = try_n(4, sleep=30)(downloader.read_html)(url) #3340 soup = Soup(html) posts = soup.findAll('post') if not posts: break for post in posts: id_ = post.attrs['id'] if id_ in ids: print('duplicate:', id_) continue ids.add(id_) url_img = post.attrs['file_url'] img = Image(id_, url_img) imgs.append(img) if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) return imgs
def get_video(url): html = downloader.read_html(url) soup = Soup(html) for script in soup.findAll('script'): script = script.text or script.string or '' hls = re.find(r'''html5player\.setVideoHLS\(['"](.+?)['"]''', script) if hls: break else: raise Exception('No VideoHLS') video = playlist2stream(hls) title = get_title(soup) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'].strip() video = Video(video, url, title, url_thumb) return video
def get_pages(url, session=None, soup=None): if soup is None: html = downloader.read_html(url, session=session) soup = Soup(html) pages = [] for inner in soup.findAll('div', class_='inner'): a = inner.find('a') if not a: continue href = a.attrs.get('href', '') if not re.search(PATTERN_ID, href): continue if a.find('img'): print('skip img', a.attrs.get('href')) continue href = urljoin(url, href) title_page = a.text page = Page(title_page, href) pages.append(page) pages = list(reversed(pages)) return pages
def decode(html, cw=None): ''' decode ''' print_ = get_print(cw) print_('decode') soup = Soup(html) for script in soup.findAll('script'): script = script.text or script.string or '' script = script.strip() if 'videoUrl' in script: break else: raise Exception('No script') flashvars = script.split()[1] script = 'playerObjList={};' + script context = js2py.EvalJs() context.execute(script) return context.eval(flashvars).to_dict()
def get_info(url, cw=None): info = _get_info(url, cw) imgs = [] html = info['content'] soup = Soup(html) for img in soup.findAll('img'): src = img.attrs.get('src') if not src: continue # 1696 if not isinstance(src, bytes): src = src.encode('utf8') t = base64.b64encode(src) if isinstance(t, bytes): t = t.decode('utf8') src = 'https://cdn1-images.epio.app/image/download/{}'.format(t) img = Image(src, url, len(imgs)) imgs.append(img) info['imgs'] = imgs return info
def get_imgs(url, title=None, range_=None, cw=None): if 'donmai.us/artists' in url: raise NotImplementedError('Not Implemented') if 'donmai.us/posts/' in url: raise NotImplementedError('Not Implemented') print_ = get_print(cw) # Range max_pid = get_max_range(cw) if range_ is None: range_ = range(1, 101) print(range_) imgs = [] i = 0 empty_count = 0 empty_count_global = 0 url_imgs = set() while i < len(range_): p = range_[i] url = setPage(url, p) print_(url) html = downloader.read_html(url) soup = Soup(html) articles = soup.findAll('article') if articles: empty_count_global = 0 else: empty_count += 1 if empty_count < 4: s = u'empty page; retry... {}'.format(p) print_(s) continue else: empty_count = 0 empty_count_global += 1 if empty_count_global >= 6: break for article in articles: id = article.attrs['data-id'] url_img = article.attrs['data-file-url'].strip() if url_img.startswith('http://') or url_img.startswith('https://'): pass else: url_img = 'https://{}donmai.us'.format( 'danbooru.' if 'danbooru.' in url else '') + url_img #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) img = Image(id, url_img) imgs.append(img) if len(imgs) >= max_pid: break if cw is not None: if not cw.alive: break cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) i += 1 return imgs
def get_videos(url, cw=None): ''' get_videos ''' print_ = get_print(cw) if '/users/' in url: mode = 'users' username = url.split('/users/')[1].split('/')[0] elif '/pornstar/' in url: mode = 'pornstar' username = url.split('/pornstar/')[1].split('/')[0] elif '/model/' in url: mode = 'model' username = url.split('/model/')[1].split('/')[0] elif '/channels/' in url: mode = 'channels' username = url.split('/channels/')[1].split('/')[0] elif '/playlist/' in url: mode = 'playlist' username = url.split('/playlist/')[1].split('/')[0] else: raise Exception('Not supported url') username = username.split('?')[0].split('#')[0] session = Session() domain = utils.domain(url) if mode in ['pornstar']: url_main = 'https://{}/{}/{}'.format(domain, mode, username) html = downloader.read_html(url_main, session=session) soup = Soup(html) soup = fix_soup(soup, url_main, session, cw) for a in soup.findAll('a'): if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''): free = True break else: free = False print_('free: {}'.format(free)) # Range max_pid = get_max_range(cw, 500) max_pid = min(max_pid, 2000)# html = downloader.read_html(url, session=session) soup = fix_soup(Soup(html), url, session, cw) info = {} # get title h1 = soup.find('h1') if h1: header = 'Playlist' title = h1.find(id='watchPlaylist') else: title = None if not title: header = 'Channel' profile = soup.find('div', class_='profileUserName') wrapper = soup.find('div', class_='titleWrapper') bio = soup.find('div', class_='withBio') title = soup.find('h1', {'itemprop':'name'}) if not title and profile: title = profile.a if not title and wrapper: title = wrapper.h1 if not title and bio: title = bio.h1 if not title: raise Exception('No title') #print(title) info['title'] = '[{}] {}'.format(header, title.text.strip()) token = re.find('''token *= *['"](.*?)['"]''', html) print_('token: {}'.format(token)) # get links hrefs = [] fail = 0 for p in range(1, 1+100): try: if mode in ['users', 'model']: if mode == 'users': url_api = 'https://{}/users/{}/videos/public/'\ 'ajax?o=mr&page={}'.format(domain, username, p) elif mode == 'model': url_api = 'https://{}/model/{}/videos/upload/'\ 'ajax?o=mr&page={}'.format(domain, username, p) r = session.post(url_api) soup = Soup(r.text) if soup.find('h1'): print('break: h1') break elif mode in ['pornstar']: if free: url_api = 'https://{}/{}/{}/videos/upload'\ '?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('div', class_='videoUList') else: url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('ul', class_='pornstarsVideos') elif mode in ['channels']: url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) try: soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide') except: break elif mode in ['playlist']: #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs)) if token is None: raise Exception('no token') url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p) soup = downloader.read_soup(url_api, session=session) else: raise NotImplementedError(mode) fail = 0 except Exception as e: print_(e) fail += 1 if fail < 2: continue else: break finally: print_('{} ({})'.format(url_api, len(hrefs))) if cw and not cw.alive: return lis = soup.findAll('li', class_='videoblock') if not lis: print_('break: no lis') break if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found': print_('Page Not Found') break c = 0 for li in lis: a = li.find('a') href = a.attrs['href'] href = urljoin(url, href) if href in hrefs: continue c += 1 if href.startswith('javascript:'): # Remove Pornhub Premium print(href) continue hrefs.append(href) if c == 0: print('c==0') break print(c) # 1320 if len(hrefs) >= max_pid: break if cw: hrefs = filter_range(hrefs, cw.range) info['hrefs'] = hrefs return info
def get_imgs(url, title=None, customWidget=None, d=None, types=['img', 'gif', 'video'], session=None): if False: # raise NotImplementedError('Not Implemented') print_ = get_print(customWidget) print_(u'types: {}'.format(', '.join(types))) # Range max_pid = get_max_range(customWidget, 2000) local_ids = {} if customWidget is not None: dir = customWidget.downloader.dir try: names = os.listdir(dir) except Exception as e: print(e) names = [] for name in names: id = os.path.splitext(name)[0] local_ids[id] = os.path.join(dir, name) imgs = [] page = 1 url_imgs = set() if 'chan.sankakucomplex' in url: type = 'chan' elif 'idol.sankakucomplex' in url: type = 'idol' else: raise Exception('Not supported subdomain') url_old = 'https://{}.sankakucomplex.com'.format(type) if customWidget is not None: customWidget.exec_queue.put( (customWidget, u"customWidget.setTitle(u'{} {}')".format( tr_(u'읽는 중...'), title))) while len(imgs) < max_pid: #if page > 25: # Anonymous users can only view 25 pages of results # break sleep(1) # #url = setPage(url, page) print_(url) html = downloader.read_html(url, referer=url_old, session=session) if '429 Too many requests'.lower() in html.lower(): print_('429 Too many requests... wait 120 secs') for i in range(120): sleep(1) if customWidget and not customWidget.alive: return [] continue page += 1 url_old = url soup = Soup(html) articles = soup.findAll('span', {'class': 'thumb'}) if not articles: break for article in articles: # 1183 tags = article.find('img', class_='preview').attrs['title'].split() if 'animated_gif' in tags: type_ = 'gif' elif 'animated' in tags or 'webm' in tags or 'video' in tags or 'mp4' in tags: # 1697 type_ = 'video' else: type_ = 'img' if type_ not in types: continue url_img = article.a.attrs['href'] if not url_img.startswith('http'): url_img = urljoin('https://{}.sankakucomplex.com'.format(type), url_img) id = re.find('show/([0-9]+)', url_img) print_(article) if id is None: # sankaku plus continue if id in local_ids: #print('skip', id) local = True else: local = False #print(url_img) if url_img not in url_imgs: url_imgs.add(url_img) if local: url_img = local_ids[id] img = Image(type, id, url_img, url, local=local, cw=customWidget, d=d) imgs.append(img) if len(imgs) >= max_pid: break if customWidget and not customWidget.alive: break try: # For page > 50 pagination = soup.find('div', class_='pagination') url = urljoin('https://{}.sankakucomplex.com'.format(type), pagination.attrs['next-page-url']) except Exception as e: print_(print_error(e)[-1]) #url = setPage(url, page) break if customWidget is not None: customWidget.setTitle(u'{} {} - {}'.format( tr_(u'읽는 중...'), title, len(imgs))) else: print(len(imgs), 'imgs') if not imgs: raise Exception('no images') return imgs
def get_imgs_page_legacy(page, session, cw=None, depth=0): if cw is not None and not cw.alive: return print_ = get_print(cw) try: html = read_html(page.url, session) except Exception as e: print_('get_imgs_page_legacy error: {}'.format(e)) if e.args and e.args[0] == 502: return [] raise if isProtected(html): data = get_soup(page.url, cw=cw, session=session) page.url = data['url'] html = data['html'] soup = Soup(html, 'html5lib') # 1653 # skip empty pages if not html: print_(u'empty page: {}'.format(page.title)) return [] # skip invalid pages err = soup.find('span', class_='cf-error-code') if err: print_(u'cf-error-code: {} ({})'.format(err.text.strip(), page.title)) if depth > 0: return [] else: return get_imgs_page_legacy(page, session, cw, depth + 1) #page.title = get_title_page(soup) matches = re.findall('var img_list *= *(.+?]);', html.replace('\n', '')) matches1 = re.findall('var img_list1 *= *(.+?]);', html.replace('\n', '')) img_list = json.loads(matches[0]) if matches else [] img_list1 = json.loads(matches1[0]) if matches1 else [] # 1780 img_list = [img for img in img_list if img] img_list1 = [img for img in img_list1 if img] # 1589 ''' if not img_list and not img_list1: print_((u'no imgs; retry... {}').format(page.title)) raise Exception('No images') ''' for script in soup.findAll('script'): script = script.text if 'var img_list =' in script: break else: raise Exception('No script') seed = int(re.find('view_cnt *= *([0-9]+)', script)) chapter = int(re.find('var +chapter *= *([0-9]+)', script)) try: cdn_domains = cut_pair(re.find('var +cdn_domains *= *(.+)', script), '[]') cdn_domains = json.loads(cdn_domains) except Exception as e: print(e) cdn_domains = [] n = max(len(img_list), len(img_list1)) img_list += [''] * (n - len(img_list)) img_list1 += [''] * (n - len(img_list1)) print_(u'{} chapter:{} seed:{} domains:{}'.format( page.title, chapter, seed, len(cdn_domains))) if seed != 0: return 'seed' imgs = [] for p, (img, img1) in enumerate(zip(img_list, img_list1)): # fix img url img = fix_img_url(img, cdn_domains, chapter, p) img1 = fix_img_url(img1, cdn_domains, chapter, p) img = urljoin(page.url, img) if img else '' img1 = urljoin(page.url, img1) if img1 else '' # most likely googledrive if img.strip('/').count('/') == 2: #1425 continue img = Image(img, page, p, img1) imgs.append(img) return imgs