def get(self, referer): soup = downloader.read_soup(self._url, referer, session=self.session) div = soup.find('div', id='display_image_detail') url = urljoin(self._url, div.find('img').parent['href']) ext = get_ext(url) self.filename = '{:04}{}'.format(self._p, ext) return url, self._url
def read_channel(url, cw=None): print_ = get_print(cw) username = url.split('/users/')[1].split('/')[0] info = {} soup = downloader.read_soup(url) title = soup.find('div', class_='user-name').text.strip() info['title'] = u'[Channel] {}'.format(title) urls = [] urls_set = set() for p in range(1, 101): items = read_page(username, p, cw) if not items: print('no items') break for item in items: if item.find('span', class_='thumb-image-container__status-text'): #2858 continue url = item.a.attrs['href'] if url in urls_set: print('duplicate:', url) continue urls_set.add(url) urls.append(url) s = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(urls)) if cw: cw.setTitle(s) else: print(s) info['urls'] = urls return info
def read_album(url, session=None): ''' read_album ''' soup = downloader.read_soup(url, session=session) id_album = re.find('/album/([0-9]+)', url, err='no album id') url_json = 'https://www.pornhub.com/album/show_album_json?album={}'.format(id_album) data = downloader.read_json(url_json, url, session=session) block = soup.find('div', class_='photoAlbumListBlock') href = block.a.attrs['href'] id_ = re.find('/photo/([0-9]+)', href, err='no photo id') ids = [id_] while True: item = data[id_] id_ = item['next'] if id_ in ids: break ids.append(id_) photos = [] for id_ in ids: item = data[id_] img = item['img_large'] referer = 'https://www.pornhub.com/photo/{}'.format(id_) photo = Photo(id_, img, referer) photos.append(photo) info = {} title = clean_title(soup.find('h1', class_='photoAlbumTitleV2').text) info['title'] = format_filename(title, 'album_{}'.format(id_album)) info['photos'] = photos return info
def get_pages(url): pages = [] urls = set() for p in range(1, 101): url_page = set_page(url, p) print(url_page) for try_ in range(4): try: soup = downloader.read_soup(url_page) view = soup.find('ul', id='_listUl') if view is None: raise Exception('no view') break except Exception as e: e_ = e print(e) else: raise e_ pages_new = [] for li in view.findAll('li', recursive=False): href = urljoin(url, li.find('a')['href']) title = li.find('span', class_='subj').text.strip() if href in urls: continue urls.add(href) no = int(li['data-episode-no']) title = '{:04} - {}'.format(no, title) page = Page(href, title) pages_new.append(page) if not pages_new: break pages += pages_new return pages[::-1]
def get_video(url, session, cw): print_ = get_print(cw) soup = downloader.read_soup(url, session=session) view = soup.find('div', id='player-container-fluid') fs = [] for source in view.findAll('source'): src = urljoin(url, source.attrs['src']) res = re.find('([0-9]+)p', source.attrs['title']) res = int(res) if res else 0 f = {'res': res, 'src': src} fs.append(f) print_(f) if not fs: raise Exception('No source') #4773 res = max(get_resolution(), min(f['res'] for f in fs)) print_(f'res: {res}') fs = sorted([f for f in fs if f['res'] <= res], key=lambda f: f['res']) f = fs[-1] print_(f'best: {f}') src_best = f['src'] title = soup.find('h1').text.strip() id = soup.find('div', id='video').attrs['data-id'] url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] #src_best = downloader.real_url(src_best) video = Video(src_best, url_thumb, url, title, id, session) return video
def read(self): soup = downloader.read_soup(self.url) for div in soup.findAll('div', class_='fileText'): href = urljoin(self.url, div.a['href']) img = Image(href, self.url, len(self.urls)) self.urls.append(img.url) board = self.url.split('/')[3] title = soup.find('span', class_='subject').text id_ = int(self.url.split('/thread/')[1].split('/')[0]) self.title = clean_title(f'[{board}] {title} ({id_})')
def read(self): if '/video/' in self.url: res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h1', id='post_title').text.strip() self.title = title view = soup.find('div', id='post') video = view.find('video') src = video.find('source')['src'] src = urljoin(self.url, src) video = Video(src, self.url, title, self.session) self.urls.append(video.url) self.single = True return if '/image/' not in self.url: raise NotImplementedError('Not a post') res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h2').text paginator = soup.find('div', id='paginator') pages = [self.url] for a in paginator.findAll('a'): href = a.get('href') if not href: continue href = urljoin(self.url, href) if href not in pages: pages.append(href) imgs = [] for i, page in enumerate(pages): if page == self.url: soup_page = soup else: soup_page = downloader.read_soup(page, session=self.session) view = soup_page.find('div', id='post') for img in view.findAll('img'): href = img.parent['href'] href = urljoin(page, href) img = Image(href, page, len(imgs), self.session) imgs.append(img) self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages))) for img in imgs: self.urls.append(img.url) self.title = clean_title(title)
def is_login(session, cw=None, n=2): ''' is_login ''' print_ = get_print(cw) print_('is_login {}'.format(n)) if n <= 0: return False url = 'https://www.pornhubpremium.com' soup = downloader.read_soup(url, session=session) soup = fix_soup(soup, url, session, cw) html = str(soup) if soup.find('ul', id='profileMenuDropdown'): return True return is_login(session, cw, n-1)
def read_photo(url, session=None): ''' read_photo ''' id_ = re.find('/photo/([0-9]+)', url, err='no photo id') soup = downloader.read_soup(url, session=session) div = soup.find('div', id='thumbSlider') href = urljoin(url, div.find('a').attrs['href']) info = read_album(href) photos = [] for photo in info['photos']: if str(photo.id_) == id_: photos.append(photo) info['photos'] = photos info['title'] = '{} - {}'.format(info['title'], photos[0].filename) return info
def read_page(username, p, cw): print_ = get_print(cw) url = 'https://xhamster.com/users/{}/videos/{}'.format(username, p) print_(url) n = 4 for try_ in range(n): try: soup = downloader.read_soup(url) items = soup.findAll('div', class_='thumb-list__item') if not items and try_ < n - 1: continue break except Exception as e: e_ = e print(e) else: raise e_ return items
def read_gallery(url, cw=None): print_ = get_print(cw) info = {} soup = downloader.read_soup(url) h1 = soup.find('h1') if h1.find('a'): url = h1.find('a')['href'] return read_gallery(url, cw) info['title'] = h1.text.strip() info['url'] = setPage(url, 1) imgs = [] ids = set() for p in range(1, 101): print_('p: {}'.format(p)) url = setPage(url, p) html = downloader.read_html(url) data = get_data(html) photos = data['photosGalleryModel']['photos'] if not photos: print('no photos') break for photo in photos: img = photo['imageURL'] id = photo['id'] referer = photo['pageURL'] if id in ids: print('duplicate:', id) continue ids.add(id) img = Image(img, id, referer) imgs.append(img) info['imgs'] = imgs return info
def read_soup(url): return downloader.read_soup(url, user_agent=UA)
def get_videos(url, cw=None): ''' get_videos ''' print_ = get_print(cw) if '/users/' in url: mode = 'users' username = url.split('/users/')[1].split('/')[0] elif '/pornstar/' in url: mode = 'pornstar' username = url.split('/pornstar/')[1].split('/')[0] elif '/model/' in url: mode = 'model' username = url.split('/model/')[1].split('/')[0] elif '/channels/' in url: mode = 'channels' username = url.split('/channels/')[1].split('/')[0] elif '/playlist/' in url: mode = 'playlist' username = url.split('/playlist/')[1].split('/')[0] else: raise Exception('Not supported url') username = username.split('?')[0].split('#')[0] session = Session() domain = utils.domain(url) if mode in ['pornstar']: url_main = 'https://{}/{}/{}'.format(domain, mode, username) html = downloader.read_html(url_main, session=session) soup = Soup(html) soup = fix_soup(soup, url_main, session, cw) for a in soup.findAll('a'): if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''): free = True break else: free = False print_('free: {}'.format(free)) # Range max_pid = get_max_range(cw, 500) max_pid = min(max_pid, 2000)# html = downloader.read_html(url, session=session) soup = fix_soup(Soup(html), url, session, cw) info = {} # get title h1 = soup.find('h1') if h1: header = 'Playlist' title = h1.find(id='watchPlaylist') else: title = None if not title: header = 'Channel' profile = soup.find('div', class_='profileUserName') wrapper = soup.find('div', class_='titleWrapper') bio = soup.find('div', class_='withBio') title = soup.find('h1', {'itemprop':'name'}) if not title and profile: title = profile.a if not title and wrapper: title = wrapper.h1 if not title and bio: title = bio.h1 if not title: raise Exception('No title') #print(title) info['title'] = '[{}] {}'.format(header, title.text.strip()) token = re.find('''token *= *['"](.*?)['"]''', html) print_('token: {}'.format(token)) # get links hrefs = [] fail = 0 for p in range(1, 1+100): try: if mode in ['users', 'model']: if mode == 'users': url_api = 'https://{}/users/{}/videos/public/'\ 'ajax?o=mr&page={}'.format(domain, username, p) elif mode == 'model': url_api = 'https://{}/model/{}/videos/upload/'\ 'ajax?o=mr&page={}'.format(domain, username, p) r = session.post(url_api) soup = Soup(r.text) if soup.find('h1'): print('break: h1') break elif mode in ['pornstar']: if free: url_api = 'https://{}/{}/{}/videos/upload'\ '?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('div', class_='videoUList') else: url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('ul', class_='pornstarsVideos') elif mode in ['channels']: url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) try: soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide') except: break elif mode in ['playlist']: #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs)) if token is None: raise Exception('no token') url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p) soup = downloader.read_soup(url_api, session=session) else: raise NotImplementedError(mode) fail = 0 except Exception as e: print_(e) fail += 1 if fail < 2: continue else: break finally: print_('{} ({})'.format(url_api, len(hrefs))) if cw and not cw.alive: return lis = soup.findAll('li', class_='videoblock') if not lis: print_('break: no lis') break if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found': print_('Page Not Found') break c = 0 for li in lis: a = li.find('a') href = a.attrs['href'] href = urljoin(url, href) if href in hrefs: continue c += 1 if href.startswith('javascript:'): # Remove Pornhub Premium print(href) continue hrefs.append(href) if c == 0: print('c==0') break print(c) # 1320 if len(hrefs) >= max_pid: break if cw: hrefs = filter_range(hrefs, cw.range) info['hrefs'] = hrefs return info
def read_soup(url): return downloader.read_soup(url)
def get_soup(): try: return downloader.read_soup(f'https://web.archive.org/web/{snapshot[0]}id_/{snapshot[1]}') except Exception as exception: print_(print_error(exception)[0]) return None
def soup(self): return downloader.read_soup(self.url)
def read_soup(url, cw): check_alive(cw) wait(cw) return downloader.read_soup(url)
def init(self): self.url = get_main(self.url) self.soup = downloader.read_soup(self.url)
def get_main(url): if 'episode_no=' in url: soup = downloader.read_soup(url) url = urljoin(url, soup.find('div', class_='subj_info').find('a')['href']) return url