def get_id(url): return re.find('baraag.net/([^/]+)', url.lower())
def __init__(self, url, session, data=None): self.id_ = re.find('/video/([0-9]+)', url, err='no id') self._session = session self._data = data self.url = LazyUrl(url, self.get, self)
def read_channel_legacy(url, session, cw=None): print_ = get_print(cw) html = downloader.read_html(url, session=session, headers=HDR) uid = re.find('//user/profile/([0-9]+)', html, err='no uid') secUid = re.find('"secUid" *: *"([^"]+?)"', html, err='no secUid') verifyFp = ''.join( random.choice( 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for i in range(16)) maxCursor = 0 info = {} info['items'] = [] ids = set() for i in range(100): url_api = 'https://t.tiktok.com/api/item_list/?count=30&id={uid}&type=1&secUid={secUid}&maxCursor={maxCursor}&minCursor=0&sourceType=8&appId=1180®ion=US&language=en&verifyFp={verifyFp}'.format( uid=uid, secUid=secUid, verifyFp=verifyFp, maxCursor=maxCursor) js = 'window.byted_acrawler.sign({url:"{}"});'.replace('{}', url_api) print(js) for try_ in range(4): try: sign = devtools.eval_js(url, js, session)['output'] break except Exception as e: print(e) e_ = e else: raise e_ url_api += '&_signature=' + sign print_(url_api) data_raw = downloader.read_html(url_api, url, session=session, headers=HDR) data = json.loads(data_raw) items = [] for item in data.get('items', []): id_video = item['id'] if id_video in ids: print('duplicate:', id_video) continue ids.add(id_video) items.append(item) if not items: print('no items') break info['items'] += items if i == 0: info['uid'] = items[0]['author']['uniqueId'] info['nickname'] = items[0]['author']['nickname'] msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info['nickname'], info['uid'], len(info['items'])) if cw: if not cw.alive: break cw.setTitle(msg) else: print(msg) if not data['hasMore']: break maxCursor = data['maxCursor'] if not info['items']: raise Exception('no items') return info
def get_imgs_page_legacy(page, session, cw=None, depth=0): if cw is not None and not cw.alive: return print_ = get_print(cw) try: html = read_html(page.url, session) except Exception as e: print_('get_imgs_page_legacy error: {}'.format(e)) if e.args and e.args[0] == 502: return [] raise if isProtected(html): data = get_soup(page.url, cw=cw, session=session) page.url = data['url'] html = data['html'] soup = Soup(html, 'html5lib') # 1653 # skip empty pages if not html: print_(u'empty page: {}'.format(page.title)) return [] # skip invalid pages err = soup.find('span', class_='cf-error-code') if err: print_(u'cf-error-code: {} ({})'.format(err.text.strip(), page.title)) if depth > 0: return [] else: return get_imgs_page_legacy(page, session, cw, depth + 1) #page.title = get_title_page(soup) matches = re.findall('var img_list *= *(.+?]);', html.replace('\n', '')) matches1 = re.findall('var img_list1 *= *(.+?]);', html.replace('\n', '')) img_list = json.loads(matches[0]) if matches else [] img_list1 = json.loads(matches1[0]) if matches1 else [] # 1780 img_list = [img for img in img_list if img] img_list1 = [img for img in img_list1 if img] # 1589 ''' if not img_list and not img_list1: print_((u'no imgs; retry... {}').format(page.title)) raise Exception('No images') ''' for script in soup.findAll('script'): script = script.text if 'var img_list =' in script: break else: raise Exception('No script') seed = int(re.find('view_cnt *= *([0-9]+)', script)) chapter = int(re.find('var +chapter *= *([0-9]+)', script)) try: cdn_domains = cut_pair(re.find('var +cdn_domains *= *(.+)', script), '[]') cdn_domains = json.loads(cdn_domains) except Exception as e: print(e) cdn_domains = [] n = max(len(img_list), len(img_list1)) img_list += [''] * (n - len(img_list)) img_list1 += [''] * (n - len(img_list1)) print_(u'{} chapter:{} seed:{} domains:{}'.format( page.title, chapter, seed, len(cdn_domains))) if seed != 0: return 'seed' imgs = [] for p, (img, img1) in enumerate(zip(img_list, img_list1)): # fix img url img = fix_img_url(img, cdn_domains, chapter, p) img1 = fix_img_url(img1, cdn_domains, chapter, p) img = urljoin(page.url, img) if img else '' img1 = urljoin(page.url, img1) if img1 else '' # most likely googledrive if img.strip('/').count('/') == 2: #1425 continue img = Image(img, page, p, img1) imgs.append(img) return imgs
def __init__(self, title, url): self.title = clean_title(title) self.url = url self.id = int(re.find(PATTERN_ID, url))
def get_id(url): id_ = re.find('seriesId=([0-9]+)', url, err='No seriesId') return id_
def init(self): id_ = re.find('/novel/([^/]+)', self.url) if id_ is not None: self.url = 'https://syosetu.org/novel/{}/'.format(id_)
def key_id(cls, url): res = re.find(CHANNEL_PATTERN, url) if res: return '_'.join(res) return url
def getPage(url): p = re.find('page=([0-9]+)', url) return int(p or 1)
def user_id(self, url): return re.find('/users/([0-9]+)', url) or re.find( '[?&]id=([0-9]+)', url)
def get_id(url): url = url.lower() if '/prof-video-click/' in url: return url.split('/prof-video-click/')[1].split('/')[2] return re.find(r'xvideos[0-9]*\.[^/]+/video([0-9]+)', url, err='no id')
def illust_id(self, url): return re.find('/artworks/([0-9]+)', url) or re.find( '[?&]illust_id=([0-9]+)', url)
def my_id(): sid = Session().cookies.get('PHPSESSID', domain='.pixiv.net') if not sid: raise errors.LoginRequired() return re.find(r'^([0-9]+)', sid, err='no userid')
def get_info(url, cw=None, depth=0): print_ = get_print(cw) api = PixivAPI() info = {} imgs = [] if utils.ui_setting: ugoira_ext = [None, '.gif', '.webp', '.png'][utils.ui_setting.ugoira_convert.currentIndex()] else: ugoira_ext = None if utils.ui_setting: format_ = compatstr(utils.ui_setting.pixivFormat.currentText()) else: format_ = 'id_ppage' max_pid = get_max_range(cw) if api.illust_id(url): # Single post id_ = api.illust_id(url) data = api.illust(id_) login = '******' not in data if FORCE_LOGIN and not login: # raise errors.LoginRequired() if data['xRestrict'] and not login: raise errors.LoginRequired('R-18') info['artist'] = data['userName'] info['artist_id'] = data['userId'] info['raw_title'] = data['illustTitle'] info['title'] = '{} (pixiv_illust_{})'.format(info['raw_title'], id_) info['create_date'] = parse_time(data['createDate']) tags_illust = set(tag['tag'] for tag in data['tags']['tags']) if tags_matched(tags_illust, cw): if data['illustType'] == 2: # ugoira data = api.ugoira_meta(id_) ugoira = { 'ext': ugoira_ext, 'delay': [frame['delay'] for frame in data['frames']], } img = Image(data['originalSrc'], url, id_, 0, format_, info, cw, ugoira=ugoira) imgs.append(img) else: data = api.pages(id_) for img in data: img = Image(img['urls']['original'], url, id_, len(imgs), format_, info, cw) imgs.append(img) else: print('tags mismatched') elif '/bookmarks/' in url or 'bookmark.php' in url: # User bookmarks id_ = api.user_id(url) if id_ is None: # id_ = my_id() if id_ == my_id(): rest = 'all' else: rest = 'show' process_user(id_, info, api) info['title'] = '{} (pixiv_bmk_{})'.format(info['artist'], info['artist_id']) ids = [] ids_set = set() offset = 0 while len(ids) < max_pid: data = api.bookmarks(id_, offset, rest=rest) c = 0 for id in [work['id'] for work in data['works']]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break offset += LIMIT if depth == 0: check_alive(cw) process_ids(ids[:max_pid], info, imgs, cw, depth) elif '/tags/' in url or 'search.php' in url: # Search q = unquote( re.find(r'/tags/([^/]+)', url) or re.find('[?&]word=([^&]*)', url, err='no tags')) info['title'] = '{} (pixiv_search_{})'.format(q, q.replace(' ', '+')) qs = query_url(url) order = qs.get('order', ['date_d'])[0] mode = qs.get('mode', ['all'])[0] ids = [] ids_set = set() p = 1 while len(ids) < max_pid: data = api.search(q, order, mode, p=p) c = 0 for id in [ illust['id'] for illust in data['illustManga']['data'] if 'id' in illust ]: if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids[:max_pid], info, imgs, cw, depth) elif 'bookmark_new_illust.php' in url or 'bookmark_new_illust_r18.php' in url: # Newest works: Following r18 = 'bookmark_new_illust_r18.php' in url id_ = my_id() process_user(id_, info, api) info['title'] = '{} (pixiv_following_{}{})'.format( info['artist'], 'r18_' if r18 else '', info['artist_id']) ids = [] ids_set = set() p = 1 while len(ids) < max_pid: c = 0 for id in api.following(p, r18=r18): if id in ids_set: continue ids_set.add(id) ids.append(id) c += 1 if not c: break p += 1 process_ids(ids[:max_pid], info, imgs, cw, depth) elif api.user_id(url): # User illusts id_ = api.user_id(url) process_user(id_, info, api) data = api.profile(id_) info['title'] = '{} (pixiv_{})'.format(info['artist'], info['artist_id']) ids = [] for illusts in [data['illusts'], data['manga']]: if not illusts: continue ids += list(illusts.keys()) ids = sorted(ids, key=int, reverse=True) process_ids(ids[:max_pid], info, imgs, cw, depth) else: raise NotImplementedError() info['imgs'] = imgs[:max_pid] return info
def get_videos(url, cw=None): ''' get_videos ''' print_ = get_print(cw) if '/users/' in url: mode = 'users' username = url.split('/users/')[1].split('/')[0] elif '/pornstar/' in url: mode = 'pornstar' username = url.split('/pornstar/')[1].split('/')[0] elif '/model/' in url: mode = 'model' username = url.split('/model/')[1].split('/')[0] elif '/channels/' in url: mode = 'channels' username = url.split('/channels/')[1].split('/')[0] elif '/playlist/' in url: mode = 'playlist' username = url.split('/playlist/')[1].split('/')[0] else: raise Exception('Not supported url') username = username.split('?')[0].split('#')[0] session = Session() if mode in ['pornstar']: url_main = 'https://www.pornhub.com/{}/{}'.format(mode, username) html = downloader.read_html(url_main, session=session) soup = Soup(html) soup = fix_soup(soup, url_main, session, cw) for a in soup.findAll('a'): if '/{}/{}/videos/upload'.format(mode, username) in a.attrs.get('href', ''): free = True break else: free = False print_('free: {}'.format(free)) # Range max_pid = get_max_range(cw, 500) max_pid = min(max_pid, 2000)# html = downloader.read_html(url, session=session) soup = fix_soup(Soup(html), url, session, cw) info = {} # get title h1 = soup.find('h1') if h1: header = 'Playlist' title = h1.find(id='watchPlaylist') else: title = None if not title: header = 'Channel' profile = soup.find('div', class_='profileUserName') wrapper = soup.find('div', class_='titleWrapper') bio = soup.find('div', class_='withBio') title = soup.find('h1', {'itemprop':'name'}) if not title and profile: title = profile.a if not title and wrapper: title = wrapper.h1 if not title and bio: title = bio.h1 if not title: raise Exception('No title') #print(title) info['title'] = '[{}] {}'.format(header, title.text.strip()) token = re.find('''token *= *['"](.*?)['"]''', html) print_('token: {}'.format(token)) # get links hrefs = [] fail = 0 for p in range(1, 1+100): try: if mode in ['users', 'model']: if mode == 'users': url_api = 'https://www.pornhub.com/users/{}/videos/public/'\ 'ajax?o=mr&page={}'.format(username, p) elif mode == 'model': url_api = 'https://www.pornhub.com/model/{}/videos/upload/'\ 'ajax?o=mr&page={}'.format(username, p) r = session.post(url_api) soup = Soup(r.text) if soup.find('h1'): print('break: h1') break elif mode in ['pornstar']: if free: url_api = 'https://www.pornhub.com/{}/{}/videos/upload'\ '?page={}'.format(mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('div', class_='videoUList') else: url_api = 'https://www.pornhub.com/{}/{}?page={}'.format(mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('ul', class_='pornstarsVideos') elif mode in ['channels']: url_api = 'https://www.pornhub.com/{}/{}/videos?page={}'.format(mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) try: soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide') except: break elif mode in ['playlist']: #url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(username, len(hrefs)) if token is None: raise Exception('no token') url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&token={}&page={}'.format(username, token, p) soup = downloader.read_soup(url_api, session=session) else: raise NotImplementedError(mode) fail = 0 except Exception as e: print_(e) fail += 1 if fail < 2: continue else: break finally: print_('{} ({})'.format(url_api, len(hrefs))) if cw and not cw.alive: return lis = soup.findAll('li', class_='videoblock') if not lis: print_('break: no lis') break if getattr(soup.find('title'), 'text', '').strip() == 'Page Not Found': print_('Page Not Found') break c = 0 for li in lis: a = li.find('a') href = a.attrs['href'] href = urljoin(url, href) if href in hrefs: continue c += 1 if href.startswith('javascript:'): # Remove Pornhub Premium print(href) continue hrefs.append(href) if c == 0: print('c==0') break print(c) # 1320 if len(hrefs) >= max_pid: break if cw: hrefs = filter_range(hrefs, cw.range) info['hrefs'] = hrefs return info
def get_artist(soup): view = soup.find('div', class_='view-title') text = view.text.replace('\n', '#') artist = re.find('작가[ #]*:[ #]*(.+?)#', text, default='N/A').strip() return artist
def get(self, url): ''' get ''' cw = self.cw session = self.session print_ = get_print(cw) if self._url: return self._url id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \ re.find(r'/embed/(\w+)', url, re.IGNORECASE) print('id: {}'.format(id_)) if 'viewkey=' not in url.lower() and '/gif/' not in url.lower(): url = urljoin(url, '/view_video.php?viewkey={}'.format(id_)) html = downloader.read_html(url, session=session) soup = Soup(html) soup = fix_soup(soup, url, session, cw) html = str(soup) # removed if soup.find('div', class_='removed'): raise Exception('removed') gif = soup.find('div', {'id': 'gifImageSection'}) if gif: print_('GIF') id_ = url.split('/gif/')[1] id_ = re.findall('[0-9a-zA-Z]+', id_)[0] jss = list(gif.children) for js in jss: if 'data-mp4' in getattr(js, 'attrs', {}): break else: raise Exception('gif mp4 url not found') title = js['data-gif-title'] url = js['data-mp4'] url_thumb = re.find(r'https?://.+?.phncdn.com/pics/gifs/.+?\.jpg', html, err='no thumb') file = File('gif_{}'.format(id_), title, url, url_thumb) else: if id_ is None: raise Exception('no id') print_('Video') # 1968 #title = j['video_title'] title = soup.find('h1', class_='title').text.strip() ydl = ytdl.YoutubeDL() info = ydl.extract_info(url) url_thumb = info['thumbnail'] videos = [] for f in info['formats']: video = {} video['height'] = f['height'] video['quality'] = f['height'] video['protocol'] = f['protocol'] video['videoUrl'] = f['url'] if f['protocol'] == 'm3u8': video['quality'] -= 1 print_('[{}p] {} {}'.format(video['height'], video['protocol'], video['videoUrl'])) videos.append(video) if not videos: raise Exception('No videos') videos = sorted(videos, key=lambda video: video['quality']) res = get_resolution() videos_good = [video for video in videos if video['quality'] <= res] if videos_good: video = videos_good[-1] else: video = videos[0] print_('\n[{}p] {} {}'.format(video['height'], video['protocol'], video['videoUrl'])) file = File(id_, title, video['videoUrl'].strip(), url_thumb) self._url = file.url self.title = file.title self.filename = file.filename self.thumb = file.thumb return self._url
def _get_video(url, session, cw, ie_key=None, allow_m3u8=True): print_ = get_print(cw) print_('get_video: {}, {}'.format(allow_m3u8, url)) options = { 'noplaylist': True, #'extract_flat': True, 'playlistend': 1, } ydl = ytdl.YoutubeDL(options, cw=cw) info = ydl.extract_info(url) if not ie_key: ie_key = ytdl.get_extractor_name(url) info['ie_key'] = ie_key url_new = info.get('url') print('url: {} -> {}'.format(url, url_new)) formats = info.get('formats', []) print(info.keys()) if not formats and (info.get('entries') or 'title' not in info): if 'entries' in info: entry = info['entries'][0] url_new = entry.get('url') or entry['webpage_url'] if url_new != url: return get_video(url_new, session, cw, ie_key=get_ie_key(info)) session.headers.update(info.get('http_headers', {})) #session.cookies.update(ydl.cookiejar) if not formats: print('no formats') if url_new: f = {'url': url_new, 'format': ''} formats.append(f) fs = [] for i, f in enumerate(formats): f['_index'] = i f['_resolution'] = f.get('vbr') or int_or_none(re.find('([0-9]+)p', f['format'], re.IGNORECASE)) or f.get('height') or f.get('width') or int(f.get('vcodec', 'none') != 'none') f['_audio'] = f.get('abr') or f.get('asr') or int(f.get('acodec', 'none') != 'none') print_(format_(f)) fs.append(f) if not fs: raise Exception('No videos') def filter_f(fs): for f in fs: if allow_m3u8: return f ext = get_ext_(f['url'], session, url) if ext.lower() != '.m3u8': return f print_('invalid url: {}'.format(f['url'])) return list(fs)[0]# f_video = filter_f(reversed(sorted(fs, key=lambda f:(f['_resolution'], f['_index'])))) print_('video0: {}'.format(format_(f_video))) if f_video['_audio']: f_audio = None else: fs_audio = sorted([f_audio for f_audio in fs if (not f_audio['_resolution'] and f_audio['_audio'])], key=lambda f:(f['_audio'], f['_index'])) if fs_audio: f_audio = fs_audio[-1] else: try: print_('trying to get f_video with audio') f_video = filter_f(reversed(sorted([f for f in fs if f['_audio']], key=lambda f:(f['_resolution'], f['_index'])))) except Exception as e: print_('failed to get f_video with audio: {}'.format(e)) f_audio = None print_('video: {}'.format(format_(f_video))) print_('audio: {}'.format(format_(f_audio))) video = Video(f_video, f_audio, info, session, url, cw=cw) return video
def get_data(html): data_raw = cut_pair(re.find('window.initials *= *(.+)', html)) return json.loads(data_raw)
def get_artist(soup): return re.find(r'작가 *: *(.+)', soup.text, default='').strip() or 'N/A'
def id_(self): try: return int(self.url) except: return int(re.find('/g/([0-9]+)', self.url))
def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage', cw=None): print_ = get_print(cw) # Range n = max(n, get_max_range(cw)) # 2303 ids = set() names = dict() dir_ = os.path.join(get_outdir('twitter'), title) if os.path.isdir(dir_) and cw: for name in cw.names_old: name = os.path.basename(name) id_ = re.find('([0-9]+)_p', name) if id_ is None: continue if get_ext(name).lower() == '.mp4': type_ = 'video' else: type_ = 'img' if type_ not in types: continue id_ = int(id_) ids.add(id_) if id_ in names: names[id_].append(name) else: names[id_] = [name] ids_sure = sorted(ids)[:-100] max_id = max(ids_sure) if ids_sure else 0 #3201 # 2303 imgs_old = [] for id_ in sorted(ids, reverse=True): for p, file in enumerate( sorted(os.path.join(dir_, name) for name in names[id_])): img = Image(file, '', id_, 0, p, format, cw, False) img.url = LazyUrl_twitter(None, lambda _: file, img) img.filename = os.path.basename(file) imgs_old.append(img) imgs_new = [] enough = False c_old = 0 for tweet in TwitterAPI(session, cw).timeline_media(username): id_ = int(tweet['id_str']) if id_ < max_id: print_('enough') enough = True break if id_ in ids: print_('duplicate: {}'.format(id_)) c_old += 1 continue ids.add(id_) imgs_new += get_imgs_from_tweet(tweet, session, types, format, cw) if len(imgs_new) + c_old >= n: #3201 break msg = '{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new)) if cw: if not cw.alive: break cw.setTitle(msg) else: print(msg) if not enough and not imgs_new and c_old == 0: raise Exception('no imgs') imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True) if len(imgs) < n: imgs = get_imgs_more(username, session, title, types, n, format, cw, imgs=imgs) return imgs[:n]