def get_video(url, cw=None): print_ = get_print(cw) check_alive(cw) data = cw.data_ version = data['version'] print_('version: {}'.format(version)) if version == '0.1': raise errors.OutdatedExtension() data = data['data'] if not isinstance(data, bytes): data = data.encode('utf8') s = base64.b64decode(data).decode('utf8') urls = json.loads(s) print_(u'\n'.join(urls[:4])) referer_seg = 'auto' if 'referer=force' in urls[0] else None # 1718 stream = M3u8_stream(url, urls=urls, n_thread=4, referer_seg=referer_seg) html = downloader.read_html(url) soup = Soup(html) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] title = soup.find('meta', { 'property': 'og:title' }).attrs['content'].strip() video = Video(stream, url_thumb, url, title) return video
def read_channel(url, type_, cw=None): print_ = get_print(cw) username = re.find(r'/users/([^/]+)', url, err='no username') info = {} urls = [] urls_set = set() for p in range(50): url = 'https://ecchi.iwara.tv/users/{}/{}?page={}'.format( username, type_, p) print_(url) html = downloader.read_html(url) soup = Soup(html) if p == 0: title = soup.find('h1', class_='page-title').text info['title'] = title.replace("'s videos", '').strip() view = soup.find('div', class_='view-content') if view is None: break urls_new = [] for div in view.findAll('div', class_='views-column'): href = div.find('a')['href'] url_video = urljoin(url, href) if url_video in urls_set: continue urls_set.add(url_video) urls_new.append(url_video) if not urls_new: break urls += urls_new info['urls'] = urls return info
def read_page(url, depth=0): print('read_page', url, depth) if depth > 10: raise Exception('Too deep') html = downloader.read_html(url, header=HDR) if len(html) < 5000: id = re.findall('logNo=([0-9]+)', html)[0] usernames = re.findall('blog.naver.com/([0-9a-zA-Z]+)', url) if not usernames: usernames = re.findall('blogId=([0-9a-zA-Z]+)', url) username = usernames[0] url = 'https://m.blog.naver.com/PostView.nhn?blogId={}&logNo={}&proxyReferer='.format( username, id) print('###', username, id, url) soup = Soup(html) if soup.find('div', {'id': 'viewTypeSelector'}): return url, soup frame = soup.find('frame') if frame is None: print('frame is None') return read_page(url, depth + 1) return read_page(urljoin('https://blog.naver.com', frame.attrs['src']), depth + 1)
def get(self, url): cw = self.cw d = self.d print_ = get_print(cw) for try_ in range(4): wait(cw) html = '' try: html = downloader.read_html(url, referer=self.referer, session=self.session) #url = 'https:' + re.findall('[Oo]riginal:? ?<a href="(//[0-9a-zA-Z_-]{2,2}.sankakucomplex.com/data/.{0,320}?)"', html)[0] soup = Soup(html) highres = soup.find(id='highres') url = urljoin(url, highres['href'] if highres else soup.find(id='image')['src']) break except Exception as e: e_msg = print_error(e)[0] if '429 Too many requests'.lower() in html.lower(): t_sleep = 120 * min(try_ + 1, 2) e = '429 Too many requests... wait {} secs'.format(t_sleep) elif 'post-content-notification' in html: # sankaku plus print_('Sankaku plus: {}'.format(self.id)) return '' else: t_sleep = 5 s = '[Sankaku] failed to read image (id:{}): {}'.format(self.id, e) print_(s) sleep(t_sleep, cw) else: raise Exception('can not find image (id:{})\n{}'.format(self.id, e_msg)) soup = Soup('<p>{}</p>'.format(url)) url = soup.string ext = os.path.splitext(url)[1].split('?')[0] self.filename = '{}{}'.format(self.id, ext) return url
def get_text(url, subtitle, update, session): html = downloader.read_html(url, session=session) soup = Soup(html) if update: update = u' ' + update else: update = '' story = soup.find('div', id='novel_honbun').text.strip() p = soup.find('div', id='novel_p') p = '' if p is None else p.text.strip() if p: story = '{}\n\n════════════════════════════════\n\n{}'.format(p, story) #2888 a = soup.find('div', id='novel_a') a = '' if a is None else a.text.strip() if a: story = '{}\n\n════════════════════════════════\n\n{}'.format(story, a) text = u'''──────────────────────────────── ◆ {}{} ──────────────────────────────── {}'''.format(subtitle, update, story) return text
def get_score(self): score_url = 'http://219.242.68.33/xuesheng/cjcx.aspx' soup = Soup(self.http_request.session, score_url) all_scoreifo = [item.text.strip() for item in soup.find_all('td')] indexs = all_scoreifo[0::10] years = all_scoreifo[2::10] terms = all_scoreifo[3::10] units = all_scoreifo[5::10] natures = all_scoreifo[7::10] courses = all_scoreifo[8::10] scores = map(lambda x: ' / '.join(x), [item.split('\n') for item in all_scoreifo[9::10]]) average = soup.find(id="ctl00_ContentPlaceHolder1_lblpjcj").text total = soup.find(id="ctl00_ContentPlaceHolder1_lblKcms").text credit = soup.find(id="ctl00_ContentPlaceHolder1_lblXfs").text tabletitle = ['序号', '课程', '成绩', '学分', '学年', '学期', '性质'] conts = [] for index, year, term, unit, nature, course, score in \ zip(indexs, years, terms, units, natures, courses, scores): temp = [ index, course.strip(), score.replace('\n', ''), unit, year, term, nature ] conts.append(temp) if self.display: table_print(tabletitle, conts) table_print(['平均成绩', '课程门数', '已获得学分'], [[average, total, credit]]) return conts
def f(html, browser=None): soup = Soup(html) if is_captcha(soup): print('captcha') browser.show() sd['shown'] = True elif sd['shown']: browser.hide() sd['shown'] = False try: info['uid'] = soup.find('h2', class_='share-title').text.strip() info['nickname'] = soup.find( 'h1', class_='share-sub-title').text.strip() except Exception as e: print_(print_error(e)[0]) c = 0 ids_now = set() for div in soup.findAll('div', class_='video-feed-item'): a = div.find('a') if a is None: continue href = a['href'] if not href: continue m = re.search(PATTERN_VID, href) if m is None: continue id_video = int(m.group('id')) ids_now.add(id_video) if id_video in ids: continue ids.add(id_video) info['items'].append({'id': id_video}) c += 1 print_('items: {}'.format(len(info['items']))) if len(info['items']) >= max_pid: info['items'] = info['items'][:max_pid] return True browser.runJavaScript( 'window.scrollTo(0, document.body.scrollHeight);') sleep(15, cw) if c or (ids_now and min(ids_now) > min(ids)): sd['count_empty'] = 0 else: print_('empty') sd['count_empty'] += 1 msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items'])) if cw: if not cw.alive: raise Exception('cw dead') cw.setTitle(msg) else: print(msg) return sd['count_empty'] > 4
def get_score(self): score_url = 'http://219.242.68.33/xuesheng/cjcx.aspx' soup = Soup(self.http_request.session, score_url) all_scoreifo = [item.text.strip() for item in soup.find_all('td')] indexs = all_scoreifo[0::10] years = all_scoreifo[2::10] terms = all_scoreifo[3::10] units = all_scoreifo[5::10] natures = all_scoreifo[7::10] courses = all_scoreifo[8::10] scores = map(lambda x: ' / '.join(x), [item.split('\n') for item in all_scoreifo[9::10]]) average = soup.find(id="ctl00_ContentPlaceHolder1_lblpjcj").text total = soup.find(id="ctl00_ContentPlaceHolder1_lblKcms").text credit = soup.find( id="ctl00_ContentPlaceHolder1_lblXfs").text tabletitle = ['序号', '课程', '成绩', '学分', '学年', '学期', '性质'] conts = [] for index, year, term, unit, nature, course, score in \ zip(indexs, years, terms, units, natures, courses, scores): temp = [index, course.strip(), score.replace('\n', ''), unit, year, term, nature] conts.append(temp) if self.display: table_print(tabletitle, conts) table_print(['平均成绩','课程门数', '已获得学分'], [[average, total, credit]]) return conts
def get_info(url, cw=None): print_ = get_print(cw) info = {'videos': []} html = downloader.read_html(url) soup = Soup(html) info['title'] = soup.find('h2', class_='videoCnt_title').text.strip() id_ = re.find(PATTERN_ID, url, err='no id') print_('id: {}'.format(id_)) token = re.find( r'''window.FC2VideoObject.push\(\[['"]ae['"], *['"](.+?)['"]''', html, err='no token') print_('token: {}'.format(token)) url_api = 'https://video.fc2.com/api/v3/videoplaylist/{}?sh=1&fs=0'.format( id_) hdr = { 'X-FC2-Video-Access-Token': token, } data = downloader.read_json(url_api, url, headers=hdr) url_video = urljoin( url, data['playlist'].get('nq') or data['playlist']['sample']) url_thumb = soup.find('meta', {'property': 'og:image'})['content'] video = Video(url_video, url_thumb, url, info['title'], id_) info['videos'].append(video) return info
def get_video(url, cw=None): print_ = get_print(cw) if cw and not cw.alive: return html = downloader.read_html(url) soup = Soup(html) data = cw.data if not isinstance(data, bytes): data = data.encode('utf8') s = base64.b64decode(data).decode('utf8') urls = json.loads(s) print_(u'\n'.join(urls[:4])) referer_seg = 'auto' if 'referer=force' in urls[0] else None # 1718 stream = M3u8_stream(url, urls=urls, n_thread=4, referer_seg=referer_seg) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] title = soup.find('meta', { 'property': 'og:title' }).attrs['content'].strip() video = Video(stream, url_thumb, url, title) return video
def get_id(url, cw=None): for try_ in range(2): try: res = clf2.solve(url, cw=cw, f=_get_page_id) html = res['html'] soup = Soup(html) if soup.find('div', class_='gn_login'): raise errors.LoginRequired() oid = _get_page_id(html) if not oid: raise Exception('no page_id') uids = re.findall(r'uid=([0-9]+)', html) uid = max(set(uids), key=uids.count) name = re.find(r"CONFIG\['onick'\]='(.+?)'", html) or soup.find( 'div', class_=lambda c: c and c.startswith('ProfileHeader_name' )).text.strip() if not name: raise Exception('no name') break except errors.LoginRequired as e: raise except Exception as e: e_ = e print(e) else: raise e_ return uid, oid, name
def get_video(url): html = downloader.read_html(url) soup = Soup(html) view = soup.find('div', id='player-container-fluid') src_best = None res_best = -1 for source in view.findAll('source'): src = urljoin(url, source.attrs['src']) res = re.find('([0-9]+)p', source.attrs['title']) res = int(res) if res else 0 if res > res_best: src_best = src res_best = res if src_best is None: raise Exception('No source') title = soup.find('h1').text.strip() id = soup.find('div', id='video').attrs['data-id'] url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] #src_best = downloader.real_url(src_best) video = Video(src_best, url_thumb, url, title, id) return video
def read_page(page): html = read_html(page.url) soup = Soup(html) text_top = get_text(soup.find('div', id='maegaki')) print(text_top.count('\n')) text_mid = get_text(soup.find('div', id='honbun')) text_bot = get_text(soup.find('div', id='atogaki')) texts = [text for text in (text_top, text_mid, text_bot) if text] story = ''' ──────────────────────────────── '''.join(texts) text = '''──────────────────────────────── ◆ {} ──────────────────────────────── {}'''.format(page.title, story) return text
def get_pages(url, cw=None): print_ = get_print(cw) url = get_main(url).replace('comic.naver.', 'm.comic.naver.') id = get_id(url) print('id:', id) print(url) html = downloader.read_html(url) soup = Soup(html) try: info = soup.find('div', class_='area_info') artist = info.find('span', class_='author').text.strip() except Exception as e: print(e) try: title = ('\n').join( soup.find( 'div', class_='title').text.strip().split('\n')[:-1]).strip() except: title = 'artist not found' raise Exception(title) print('artist:', artist) title = soup.find('meta', {'property': 'og:title'}).attrs['content'] pages = [] nos = set() for p in range(1, 100): if p == 1: url_page = url else: url_page = set_page(url, p) html = downloader.read_html(url_page) print('read page:', url_page) soup = Soup(html) view = soup.findAll('ul', class_='section_episode_list')[(-1)] for lst in view.findAll('li'): url_page = urljoin(url, lst.find('a').attrs['href']) if 'detail.nhn' not in url_page.lower(): continue print_('url_page: {}'.format(url_page)) text = lst.find('strong', class_='title').find('span', class_='name').text.strip() no = int(re.findall('[?&]no=([0-9]+)', url_page)[0]) if no in nos: print('duplicate no: {}'.format(no)) continue nos.add(no) text = '{:04} - {}'.format(no, text) page = Page(url_page, text, p) pages.append(page) btn_next = soup.find('a', class_='btn_next') if btn_next is None or btn_next.attrs['href'] == '#': print('end of page') break info = Info(id, title, artist) return (info, pages)
def get_video(url, session, cw): print_ = get_print(cw) html = downloader.read_html(url, session=session) if "document.location.href='https://login." in html: raise errors.LoginRequired() soup = Soup(html) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] print_('url_thumb: {}'.format(url_thumb)) params = re.find('VodParameter *= *[\'"]([^\'"]+)[\'"]', html, err='No VodParameter') params += '&adultView=ADULT_VIEW&_={}'.format(int(time()*1000)) url_xml = 'http://stbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params print(url_xml) html = downloader.read_html(url_xml, session=session, referer=url) soup = Soup(html) if '<flag>PARTIAL_ADULT</flag>' in html: raise errors.LoginRequired() title = soup.find('title').string.strip() urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html) if not urls_m3u8: raise Exception('no m3u8') streams = [] for url_m3u8 in urls_m3u8: try: stream = _get_stream(url_m3u8) except Exception as e: print(e) continue #2193 streams.append(stream) for stream in streams[1:]: streams[0] += stream stream = streams[0] id = url.split('/')[(-1)].split('?')[0].split('#')[0] video = Video(stream, url, id, title, url_thumb) return video
def get_video(url, session): while url.strip().endswith('/'): url = url[:-1] html = downloader.read_html(url, session=session) soup = Soup(html) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] params = re.findall('VodParameter *= *[\'"]([^\'"]+)[\'"]', html)[0] params += '&adultView=ADULT_VIEW&_={}'.format(int(time() * 1000)) url_xml = 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params print(url_xml) html = downloader.read_html(url_xml, session=session, referer=url) soup = Soup(html) title = soup.find('title').string.strip() urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html) if not urls_m3u8: raise Exception('no m3u8') streams = [] for url_m3u8 in urls_m3u8: try: stream = _get_stream(url_m3u8) except Exception as e: print(e) continue #2193 streams.append(stream) for stream in streams[1:]: streams[0] += stream stream = streams[0] id = url.split('/')[(-1)].split('?')[0].split('#')[0] video = Video(stream, url, id, title, url_thumb) return video
def get_info(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) info = {} info['title'] = soup.find('h1', id='workTitle').text.strip() info['artist'] = soup.find('span', id='workAuthor-activityName').text.strip() desc = soup.find('section', id='description') button = desc.find('span', class_='ui-truncateTextButton-expandButton') if button: print('decompose button') button.decompose() catch = desc.find('span', id='catchphrase-body').text.strip() intro = desc.find('p', id='introduction').text.strip() desc = u' {}\n\n\n{}'.format(catch, intro) info['description'] = desc pages = [] for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'): href = urljoin(url, a.attrs['href']) subtitle = a.find('span', class_='widget-toc-episode-titleLabel').text.strip() date = a.find('time', class_='widget-toc-episode-datePublished').text.strip() page = Page(href, subtitle, date, len(pages) + 1) pages.append(page) info['pages'] = pages return info
def get_imgs_page(id_art, session, date=None, cw=None): print_ = get_print(cw) url_json = 'https://www.artstation.com/projects/{}.json'.format(id_art) post_url = 'https://www.artstation.com/artwork/{}'.format(id_art) try: html = downloader.read_html(url_json, session=session, referer=post_url) data = json.loads(html) imgs_ = data['assets'] except Exception as e: print_(print_error(e)[(-1)]) return [] if date is None: date = data['created_at'][2:10] imgs = [] for page, img in enumerate(imgs_): if not img['has_image']: print('no img') continue url = None video = None embed = img.get('player_embedded') if embed: soup = Soup(embed) url_embed = soup.find('iframe').attrs['src'] print_('embed: {}'.format(url_embed)) try: html = downloader.read_html(url_embed, session=session, referer=post_url) soup = Soup(html) url = soup.find('video').find('source').attrs['src'] except Exception as e: pass if not url: try: url = soup.find('link', {'rel': 'canonical'}).attrs['href'] print_('YouTube: {}'.format(url)) raise Exception('YouTube') ## from extractor import youtube_downloader ## video = youtube_downloader.Video(url, cw=cw) except Exception as e: print(e) url = None if not url: url = img['image_url'] if video: img = video else: img = Image(post_url, date, url, page) img.data = data # imgs.append(img) return imgs
def get_imgs(page, cw=None): print_ = get_print(cw) html = downloader.read_html(page.url) soup = Soup(html) type_ = re.find('''webtoonType *: *['"](.+?)['"]''', html) print_('type: {}'.format(type_)) imgs = [] if type_ == 'DEFAULT': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=715772 view = soup.find('div', class_='toon_view_lst') for img in view.findAll('img'): img = img.attrs.get('data-src') if not img: continue img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) elif type_ == 'CUTTOON': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=752803 view = soup.find('div', class_='swiper-wrapper') for div in view.findAll('div', class_='swiper-slide'): if div.parent != view: continue if div.find('div', class_='cut_viewer_last'): print('cut_viewer_last') continue if div.find('div', class_='cut_viewer_recomm'): print('cut_viewer_recomm') continue img = div.find('img') img = img.attrs['data-src'] img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) elif type_ == 'EFFECTTOON': #2313; https://m.comic.naver.com/webtoon/detail.nhn?titleId=670144 img_base = re.find('''imageUrl *: *['"](.+?)['"]''', html) + '/' print('img_base:', img_base) url_api = re.find('''documentUrl *: *['"](.+?)['"]''', html) data_raw = downloader.read_html(url_api, page.url) data = json.loads(data_raw) for img in data['assets']['stillcut'].values( ): # ordered in python3.7+ img = urljoin(img_base, img) img = Image(img, page, len(imgs)) imgs.append(img) else: _imgs = re.findall('sImageUrl *: *[\'"](.+?)[\'"]', html) if not _imgs: raise Exception('no imgs') for img in _imgs: img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) return imgs
def get_video(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) video = soup.find('video', id='vjsplayer').find('source').attrs['src'] url_thumb = soup.find('video', id='vjsplayer').attrs['poster'] title = get_title(soup) filename = u'{}.mp4'.format(clean_title(title)) video = Video(video, url_thumb, url, filename) return video
def get_info(url, soup=None): if soup is None: html = read_html(url) soup = Soup(html) info = {} info['artist'] = soup.find('span', {'itemprop': 'author'}).text.strip() info['title'] = soup.find('span', {'itemprop': 'name'}).text.strip() sss = get_sss(soup) info['novel_ex'] = get_text(sss[-2], '') return info
def read(self) -> None: response = requests.get(self.url) soup = Soup(response.text) self.title = soup.find("title").text image_element_list = soup.find("div", class_="article-content").findAll("img") for image_element in image_element_list: self.urls.append(image_element["src"])
def get_video(url_page): id = get_id(url_page) html = downloader.read_html(url_page) soup = Soup(html, unescape=True) name = soup.find('title').text.replace('- XVIDEOS.COM', '').strip() print('name:', name) url = re.find('.setVideoHLS\\([\'"](.+?)[\'"]\\)', html) print(url) ext = os.path.splitext(url.split('?')[0])[1] if ext.lower() == '.m3u8': url = playlist2stream(url, n_thread=5) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] video = Video(url, url_page, id, name, url_thumb) return video
def get(self, url): if self._url_video: return self._url_video cw = self.cw print_ = get_print(cw) html = downloader.read_html(url) soup = Soup(html) embedUrl = extract('embedUrl', html, cw) if embedUrl: raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl)) uid = extract('strLocalChUserId', html, cw) pid = extract('nLocalPrgId', html, cw) fid = extract('strFid', html, cw) resolType = extract('strResolType', html, cw) resolArr = extract('strResolArr', html, cw) vodSvr = extract('nVodSvr', html, cw) resols = extract('nInfo', html, cw) runtime = extract('runtime', html, cw) url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/' data = { 'userId': uid, 'prgId': pid, 'fid': fid, 'resolType': resolType, 'resolArr': ','.join(map(str, resolArr)), 'vodSvr': vodSvr, 'resol': max(resols), 'runtime': runtime, 'tvbox': 'false', 'defResol': 'true', 'embed': 'false', } session = Session() r = session.post(url_api, headers={'Referer': url}, data=data) data = json.loads(r.text) self._url_video = data['src'] self.title = soup.find('meta', {'property': 'og:description'})['content'] ext = get_ext(self._url_video) self.filename = format_filename(self.title, pid, ext) self.url_thumb = soup.find('meta', {'property': 'og:image'})['content'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) return self._url_video
def read(self): if '/video/' in self.url: res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h1', id='post_title').text.strip() self.title = title view = soup.find('div', id='post') video = view.find('video') src = video.find('source')['src'] src = urljoin(self.url, src) video = Video(src, self.url, title, self.session) self.urls.append(video.url) self.single = True return if '/image/' not in self.url: raise NotImplementedError('Not a post') res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h2').text paginator = soup.find('div', id='paginator') pages = [self.url] for a in paginator.findAll('a'): href = a.get('href') if not href: continue href = urljoin(self.url, href) if href not in pages: pages.append(href) imgs = [] for i, page in enumerate(pages): if page == self.url: soup_page = soup else: soup_page = downloader.read_soup(page, session=self.session) view = soup_page.find('div', id='post') for img in view.findAll('img'): href = img.parent['href'] href = urljoin(page, href) img = Image(href, page, len(imgs), self.session) imgs.append(img) self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages))) for img in imgs: self.urls.append(img.url) self.title = clean_title(title)
class Downloader_asiansister(Downloader): type = 'asiansister' URLS = ['asiansister.com'] @try_n(4) def init(self): self.url = self.url.replace('asiansister_', '') html = downloader.read_html(self.url) self.soup = Soup(html) @property def name(self): return clean_title( self.soup.find('title').text.replace('- ASIANSISTER.COM', '').strip()) def read(self): imgs = get_imgs(self.url, self.soup, self.name) for img in imgs: if img.type == 'video': self.single = True self.urls.append(img.url) self.title = self.name
def get_info(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) info = {} info['id'] = get_id(url) title = soup.find('h1').text.strip() info['title'] = title for tag in soup.findAll('span', class_='tag'): href = tag.parent.attrs['href'] href = urljoin(url, href).strip('/') key = href.split('/')[3] value = href.split('/')[-1] if key == 'language' and value == 'translated': continue if key in info: info[key].append(value) else: info[key] = [value] for key in ['artists', 'groups', 'parodies', 'tags', 'characters']: if key not in info: info[key] = [] return info
def read(self): session = self.__get_cookie() f = BytesIO() title_element = self.soup.find("b", {"class": "cut_line_one"}) # Maybe NavigableString? assert isinstance(title_element, Tag) self.title = title_element.text # css selecter is not working :( ep_num = self.soup.find( "span", { "style": "background-color:rgba(155,155,155,0.5);padding: 1px 6px;border-radius: 3px;font-size: 11px; margin-right: 3px;" }, ) assert isinstance(ep_num, Tag) ep_name = self.soup.find("span", {"class": "cut_line_one"}) assert isinstance(ep_name, Tag) # Dirty but for clean filename replaced_name = ep_name.text.replace(ep_num.text, "", 1) self.filenames[f] = clean_title(f"[{ep_num.text}] {replaced_name}.txt", "safe") # https://novelpia.com/viewer/:number: numbers: List[str] = [] numbers.append(self.__get_number(self.parsed_url[2])) # Get real contents # https://novelpia.com/proc/viewer_data/:number: # {"s": [{"text": ""}]} viewer_datas = map( lambda number: f"https://novelpia.com/proc/viewer_data/{number}", numbers) for viewer_data in viewer_datas: response = session.get(viewer_data) if response.text: response = response.json() for text_dict in response["s"]: text = text_dict["text"] if "img" in text: soup = Soup(text) img = soup.find("img") # Maybe NavigableString here too? assert isinstance(img, Tag) src = img.attrs["src"] filename = img.attrs["data-filename"] f.write(f"[{filename}]".encode("UTF-8")) self.urls.append(f"https:{src}") self.filenames[f"https:{src}"] = filename else: f.write(text_dict["text"].encode("UTF-8")) f.seek(0) self.urls.append(f) else: self.print_(f"{viewer_data} 해당 작품은 로그인이 필요합니다.")
def read_channel(url, cw=None): print_ = get_print(cw) username = url.split('/users/')[1].split('/')[0] info = {} html = downloader.read_html(url) soup = Soup(html) title = soup.find('div', class_='user-name').text.strip() info['title'] = u'[Channel] {}'.format(title) items = [] for p in range(1, 21): url = 'https://xhamster.com/users/{}/videos/{}'.format(username, p) print_(url) html = downloader.read_html(url) soup = Soup(html) items_ = soup.findAll('div', class_='thumb-list__item') if not items_: print('no items') break for item in items_: items.append(item) urls = [] for item in items: url = item.a.attrs['href'] if url in urls: print('duplicate:', url) continue urls.append(url) info['urls'] = urls return info
def get_content(self): if self.content != "" and self.type == "blog": return self.content soup = Soup(self.url) # extract blog content self.content += soup.find("div", {"class":"content_body"}).get_text() return self.content
def get_artist(userid, soup=None): if soup is None: url = u'https://www.wikiart.org/en/{}'.format(userid) html = downloader.read_html(url) soup = Soup(html) return soup.find('h3').text.strip()
def get(self, url_page): if not self._url: id = get_id(url_page) html = downloader.read_html(url_page) soup = Soup(html, unescape=True) self.title = soup.find('title').text.replace('- XVIDEOS.COM', '').strip() url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html) ext = get_ext(url) if ext.lower() == '.m3u8': url = playlist2stream(url, n_thread=5) url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] self.thumb = BytesIO() downloader.download(url_thumb, buffer=self.thumb) self.filename = format_filename(self.title, id, '.mp4') self._url= url return self._url
def get_info(self): ''' 通过登录会话session获取学生信息 :param sess: :return: 学生信息 ''' ifo_url = 'http://219.242.68.33/xuesheng/xsxx.aspx' soup = Soup(self.http_request.session, ifo_url) data = {} data['a.姓名'] = soup.find(id="ctl00_ContentPlaceHolder1_lblXm").text data['b.身份证号'] = soup.find(id="ctl00_ContentPlaceHolder1_lblSfz").text data['c.学号'] = soup.find(id="ctl00_ContentPlaceHolder1_lblXh").text data['d.班级'] = soup.find(id="ctl00_ContentPlaceHolder1_className").text data['e.院系'] = soup.find(id="ctl00_ContentPlaceHolder1_collegeName").text if self.display is True: tabletitle = [item[2:] for item in sorted(data.keys())] cont = [data[item] for item in sorted(data.keys())] table_print(tabletitle, cont) return data
def trending(self): all_trending = [] # get soup soup = Soup(self.BASE_URL + "/blog") # locate the html tags for a in soup.find("nav", {"class" : "blog-trending"}).findAll("a"): # construct blog object i = Item(self.BASE_URL + a.get("href"), a.get_text()) i.type = "blog" all_trending.append(i) return all_trending
def newest(self): newest_posts = [] # compose url url = self.BASE_URL + BLOG soup = Soup(url) a_tags = soup.find("div", {"class":"blog_posts-list"}).findAll("a") i = 0 for a_tag in a_tags: url = self.BASE_URL + a_tag.get("href") title = a_tag.get_text() if i %10 == 1: item = Item(url, title) item.type = "blog" newest_posts.append(item) i += 1 return newest_posts
def topic(self, topic): if topic not in topics: return "Topic not Found" posts = [] url = topics[topic] soup = Soup(url) a_tags = soup.find("div", {"class":"blog_posts-list"}).findAll("a") i = 0 for a_tag in a_tags: url = self.BASE_URL + a_tag.get("href") title = a_tag.get_text() if i %10 == 1: item = Item(url, title) item.type = "blog" posts.append(item) i += 1 return posts