def get_latest_build_date(self): """ Returns date of latest available nightly build.""" if self.application not in ('fennec'): url = urljoin(self.base_url, 'nightly', 'latest-%s/' % self.branch) else: url = urljoin(self.base_url, 'nightly', 'latest-%s-%s/' % (self.branch, self.platform)) self.logger.info('Retrieving the build status file from %s' % url) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) parser.entries = parser.filter(r'.*%s\.txt' % self.platform_regex) if not parser.entries: message = 'Status file for %s build cannot be found' % \ self.platform_regex raise errors.NotFoundError(message, url) # Read status file for the platform, retrieve build id, # and convert to a date headers = {'Cache-Control': 'max-age=0'} r = requests.get(url + parser.entries[-1], auth=self.authentication, headers=headers) try: r.raise_for_status() return datetime.strptime(r.text.split('\n')[0], '%Y%m%d%H%M%S') finally: r.close()
def get_info(url, cw=None): print_ = get_print(cw) info = {} html = downloader.read_html(url) if '"cafe_cautionpage"' in html: raise errors.LoginRequired() url_article = re.find(r'''//cafe\.naver\.com/ArticleRead\.nhn\?articleid=[0-9]+&clubid=[0-9]+''', html, err='no iframe') url_article = urljoin(url, url_article) print_(url_article) articleid = re.find(r'articleid=([0-9]+)', url_article) clubid = re.find(r'clubid=([0-9]+)', url_article) url_api = f'https://apis.naver.com/cafe-web/cafe-articleapi/v2/cafes/{clubid}/articles/{articleid}?query=&useCafeId=true&requestFrom=A' j = downloader.read_json(url_api, url) info['title'] = j['result']['article']['subject'] info['cafename'] = j['result']['cafe']['url'] info['cafeid'] = clubid info['id'] = articleid html_content = j['result']['article']['contentHtml'] soup = Soup(html_content) imgs = [] pairs = [] for video in soup.findAll('span', class_='_naverVideo'): vid = video.attrs['vid'] key = video.attrs['key'] pairs.append((vid, key)) for script in soup.findAll('script', class_='__se_module_data'): data_raw = script['data-module'] data = json.loads(data_raw)['data'] vid = data.get('vid') if not vid: continue key = data['inkey'] pairs.append((vid, key)) for vid, key in pairs: url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key) data_raw = downloader.read_html(url_api) data = json.loads(data_raw) fs = data['videos']['list'] fs = sorted(fs, key=lambda f: f['size'], reverse=True) video = Image(fs[0]['source'], url_article, len(imgs)) imgs.append(video) for img in soup.findAll('img'): img = Image(urljoin(url_article, img['src']), url, len(imgs)) imgs.append(img) info['imgs'] = imgs return info
def fix_url(url, session=None, cw=None): print_ = get_print(cw) if '&manga_name=' not in url: return url print_('fix url') qs = query_url(url) name = qs['manga_name'][0].replace('+', ' ') url_search = urljoin(url, '/bbs/search.php') url_search = update_url_query(url_search, {'stx': [name]}) print(url_search) html = read_html(url_search, session=session) soup = Soup(html) posts = soup.findAll('div', class_='post-row') print_(('posts:').format(len(posts))) if len(posts) != 1: return url for a in posts[0].findAll('a'): href = urljoin(url, a.attrs['href']) if 'manga_detail' in href: break else: raise Exception('Failed to find link') if cw is not None: cw.gal_num = href return href
def get_imgs(url, soup=None, name=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) view = soup.findAll('div', class_='rootContant')[:2][-1] v = view.find('video') if v: img = v.find('source').attrs['src'] img = urljoin(url, img) img = Image(img, url, 0, 'video') ext = os.path.splitext(img.url().split('?')[0])[1] img.filename = u'{}{}'.format(name, ext) return [img] imgs = [] for img in view.findAll('img'): img = img.attrs['dataurl'] img = urljoin(url, img) img = re.sub('/[a-z]+images/', '/images/', img).replace('_t.', '.') img = Image(img, url, len(imgs)) imgs.append(img) return imgs
def get_files(url, session, multi_post=False, cw=None): print_ = get_print(cw) html = read_html(url, session=session) soup = Soup(html) h = soup.find('h1', class_='title') content = h.parent.parent.parent title = h.text.strip() youtube = content.find('div', class_='embedded-video') video = content.find('video') if youtube: type = 'youtube' elif video: type = 'video' else: type = 'image' print_(('type: {}').format(type)) files = [] if type == 'image': urls = set() for img in content.findAll('img'): img = urljoin(url, img.parent.attrs['href']) if '/files/' not in img: continue if img in urls: print('duplicate') continue urls.add(img) file = File(type, img, title, url, len(files), multi_post=multi_post) files.append(file) elif type == 'youtube': src = urljoin(url, youtube.find('iframe').attrs['src']) file = File(type, src, title, url) files.append(file) elif type == 'video': url_thumb = urljoin(url, video.attrs['poster']) print('url_thumb:', url_thumb) id = re.find('videos/([0-9a-zA-Z_-]+)', url, err='no video id') url_data = urljoin(url, '/api/video/{}'.format(id)) s_json = read_html(url_data, url, session=session) data = json.loads(s_json) video = data[0] url_video = urljoin(url, video['uri']) file = File(type, url_video, title, url) file.url_thumb = url_thumb file.thumb = BytesIO() downloader.download(url_thumb, buffer=file.thumb, referer=url) files.append(file) else: raise NotImplementedError(type) return files
def get_imgs(page, cw=None): print_ = get_print(cw) html = downloader.read_html(page.url) soup = Soup(html) type_ = re.find('''webtoonType *: *['"](.+?)['"]''', html) print_('type: {}'.format(type_)) imgs = [] if type_ == 'DEFAULT': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=715772 view = soup.find('div', class_='toon_view_lst') for img in view.findAll('img'): img = img.attrs.get('data-src') if not img: continue img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) elif type_ == 'CUTTOON': # https://m.comic.naver.com/webtoon/detail.nhn?titleId=752803 view = soup.find('div', class_='swiper-wrapper') for div in view.findAll('div', class_='swiper-slide'): if div.parent != view: continue if div.find('div', class_='cut_viewer_last'): print('cut_viewer_last') continue if div.find('div', class_='cut_viewer_recomm'): print('cut_viewer_recomm') continue img = div.find('img') img = img.attrs['data-src'] img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) elif type_ == 'EFFECTTOON': #2313; https://m.comic.naver.com/webtoon/detail.nhn?titleId=670144 img_base = re.find('''imageUrl *: *['"](.+?)['"]''', html) + '/' print('img_base:', img_base) url_api = re.find('''documentUrl *: *['"](.+?)['"]''', html) data_raw = downloader.read_html(url_api, page.url) data = json.loads(data_raw) for img in data['assets']['stillcut'].values( ): # ordered in python3.7+ img = urljoin(img_base, img) img = Image(img, page, len(imgs)) imgs.append(img) else: _imgs = re.findall('sImageUrl *: *[\'"](.+?)[\'"]', html) if not _imgs: raise Exception('no imgs') for img in _imgs: img = urljoin(page.url, img) img = Image(img, page, len(imgs)) imgs.append(img) return imgs
def path_regex(self): """Return the regex for the path""" try: path = urljoin(self.monthly_build_list_regex, self.builds[self.build_index]) return path except: folder = urljoin(self.base_url, self.monthly_build_list_regex) raise NotFoundError("Specified sub folder cannot be found", folder)
def read_channel(url_page, cw=None): print_ = get_print(cw) res = re.find(CHANNEL_PATTERN, url_page) if res is None: raise Exception('Not channel') header, username = res print(header, username) max_pid = get_max_range(cw) info = {} info['header'] = header info['username'] = username session = Session() urls = [] ids = set() for p in range(100): url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) print_(url_api) r = session.post(url_api) data = json.loads(r.text) videos = data.get('videos') #4530 if not videos: print_('empty') break for video in videos: id_ = video['id'] if id_ in ids: print_('duplicate: {}'.format(id_)) continue ids.add(id_) info['name'] = video['pn'] urls.append(urljoin(url_page, video['u'])) if len(urls) >= max_pid: break n = data['nb_videos'] s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) if cw: cw.setTitle(s) else: print(s) if len(ids) >= n: break sleep(1, cw) if not urls: raise Exception('no videos') info['urls'] = urls[:max_pid] return info
def path_regex(self): """Return the regex for the path""" try: path = urljoin(self.monthly_build_list_regex, self.builds[self.build_index]) if self.application in MULTI_LOCALE_APPLICATIONS \ and self.locale != 'multi': path = urljoin(path, self.locale) return path except: folder = urljoin(self.base_url, self.monthly_build_list_regex) raise NotFoundError("Specified sub folder cannot be found", folder)
def path_regex(self): """Return the regex for the path""" try: path = urljoin(self.monthly_build_list_regex, self.builds[self.build_index]) if self.application in APPLICATIONS_MULTI_LOCALE \ and self.locale != 'multi': path = urljoin(path, self.locale) return path except: folder = urljoin(self.base_url, self.monthly_build_list_regex) raise errors.NotFoundError("Specified sub folder cannot be found", folder)
def read(self): if '/video/' in self.url: res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h1', id='post_title').text.strip() self.title = title view = soup.find('div', id='post') video = view.find('video') src = video.find('source')['src'] src = urljoin(self.url, src) video = Video(src, self.url, title, self.session) self.urls.append(video.url) self.single = True return if '/image/' not in self.url: raise NotImplementedError('Not a post') res = clf2.solve(self.url, session=self.session, cw=self.cw) soup = Soup(res['html']) title = soup.find('h2').text paginator = soup.find('div', id='paginator') pages = [self.url] for a in paginator.findAll('a'): href = a.get('href') if not href: continue href = urljoin(self.url, href) if href not in pages: pages.append(href) imgs = [] for i, page in enumerate(pages): if page == self.url: soup_page = soup else: soup_page = downloader.read_soup(page, session=self.session) view = soup_page.find('div', id='post') for img in view.findAll('img'): href = img.parent['href'] href = urljoin(page, href) img = Image(href, page, len(imgs), self.session) imgs.append(img) self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages))) for img in imgs: self.urls.append(img.url) self.title = clean_title(title)
def read_channel(url_page, cw=None): print_ = get_print(cw) res = re.find(CHANNEL_PATTERN, url_page) if res is None: raise Exception('Not channel') header, username = res print(header, username) max_pid = get_max_range(cw, 2000) info = {} info['header'] = header info['username'] = username session = Session() urls = [] urls_set = set() for p in range(100): url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) print(url_api) r = session.post(url_api, data='main_cats=false') soup = Soup(r.text) thumbs = soup.findAll('div', class_='thumb-block') if not thumbs: print_('empty') break for thumb in thumbs: info['name'] = thumb.find('span', class_='name').text.strip() href = thumb.find('a')['href'] href = urljoin(url_page, href) if href in urls_set: print_('duplicate: {}'.format(href)) continue urls_set.add(href) urls.append(href) if len(urls) >= max_pid: break s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) if cw: if not cw.alive: return cw.setTitle(s) else: print(s) if not urls: raise Exception('no videos') info['urls'] = urls[:max_pid] return info
def get_pages(soup, url): pages = [] hrefs = set() titles = set() for a in soup.findAll(lambda tag: tag.name == 'a' and '/viewer/stories/' in tag.get('href', ''))[::-1]: href = urljoin(url, a.attrs['href']) if href in hrefs: continue hrefs.add(href) divs = a.findAll('div', recursive=False) if len(divs) < 2: continue right = divs[1] number = right.findAll('span')[0].text.strip() title = right.findAll('span')[1].text.strip() title = ' - '.join(x for x in [number, title] if x) if title in titles: title0 = title i = 2 while title in titles: title = title0 + ' ({})'.format(i) i += 1 titles.add(title) page = Page(href, title) pages.append(page) if not pages: raise Exception('no pages') return pages
def get_video(url): html = downloader.read_html(url) soup = Soup(html) view = soup.find('div', id='player-container-fluid') src_best = None res_best = -1 for source in view.findAll('source'): src = urljoin(url, source.attrs['src']) res = re.find('([0-9]+)p', source.attrs['title']) res = int(res) if res else 0 if res > res_best: src_best = src res_best = res if src_best is None: raise Exception('No source') title = soup.find('h1').text.strip() id = soup.find('div', id='video').attrs['data-id'] url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] #src_best = downloader.real_url(src_best) video = Video(src_best, url_thumb, url, title, id) return video
def generic_iter_pages(start_url, page_parser_cls, request_fn: RequestFunction) -> Iterator[Page]: next_url = start_url while next_url: logger.debug("Requesting page from: %s", next_url) response = request_fn(next_url) logger.debug("Parsing page response") parser = page_parser_cls(response) page = parser.get_page() # TODO: If page is actually an iterable calling len(page) might consume it logger.debug("Got %s raw posts from page", len(page)) yield page logger.debug("Looking for next page URL") next_page = parser.get_next_page() if next_page: next_url = utils.urljoin(FB_MOBILE_BASE_URL, next_page) next_url = next_url.replace("&num_to_fetch=4", "&num_to_fetch=20") else: logger.info("Page parser did not find next page URL") next_url = None
def get(self, referer): soup = downloader.read_soup(self._url, referer, session=self.session) div = soup.find('div', id='display_image_detail') url = urljoin(self._url, div.find('img').parent['href']) ext = get_ext(url) self.filename = '{:04}{}'.format(self._p, ext) return url, self._url
def get_build_info_for_index(self, build_index=None): url = urljoin(self.base_url, self.build_list_regex) self.logger.info('Retrieving list of builds from %s' % url) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) parser.entries = parser.filter(r'^\d+$') if self.timestamp: # If a timestamp is given, retrieve the folder with the timestamp # as name parser.entries = self.timestamp in parser.entries and \ [self.timestamp] elif self.date: # If date is given, retrieve the subset of builds on that date parser.entries = filter(self.date_matches, parser.entries) if not parser.entries: message = 'No builds have been found' raise NotFoundError(message, url) self.show_matching_builds(parser.entries) # If no index has been given, set it to the last build of the day. if build_index is None: build_index = len(parser.entries) - 1 return (parser.entries, build_index)
def init(self): self.url = clean_url(self.url) self.session = Session() if re.search(PATTERN_ID, self.url): #1799 select = self.soup.find('select', class_='bookselect') for i, op in enumerate(select.findAll('option')[::-1]): if 'selected' in op.attrs: break else: raise Exception('no selected option') for a in self.soup.findAll('a'): url = urljoin(self.url, a.get('href') or '') if re.search(PATTERN, url): break else: raise Exception('list not found') self.url = self.fix_url(url) self._soup = None for i, page in enumerate( get_pages(self.url, self.session, self.soup)): if page.id == int(op['value']): break else: raise Exception('can not find page') self.cw.range_p = [i]
def path_regex(self): """Return the regex for the path to the build folder""" if self.locale_build: return self.build_list_regex return '%s/' % urljoin(self.build_list_regex, self.builds[self.build_index])
def get_info(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) info = {} info['title'] = soup.find('h1', id='workTitle').text.strip() info['artist'] = soup.find('span', id='workAuthor-activityName').text.strip() desc = soup.find('section', id='description') button = desc.find('span', class_='ui-truncateTextButton-expandButton') if button: print('decompose button') button.decompose() catch = desc.find('span', id='catchphrase-body').text.strip() intro = desc.find('p', id='introduction').text.strip() desc = u' {}\n\n\n{}'.format(catch, intro) info['description'] = desc pages = [] for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'): href = urljoin(url, a.attrs['href']) subtitle = a.find('span', class_='widget-toc-episode-titleLabel').text.strip() date = a.find('time', class_='widget-toc-episode-datePublished').text.strip() page = Page(href, subtitle, date, len(pages) + 1) pages.append(page) info['pages'] = pages return info
def get_imgs_page(page, session, cw=None): print_ = get_print(cw) print_(page.title) html = downloader.read_html(page.url, session=session) soup = Soup(html) view = soup.find('div', class_='chapter-content') if not view: raise Exception('no chapter-content') imgs = [] for img in soup.findAll('img', class_='chapter-img'): src = img.get('data-pagespeed-lazy-src') or img.get( 'data-src') or img.get('data-srcset') or img.get( 'data-aload') or img['src'] try: src = base64.b64decode(src).strip().decode('utf8') except: pass src = urljoin(page.url, src) if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src: continue if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src: continue if 'LoveHug_600cfd96e98ff.jpg' in src: continue img = Image(src.strip(), page, len(imgs)) imgs.append(img) return imgs
def get_pages(url, soup=None, session=None): if soup is None: html = read_html(url, session=session, cw=None) soup = Soup(html) pagination = soup.find('div', class_='pagination') pages = [] hrefs = set() for a in pagination.findAll('a'): href = a.attrs.get('href', '') href = urljoin(url, href) if not href.startswith(url): print('not match', href) continue while href.endswith('/'): href = href[:-1] if href in hrefs: print('duplicate', href) continue hrefs.add(href) text = a.text.strip() page = Page(text, href) pages.append(page) if url not in hrefs: page = Page('1', url, soup) pages.insert(0, page) return pages
def _call(self, url_api, referer='https://twitter.com', params=None): url_api = urljoin('https://api.twitter.com', url_api) if params: url_api = update_url_query(url_api, params) #print('call:', url_api) data = downloader.read_json(url_api, referer, session=self.session) return data
def __init__(self, destination=None, platform=None, application='firefox', locale=None, extension=None, username=None, password=None, retry_attempts=0, retry_delay=10., is_stub_installer=False, timeout=None, log_level='INFO', base_url=BASE_URL): # Private properties for caching self._filename = None self._binary = None self.destination = destination or os.getcwd() if not locale: if application in APPLICATIONS_MULTI_LOCALE: self.locale = 'multi' else: self.locale = 'en-US' else: self.locale = locale self.platform = platform or self.detect_platform() self.session = requests.Session() if (username, password) != (None, None): self.session.auth = (username, password) self.retry_attempts = retry_attempts self.retry_delay = retry_delay self.is_stub_installer = is_stub_installer self.timeout_download = timeout # this is the timeout used in requests.get. Unlike "auth", # it does not work if we attach it on the session, so we handle # it independently. self.timeout_network = 60. logging.basicConfig(format=' %(levelname)s | %(message)s') self.logger = logging.getLogger(self.__module__) self.logger.setLevel(log_level) logging.getLogger('redo').setLevel(logging.INFO) # build the base URL self.application = application self.base_url = '%s/' % urljoin( base_url, APPLICATIONS_TO_FTP_DIRECTORY.get(self.application, self.application) ) if extension: self.extension = extension else: if self.application in APPLICATIONS_MULTI_LOCALE and \ self.platform in ('win32', 'win64'): # builds for APPLICATIONS_MULTI_LOCALE only exist in zip self.extension = 'zip' else: self.extension = DEFAULT_FILE_EXTENSIONS[self.platform] self._retry_check_404(self.get_build_info)
def get_info(url): url = downloader.real_url(url) q = re.find(r'/comic/([^/?]+)', url) url_api = 'https://nhentai.com/api/comics/{}'.format(q) data_raw = downloader.read_html(url_api, url) data = json.loads(data_raw) url_api = 'https://nhentai.com/api/comics/{}/images'.format(q) data_raw = downloader.read_html(url_api, url) data_images = json.loads(data_raw) info = {} info['url'] = url info['id'] = int(data['id']) info['type'] = data['category']['name'] info['title'] = data['title'] info['artists'] = [x['name'] for x in data['artists']] info['groups'] = [x['name'] for x in data['groups']] info['seriess'] = [x['name'] for x in data['parodies']] info['lang'] = data['language']['name'] imgs = [] for img in data_images['images']: img = urljoin(url, img['source_url']) img = Image(url, img, len(imgs)) imgs.append(img) info['imgs'] = imgs return info
def read_channel(url, type_, cw=None): print_ = get_print(cw) username = re.find(r'/users/([^/]+)', url, err='no username') info = {} urls = [] urls_set = set() for p in range(50): url = 'https://ecchi.iwara.tv/users/{}/{}?page={}'.format( username, type_, p) print_(url) html = downloader.read_html(url) soup = Soup(html) if p == 0: title = soup.find('h1', class_='page-title').text info['title'] = title.replace("'s videos", '').strip() view = soup.find('div', class_='view-content') if view is None: break urls_new = [] for div in view.findAll('div', class_='views-column'): href = div.find('a')['href'] url_video = urljoin(url, href) if url_video in urls_set: continue urls_set.add(url_video) urls_new.append(url_video) if not urls_new: break urls += urls_new info['urls'] = urls return info
def get_build_info_for_date(self, date, has_time=False, build_index=None): url = urljoin(self.base_url, self.monthly_build_list_regex) self.logger.info('Retrieving list of builds from %s' % url) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) regex = r'%(DATE)s-(\d+-)+%(BRANCH)s%(L10N)s$' % { 'DATE': date.strftime('%Y-%m-%d'), 'BRANCH': self.branch, 'L10N': '' if self.locale == 'en-US' else '(-l10n)?'} parser.entries = parser.filter(regex) parser.entries = parser.filter(self.is_build_dir) if has_time: # If a time is included in the date, use it to determine the # build's index regex = r'.*%s.*' % date.strftime('%H-%M-%S') parser.entries = parser.filter(regex) if not parser.entries: date_format = '%Y-%m-%d-%H-%M-%S' if has_time else '%Y-%m-%d' message = 'Folder for builds on %s has not been found' % \ self.date.strftime(date_format) raise NotFoundError(message, url) # If no index has been given, set it to the last build of the day. self.show_matching_builds(parser.entries) if build_index is None: build_index = len(parser.entries) - 1 return (parser.entries, build_index)
def get_pages(url): pages = [] urls = set() for p in range(1, 101): url_page = set_page(url, p) print(url_page) for try_ in range(4): try: soup = downloader.read_soup(url_page) view = soup.find('ul', id='_listUl') if view is None: raise Exception('no view') break except Exception as e: e_ = e print(e) else: raise e_ pages_new = [] for li in view.findAll('li', recursive=False): href = urljoin(url, li.find('a')['href']) title = li.find('span', class_='subj').text.strip() if href in urls: continue urls.add(href) no = int(li['data-episode-no']) title = '{:04} - {}'.format(no, title) page = Page(href, title) pages_new.append(page) if not pages_new: break pages += pages_new return pages[::-1]
def get_build_info_for_index(self, build_index=None): url = urljoin(self.base_url, self.build_list_regex) self.logger.info('Retrieving list of builds from %s' % url) parser = self._create_directory_parser(url) parser.entries = parser.filter(r'^\d+$') if self.timestamp: # If a timestamp is given, retrieve the folder with the timestamp # as name parser.entries = self.timestamp in parser.entries and \ [self.timestamp] elif self.date: # If date is given, retrieve the subset of builds on that date parser.entries = filter(self.date_matches, parser.entries) if not parser.entries: message = 'No builds have been found' raise errors.NotFoundError(message, url) self.show_matching_builds(parser.entries) # If no index has been given, set it to the last build of the day. if build_index is None: # Find the most recent non-empty entry. build_index = len(parser.entries) for build in reversed(parser.entries): build_index -= 1 if not build_index or self.is_build_dir(build): break self.logger.info('Selected build: %s' % parser.entries[build_index]) return (parser.entries, build_index)
def get_info(url, soup=None): if soup is None: html = downloader.read_html(url) soup = Soup(html) info = {} info['id'] = get_id(url) title = soup.find('h1').text.strip() info['title'] = title for tag in soup.findAll('span', class_='tag'): href = tag.parent.attrs['href'] href = urljoin(url, href).strip('/') key = href.split('/')[3] value = href.split('/')[-1] if key == 'language' and value == 'translated': continue if key in info: info[key].append(value) else: info[key] = [value] for key in ['artists', 'groups', 'parodies', 'tags', 'characters']: if key not in info: info[key] = [] return info
def f(_): html = downloader.read_html(url, session=session) soup = Soup(html) box = soup.find('section', id='picBox') img = box.find('img') if img is None: raise Exception('No img') onclick = img.attrs.get('onclick', '') if onclick and '.src' in onclick: print('onclick', onclick) img = re.find('''.src *= *['"](.+?)['"]''', onclick) else: img = img.attrs['src'] img = urljoin(url, img) filename = clean_title(os.path.basename(img.split('?')[0])) name, ext = os.path.splitext(filename) # https://www.hentai-foundry.com/pictures/user/DrGraevling/74069/Eversong-Interrogation-pg.-13 if ext.lower() not in [ '.bmp', '.png', '.gif', '.jpg', '.jpeg', '.webp', '.webm', '.avi', '.mp4', '.mkv', '.wmv' ]: filename = u'{}.jpg'.format(name) self.filename = filename return img
def downloadRawFile ( self , remote , local=None ) : """Downloads a remote file to the local system. remote - path relative to repository base local - Optional local name for the file Returns the local file name or False if errors""" remote = utils.urljoin( self.base_url() , remote ) if not local : (handle, fname) = tempfile.mkstemp() else : fname = local handle = os.open( fname , os.O_WRONLY | os.O_TRUNC | os.O_CREAT ) try: response = urllib2.urlopen( remote ) data = response.read(256) while data : os.write(handle, data) data = response.read(256) os.close(handle) except Exception ,ex : repolib.logger.error( "Exception : %s" % ex ) os.close(handle) os.unlink(fname) return False
def fix_url(cls, url): # 2377 m = re.find(r'/board.php\?bo_table=([0-9a-zA-Z_]+)&wr_id=([0-9]+)', url) if m: return urljoin(url, '/{}/{}'.format(*m)) return url.split('?')[0]
def init(self): self.url = self.url.replace('manatoki_', '') self.session, self.soup, url = get_soup(self.url) self.url = self.fix_url(url) # 2377 list = self.soup.find(attrs={'data-original-title': '목록'}) if list: url = urljoin(self.url, list.parent['href']) nav = self.soup.find('div', class_='toon-nav') select = nav.find('select', {'name': 'wr_id'}) for i, op in enumerate(select.findAll('option')[::-1]): if 'selected' in op.attrs: break else: raise Exception('no selected option') self.session, self.soup, url = get_soup(url) self.url = self.fix_url(url) for i, page in enumerate(get_pages(self.url, self.soup)): if page.id == int(op['value']): break else: raise Exception('can not find page') self.customWidget.range_p = [i] self.name
def read_playlist(url, n, cw=None): print_ = get_print(cw) for header in ['channel', 'user', 'c']: if '/{}/'.format(header) in url.lower(): username = re.find(r'/{}/([^/\?]+)'.format(header), url, re.IGNORECASE) url = urljoin(url, '/{}/{}/videos'.format(header, username)) options = { 'extract_flat': True, 'playlistend': n, } ydl = ytdl.YoutubeDL(options) info = ydl.extract_info(url) es = info['entries'] urls = [] for e in es: href = 'https://www.youtube.com/watch?v={}'.format(e['id']) urls.append(href) info['urls'] = urls if 'uploader' not in info: title = info['title'] if title.lower().endswith(' - videos'): title = title[:-len(' - videos')] info['uploader'] = title print_('⚠️ Fix uploader: None -> {}'.format(title)) return info
def read(self): cw = self.customWidget title = self.get_title(self.url) ids = set() url = self.url while True: html = urlopen(url) soup = BeautifulSoup(html, "html.parser") tmp = soup.find_all(attrs={'class':'directlink'}, href=True) for image_html in tmp: image_url = image_html['href'] id_ = self.get_id(image_url) if id_ in ids: self.print_('duplicate: {}'.format(id_)) continue ids.add(id_) self.urls.append(image_url) self.filenames[image_url] = self.get_filename(image_url) if not cw.alive: break cw.setTitle('{} {} - {}'.format(tr_('읽는 중...'), title, len(self.urls))) next_page = soup.find('a', attrs={'rel':'next'}, href=True) if not next_page: break else: url = urljoin(self.url, next_page['href']) self.title = title
def path_regex(self): """Return the regex for the path""" if self.locale_build: return self.build_list_regex return urljoin(self.build_list_regex, self.builds[self.build_index])
def get_video(url, session, cw): print_ = get_print(cw) soup = downloader.read_soup(url, session=session) view = soup.find('div', id='player-container-fluid') fs = [] for source in view.findAll('source'): src = urljoin(url, source.attrs['src']) res = re.find('([0-9]+)p', source.attrs['title']) res = int(res) if res else 0 f = {'res': res, 'src': src} fs.append(f) print_(f) if not fs: raise Exception('No source') #4773 res = max(get_resolution(), min(f['res'] for f in fs)) print_(f'res: {res}') fs = sorted([f for f in fs if f['res'] <= res], key=lambda f: f['res']) f = fs[-1] print_(f'best: {f}') src_best = f['src'] title = soup.find('h1').text.strip() id = soup.find('div', id='video').attrs['data-id'] url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] #src_best = downloader.real_url(src_best) video = Video(src_best, url_thumb, url, title, id, session) return video
def get_info(url, cw=None): print_ = get_print(cw) info = {'videos': []} html = downloader.read_html(url) soup = Soup(html) info['title'] = soup.find('h2', class_='videoCnt_title').text.strip() id_ = re.find(PATTERN_ID, url, err='no id') print_('id: {}'.format(id_)) token = re.find( r'''window.FC2VideoObject.push\(\[['"]ae['"], *['"](.+?)['"]''', html, err='no token') print_('token: {}'.format(token)) url_api = 'https://video.fc2.com/api/v3/videoplaylist/{}?sh=1&fs=0'.format( id_) hdr = { 'X-FC2-Video-Access-Token': token, } data = downloader.read_json(url_api, url, headers=hdr) url_video = urljoin( url, data['playlist'].get('nq') or data['playlist']['sample']) url_thumb = soup.find('meta', {'property': 'og:image'})['content'] video = Video(url_video, url_thumb, url, info['title'], id_) info['videos'].append(video) return info
def fix_url(cls, url): url = re.sub(r'\?page=[0-9]+&', '?', url) url = re.sub(r'&page=[0-9]+', '', url) pool = re.find('/pool/show/([0-9]+)', url) if pool is not None: url = urljoin(url, '/post?tags=pool%3A{}'.format(pool)) return url
def get_build_info(self): """Defines additional build information""" # Internally we access builds via index url = urljoin(self.base_url, self.candidate_build_list_regex) self.logger.info('Retrieving list of candidate builds from %s' % url) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) if not parser.entries: message = 'Folder for specific candidate builds at %s has not' \ 'been found' % url raise errors.NotFoundError(message, url) self.show_matching_builds(parser.entries) self.builds = parser.entries self.build_index = len(parser.entries) - 1 if self.build_number and \ ('build%s' % self.build_number) in self.builds: self.builds = ['build%s' % self.build_number] self.build_index = 0 self.logger.info('Selected build: build%s' % self.build_number) else: self.logger.info('Selected build: build%d' % (self.build_index + 1))
def path_regex(self): """Return the regex for the path""" build_dir = 'try-%(PLATFORM)s%(DEBUG)s' % { 'PLATFORM': self.platform_regex, 'DEBUG': '-debug' if self.debug_build else ''} return urljoin(self.build_list_regex, self.builds[self.build_index], build_dir)
def buffer_sync(self): """bulk upsert of everything in self.buffer """ data = protocol.serialize_web( [x.to_broadcast_json(include_hidden=True) for x in self.buffer]) url = utils.urljoin(self.baseurl, self.docid + "/", 'bulkupsert') self.s.post(url, data=data) for m in self.buffer: m.set('created', True) self.buffer = []
def fetch(self, typename=None, id=None): if typename is None: url = utils.urljoin(self.baseurl, self.docid) data = self.s.get(url).content specs = self.ph.deserialize_web(data) models = [ContinuumModel( x['type'], **x['attributes']) for x in specs] return models elif typename is not None and id is None: url = utils.urljoin(self.baseurl, self.docid +"/", typename) attrs = self.ph.deserialize_web(self.s.get(url).content) models = [ContinuumModel(typename, **x) for x in attrs] return models elif typename is not None and id is not None: url = utils.urljoin(self.baseurl, self.docid +"/", typename + "/", id) attr = self.ph.deserialize_web(self.s.get(url).content) if attr is None: return None model = ContinuumModel(typename, **attr) return model
def create(self, model, defer=False): if not model.get('docs'): model.set('docs', [self.docid]) if defer: self.buffer.append(model) else: url = utils.urljoin(self.baseurl, self.docid + "/", model.typename) log.debug("create %s", url) self.s.post(url, data=self.ph.serialize_msg(model.to_json())) return model
def is_build_dir(self, dir): """Return whether or not the given dir contains a build.""" url = urljoin(self.base_url, self.monthly_build_list_regex, dir) if self.application in MULTI_LOCALE_APPLICATIONS \ and self.locale != 'multi': url = urljoin(url, self.locale) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) pattern = re.compile(self.binary_regex, re.IGNORECASE) for entry in parser.entries: try: pattern.match(entry).group() return True except: # No match, continue with next entry continue return False
def update(self, model, defer=False): model.set('doc', self.docid) if defer: self.buffer.append(model) else: url = utils.urljoin(self.baseurl, self.docid + "/", model.typename + "/", model.id +"/") log.debug("create %s", url) self.s.put(url, data=protocol.serialize_web( model.to_json(include_hidden=True))) return model
def is_build_dir(self, folder_name): """Return whether or not the given dir contains a build.""" # Cannot move up to base scraper due to parser.entries call in # get_build_info_for_index (see below) url = '%s/' % urljoin(self.base_url, self.build_list_regex, folder_name) if self.application in APPLICATIONS_MULTI_LOCALE \ and self.locale != 'multi': url = '%s/' % urljoin(url, self.locale) parser = self._create_directory_parser(url) pattern = re.compile(self.binary_regex, re.IGNORECASE) for entry in parser.entries: try: pattern.match(entry).group() return True except: # No match, continue with next entry continue return False
def create(self, model, defer=False): model.set('doc', self.docid) if defer: self.buffer.append(model) else: url = utils.urljoin(self.baseurl, self.docid + "/", model.typename +"/") log.debug("create %s", url) self.s.post(url, data=self.ph.serialize_msg( model.to_json(include_hidden=True))) model.set('created', True) return model
def __init__(self, directory, version, platform=None, application='firefox', locale='en-US', extension=None, authentication=None, retry_attempts=0, retry_delay=10., is_stub_installer=False, timeout=None, log_level='INFO', base_url=BASE_URL): # Private properties for caching self._target = None self._binary = None self.directory = directory self.locale = locale self.platform = platform or self.detect_platform() self.version = version self.extension = extension or DEFAULT_FILE_EXTENSIONS[self.platform] self.authentication = authentication self.retry_attempts = retry_attempts self.retry_delay = retry_delay self.is_stub_installer = is_stub_installer self.timeout_download = timeout self.timeout_network = 60. self.logger = mozlog.getLogger(' ') self.logger.setLevel(getattr(mozlog, log_level.upper())) # build the base URL self.application = application self.base_url = urljoin(base_url, self.application) attempt = 0 while True: attempt += 1 try: self.get_build_info() break except (NotFoundError, requests.exceptions.RequestException), e: if self.retry_attempts > 0: # Log only if multiple attempts are requested self.logger.warning("Build not found: '%s'" % e.message) self.logger.info('Will retry in %s seconds...' % (self.retry_delay)) time.sleep(self.retry_delay) self.logger.info("Retrying... (attempt %s)" % attempt) if attempt >= self.retry_attempts: if hasattr(e, 'response') and \ e.response.status_code == 404: message = "Specified build has not been found" raise NotFoundError(message, e.response.url) else: raise
def get_build_info_for_index(self, build_index=None): url = urljoin(self.base_url, self.build_list_regex) self.logger.info('Retrieving list of builds from %s' % url) parser = self._create_directory_parser(url) parser.entries = parser.filter('.*-%s$' % self.changeset) if not parser.entries: raise errors.NotFoundError('No builds have been found', url) self.show_matching_builds(parser.entries) self.logger.info('Selected build: %s' % parser.entries[0]) return (parser.entries, 0)
def fetch(self, typename=None, id=None, include_hidden=False): query = urllib.urlencode({'include_hidden' : include_hidden}) if typename is None: url = utils.urljoin(self.baseurl, self.docid +"/") + "?" + query data = self.s.get(url).content specs = protocol.deserialize_web(data) models = [make_model(x['type'], client=self, **x['attributes'])\ for x in specs] return models elif typename is not None and id is None: url = utils.urljoin(self.baseurl, self.docid +"/", typename + "/") url += "?" + query attrs = protocol.deserialize_web(self.s.get(url).content) models = [make_model(typename, client=self, **x) for x in attrs] return models elif typename is not None and id is not None: url = utils.urljoin(self.baseurl, self.docid +"/", typename + "/", id +"/") url += "?" + query attr = protocol.deserialize_web(self.s.get(url).content) if attr is None: return None model = make_model(typename, client=self, **attr) return model
def get_build_info_for_index(self, build_index=None): url = urljoin(self.base_url, self.build_list_regex) self.logger.info('Retrieving list of builds from %s' % url) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) parser.entries = parser.filter('.*-%s$' % self.changeset) if not parser.entries: raise NotFoundError('No builds have been found', url) self.show_matching_builds(parser.entries) self.logger.info('Selected build: %s' % parser.entries[0]) return (parser.entries, 0)
def get_build_info_for_date(self, date, build_index=None): url = urljoin(self.base_url, self.monthly_build_list_regex) has_time = date and date.time() self.logger.info('Retrieving list of builds from %s' % url) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) regex = r'%(DATE)s-(\d+-)+%(BRANCH)s%(L10N)s%(PLATFORM)s$' % { 'DATE': date.strftime('%Y-%m-%d'), 'BRANCH': self.branch, # ensure to select the correct subfolder for localized builds 'L10N': '' if self.locale in ('en-US', 'multi') else '(-l10n)?', 'PLATFORM': '' if self.application not in ( 'fennec') else '-' + self.platform } parser.entries = parser.filter(regex) parser.entries = parser.filter(self.is_build_dir) if has_time: # If a time is included in the date, use it to determine the # build's index regex = r'.*%s.*' % date.strftime('%H-%M-%S') parser.entries = parser.filter(regex) if not parser.entries: date_format = '%Y-%m-%d-%H-%M-%S' if has_time else '%Y-%m-%d' message = 'Folder for builds on %s has not been found' % \ self.date.strftime(date_format) raise errors.NotFoundError(message, url) # If no index has been given, set it to the last build of the day. self.show_matching_builds(parser.entries) # If no index has been given, set it to the last build of the day. if build_index is None: # Find the most recent non-empty entry. build_index = len(parser.entries) for build in reversed(parser.entries): build_index -= 1 if not build_index or self.is_build_dir(build): break self.logger.info('Selected build: %s' % parser.entries[build_index]) return (parser.entries, build_index)
def get_build_info_for_version(self, version, build_index=None): url = urljoin(self.base_url, self.candidate_build_list_regex) self.logger.info('Retrieving list of candidate builds from %s' % url) parser = DirectoryParser(url, authentication=self.authentication, timeout=self.timeout_network) if not parser.entries: message = 'Folder for specific candidate builds at %s has not' \ 'been found' % url raise NotFoundError(message, url) self.show_matching_builds(parser.entries) # If no index has been given, set it to the last build of the given # version. if build_index is None: build_index = len(parser.entries) - 1 return (parser.entries, build_index)
def url(self): """Return the URL of the build""" return urljoin(self.path, self.binary)
def path(self): """Return the path to the build""" return urljoin(self.base_url, self.path_regex)
def buffer_sync(self): data = self.ph.serialize_web([x.to_broadcast_json() \ for x in self.buffer]) url = utils.urljoin(self.baseurl, self.docid + "/", 'bulkupsert') self.s.post(url, data=data) self.buffer = []
def delete(self, typename, id): url = utils.urljoin(self.baseurl, self.docid +"/", typename + "/", id) self.s.delete(url)
def __init__(self, destination=None, platform=None, application='firefox', locale=None, extension=None, username=None, password=None, retry_attempts=0, retry_delay=10., is_stub_installer=False, timeout=None, log_level='INFO', base_url=BASE_URL): # Private properties for caching self._filename = None self._binary = None self.destination = destination or os.getcwd() if not locale: if application in APPLICATIONS_MULTI_LOCALE: self.locale = 'multi' else: self.locale = 'en-US' else: self.locale = locale self.platform = platform or self.detect_platform() if (username, password) == (None, None): self.authentication = None else: self.authentication = (username, password) self.retry_attempts = retry_attempts self.retry_delay = retry_delay self.is_stub_installer = is_stub_installer self.timeout_download = timeout self.timeout_network = 60. logging.basicConfig(format=' %(levelname)s | %(message)s') self.logger = logging.getLogger(self.__module__) self.logger.setLevel(log_level) # build the base URL self.application = application self.base_url = urljoin(base_url, APPLICATIONS_TO_FTP_DIRECTORY.get( self.application, self.application)) if extension: self.extension = extension else: if self.application in APPLICATIONS_MULTI_LOCALE and \ self.platform in ('win32', 'win64'): # builds for APPLICATIONS_MULTI_LOCALE only exist in zip self.extension = 'zip' else: self.extension = DEFAULT_FILE_EXTENSIONS[self.platform] attempt = 0 while True: attempt += 1 try: self.get_build_info() break except (errors.NotFoundError, requests.exceptions.RequestException), e: if self.retry_attempts > 0: # Log only if multiple attempts are requested self.logger.warning("Build not found: '%s'" % e.message) self.logger.info('Will retry in %s seconds...' % (self.retry_delay)) time.sleep(self.retry_delay) self.logger.info("Retrying... (attempt %s)" % attempt) if attempt >= self.retry_attempts: if hasattr(e, 'response') and \ e.response.status_code == 404: message = "Specified build has not been found" raise errors.NotFoundError(message, e.response.url) else: raise