def test_download_google_results_file(self): query = 'Well of Ascension' file_name = os.path.join('temp', 'google_results.html') if os.path.exists(file_name): os.remove(file_name) d = Downloader() url = d.goodreads_id_query(query) result = d.download_file(url, file_name) self.assertTrue(os.path.exists(file_name))
def kntu_download(user_name, password, pasted_urls): kntu_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36', } kntu_login_data = { 'anchor': '', 'username': user_name, 'password': password, 'rememberusername': '******' } kntu_downloader = Downloader('https://vc.kntu.ac.ir/login/index.php', 'https://connect.kntu.ac.ir/', kntu_login_data, kntu_headers, kntu_headers) if not kntu_downloader.login({'logintoken'}): return for url in pasted_urls: if re.match( r'https://vc\d*\.kntu\.ac\.ir/mod/adobeconnect/joinrecording\.php.*', url): filename = re.findall('recording=(\d+)&', url)[0] print('Downloading ' + filename + '...') kntu_downloader.set_name_to_save(filename) kntu_downloader.set_pasted_url(url) kntu_downloader.set_cookies() if not kntu_downloader.create_downlaod_link(): continue kntu_downloader.download_file() kntu_downloader.save_file() kntu_downloader.extract_zip_file() kntu_downloader.convert_media() kntu_downloader.download_other_files() print(filename + ' downloaded!') else: print('Wrong URL format')
def download(self, sub, path): r = requests.get(sub.page_url) if r.status_code == 200: soup = self._get_soup(r.text) sub.url = self._get_full_url( soup.find(u'a', id=u'downloadButton')[u'href']) dl = Downloader() zip_path = os.path.splitext(path)[0] + u'.zip' if dl.download_file(sub.url, zip_path): is_extration_success = self._extract_sub_zip(zip_path, path) try: os.remove(zip_path) except OSError, e: pass if is_extration_success: return True
def download(self, sub, path): r = requests.get(sub.page_url) if r.status_code == 200: soup = self._get_soup(r.text) sub.url = self._get_full_url( soup.find(u'a', id=u'downloadButton')[u'href'] ) dl = Downloader() zip_path = os.path.splitext(path)[0] + u'.zip' if dl.download_file(sub.url, zip_path): is_extration_success = self._extract_sub_zip(zip_path, path) try: os.remove(zip_path) except OSError, e: pass if is_extration_success: return True
class WearCollector(Collector): def __init__(self, reporter: Reporter, waiter: Waiter, outdir: str, useragent: str = ''): super(WearCollector, self).__init__() self.reporter: Reporter = reporter self.waiter = waiter self.outdir = outdir self.useragent = useragent self.cacher = Cacher(self.outdir) # 非同期処理の同時接続数制御 self.semaphore = Semaphore(2) # ファイルダウンローダ self.downloader = Downloader(self.waiter, self.semaphore, self.reporter) async def download_user_page(self, url: str, page_num): url = url + f'?pageno={page_num}' # キャッシュがあれば使う filename = urllib.parse.quote(url, safe='') + '.html' content, info = self.cacher.get(filename) if content and info: html = content realurl = info.get('realurl') self.reporter.report(INFO, f'use cache {url}') else: await self.waiter.wait(url) async with self.semaphore: self.reporter.report(INFO, f'fetching {url}', type=NETWORK) async with aiohttp.request( 'get', url, headers={'user-agent': self.useragent}) as res: html = await res.text() realurl = str(res.url) self.cacher.set(filename, html, { 'status': res.status, 'realurl': realurl }) # 終了条件 if page_num >= 2 and realurl.count('?pageno') == 0: return False else: for url, data in await self.run_in_executor(parse_user, html): await self.add_future( 'gallery', self.gallery_collector(url, 1, 501, userdata=data)) return True async def user_collector(self, url: str, pagestart: int, pageend: int): await self.queued_paging( pagestart, pageend, lambda page: self.download_user_page(url, page)) async def download_gallery_page(self, url: str, page_num: int, userdata=None): url = url + f'?pageno={page_num}' filename = urllib.parse.quote(url, safe='') + '.html' content, info = self.cacher.get(filename) if content and info: html = content realurl = info.get('realurl') self.reporter.report(INFO, f'use cache {url}') else: await self.waiter.wait(url) async with self.semaphore: self.reporter.report(INFO, f'fetching {url}', type=NETWORK) async with aiohttp.request( 'get', url, headers={'user-agent': self.useragent}) as res: html = await res.text() realurl = str(res.url) self.cacher.set(filename, html, { 'status': res.status, 'realurl': realurl }) # 終了条件 if page_num >= 2 and realurl.count('?pageno') == 0: return False else: for url, data in await self.run_in_executor( parse_gallely, html, userdata): imagefile = urllib.parse.quote(url, safe='') tmp_save(os.path.join(self.outdir, imagefile + '.json'), json.dumps(data)) imagepath = os.path.join(self.outdir, imagefile) if not os.path.exists(imagepath): await self.add_future( 'image', self.downloader.download_file( url, imagepath, headers={'user-agent': self.useragent})) return True async def gallery_collector(self, url: str, pagestart: int, pageend: int, userdata=None): await self.queued_paging( pagestart, pageend, lambda page: self.download_gallery_page( url, page, userdata=userdata))