def download_file(url, fname=None): fname.parent.mkdir(parents=True, exist_ok=True) try: save_large_file(url, fname) return True except Exception as exc: logger.error(f"Error while downloading from {url}: {exc}") return False
def download_file(url, fpath): fpath.parent.mkdir(parents=True, exist_ok=True) try: save_large_file(url, fpath) return True except Exception as exc: logger.error(f"Error while downloading from {url}: {exc}") if fpath.exists(): os.unlink(fpath) return False
def test_urlitem_html(tmp_path, gzip_html_url): file_path = tmp_path / "file.html" save_large_file(gzip_html_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(URLItem(url=gzip_html_url)) zim = Archive(fpath) assert bytes(zim.get_item("wiki/Main_Page").content) == file_bytes
def test_urlprovider(tmp_path, png_image_url): file_path = tmp_path / "file.png" save_large_file(png_image_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item( SpecialURLProviderItem(url=png_image_url, path="one.png")) zim = Archive(fpath) assert bytes(zim.get_item("one.png").content) == file_bytes
def test_urlitem_nonhtmlgzip(tmp_path, gzip_nonhtml_url): file_path = tmp_path / "file.txt" save_large_file(gzip_nonhtml_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(URLItem(url=gzip_nonhtml_url)) creator.add_item(URLItem(url=gzip_nonhtml_url, use_disk=True)) zim = Archive(fpath) assert bytes(zim.get_item("robots.txt").content) == file_bytes
def test_urlitem_binary(tmp_path, png_image_url): file_path = tmp_path / "file.png" save_large_file(png_image_url, file_path) with open(file_path, "rb") as fh: file_bytes = fh.read() fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(URLItem(url=png_image_url)) zim = Archive(fpath) assert (bytes( zim.get_item("static/images/project-logos/commonswiki.png").content) == file_bytes)
def download_jpeg_image_and_convert( self, url, fpath, preset_options={}, resize=None ): """downloads a JPEG image and converts and optimizes it into desired format detected from fpath""" org_jpeg_path = pathlib.Path( tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name ) save_large_file(url, org_jpeg_path) if resize is not None: resize_image( org_jpeg_path, width=resize[0], height=resize[1], method="cover", ) optimize_image( org_jpeg_path, fpath, convert=True, delete_src=True, **preset_options ) logger.debug(f"Converted {org_jpeg_path} to {fpath} and optimized ")
def download_and_extract_archive(self): # download if it's a URL if self.archive.startswith("http"): logger.info(f"Downloading archive at {self.archive}") save_large_file(self.archive, self.archive_path) # extract ZIP logger.info( f"Extracting ZIP archive {self.archive_path} to {self.files_path}") args = [ "unzip", "-u", "-q", "-D", str(self.archive_path), "-d", str(self.files_path), ] logger.debug(nicer_args_join(args)) subprocess.run(args, check=True)
def prepare_ogvjs_folder(tmp_path, videojs_url, ogvjs_url, videojs_ogvjs_url): videojs_zip = tmp_path / "video-js-7.6.4.zip" if not videojs_zip.exists(): save_large_file(videojs_url, videojs_zip) videojs_dir = tmp_path.joinpath("videojs") shutil.rmtree(videojs_dir, ignore_errors=True) videojs_dir.mkdir() with zipfile.ZipFile(videojs_zip) as zipf: zipf.extractall(videojs_dir) ogvjs_zip = tmp_path / "ogvjs-1.6.1.zip" if not ogvjs_zip.exists(): save_large_file(ogvjs_url, ogvjs_zip) ogvjs_dir = tmp_path.joinpath("ogvjs") ogvjs_dir_tmp = tmp_path.joinpath("ogvjs_tmp") shutil.rmtree(ogvjs_dir, ignore_errors=True) shutil.rmtree(ogvjs_dir_tmp, ignore_errors=True) ogvjs_dir.mkdir() with zipfile.ZipFile(ogvjs_zip) as zipf: zipf.extractall(ogvjs_dir_tmp) # move back one dir ogvjs_dir_tmp.joinpath("ogvjs-1.6.1").rename(ogvjs_dir) videojs_ogvjs_zip = tmp_path / "v1.3.1.zip" if not videojs_ogvjs_zip.exists(): save_large_file(videojs_ogvjs_url, videojs_ogvjs_zip) member = "videojs-ogvjs-1.3.1/dist/videojs-ogvjs.js" with zipfile.ZipFile(videojs_ogvjs_zip) as zipf: zipf.extract(member, tmp_path) # move script to root tmp_path.joinpath(member).rename(tmp_path.joinpath("videojs-ogvjs.js"))
def download_video_data(self): # Download all the TED talk videos and the meta-data for it. # Save the videos in build_dir/{video id}/video.mp4. # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg. # Save the image of the speaker in build_dir/{video id}/speaker.jpg. # load the dumped metadata self.load_meta_from_file() for video in self.videos: # set up variables video_id = str(video["id"]) video_title = video["title"] video_link = video["video_link"] video_speaker = video["speaker_picture"] video_thumbnail = video["thumbnail"] video_dir = self.videos_dir.joinpath(video_id) video_file_path = video_dir.joinpath("video.mp4") speaker_path = video_dir.joinpath("speaker.jpg") thumbnail_path = video_dir.joinpath("thumbnail.jpg") # ensure that video directory exists if not video_dir.exists(): video_dir.mkdir(parents=True) # download video if not video_file_path.exists(): logger.debug(f"Downloading {video_title}") try: save_large_file(video_link, video_file_path) except Exception: logger.error(f"Could not download {video_file_path}") else: logger.debug(f"video.mp4 already exists. Skipping video {video_title}") # download an image of the speaker if not speaker_path.exists(): if video_speaker == "None" or video_speaker == "": logger.debug("Speaker doesn't have an image") else: logger.debug(f"Downloading Speaker image for {video_title}") save_large_file(video_speaker, speaker_path) else: logger.debug(f"speaker.jpg already exists for {video_title}") # download the thumbnail of the video if not thumbnail_path.exists(): logger.debug(f"Downloading thumbnail for {video_title}") save_large_file(video_thumbnail, thumbnail_path) else: logger.debug(f"Thumbnail already exists for {video_title}") # recompress if necessary post_process_video( video_dir, video_id, self.video_format, self.low_quality, )
def download_video_files(self, video): """download all video files (video, thumbnail, speaker)""" # Download all the TED talk videos and the meta-data for it. # Save the videos in build_dir/{video id}/video.mp4. # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg. # Save the image of the speaker in build_dir/{video id}/speaker.jpg. # set up variables video_id = str(video["id"]) # Take the english version of title or else whatever language it's available in video_title = video["title"][0]["text"] video_link = video["video_link"] video_speaker = video["speaker_picture"] video_thumbnail = video["thumbnail"] video_dir = self.videos_dir.joinpath(video_id) org_video_file_path = video_dir.joinpath("video.mp4") req_video_file_path = video_dir.joinpath(f"video.{self.video_format}") speaker_path = video_dir.joinpath("speaker.webp") thumbnail_path = video_dir.joinpath("thumbnail.webp") # ensure that video directory exists if not video_dir.exists(): video_dir.mkdir(parents=True) # set preset preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)() # download video downloaded_from_cache = False logger.debug(f"Downloading {video_title}") if self.s3_storage: s3_key = f"{self.video_format}/{self.video_quality}/{video_id}" downloaded_from_cache = self.download_from_cache( s3_key, req_video_file_path, preset.VERSION ) if not downloaded_from_cache: try: if "https://" not in video_link: options = ( BestWebm if self.video_format == "webm" else BestMp4 ).get_options( target_dir=video_dir, filepath=pathlib.Path("video.%(ext)s") ) self.yt_downloader.download(video_link, options) else: save_large_file(video_link, org_video_file_path) except Exception: logger.error(f"Could not download {org_video_file_path}") # download speaker and thumbnail images self.download_speaker_image(video_id, video_title, video_speaker, speaker_path) self.download_thumbnail(video_id, video_title, video_thumbnail, thumbnail_path) # recompress if necessary try: if not downloaded_from_cache: post_process_video( video_dir, video_id, preset, self.video_format, self.low_quality, ) except Exception as e: logger.error(f"Failed to post process video {video_id}") logger.debug(e) else: # upload to cache only if recompress was successful if self.s3_storage and not downloaded_from_cache: self.upload_to_cache(s3_key, req_video_file_path, preset.VERSION)
def test_large_download_https(tmp_path, valid_https_url): dest_file = tmp_path / "favicon.ico" save_large_file(valid_https_url, dest_file) assert_downloaded_file(valid_https_url, dest_file)
def test_urlprovider_nolength(tmp_path, png_image_url, png_image): # save url's content locally using external tool png_image = tmp_path / "original.png" save_large_file(png_image_url, png_image) with open(png_image, "rb") as fh: png_image_bytes = fh.read() # create and start an http server without Content-Length support server_fpath = tmp_path / "httpd.py" port = random.randint(10000, 20000) server_code = """ from http.server import BaseHTTPRequestHandler, HTTPServer class handler(BaseHTTPRequestHandler): def do_GET(self): self.send_response(200) self.send_header("Content-type", "image/png") if "gzip" in self.path: self.send_header("Content-Encoding", "gzip") self.end_headers() with open("{src}", "rb") as fh: self.wfile.write(fh.read()) with HTTPServer(('', {port}), handler) as server: server.serve_forever() """ with open(server_fpath, "w") as fh: fh.write( server_code.replace("{port}", str(port)).replace("{src}", str(png_image))) httpd = subprocess.Popen([sys.executable, server_fpath]) time.sleep(2) # allow http server to start fpath = tmp_path / "test.zim" try: with tempfile.TemporaryDirectory() as tmp_dir, Creator( fpath) as creator: tmp_dir = pathlib.Path(tmp_dir) creator.add_item( URLItem( url=f"http://localhost:{port}/hoho.png", path="B", tmp_dir=tmp_dir, use_disk=True, )) creator.add_item( URLItem(url=f"http://localhost:{port}/home.png", tmp_dir=tmp_dir)) creator.add_item( SpecialURLProviderItem(url=f"http://localhost:{port}/home.png", mimetype="image/png")) finally: httpd.terminate() zim = Archive(fpath) assert bytes(zim.get_item("home.png").content) == png_image_bytes assert bytes(zim.get_item("B").content) == png_image_bytes