예제 #1
0
파일: utils.py 프로젝트: spenfraz/gutenberg
def download_file(url, fname=None):
    fname.parent.mkdir(parents=True, exist_ok=True)
    try:
        save_large_file(url, fname)
        return True
    except Exception as exc:
        logger.error(f"Error while downloading from {url}: {exc}")
        return False
예제 #2
0
def download_file(url, fpath):
    fpath.parent.mkdir(parents=True, exist_ok=True)
    try:
        save_large_file(url, fpath)
        return True
    except Exception as exc:
        logger.error(f"Error while downloading from {url}: {exc}")
        if fpath.exists():
            os.unlink(fpath)
        return False
예제 #3
0
def test_urlitem_html(tmp_path, gzip_html_url):
    file_path = tmp_path / "file.html"
    save_large_file(gzip_html_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(URLItem(url=gzip_html_url))

    zim = Archive(fpath)
    assert bytes(zim.get_item("wiki/Main_Page").content) == file_bytes
예제 #4
0
def test_urlprovider(tmp_path, png_image_url):
    file_path = tmp_path / "file.png"
    save_large_file(png_image_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(
            SpecialURLProviderItem(url=png_image_url, path="one.png"))

    zim = Archive(fpath)
    assert bytes(zim.get_item("one.png").content) == file_bytes
예제 #5
0
def test_urlitem_nonhtmlgzip(tmp_path, gzip_nonhtml_url):
    file_path = tmp_path / "file.txt"
    save_large_file(gzip_nonhtml_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(URLItem(url=gzip_nonhtml_url))
        creator.add_item(URLItem(url=gzip_nonhtml_url, use_disk=True))

    zim = Archive(fpath)
    assert bytes(zim.get_item("robots.txt").content) == file_bytes
예제 #6
0
def test_urlitem_binary(tmp_path, png_image_url):
    file_path = tmp_path / "file.png"
    save_large_file(png_image_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(URLItem(url=png_image_url))

    zim = Archive(fpath)
    assert (bytes(
        zim.get_item("static/images/project-logos/commonswiki.png").content) ==
            file_bytes)
예제 #7
0
    def download_jpeg_image_and_convert(
        self, url, fpath, preset_options={}, resize=None
    ):
        """downloads a JPEG image and converts and optimizes it into desired format detected from fpath"""

        org_jpeg_path = pathlib.Path(
            tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
        )
        save_large_file(url, org_jpeg_path)
        if resize is not None:
            resize_image(
                org_jpeg_path,
                width=resize[0],
                height=resize[1],
                method="cover",
            )
        optimize_image(
            org_jpeg_path, fpath, convert=True, delete_src=True, **preset_options
        )
        logger.debug(f"Converted {org_jpeg_path} to {fpath} and optimized ")
예제 #8
0
    def download_and_extract_archive(self):
        # download if it's a URL
        if self.archive.startswith("http"):
            logger.info(f"Downloading archive at {self.archive}")
            save_large_file(self.archive, self.archive_path)

        # extract ZIP
        logger.info(
            f"Extracting ZIP archive {self.archive_path} to {self.files_path}")
        args = [
            "unzip",
            "-u",
            "-q",
            "-D",
            str(self.archive_path),
            "-d",
            str(self.files_path),
        ]
        logger.debug(nicer_args_join(args))
        subprocess.run(args, check=True)
예제 #9
0
def prepare_ogvjs_folder(tmp_path, videojs_url, ogvjs_url, videojs_ogvjs_url):

    videojs_zip = tmp_path / "video-js-7.6.4.zip"
    if not videojs_zip.exists():
        save_large_file(videojs_url, videojs_zip)
    videojs_dir = tmp_path.joinpath("videojs")
    shutil.rmtree(videojs_dir, ignore_errors=True)
    videojs_dir.mkdir()
    with zipfile.ZipFile(videojs_zip) as zipf:
        zipf.extractall(videojs_dir)

    ogvjs_zip = tmp_path / "ogvjs-1.6.1.zip"
    if not ogvjs_zip.exists():
        save_large_file(ogvjs_url, ogvjs_zip)
    ogvjs_dir = tmp_path.joinpath("ogvjs")
    ogvjs_dir_tmp = tmp_path.joinpath("ogvjs_tmp")
    shutil.rmtree(ogvjs_dir, ignore_errors=True)
    shutil.rmtree(ogvjs_dir_tmp, ignore_errors=True)
    ogvjs_dir.mkdir()
    with zipfile.ZipFile(ogvjs_zip) as zipf:
        zipf.extractall(ogvjs_dir_tmp)
    # move back one dir
    ogvjs_dir_tmp.joinpath("ogvjs-1.6.1").rename(ogvjs_dir)

    videojs_ogvjs_zip = tmp_path / "v1.3.1.zip"
    if not videojs_ogvjs_zip.exists():
        save_large_file(videojs_ogvjs_url, videojs_ogvjs_zip)
    member = "videojs-ogvjs-1.3.1/dist/videojs-ogvjs.js"
    with zipfile.ZipFile(videojs_ogvjs_zip) as zipf:
        zipf.extract(member, tmp_path)
    # move script to root
    tmp_path.joinpath(member).rename(tmp_path.joinpath("videojs-ogvjs.js"))
예제 #10
0
파일: scraper.py 프로젝트: satyamtg/ted
    def download_video_data(self):

        # Download all the TED talk videos and the meta-data for it.
        # Save the videos in build_dir/{video id}/video.mp4.
        # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg.
        # Save the image of the speaker in build_dir/{video id}/speaker.jpg.
        # load the dumped metadata
        self.load_meta_from_file()
        for video in self.videos:
            # set up variables
            video_id = str(video["id"])
            video_title = video["title"]
            video_link = video["video_link"]
            video_speaker = video["speaker_picture"]
            video_thumbnail = video["thumbnail"]
            video_dir = self.videos_dir.joinpath(video_id)
            video_file_path = video_dir.joinpath("video.mp4")
            speaker_path = video_dir.joinpath("speaker.jpg")
            thumbnail_path = video_dir.joinpath("thumbnail.jpg")

            # ensure that video directory exists
            if not video_dir.exists():
                video_dir.mkdir(parents=True)

            # download video
            if not video_file_path.exists():
                logger.debug(f"Downloading {video_title}")
                try:
                    save_large_file(video_link, video_file_path)
                except Exception:
                    logger.error(f"Could not download {video_file_path}")
            else:
                logger.debug(f"video.mp4 already exists. Skipping video {video_title}")

            # download an image of the speaker
            if not speaker_path.exists():
                if video_speaker == "None" or video_speaker == "":
                    logger.debug("Speaker doesn't have an image")
                else:
                    logger.debug(f"Downloading Speaker image for {video_title}")
                    save_large_file(video_speaker, speaker_path)
            else:
                logger.debug(f"speaker.jpg already exists for {video_title}")

            # download the thumbnail of the video
            if not thumbnail_path.exists():
                logger.debug(f"Downloading thumbnail for {video_title}")
                save_large_file(video_thumbnail, thumbnail_path)
            else:
                logger.debug(f"Thumbnail already exists for {video_title}")

            # recompress if necessary
            post_process_video(
                video_dir, video_id, self.video_format, self.low_quality,
            )
예제 #11
0
    def download_video_files(self, video):
        """download all video files (video, thumbnail, speaker)"""

        # Download all the TED talk videos and the meta-data for it.
        # Save the videos in build_dir/{video id}/video.mp4.
        # Save the thumbnail for the video in build_dir/{video id}/thumbnail.jpg.
        # Save the image of the speaker in build_dir/{video id}/speaker.jpg.

        # set up variables
        video_id = str(video["id"])
        # Take the english version of title or else whatever language it's available in
        video_title = video["title"][0]["text"]
        video_link = video["video_link"]
        video_speaker = video["speaker_picture"]
        video_thumbnail = video["thumbnail"]
        video_dir = self.videos_dir.joinpath(video_id)
        org_video_file_path = video_dir.joinpath("video.mp4")
        req_video_file_path = video_dir.joinpath(f"video.{self.video_format}")
        speaker_path = video_dir.joinpath("speaker.webp")
        thumbnail_path = video_dir.joinpath("thumbnail.webp")

        # ensure that video directory exists
        if not video_dir.exists():
            video_dir.mkdir(parents=True)

        # set preset
        preset = {"mp4": VideoMp4Low}.get(self.video_format, VideoWebmLow)()

        # download video
        downloaded_from_cache = False
        logger.debug(f"Downloading {video_title}")
        if self.s3_storage:
            s3_key = f"{self.video_format}/{self.video_quality}/{video_id}"
            downloaded_from_cache = self.download_from_cache(
                s3_key, req_video_file_path, preset.VERSION
            )
        if not downloaded_from_cache:
            try:
                if "https://" not in video_link:
                    options = (
                        BestWebm if self.video_format == "webm" else BestMp4
                    ).get_options(
                        target_dir=video_dir, filepath=pathlib.Path("video.%(ext)s")
                    )
                    self.yt_downloader.download(video_link, options)
                else:
                    save_large_file(video_link, org_video_file_path)
            except Exception:
                logger.error(f"Could not download {org_video_file_path}")

        # download speaker and thumbnail images
        self.download_speaker_image(video_id, video_title, video_speaker, speaker_path)
        self.download_thumbnail(video_id, video_title, video_thumbnail, thumbnail_path)

        # recompress if necessary
        try:
            if not downloaded_from_cache:
                post_process_video(
                    video_dir,
                    video_id,
                    preset,
                    self.video_format,
                    self.low_quality,
                )
        except Exception as e:
            logger.error(f"Failed to post process video {video_id}")
            logger.debug(e)
        else:
            # upload to cache only if recompress was successful
            if self.s3_storage and not downloaded_from_cache:
                self.upload_to_cache(s3_key, req_video_file_path, preset.VERSION)
예제 #12
0
def test_large_download_https(tmp_path, valid_https_url):
    dest_file = tmp_path / "favicon.ico"
    save_large_file(valid_https_url, dest_file)
    assert_downloaded_file(valid_https_url, dest_file)
예제 #13
0
def test_urlprovider_nolength(tmp_path, png_image_url, png_image):

    # save url's content locally using external tool
    png_image = tmp_path / "original.png"
    save_large_file(png_image_url, png_image)
    with open(png_image, "rb") as fh:
        png_image_bytes = fh.read()

    # create and start an http server without Content-Length support
    server_fpath = tmp_path / "httpd.py"
    port = random.randint(10000, 20000)
    server_code = """
from http.server import BaseHTTPRequestHandler, HTTPServer

class handler(BaseHTTPRequestHandler):

    def do_GET(self):
        self.send_response(200)
        self.send_header("Content-type", "image/png")
        if "gzip" in self.path:
            self.send_header("Content-Encoding", "gzip")
        self.end_headers()
        with open("{src}", "rb") as fh:
            self.wfile.write(fh.read())


with HTTPServer(('', {port}), handler) as server:
    server.serve_forever()

"""
    with open(server_fpath, "w") as fh:
        fh.write(
            server_code.replace("{port}",
                                str(port)).replace("{src}", str(png_image)))

    httpd = subprocess.Popen([sys.executable, server_fpath])
    time.sleep(2)  # allow http server to start

    fpath = tmp_path / "test.zim"
    try:
        with tempfile.TemporaryDirectory() as tmp_dir, Creator(
                fpath) as creator:
            tmp_dir = pathlib.Path(tmp_dir)
            creator.add_item(
                URLItem(
                    url=f"http://localhost:{port}/hoho.png",
                    path="B",
                    tmp_dir=tmp_dir,
                    use_disk=True,
                ))
            creator.add_item(
                URLItem(url=f"http://localhost:{port}/home.png",
                        tmp_dir=tmp_dir))

            creator.add_item(
                SpecialURLProviderItem(url=f"http://localhost:{port}/home.png",
                                       mimetype="image/png"))
    finally:
        httpd.terminate()

    zim = Archive(fpath)
    assert bytes(zim.get_item("home.png").content) == png_image_bytes
    assert bytes(zim.get_item("B").content) == png_image_bytes