Python stream_file 예제들, zimscraperlib.download.stream_file Python 예제들

예제 #1

0

파일 보기

def save_channel_branding(channels_dir, channel_id, save_banner=False):
    """ download, save and resize profile [and banner] of a channel """
    channel_json = get_channel_json(channel_id)

    thumbnails = channel_json["snippet"]["thumbnails"]
    for quality in ("medium",
                    "default"):  # high:800px, medium:240px, default:88px
        if quality in thumbnails.keys():
            thumnbail = thumbnails[quality]["url"]
            break

    channel_dir = channels_dir.joinpath(channel_id)
    channel_dir.mkdir(exist_ok=True)

    profile_path = channel_dir.joinpath("profile.jpg")
    if not profile_path.exists():
        stream_file(thumnbail, profile_path)
        # resize profile as we only use up 100px/80 sq
        resize_image(profile_path, width=100, height=100)

    # currently disabled as per deprecation of the following property
    # without an alternative way to retrieve it (using the API)
    # See: https://developers.google.com/youtube/v3/revision_history#september-9,-2020
    if save_banner and False:
        banner = channel_json["brandingSettings"]["image"]["bannerImageUrl"]
        banner_path = channel_dir.joinpath("banner.jpg")
        if not banner_path.exists():
            stream_file(banner, banner_path)

예제 #2

0

파일 보기

    def add_html5_node(self, node_id):
        """Add content from this `html5` node to zim

        html5 node is single ZIP file containing a standalone HTML app
        which entrypoint is a file named index.html

        we extract and add each file from the ZIP to /{node_id}/

        Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid
        html5 app (unreachable)"""

        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return

        # download ZIP file to memory
        ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"])
        ark_data = io.BytesIO()
        stream_file(url=ark_url, byte_stream=ark_data)

        # loop over zip members and create an entry for each
        zip_ark = zipfile.ZipFile(ark_data)
        for ark_member in zip_ark.namelist():
            with self.creator_lock:
                self.creator.add_item_for(
                    path=f"{node_id}/{ark_member}",
                    content=zip_ark.open(ark_member).read(),
                )

예제 #3

0

파일 보기

def get_all_sites() -> List[dict]:
    """List of all StackExchange Sites with basic details"""
    url = f"{DOWNLOAD_ROOT}/Sites.xml"
    buf = io.BytesIO()
    stream_file(url, byte_stream=buf)

    parser = XMLtoDict()
    return parser.parse(buf.getvalue()).get("sites", {}).get("row", [])

예제 #4

0

파일 보기

파일: conftest.py 프로젝트: openzim/python-scraperlib

def real_zim_file(tmpdir_factory):
    from zimscraperlib.download import stream_file

    dst = tmpdir_factory.mktemp("data").join("small.zim")
    stream_file(
        "https://github.com/openzim/zim-testing-suite/raw/v0.3/data/withns/"
        "wikipedia_en_climate_change_nopic_2020-01.zim",
        dst,
    )
    return dst

예제 #5

0

파일 보기

def test_filelikeprovider_nosize(tmp_path, png_image_url):
    fileobj = io.BytesIO()
    stream_file(png_image_url, byte_stream=fileobj)

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(FileLikeProviderItem(fileobj=fileobj, path="one.png"))

    zim = Archive(fpath)
    assert bytes(zim.get_item("one.png").content) == fileobj.getvalue()

예제 #6

0

파일 보기

파일: conftest.py 프로젝트: openzim/python-scraperlib

def small_zim_file(tmpdir_factory):
    from zimscraperlib.download import stream_file

    dst = tmpdir_factory.mktemp("data").join("small.zim")
    stream_file(
        "https://github.com/openzim/zim-testing-suite/raw/v0.3/data/nons/"
        "small.zim",
        dst,
    )
    return dst

예제 #7

0

파일 보기

파일: dependencies.py 프로젝트: openzim/sotoki

def get_all_assets(cache):

    for path, source in ASSETS:
        target = cache.joinpath(path)
        if target.exists():
            continue

        if not target.parent.exists():
            target.parent.mkdir(exist_ok=True, parents=True)
        print(f"Downloading {source} into {target}")
        stream_file(url=source, fpath=target)

예제 #8

0

파일 보기

    def check_branding_values(self):
        """checks that user-supplied images and colors are valid (so to fail early)

        Images are checked for existence or downloaded then resized
        Colors are check for validity"""

        # skip this step if none of related values were supplied
        if not sum([
                bool(x) for x in (
                    self.profile_image,
                    self.banner_image,
                    self.main_color,
                    self.secondary_color,
                )
        ]):
            return
        logger.info("checking your branding files and values")
        if self.profile_image:
            if self.profile_image.startswith("http"):
                stream_file(self.profile_image, self.profile_path)
            else:
                if not self.profile_image.exists():
                    raise IOError(
                        f"--profile image could not be found: {self.profile_image}"
                    )
                shutil.move(self.profile_image, self.profile_path)
            resize_image(self.profile_path,
                         width=100,
                         height=100,
                         method="thumbnail")
        if self.banner_image:
            if self.banner_image.startswith("http"):
                stream_file(self.banner_image, self.banner_path)
            else:
                if not self.banner_image.exists():
                    raise IOError(
                        f"--banner image could not be found: {self.banner_image}"
                    )
                shutil.move(self.banner_image, self.banner_path)
            resize_image(self.banner_path,
                         width=1060,
                         height=175,
                         method="thumbnail")

        if self.main_color and not is_hex_color(self.main_color):
            raise ValueError(
                f"--main-color is not a valid hex color: {self.main_color}")

        if self.secondary_color and not is_hex_color(self.secondary_color):
            raise ValueError(
                f"--secondary_color-color is not a valid hex color: {self.secondary_color}"
            )

예제 #9

0

파일 보기

    def download_db(self):
        """download channel DB from kolibri and initialize DB

        Also sets the root_id with DB-computer value"""
        # download database
        fpath = self.build_dir.joinpath("db.sqlite3")
        logger.debug(f"Downloading database into {fpath.name}…")
        stream_file(
            f"{STUDIO_URL}/content/databases/{self.channel_id}.sqlite3",
            fpath,
        )
        self.db = KolibriDB(fpath, self.root_id)
        self.root_id = self.db.root_id

예제 #10

0

파일 보기

    def add_html5_node(self, node_id):
        """Add content from this `html5` node to zim

        html5 node is single ZIP file containing a standalone HTML app
        which entrypoint is a file named index.html

        we extract and add each file from the ZIP to /{node_id}/

        Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid
        html5 app (unreachable)"""

        file = self.db.get_node_file(node_id, thumbnail=False)
        if not file:
            return

        # download ZIP file to memory
        ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"])
        ark_data = io.BytesIO()
        stream_file(url=ark_url, byte_stream=ark_data)

        # loop over zip members and create an entry (or redir. for each if using dedup)
        zip_ark = zipfile.ZipFile(ark_data)
        for ark_member in zip_ark.namelist():
            if not self.dedup_html_files:
                with self.creator_lock:
                    self.creator.add_item_for(
                        path=f"{node_id}/{ark_member}",
                        content=zip_ark.open(ark_member).read(),
                    )
                continue

            # calculate hash of file and add entry if not in zim already
            content = zip_ark.open(ark_member).read()
            content_hash = hashlib.md5(content).hexdigest()  # nosec

            if content_hash not in self.html_files_cache:
                self.html_files_cache.append(content_hash)
                with self.creator_lock:
                    self.creator.add_item_for(
                        path=f"html5_files/{content_hash}",
                        content=content,
                    )

            # add redirect to the unique sum-based entry for that file's path
            with self.creator_lock:
                self.creator.add_redirect(
                    path=f"{node_id}/{ark_member}",
                    target_path=f"html5_files/{content_hash}")

        logger.debug(f"Added HTML5 node #{node_id}")

예제 #11

0

파일 보기

파일: test_download.py 프로젝트: mahakporwal02/python_scraperlib

def test_first_block_download(valid_http_url):
    byte_stream = io.BytesIO()
    size, ret = stream_file(
        url=valid_http_url, byte_stream=byte_stream, only_first_block=True
    )
    assert_headers(ret)
    assert len(byte_stream.read()) == 3062

예제 #12

0

파일 보기

def test_first_block_download(valid_http_url):
    byte_stream = io.BytesIO()
    size, ret = stream_file(url=valid_http_url,
                            byte_stream=byte_stream,
                            only_first_block=True)
    assert_headers(ret)
    # valid_http_url randomly returns gzip-encoded content.
    # otherwise, expected size is default block size
    expected = 3062 if ret.get("Content-Encoding") == "gzip" else 1024
    assert len(byte_stream.read()) <= expected

예제 #13

0

파일 보기

 def get_image_data(self, url: str, **resize_args: dict) -> io.BytesIO:
     """Bytes stream of an optimized, resized WebP of the source image"""
     src, webp = io.BytesIO(), io.BytesIO()
     stream_file(url=url, byte_stream=src)
     with Image.open(src) as img:
         img.save(webp, format="WEBP")
     del src
     resize_args = resize_args or {}
     try:
         resize_image(
             src=webp,
             **resize_args,
             allow_upscaling=False,
         )
     except ImageSizeError as exc:
         logger.debug(f"Resize Error for {url}: {exc}")
     return optimize_webp(
         src=webp,
         lossless=False,
         quality=60,
         method=6,
     )

예제 #14

0

파일 보기

    def get_version_ident_for(self, url: str) -> str:
        """~version~ of the URL data to use for comparisons. Built from headers"""
        try:
            resp = requests.head(url, headers={"User-Agent": USER_AGENT})
            headers = resp.headers
        except Exception:
            logger.warning(f"Unable to HEAD {url}")
            try:
                _, headers = stream_file(
                    url=url,
                    headers={"User-Agent": USER_AGENT},
                    byte_stream=io.BytesIO(),
                    block_size=1,
                    only_first_block=True,
                )
            except Exception:
                logger.warning(f"Unable to query image at {url}")
                return

        for header in ("ETag", "Last-Modified", "Content-Length"):
            if headers.get(header):
                return headers.get(header)

        return "-1"

예제 #15

0

파일 보기

def test_missing_dest(tmp_path):
    with pytest.raises(requests.exceptions.ConnectionError):
        stream_file(url="http://some_url", byte_stream=io.BytesIO())

예제 #16

0

파일 보기

def test_save_https(tmp_path, valid_https_url):
    dest_file = tmp_path / "favicon.ico"
    size, ret = stream_file(url=valid_https_url, fpath=dest_file)
    assert_headers(ret)
    assert_downloaded_file(valid_https_url, dest_file)

예제 #17

0

파일 보기

def test_no_output_supplied(valid_http_url):
    with pytest.raises(ValueError,
                       match="Either file path or a bytesIO object is needed"):
        stream_file(url=valid_http_url)

예제 #18

0

파일 보기

def test_invalid_url(tmp_path, invalid_url):
    dest_file = tmp_path / "favicon.ico"
    with pytest.raises(requests.exceptions.ConnectionError):
        stream_file(url=invalid_url, fpath=dest_file)

예제 #19

0

파일 보기

def test_stream_to_bytes(valid_https_url):
    byte_stream = io.BytesIO()
    size, ret = stream_file(url=valid_https_url, byte_stream=byte_stream)
    assert_headers(ret)
    assert byte_stream.read() == requests.get(valid_https_url).content

예제 #20

0

파일 보기

 def download_to_disk(self, file_id, ext):
     """ download a Kolibri file to the build-dir using its filename """
     url, fname = get_kolibri_url_for(file_id, ext)
     fpath = self.build_dir / fname
     stream_file(url, fpath)
     return fpath

예제 #21

0

파일 보기

def test_save_parent_folder_missing(tmp_path, valid_http_url):
    dest_file = tmp_path / "some-folder" / "favicon.ico"
    with pytest.raises(IOError):
        stream_file(url=valid_http_url, fpath=dest_file)

예제 #22

0

파일 보기

    def add_exercise_node(self, node_id):
        """Add content from this `exercise` node to zim

        exercise node is composed of a single perseus file.
        a perseus file is a ZIP containing an exercise.json entrypoint and other files
        we extract and add the individual exercises as standalone HTML files dependent
        on standalone version of perseus reader from https://github.com/Khan/perseus"""

        # find perseus file (should be a single one)
        files = self.db.get_node_files(node_id, thumbnail=False)
        if not files:
            return
        files = sorted(files, key=lambda f: f["prio"])
        perseus_file = next(filter(lambda f: f["supp"] == 0, files))

        # download persus file
        perseus_url, perseus_name = get_kolibri_url_for(
            perseus_file["id"], perseus_file["ext"])
        perseus_data = io.BytesIO()
        stream_file(url=perseus_url, byte_stream=perseus_data)

        # read JSON manifest from perseus file
        zip_ark = zipfile.ZipFile(perseus_data)
        manifest_name = "exercise.json"
        if manifest_name not in zip_ark.namelist():
            logger.error(f"Excercise node without {manifest_name}")
            return
        manifest = json.loads(read_from_zip(zip_ark, manifest_name))

        # copy exercise content, rewriting internal paths
        # all internal resources to be stored under {node_id}/ prefix
        assessment_items = []
        for assessment_item in manifest.get("all_assessment_items", []):
            item_path = f"{assessment_item}.json"
            if item_path in zip_ark.namelist():
                perseus_content = read_from_zip(zip_ark, item_path)
                perseus_content = perseus_content.replace(
                    r"web+graphie:${☣ LOCALPATH}", f'web+graphie:./{node_id}')
                perseus_content = perseus_content.replace(
                    r"${☣ LOCALPATH}", f'./{node_id}')
            assessment_items.append(perseus_content)

        # add all support files to ZIM
        for ark_member in zip_ark.namelist():
            if ark_member == manifest_name:
                continue

            path = f"{node_id}/{ark_member}"
            with self.creator_lock:
                self.creator.add_item_for(
                    path=path,
                    title="",
                    content=read_from_zip(zip_ark, ark_member, as_text=False),
                )
            logger.debug(f'Added exercise support file {path}')

        # prepare and add exercise HTML article
        node = self.db.get_node(node_id, with_parents=True)
        html = self.jinja2_env.get_template("perseus_exercise.html").render(
            node_id=node_id,
            perseus_content=f"[{', '.join(assessment_items)}]",
            questions_count=str(len(assessment_items)),
            **node)
        with self.creator_lock:
            self.creator.add_item_for(path=node_id,
                                      title=node["title"],
                                      content=html,
                                      mimetype="text/html")
        logger.debug(f'Added exercise node #{node_id}')

예제 #23

0

파일 보기

def test_save_http_error(tmp_path, http_error_url):
    dest_file = tmp_path / "favicon.ico"
    with pytest.raises(requests.exceptions.HTTPError):
        stream_file(url=http_error_url, fpath=dest_file)