def save_channel_branding(channels_dir, channel_id, save_banner=False): """ download, save and resize profile [and banner] of a channel """ channel_json = get_channel_json(channel_id) thumbnails = channel_json["snippet"]["thumbnails"] for quality in ("medium", "default"): # high:800px, medium:240px, default:88px if quality in thumbnails.keys(): thumnbail = thumbnails[quality]["url"] break channel_dir = channels_dir.joinpath(channel_id) channel_dir.mkdir(exist_ok=True) profile_path = channel_dir.joinpath("profile.jpg") if not profile_path.exists(): stream_file(thumnbail, profile_path) # resize profile as we only use up 100px/80 sq resize_image(profile_path, width=100, height=100) # currently disabled as per deprecation of the following property # without an alternative way to retrieve it (using the API) # See: https://developers.google.com/youtube/v3/revision_history#september-9,-2020 if save_banner and False: banner = channel_json["brandingSettings"]["image"]["bannerImageUrl"] banner_path = channel_dir.joinpath("banner.jpg") if not banner_path.exists(): stream_file(banner, banner_path)
def add_html5_node(self, node_id): """Add content from this `html5` node to zim html5 node is single ZIP file containing a standalone HTML app which entrypoint is a file named index.html we extract and add each file from the ZIP to /{node_id}/ Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid html5 app (unreachable)""" file = self.db.get_node_file(node_id, thumbnail=False) if not file: return # download ZIP file to memory ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"]) ark_data = io.BytesIO() stream_file(url=ark_url, byte_stream=ark_data) # loop over zip members and create an entry for each zip_ark = zipfile.ZipFile(ark_data) for ark_member in zip_ark.namelist(): with self.creator_lock: self.creator.add_item_for( path=f"{node_id}/{ark_member}", content=zip_ark.open(ark_member).read(), )
def get_all_sites() -> List[dict]: """List of all StackExchange Sites with basic details""" url = f"{DOWNLOAD_ROOT}/Sites.xml" buf = io.BytesIO() stream_file(url, byte_stream=buf) parser = XMLtoDict() return parser.parse(buf.getvalue()).get("sites", {}).get("row", [])
def real_zim_file(tmpdir_factory): from zimscraperlib.download import stream_file dst = tmpdir_factory.mktemp("data").join("small.zim") stream_file( "https://github.com/openzim/zim-testing-suite/raw/v0.3/data/withns/" "wikipedia_en_climate_change_nopic_2020-01.zim", dst, ) return dst
def test_filelikeprovider_nosize(tmp_path, png_image_url): fileobj = io.BytesIO() stream_file(png_image_url, byte_stream=fileobj) fpath = tmp_path / "test.zim" with Creator(fpath) as creator: creator.add_item(FileLikeProviderItem(fileobj=fileobj, path="one.png")) zim = Archive(fpath) assert bytes(zim.get_item("one.png").content) == fileobj.getvalue()
def small_zim_file(tmpdir_factory): from zimscraperlib.download import stream_file dst = tmpdir_factory.mktemp("data").join("small.zim") stream_file( "https://github.com/openzim/zim-testing-suite/raw/v0.3/data/nons/" "small.zim", dst, ) return dst
def get_all_assets(cache): for path, source in ASSETS: target = cache.joinpath(path) if target.exists(): continue if not target.parent.exists(): target.parent.mkdir(exist_ok=True, parents=True) print(f"Downloading {source} into {target}") stream_file(url=source, fpath=target)
def check_branding_values(self): """checks that user-supplied images and colors are valid (so to fail early) Images are checked for existence or downloaded then resized Colors are check for validity""" # skip this step if none of related values were supplied if not sum([ bool(x) for x in ( self.profile_image, self.banner_image, self.main_color, self.secondary_color, ) ]): return logger.info("checking your branding files and values") if self.profile_image: if self.profile_image.startswith("http"): stream_file(self.profile_image, self.profile_path) else: if not self.profile_image.exists(): raise IOError( f"--profile image could not be found: {self.profile_image}" ) shutil.move(self.profile_image, self.profile_path) resize_image(self.profile_path, width=100, height=100, method="thumbnail") if self.banner_image: if self.banner_image.startswith("http"): stream_file(self.banner_image, self.banner_path) else: if not self.banner_image.exists(): raise IOError( f"--banner image could not be found: {self.banner_image}" ) shutil.move(self.banner_image, self.banner_path) resize_image(self.banner_path, width=1060, height=175, method="thumbnail") if self.main_color and not is_hex_color(self.main_color): raise ValueError( f"--main-color is not a valid hex color: {self.main_color}") if self.secondary_color and not is_hex_color(self.secondary_color): raise ValueError( f"--secondary_color-color is not a valid hex color: {self.secondary_color}" )
def download_db(self): """download channel DB from kolibri and initialize DB Also sets the root_id with DB-computer value""" # download database fpath = self.build_dir.joinpath("db.sqlite3") logger.debug(f"Downloading database into {fpath.name}…") stream_file( f"{STUDIO_URL}/content/databases/{self.channel_id}.sqlite3", fpath, ) self.db = KolibriDB(fpath, self.root_id) self.root_id = self.db.root_id
def add_html5_node(self, node_id): """Add content from this `html5` node to zim html5 node is single ZIP file containing a standalone HTML app which entrypoint is a file named index.html we extract and add each file from the ZIP to /{node_id}/ Note: Studio doesn't enforce the mandatory index.html, thus allowing invalid html5 app (unreachable)""" file = self.db.get_node_file(node_id, thumbnail=False) if not file: return # download ZIP file to memory ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"]) ark_data = io.BytesIO() stream_file(url=ark_url, byte_stream=ark_data) # loop over zip members and create an entry (or redir. for each if using dedup) zip_ark = zipfile.ZipFile(ark_data) for ark_member in zip_ark.namelist(): if not self.dedup_html_files: with self.creator_lock: self.creator.add_item_for( path=f"{node_id}/{ark_member}", content=zip_ark.open(ark_member).read(), ) continue # calculate hash of file and add entry if not in zim already content = zip_ark.open(ark_member).read() content_hash = hashlib.md5(content).hexdigest() # nosec if content_hash not in self.html_files_cache: self.html_files_cache.append(content_hash) with self.creator_lock: self.creator.add_item_for( path=f"html5_files/{content_hash}", content=content, ) # add redirect to the unique sum-based entry for that file's path with self.creator_lock: self.creator.add_redirect( path=f"{node_id}/{ark_member}", target_path=f"html5_files/{content_hash}") logger.debug(f"Added HTML5 node #{node_id}")
def test_first_block_download(valid_http_url): byte_stream = io.BytesIO() size, ret = stream_file( url=valid_http_url, byte_stream=byte_stream, only_first_block=True ) assert_headers(ret) assert len(byte_stream.read()) == 3062
def test_first_block_download(valid_http_url): byte_stream = io.BytesIO() size, ret = stream_file(url=valid_http_url, byte_stream=byte_stream, only_first_block=True) assert_headers(ret) # valid_http_url randomly returns gzip-encoded content. # otherwise, expected size is default block size expected = 3062 if ret.get("Content-Encoding") == "gzip" else 1024 assert len(byte_stream.read()) <= expected
def get_image_data(self, url: str, **resize_args: dict) -> io.BytesIO: """Bytes stream of an optimized, resized WebP of the source image""" src, webp = io.BytesIO(), io.BytesIO() stream_file(url=url, byte_stream=src) with Image.open(src) as img: img.save(webp, format="WEBP") del src resize_args = resize_args or {} try: resize_image( src=webp, **resize_args, allow_upscaling=False, ) except ImageSizeError as exc: logger.debug(f"Resize Error for {url}: {exc}") return optimize_webp( src=webp, lossless=False, quality=60, method=6, )
def get_version_ident_for(self, url: str) -> str: """~version~ of the URL data to use for comparisons. Built from headers""" try: resp = requests.head(url, headers={"User-Agent": USER_AGENT}) headers = resp.headers except Exception: logger.warning(f"Unable to HEAD {url}") try: _, headers = stream_file( url=url, headers={"User-Agent": USER_AGENT}, byte_stream=io.BytesIO(), block_size=1, only_first_block=True, ) except Exception: logger.warning(f"Unable to query image at {url}") return for header in ("ETag", "Last-Modified", "Content-Length"): if headers.get(header): return headers.get(header) return "-1"
def test_missing_dest(tmp_path): with pytest.raises(requests.exceptions.ConnectionError): stream_file(url="http://some_url", byte_stream=io.BytesIO())
def test_save_https(tmp_path, valid_https_url): dest_file = tmp_path / "favicon.ico" size, ret = stream_file(url=valid_https_url, fpath=dest_file) assert_headers(ret) assert_downloaded_file(valid_https_url, dest_file)
def test_no_output_supplied(valid_http_url): with pytest.raises(ValueError, match="Either file path or a bytesIO object is needed"): stream_file(url=valid_http_url)
def test_invalid_url(tmp_path, invalid_url): dest_file = tmp_path / "favicon.ico" with pytest.raises(requests.exceptions.ConnectionError): stream_file(url=invalid_url, fpath=dest_file)
def test_stream_to_bytes(valid_https_url): byte_stream = io.BytesIO() size, ret = stream_file(url=valid_https_url, byte_stream=byte_stream) assert_headers(ret) assert byte_stream.read() == requests.get(valid_https_url).content
def download_to_disk(self, file_id, ext): """ download a Kolibri file to the build-dir using its filename """ url, fname = get_kolibri_url_for(file_id, ext) fpath = self.build_dir / fname stream_file(url, fpath) return fpath
def test_save_parent_folder_missing(tmp_path, valid_http_url): dest_file = tmp_path / "some-folder" / "favicon.ico" with pytest.raises(IOError): stream_file(url=valid_http_url, fpath=dest_file)
def add_exercise_node(self, node_id): """Add content from this `exercise` node to zim exercise node is composed of a single perseus file. a perseus file is a ZIP containing an exercise.json entrypoint and other files we extract and add the individual exercises as standalone HTML files dependent on standalone version of perseus reader from https://github.com/Khan/perseus""" # find perseus file (should be a single one) files = self.db.get_node_files(node_id, thumbnail=False) if not files: return files = sorted(files, key=lambda f: f["prio"]) perseus_file = next(filter(lambda f: f["supp"] == 0, files)) # download persus file perseus_url, perseus_name = get_kolibri_url_for( perseus_file["id"], perseus_file["ext"]) perseus_data = io.BytesIO() stream_file(url=perseus_url, byte_stream=perseus_data) # read JSON manifest from perseus file zip_ark = zipfile.ZipFile(perseus_data) manifest_name = "exercise.json" if manifest_name not in zip_ark.namelist(): logger.error(f"Excercise node without {manifest_name}") return manifest = json.loads(read_from_zip(zip_ark, manifest_name)) # copy exercise content, rewriting internal paths # all internal resources to be stored under {node_id}/ prefix assessment_items = [] for assessment_item in manifest.get("all_assessment_items", []): item_path = f"{assessment_item}.json" if item_path in zip_ark.namelist(): perseus_content = read_from_zip(zip_ark, item_path) perseus_content = perseus_content.replace( r"web+graphie:${☣ LOCALPATH}", f'web+graphie:./{node_id}') perseus_content = perseus_content.replace( r"${☣ LOCALPATH}", f'./{node_id}') assessment_items.append(perseus_content) # add all support files to ZIM for ark_member in zip_ark.namelist(): if ark_member == manifest_name: continue path = f"{node_id}/{ark_member}" with self.creator_lock: self.creator.add_item_for( path=path, title="", content=read_from_zip(zip_ark, ark_member, as_text=False), ) logger.debug(f'Added exercise support file {path}') # prepare and add exercise HTML article node = self.db.get_node(node_id, with_parents=True) html = self.jinja2_env.get_template("perseus_exercise.html").render( node_id=node_id, perseus_content=f"[{', '.join(assessment_items)}]", questions_count=str(len(assessment_items)), **node) with self.creator_lock: self.creator.add_item_for(path=node_id, title=node["title"], content=html, mimetype="text/html") logger.debug(f'Added exercise node #{node_id}')
def test_save_http_error(tmp_path, http_error_url): dest_file = tmp_path / "favicon.ico" with pytest.raises(requests.exceptions.HTTPError): stream_file(url=http_error_url, fpath=dest_file)