def download_wet_file(path, dl_dir): url = f"{_DOWNLOAD_HOST}/{path}" out_path = f"{dl_dir}/{path}" if tf.io.gfile.exists(out_path): c4_utils.get_counter_inc_fn("download_wet_url")("exists") return out_path tmp_dir = f"{out_path}.incomplete{uuid.uuid4().hex}" try: tf.io.gfile.makedirs(tmp_dir) downloader = tfds.download.downloader.get_downloader() with downloader.tqdm(): # TODO(slebedev): Investigate why pytype infers Promise[Future[...]]. dl_path = downloader.download( url, tmp_dir).get().path # type: ignore tf.io.gfile.rename(os.fspath(dl_path), out_path, overwrite=True) finally: if tf.io.gfile.exists(tmp_dir): tf.io.gfile.rmtree(tmp_dir) c4_utils.get_counter_inc_fn("download_wet_url")("downloaded") return out_path
def _emit_examples(el): c4_utils.get_counter_inc_fn(split)("examples") _, features = el return features["url"], { "url": features["url"], "text": features["text"], "content-type": features["content-type"], "content-length": features["content-length"], "timestamp": features["timestamp"] }
def download_wet_file(path, dl_dir): url = f"{_DOWNLOAD_HOST}/{path}" out_path = f"{dl_dir}/{path}" if tf.io.gfile.exists(out_path): c4_utils.get_counter_inc_fn("download_wet_url")("exists") return out_path tmp_dir = f"{out_path}.incomplete{uuid.uuid4().hex}" try: tf.io.gfile.makedirs(tmp_dir) downloader = tfds.download.downloader.get_downloader() with downloader.tqdm(): dl_path = downloader.download(url, tmp_dir).get().path tf.io.gfile.rename(os.fspath(dl_path), out_path, overwrite=True) finally: if tf.io.gfile.exists(tmp_dir): tf.io.gfile.rmtree(tmp_dir) c4_utils.get_counter_inc_fn("download_wet_url")("downloaded") return out_path
def _emit_examples(el): c4_utils.get_counter_inc_fn("emit-examples")("emitted") url, text = el return {"url": url, "text": text}