def run(self): if self.use_mlflow: LOGGER.info(f"Saving MLFlow info to {self.path_mlflow}") # Check arguments are not None if self.path_mlflow is None: raise ValueError("'path_mlflow' should not be None") if self.run_id is None: raise ValueError("'run_id' should not be None") if self.run_uuid is None: raise ValueError("'run_uuid' should not be None") if self.experiment_id is None: raise ValueError("'experiment_id' should not be None") # Save info to path info = { "run_id": self.run_id, "run_uuid": self.run_uuid, "experiment_id": self.experiment_id } # Need to create directory if not HDFS if not Path(self.path_mlflow).is_hdfs: Path(self.path_mlflow).parent.mkdir(parents=True, exist_ok=True) # Dump MLFlow information to path with Path(self.path_mlflow).open("w") as file: json.dump(info, file, indent=4)
def write(self, dataset: tf.data.Dataset): def _write_chunk(data, path_data): if self.compression_type == "GZIP" and not path_data.endswith( ".gz"): path_data += ".gz" elif self.compression_type == "ZLIB" and not path_data.endswith( ".zlib"): path_data += ".zlib" LOGGER.info(f"Writing tf record dataset to {path_data}") with tf.io.TFRecordWriter( path_data, options=tf.io.TFRecordOptions( compression_type=self.compression_type)) as writer: for ex in data: writer.write(ex) if self.chunk_size is None: _write_chunk(progress(from_dataset(dataset), secs=self.secs), self.path) else: if not Path(self.path).is_hdfs: Path(self.path).mkdir(parents=True, exist_ok=True) for idx, chunk in enumerate( chunks(progress(from_dataset(dataset), secs=self.secs), self.chunk_size)): path_chunk = f"{self.path}/part-{idx}.tfrecord" _write_chunk(chunk, path_chunk)
def read_json(path: Union[str, Path]) -> Dict: """Read json or jsonnet file into dictionary""" if Path(path).suffix == ".jsonnet": LOGGER.info(f"Parsing jsonnet file '{path}'") json_str = _jsonnet.evaluate_file(str(path)) data = json.loads(json_str) else: with Path(path).open() as file: data = json.load(file) return data
def run(self): if self.skip_copy: LOGGER.info( f"NOT COPYING {self.source} to {self.target} (skip_copy=True)") return if self.overwrite and Path(self.target).is_dir(): Path(self.target).delete_dir() LOGGER.info(f"Copying {self.source} to {self.target}") Path(self.source).copy_dir(self.target)
def filenames(self): if isinstance(self.path, list): return sorted([str(path) for path in self.path]) else: if Path(self.path).is_dir(): paths = Path(self.path).glob("*") return sorted([ str(path) for path in paths if path.is_file() and not path.name.startswith("_") ]) else: return [str(self.path)]
def filenames(self): """Get filenames in path.""" pattern = "*" if not self.recursive else "**/*" if isinstance(self.path, list): return sorted([str(path) for path in self.path]) else: if Path(self.path).is_dir(): paths = Path(self.path).glob(pattern) return sorted([ str(path) for path in paths if path.is_file() and not path.name.startswith("_") ]) else: return [str(self.path)]
def write(path: str, vocab: Iterable[str]): """Write vocabulary to file. Parameters ---------- path : str Path to .txt file with one item per line vocab : Iterable[str] Iterable of lexemes (strings) Lexemes should not have newline characters. """ # Check that vocab is not a string if isinstance(vocab, str): msg = f"Expected iterable of strings, but got a string ({vocab})" raise TypeError(msg) # Check that no item in vocab has a newline character for item in vocab: if not isinstance(item, str): msg = f"Expected item of type str, but got {type(item)} for item {item}" raise TypeError(msg) if "\n" in item: msg = f"Found newline character in item {item} (forbidden)." raise ValueError(msg) # Write each item on a new line with Path(path).open("w") as file: file.write("\n".join(vocab))
def read_eval_metrics(eval_dir: str) -> Dict: """Reload summaries from model_dir""" if not Path(eval_dir).is_dir(): return dict() summaries = defaultdict(dict) # type: Dict[int, Dict[str, float]] for path in Path(eval_dir).glob(_SUMMARY_PATTERN): for event in tf.train.summary_iterator(str(path)): if not event.HasField("summary"): continue metrics = {} for value in event.summary.value: if value.HasField("simple_value"): metrics[value.tag] = value.simple_value if metrics: summaries[event.step].update(metrics) return summaries
def write(self, table: pyarrow.Table, compression="snappy"): if not isinstance(self.path_or_paths, str): msg = f"Cannot write table to {self.path_or_paths} (expected string)" raise TypeError(msg) LOGGER.info(f"Writing table to {self.path_or_paths}") with Path(self.path_or_paths).open("wb", filesystem=self.filesystem) as file: pq.write_table(table, file, compression=compression)
def run(self): LOGGER.info( f"Cleanup checkpoints in {self.path_model}/{self.path_checkpoints}" ) checkpoint_files = Path(self.path_model, self.path_checkpoints).glob("model.ckpt-*") for checkpoint_file in checkpoint_files: LOGGER.info(f"- Deleting {checkpoint_file}") checkpoint_file.delete()
def read(path: str) -> List[str]: """Read vocabulary from file. Parameters ---------- path : str Path to .txt file with one item per line """ with Path(path).open() as file: return [line.strip() for line in file if line.strip()]
def export(self, estimator: tf.estimator.Estimator): for variable_name in self.variable_names: variable_export_dir = Path(self.path_variables, variable_name) LOGGER.info( f"Saving variable {variable_name} to {variable_export_dir}") with ParquetDataset(variable_export_dir).open() as ds: variable_value = estimator.get_variable_value(variable_name) ds.write_pandas(pd.DataFrame(variable_value), compression=self.compression, chunk_size=self.chunk_size)
def export(self, estimator: tf.estimator.Estimator): # Reload summaries and select best step LOGGER.info(f"Reloading summaries from {estimator.model_dir}") summaries = read_eval_metrics(estimator.eval_dir()).items() for step, metrics in sorted(summaries): LOGGER.info(f"- {step}: {metrics}") sorted_summaries = sorted(summaries, key=lambda t: t[1][self.metric]) if self.mode == BestMode.INCREASE: best_step, best_metrics = sorted_summaries[-1] elif self.mode == BestMode.DECREASE: best_step, best_metrics = sorted_summaries[0] else: raise ValueError(f"Mode {self.mode} not recognized.") LOGGER.info(f"Best summary at step {best_step}: {best_metrics}") # List available checkpoints and select closes to best_step checkpoints = Path(estimator.model_dir).glob(_CHEKPOINT_PATTERN) checkpoint_steps = [ int(re.findall(r"-(\d+).index", str(path))[0]) for path in checkpoints ] selected_step = sorted(checkpoint_steps, key=lambda step: abs(step - best_step))[0] LOGGER.info(f"Selected checkpoint {selected_step}") # Change checkpoint information with Path(estimator.model_dir, "checkpoint").open("r") as file: lines = file.read().split("\n") lines[0] = f'model_checkpoint_path: "model.ckpt-{selected_step}"' with Path(estimator.model_dir, "checkpoint").open("w") as file: file.write("\n".join(lines)) # Check that change is effective global_step = estimator.get_variable_value("global_step") if not global_step == selected_step: msg = f"Changed checkpoint file to use step {selected_step}, but estimator uses {global_step}" raise ValueError(msg) # Log to MLFlow if self.use_mlflow: mlflow.log_metric(key=self.tag, value=global_step)
def write_pandas( self, df: pd.DataFrame, compression="snappy", num_chunks: int = None, chunk_size: int = None, schema: pyarrow.Schema = None, ): """Write DataFrame as Parquet Dataset""" # Check arguments if not isinstance(self.path_or_paths, str): msg = f"Cannot write table to {self.path_or_paths} (expected string)" raise TypeError(msg) if num_chunks is not None and chunk_size is not None: msg = f"Both num_chunks and chunk_size are given, not allowed" raise ValueError(msg) if chunk_size is not None: num_chunks = max(len(df) // chunk_size, 1) # Write DataFrame to parquet if num_chunks is None: table = pyarrow.Table.from_pandas(df, schema=schema, preserve_index=False) self.write(table, compression=compression) else: Path(self.path_or_paths).mkdir(parents=True, exist_ok=True, filesystem=self.filesystem) chunks = np.array_split(df, num_chunks) for idx, chunk in enumerate(chunks): filename = f"part-{idx:05d}.parquet.{compression}" chunk_path = Path(self.path_or_paths, filename) LOGGER.info(f"Writing chunk:{idx} to {chunk_path}") with chunk_path.open("wb", filesystem=self.filesystem) as file: table = pyarrow.Table.from_pandas(chunk, schema=schema, preserve_index=False) pq.write_table(table, file, compression=compression)
def get_latest_saved_model(saved_model_dir: str) -> str: """Get latest sub directory in saved_model_dir. Parameters ---------- saved_model_dir : str Path to directory containing saved model exports. Returns ------- str """ subdirs = [ str(path) for path in Path(saved_model_dir).iterdir() if path.is_dir() and "temp" not in str(path) ] return str(sorted(subdirs)[-1])
def upload_pex( path_pex: str, path_pex_existing: str = None, additional_packages: Dict = None, ignored_packages: List = None ) -> str: """Upload Current Environment and return path to PEX on HDFS""" if path_pex_existing is None: LOGGER.info(f"Uploading env to {path_pex}") packaging.upload_env_to_hdfs( archive_on_hdfs=path_pex, additional_packages=additional_packages if additional_packages else {}, ignored_packages=ignored_packages if ignored_packages else [], packer=cluster_pack.packaging.PEX_PACKER, ) elif not Path(path_pex_existing).is_hdfs: LOGGER.info(f"Uploading env to {path_pex}") packaging.upload_zip_to_hdfs(path_pex_existing, archive_on_hdfs=path_pex) else: LOGGER.info(f"Skipping upload, PEX {path_pex_existing} already exists") path_pex = path_pex_existing return path_pex
def run(self): if self.skip_copy: LOGGER.info(f"NOT COPYING {self.source} to {self.target} (skip_copy=True)") return LOGGER.info(f"Copying {self.source} to {self.target}") Path(self.source).copy_dir(self.target)
def write_json(data: Dict, path: Union[str, Path]): """Write data to path""" with Path(path).open("w") as file: json.dump(data, file, indent=4)
def is_hdfs(self) -> bool: if isinstance(self.path_or_paths, str): return Path(self.path_or_paths).is_hdfs # type: ignore else: return Path(self.path_or_paths[0]).is_hdfs