Пример #1
0
    def run(self):
        if self.use_mlflow:
            LOGGER.info(f"Saving MLFlow info to {self.path_mlflow}")
            # Check arguments are not None
            if self.path_mlflow is None:
                raise ValueError("'path_mlflow' should not be None")
            if self.run_id is None:
                raise ValueError("'run_id' should not be None")
            if self.run_uuid is None:
                raise ValueError("'run_uuid' should not be None")
            if self.experiment_id is None:
                raise ValueError("'experiment_id' should not be None")

            # Save info to path
            info = {
                "run_id": self.run_id,
                "run_uuid": self.run_uuid,
                "experiment_id": self.experiment_id
            }

            # Need to create directory if not HDFS
            if not Path(self.path_mlflow).is_hdfs:
                Path(self.path_mlflow).parent.mkdir(parents=True,
                                                    exist_ok=True)

            # Dump MLFlow information to path
            with Path(self.path_mlflow).open("w") as file:
                json.dump(info, file, indent=4)
Пример #2
0
    def write(self, dataset: tf.data.Dataset):
        def _write_chunk(data, path_data):
            if self.compression_type == "GZIP" and not path_data.endswith(
                    ".gz"):
                path_data += ".gz"
            elif self.compression_type == "ZLIB" and not path_data.endswith(
                    ".zlib"):
                path_data += ".zlib"
            LOGGER.info(f"Writing tf record dataset to {path_data}")
            with tf.io.TFRecordWriter(
                    path_data,
                    options=tf.io.TFRecordOptions(
                        compression_type=self.compression_type)) as writer:
                for ex in data:
                    writer.write(ex)

        if self.chunk_size is None:
            _write_chunk(progress(from_dataset(dataset), secs=self.secs),
                         self.path)
        else:
            if not Path(self.path).is_hdfs:
                Path(self.path).mkdir(parents=True, exist_ok=True)
            for idx, chunk in enumerate(
                    chunks(progress(from_dataset(dataset), secs=self.secs),
                           self.chunk_size)):
                path_chunk = f"{self.path}/part-{idx}.tfrecord"
                _write_chunk(chunk, path_chunk)
Пример #3
0
def read_json(path: Union[str, Path]) -> Dict:
    """Read json or jsonnet file into dictionary"""
    if Path(path).suffix == ".jsonnet":
        LOGGER.info(f"Parsing jsonnet file '{path}'")
        json_str = _jsonnet.evaluate_file(str(path))
        data = json.loads(json_str)
    else:
        with Path(path).open() as file:
            data = json.load(file)
    return data
Пример #4
0
    def run(self):
        if self.skip_copy:
            LOGGER.info(
                f"NOT COPYING {self.source} to {self.target} (skip_copy=True)")
            return
        if self.overwrite and Path(self.target).is_dir():
            Path(self.target).delete_dir()

        LOGGER.info(f"Copying {self.source} to {self.target}")
        Path(self.source).copy_dir(self.target)
Пример #5
0
 def filenames(self):
     if isinstance(self.path, list):
         return sorted([str(path) for path in self.path])
     else:
         if Path(self.path).is_dir():
             paths = Path(self.path).glob("*")
             return sorted([
                 str(path) for path in paths
                 if path.is_file() and not path.name.startswith("_")
             ])
         else:
             return [str(self.path)]
Пример #6
0
 def filenames(self):
     """Get filenames in path."""
     pattern = "*" if not self.recursive else "**/*"
     if isinstance(self.path, list):
         return sorted([str(path) for path in self.path])
     else:
         if Path(self.path).is_dir():
             paths = Path(self.path).glob(pattern)
             return sorted([
                 str(path) for path in paths
                 if path.is_file() and not path.name.startswith("_")
             ])
         else:
             return [str(self.path)]
Пример #7
0
def write(path: str, vocab: Iterable[str]):
    """Write vocabulary to file.

    Parameters
    ----------
    path : str
        Path to .txt file with one item per line
    vocab : Iterable[str]
        Iterable of lexemes (strings)
        Lexemes should not have newline characters.
    """
    # Check that vocab is not a string
    if isinstance(vocab, str):
        msg = f"Expected iterable of strings, but got a string ({vocab})"
        raise TypeError(msg)

    # Check that no item in vocab has a newline character
    for item in vocab:
        if not isinstance(item, str):
            msg = f"Expected item of type str, but got {type(item)} for item {item}"
            raise TypeError(msg)
        if "\n" in item:
            msg = f"Found newline character in item {item} (forbidden)."
            raise ValueError(msg)

    # Write each item on a new line
    with Path(path).open("w") as file:
        file.write("\n".join(vocab))
Пример #8
0
def read_eval_metrics(eval_dir: str) -> Dict:
    """Reload summaries from model_dir"""
    if not Path(eval_dir).is_dir():
        return dict()
    summaries = defaultdict(dict)  # type: Dict[int, Dict[str, float]]
    for path in Path(eval_dir).glob(_SUMMARY_PATTERN):
        for event in tf.train.summary_iterator(str(path)):
            if not event.HasField("summary"):
                continue
            metrics = {}
            for value in event.summary.value:
                if value.HasField("simple_value"):
                    metrics[value.tag] = value.simple_value
            if metrics:
                summaries[event.step].update(metrics)
    return summaries
Пример #9
0
 def write(self, table: pyarrow.Table, compression="snappy"):
     if not isinstance(self.path_or_paths, str):
         msg = f"Cannot write table to {self.path_or_paths} (expected string)"
         raise TypeError(msg)
     LOGGER.info(f"Writing table to {self.path_or_paths}")
     with Path(self.path_or_paths).open("wb",
                                        filesystem=self.filesystem) as file:
         pq.write_table(table, file, compression=compression)
Пример #10
0
 def run(self):
     LOGGER.info(
         f"Cleanup checkpoints in {self.path_model}/{self.path_checkpoints}"
     )
     checkpoint_files = Path(self.path_model,
                             self.path_checkpoints).glob("model.ckpt-*")
     for checkpoint_file in checkpoint_files:
         LOGGER.info(f"- Deleting {checkpoint_file}")
         checkpoint_file.delete()
Пример #11
0
def read(path: str) -> List[str]:
    """Read vocabulary from file.

    Parameters
    ----------
    path : str
        Path to .txt file with one item per line
    """
    with Path(path).open() as file:
        return [line.strip() for line in file if line.strip()]
Пример #12
0
 def export(self, estimator: tf.estimator.Estimator):
     for variable_name in self.variable_names:
         variable_export_dir = Path(self.path_variables, variable_name)
         LOGGER.info(
             f"Saving variable {variable_name} to {variable_export_dir}")
         with ParquetDataset(variable_export_dir).open() as ds:
             variable_value = estimator.get_variable_value(variable_name)
             ds.write_pandas(pd.DataFrame(variable_value),
                             compression=self.compression,
                             chunk_size=self.chunk_size)
Пример #13
0
    def export(self, estimator: tf.estimator.Estimator):
        # Reload summaries and select best step
        LOGGER.info(f"Reloading summaries from {estimator.model_dir}")
        summaries = read_eval_metrics(estimator.eval_dir()).items()
        for step, metrics in sorted(summaries):
            LOGGER.info(f"- {step}: {metrics}")
        sorted_summaries = sorted(summaries, key=lambda t: t[1][self.metric])
        if self.mode == BestMode.INCREASE:
            best_step, best_metrics = sorted_summaries[-1]
        elif self.mode == BestMode.DECREASE:
            best_step, best_metrics = sorted_summaries[0]
        else:
            raise ValueError(f"Mode {self.mode} not recognized.")
        LOGGER.info(f"Best summary at step {best_step}: {best_metrics}")

        # List available checkpoints and select closes to best_step
        checkpoints = Path(estimator.model_dir).glob(_CHEKPOINT_PATTERN)
        checkpoint_steps = [
            int(re.findall(r"-(\d+).index", str(path))[0])
            for path in checkpoints
        ]
        selected_step = sorted(checkpoint_steps,
                               key=lambda step: abs(step - best_step))[0]
        LOGGER.info(f"Selected checkpoint {selected_step}")

        # Change checkpoint information
        with Path(estimator.model_dir, "checkpoint").open("r") as file:
            lines = file.read().split("\n")
            lines[0] = f'model_checkpoint_path: "model.ckpt-{selected_step}"'

        with Path(estimator.model_dir, "checkpoint").open("w") as file:
            file.write("\n".join(lines))

        # Check that change is effective
        global_step = estimator.get_variable_value("global_step")
        if not global_step == selected_step:
            msg = f"Changed checkpoint file to use step {selected_step}, but estimator uses {global_step}"
            raise ValueError(msg)

        # Log to MLFlow
        if self.use_mlflow:
            mlflow.log_metric(key=self.tag, value=global_step)
Пример #14
0
    def write_pandas(
        self,
        df: pd.DataFrame,
        compression="snappy",
        num_chunks: int = None,
        chunk_size: int = None,
        schema: pyarrow.Schema = None,
    ):
        """Write DataFrame as Parquet Dataset"""
        # Check arguments
        if not isinstance(self.path_or_paths, str):
            msg = f"Cannot write table to {self.path_or_paths} (expected string)"
            raise TypeError(msg)
        if num_chunks is not None and chunk_size is not None:
            msg = f"Both num_chunks and chunk_size are given, not allowed"
            raise ValueError(msg)
        if chunk_size is not None:
            num_chunks = max(len(df) // chunk_size, 1)

        # Write DataFrame to parquet
        if num_chunks is None:
            table = pyarrow.Table.from_pandas(df,
                                              schema=schema,
                                              preserve_index=False)
            self.write(table, compression=compression)
        else:
            Path(self.path_or_paths).mkdir(parents=True,
                                           exist_ok=True,
                                           filesystem=self.filesystem)
            chunks = np.array_split(df, num_chunks)
            for idx, chunk in enumerate(chunks):
                filename = f"part-{idx:05d}.parquet.{compression}"
                chunk_path = Path(self.path_or_paths, filename)
                LOGGER.info(f"Writing chunk:{idx} to {chunk_path}")
                with chunk_path.open("wb", filesystem=self.filesystem) as file:
                    table = pyarrow.Table.from_pandas(chunk,
                                                      schema=schema,
                                                      preserve_index=False)
                    pq.write_table(table, file, compression=compression)
Пример #15
0
def get_latest_saved_model(saved_model_dir: str) -> str:
    """Get latest sub directory in saved_model_dir.

    Parameters
    ----------
    saved_model_dir : str
        Path to directory containing saved model exports.

    Returns
    -------
    str
    """
    subdirs = [
        str(path) for path in Path(saved_model_dir).iterdir()
        if path.is_dir() and "temp" not in str(path)
    ]
    return str(sorted(subdirs)[-1])
Пример #16
0
def upload_pex(
    path_pex: str, path_pex_existing: str = None, additional_packages: Dict = None, ignored_packages: List = None
) -> str:
    """Upload Current Environment and return path to PEX on HDFS"""
    if path_pex_existing is None:
        LOGGER.info(f"Uploading env to {path_pex}")
        packaging.upload_env_to_hdfs(
            archive_on_hdfs=path_pex,
            additional_packages=additional_packages if additional_packages else {},
            ignored_packages=ignored_packages if ignored_packages else [],
            packer=cluster_pack.packaging.PEX_PACKER,
        )
    elif not Path(path_pex_existing).is_hdfs:
        LOGGER.info(f"Uploading env to {path_pex}")
        packaging.upload_zip_to_hdfs(path_pex_existing, archive_on_hdfs=path_pex)
    else:
        LOGGER.info(f"Skipping upload, PEX {path_pex_existing} already exists")
        path_pex = path_pex_existing
    return path_pex
Пример #17
0
 def run(self):
     if self.skip_copy:
         LOGGER.info(f"NOT COPYING {self.source} to {self.target} (skip_copy=True)")
         return
     LOGGER.info(f"Copying {self.source} to {self.target}")
     Path(self.source).copy_dir(self.target)
Пример #18
0
def write_json(data: Dict, path: Union[str, Path]):
    """Write data to path"""
    with Path(path).open("w") as file:
        json.dump(data, file, indent=4)
Пример #19
0
 def is_hdfs(self) -> bool:
     if isinstance(self.path_or_paths, str):
         return Path(self.path_or_paths).is_hdfs  # type: ignore
     else:
         return Path(self.path_or_paths[0]).is_hdfs