def latest_model_checkpoint(cfg: LatestCheckpointConfig):
    r"""Select the latest model checkpoint and write path to JSON.

    Find the latest model checkpoint written by the machine learning
    stage and write the path into a JSON file to be consumed by the
    agent stage.

    Parameters
    ----------
    cfg : LatestCheckpointConfig
        pydantic YAML configuration for model selection task.
    """
    api = DeepDriveMD_API(cfg.experiment_directory)

    # Check if there is a new model
    if cfg.stage_idx % cfg.retrain_freq == 0:
        # Select latest model checkpoint.
        model_checkpoint = latest_checkpoint(api, cfg.checkpoint_dir,
                                             cfg.checkpoint_suffix)
        # Get latest model YAML configuration.
        model_config = api.machine_learning_stage.config_path(
            cfg.stage_idx, cfg.task_idx)
    else:  # Use old model
        token = get_model_path(cfg.stage_idx - 1, cfg.task_idx, api)
        assert token is not None, f"{cfg.stage_idx - 1}, {cfg.task_idx}"
        model_config, model_checkpoint = token

    # Format data into JSON serializable list of dictionaries
    data = [{
        "model_checkpoint": str(model_checkpoint),
        "model_config": str(model_config)
    }]
    # Dump metadata to disk for MD stage
    api.model_selection_stage.write_task_json(data, cfg.stage_idx,
                                              cfg.task_idx)
示例#2
0
    def __init__(self, cfg: ExperimentConfig):
        self.cfg = cfg
        self.stage_idx = 0

        self.api = DeepDriveMD_API(cfg.experiment_directory)
        self.pipeline = Pipeline()
        self.pipeline.name = self.PIPELINE_NAME

        self._init_experiment_dir()
示例#3
0
    def __init__(self, cfg: OpenMMConfig):

        self.cfg = cfg
        self.api = DeepDriveMD_API(cfg.experiment_directory)
        self._prefix = self.api.molecular_dynamics_stage.unique_name(
            cfg.output_path)

        # Use node local storage if available. Otherwise, write to output directory.
        if cfg.node_local_path is not None:
            self.workdir = cfg.node_local_path.joinpath(self._prefix)
        else:
            self.workdir = cfg.output_path

        self._init_workdir()
def get_model_path(
    stage_idx: int = -1,
    task_idx: int = 0,
    api: Optional[DeepDriveMD_API] = None,
    experiment_dir: Optional[PathLike] = None,
) -> Optional[Tuple[Path, Path]]:
    r"""Get the current best model.

    Should be imported by other stages to retrieve the best model path.

    Parameters
    ----------
    api : DeepDriveMD_API, optional
        API to DeepDriveMD to access the machine learning model path.
    experiment_dir : Union[str, Path], optional
        Experiment directory to initialize DeepDriveMD_API.

    Returns
    -------
    None
        If model selection has not run before.
    model_config : Path, optional
        Path to the most recent model YAML configuration file
        selected by the model selection stage. Contains hyperparameters.
    model_checkpoint : Path, optional
        Path to the most recent model weights selected by the model
        selection stage.


    Raises
    ------
    ValueError
        If both `api` and `experiment_dir` are None.
    """
    if api is None and experiment_dir is None:
        raise ValueError("Both `api` and `experiment_dir` are None")

    if api is None:
        assert experiment_dir is not None
        api = DeepDriveMD_API(experiment_dir)

    data = api.model_selection_stage.read_task_json(stage_idx, task_idx)
    if data is None:
        return

    model_config = Path(data[0]["model_config"])
    model_checkpoint = Path(data[0]["model_checkpoint"])

    return model_config, model_checkpoint
示例#5
0
def get_h5_training_file(cfg: KerasCVAEModelConfig) -> Tuple[Path, List[str]]:
    # Collect training data
    api = DeepDriveMD_API(cfg.experiment_directory)
    md_data = api.get_last_n_md_runs()
    all_h5_files = md_data["data_files"]

    virtual_h5_path, h5_files = get_virtual_h5_file(
        output_path=cfg.output_path,
        all_h5_files=all_h5_files,
        last_n=cfg.last_n_h5_files,
        k_random_old=cfg.k_random_old_h5_files,
        virtual_name=f"virtual_{cfg.model_tag}",
        node_local_path=cfg.node_local_path,
    )

    return virtual_h5_path, h5_files
 def __init__(self, experiment_directory: PathLike):
     self.api = DeepDriveMD_API(experiment_directory)
示例#7
0
def concatenate_last_n_h5(cfg: BasicAggegation):

    fields = []
    if cfg.rmsd:
        fields.append("rmsd")
    if cfg.fnc:
        fields.append("fnc")
    if cfg.contact_map:
        fields.append("contact_map")
    if cfg.point_cloud:
        fields.append("point_cloud")

    # Get list of input h5 files
    api = DeepDriveMD_API(cfg.experiment_directory)
    md_data = api.get_last_n_md_runs(n=cfg.last_n_h5_files)
    files = md_data["data_files"]

    if cfg.verbose:
        print(f"Collected {len(files)} h5 files.")

    # Open output file
    fout = h5py.File(cfg.output_path, "w", libver="latest")

    # Initialize data buffers
    data = {x: [] for x in fields}

    for in_file in files:

        if cfg.verbose:
            print("Reading", in_file)

        with h5py.File(in_file, "r") as fin:
            for field in fields:
                data[field].append(fin[field][...])

    # Concatenate data
    for field in data:
        data[field] = np.concatenate(data[field])

    # Centor of mass (CMS) subtraction
    if "point_cloud" in data:
        if cfg.verbose:
            print("Subtract center of mass (CMS) from point cloud")
        cms = np.mean(data["point_cloud"][:, 0:3, :].astype(np.float128),
                      axis=2,
                      keepdims=True).astype(np.float32)
        data["point_cloud"][:, 0:3, :] -= cms

    # Create new dsets from concatenated dataset
    for field, concat_dset in data.items():
        if field == "traj_file":
            utf8_type = h5py.string_dtype("utf-8")
            fout.create_dataset("traj_file", data=concat_dset, dtype=utf8_type)
            continue

        shape = concat_dset.shape
        chunkshape = (1, ) + shape[1:]
        # Create dataset
        if concat_dset.dtype != np.object:
            if np.any(np.isnan(concat_dset)):
                raise ValueError("NaN detected in concat_dset.")
            dset = fout.create_dataset(field,
                                       shape,
                                       chunks=chunkshape,
                                       dtype=concat_dset.dtype)
        else:
            dset = fout.create_dataset(field,
                                       shape,
                                       chunks=chunkshape,
                                       dtype=h5py.vlen_dtype(np.int16))
        # write data
        dset[...] = concat_dset[...]

    # Clean up
    fout.flush()
    fout.close()