def local_files(ctx, role, name, compute_hash: bool, export: bool, imported: bool, path: str): """Add a local file directory (not managed by git) to the workspace. Subcommand of ``add``""" ns = ctx.obj if role is None: if imported: role = ResourceRoles.SOURCE_DATA_SET elif ns.batch: raise BatchModeError("--role") else: role = click.prompt( "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults", type=ROLE_PARAM, ) path = abspath(expanduser(path)) workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE): raise click.BadOptionUsage( message="Cannot export a source data or code resource", option_name="export") if export and imported: raise click.BadOptionUsage( message="Cannot specify both --export and --imported", option_name="imported") if imported and role != ResourceRoles.SOURCE_DATA_SET: raise click.BadOptionUsage( message="--imported only for source-data roles", option_name="imported") add_command("file", role, name, workspace, path, compute_hash, export, imported)
def _init_dws_state(self): workspace = find_and_load_workspace( batch=True, verbose=self.verbose, uri_or_local_path=self.workspace_dir) self._dws_state = _DwsModelState(workspace, self.input_resource, self.results_resource)
def pull(ctx, workspace_dir: str, only: Optional[str], skip: Optional[str], only_workspace: bool): """Pull the latest state of the workspace and its resources from their origins.""" ns = ctx.obj option_cnt = ((1 if only is not None else 0) + (1 if skip is not None else 0) + (1 if only_workspace else 0)) if option_cnt > 1: raise click.BadOptionUsage( message= "Please specify at most one of --only, --skip, or --only-workspace", option_name="--only") # type: ignore if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) pull_command( workspace, only=only.split(",") if only else None, skip=skip.split(",") if skip else None, only_workspace=only_workspace, )
def restore( ctx, workspace_dir: str, only: Optional[str], leave: Optional[str], strict: bool, tag_or_hash: str, ): """Restore the workspace to a prior state""" ns = ctx.obj if (only is not None) and (leave is not None): raise click.BadOptionUsage( option_name="--only", message="Please specify either --only or --leave, but not both" ) # type: ignore if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) restore_command( workspace, tag_or_hash, only=only.split(",") if only else None, leave=leave.split(",") if leave else None, strict=strict, )
def get_snapshot_history( workspace_uri_or_path: Optional[str] = None, reverse: bool = False, max_count: Optional[int] = None, verbose: bool = False, ) -> Iterable[SnapshotInfo]: """Get the history of snapshots, starting with the oldest first (unless :reverse: is True). Returns a list of SnapshotInfo instances, containing the snapshot number, hash, tag, timestamp, and message. If :max_count: is specified, returns at most that many snapshots. """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) assert isinstance(workspace, SnapshotWorkspaceMixin) if not reverse: return [ SnapshotInfo( snapshot_idx + 1, md.hashval, md.tags, md.timestamp, md.message, md.metrics ) for (snapshot_idx, md) in enumerate( workspace.list_snapshots(reverse=False, max_count=max_count) ) ] else: last_snapshot_no = workspace.get_next_snapshot_number() - 1 return [ SnapshotInfo( last_snapshot_no - i, md.hashval, md.tags, md.timestamp, md.message, md.metrics ) for (i, md) in enumerate(workspace.list_snapshots(reverse=True, max_count=max_count)) ]
def __init__( self, model_name: str, monitor: str = "val_loss", save_best_only: bool = False, mode: str = "auto", save_freq: Union[str, int] = "epoch", results_resource: Optional[Union[str, ResourceRef]] = None, workspace_dir: Optional[str] = None, verbose: Union[int, bool] = 0, ): """ model_name is used to create the checkpoint filenames. The checkpoints will be saved as MODEL_NAME_{epoch}. Currently, only supports save_weights_only option. verbose can be either 0,1 in the style of tensorflow or a True,False in the style of Data Workspaces. """ self.dws_model_name = model_name if verbose == 0 or verbose == False: tf_verbose = 0 dws_verbose = False else: tf_verbose = 1 dws_verbose = True self.workspace = find_and_load_workspace( batch=True, verbose=dws_verbose, uri_or_local_path=workspace_dir) results_ref = _find_resource(self.workspace, ResourceRoles.RESULTS, results_resource) self.results_resource = self.workspace.get_resource(results_ref.name) if not isinstance(self.results_resource, FileResourceMixin): raise ConfigurationError( "Resource %s is not a file-based resource" % results_ref.name) self.results_subdir = results_ref.subpath # type: Optional[str] scratch_dir = self.workspace.get_scratch_directory() assert isdir(scratch_dir), "missing scratch directory %s" % scratch_dir self.dws_checkpoint_path = join(scratch_dir, "checkpoints") # type: str if not isdir(self.dws_checkpoint_path): os.mkdir(self.dws_checkpoint_path) self.checkpoint_filepath_template = join(self.dws_checkpoint_path, model_name + "_{epoch}") super().__init__( filepath=self.checkpoint_filepath_template, monitor=monitor, save_best_only=save_best_only, mode=mode, save_freq=save_freq, save_weights_only=True, verbose=tf_verbose, )
def snapshot(ctx, workspace_dir, message, tag): """Take a snapshot of the current workspace's state""" ns = ctx.obj if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) snapshot_command(workspace, tag, message)
def status(ctx, workspace_dir, history, limit): """NOTE: this command is DEPRECATED. Please use ``dws report status`` and ``dws report history`` instead.""" ns = ctx.obj if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) status_command(workspace, history, limit)
def diff(ctx, workspace_dir, snapshot_or_tag1, snapshot_or_tag2): """List differences between two snapshots""" ns = ctx.obj if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) diff_command(workspace, snapshot_or_tag1, snapshot_or_tag2)
def take_snapshot( workspace_uri_or_path: Optional[str] = None, tag: Optional[str] = None, message: str = "", verbose: bool = False, ) -> str: """Take a snapshot of the workspace, using the tag and message, if provided. Returns the snapshot hash (which can be used to restore to this point). """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) return snapshot_command(workspace, tag=tag, message=message)
def get_local_path_for_resource( name: str, workspace_uri_or_path: Optional[str] = None, verbose: bool = False ) -> Optional[str]: """If a local path is available for this resource, return it. Otherwise, return None.""" workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) r = workspace.get_resource(name) return cast(LocalStateResourceMixin, r).get_local_path_if_any() \ if isinstance(r, LocalStateResourceMixin) \ else None
def s3(ctx, role, name, bucket_name: str): """Add a S3 resource to the workspace. Subcommand of ``add``""" ns = ctx.obj if role is None: if ns.batch: raise BatchModeError("--role") else: role = click.prompt( "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults", type=ROLE_PARAM, ) workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) add_command("s3", role, name, workspace, bucket_name)
def deploy_build( ctx, image_name: Optional[str], force_rebuild: bool, git_user_email: Optional[str], git_user_name: Optional[str], ): """Build a docker image containing this workspace. This command is EXERIMENTAL and subject to change.""" ns = ctx.obj workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) deploy_build_command(workspace, image_name, force_rebuild, git_user_email, git_user_name)
def get_filesystem_for_resource(name: str, workspace_uri_or_path: Optional[str] = None, verbose: bool = False ) -> Optional[ResourceFileSystem]: """Get the a filesystem-like object for the named resource. If it isn't a FileResource, returns None. """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) resource = workspace.get_resource(name) if isinstance(resource, FileResourceMixin): return ResourceFileSystem(resource) else: return None
def graph( ctx, resource: Optional[str], snapshot: Optional[str], format: str, width: int, height: int, output_file: str, ): """Graph the lineage of a resource, writing the graph to an HTML file. Subcommand of ``lineage``""" ns = ctx.obj workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) lineage_graph_command(workspace, output_file, resource, snapshot, format, width, height)
def api_resource(ctx, role, name): """Resource to represent data obtained via an API. Use this when there is no file-based representation of your data that can be versioned and captured more directly. Subcommand of ``add``""" ns = ctx.obj if role is None: if ns.batch: raise BatchModeError("--role") else: role = click.prompt( "Please enter a role for this resource, either [s]ource-data or [i]ntermediate-data", type=DATA_ROLE_PARAM, ) workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) add_command("api-resource", role, name, workspace)
def delete_snapshot(ctx, workspace_dir: str, no_include_resources: bool, tag_or_hash: str): """Delete the specified snapshot. This includes the metadata and lineage data for the snapshot. Unless --no-include-resources is specified, this also deletes any results data saved for the snapshot (under the snapshots subdirectory of a results resource).""" ns = ctx.obj if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) delete_snapshot_command(workspace, tag_or_hash, no_include_resources)
def publish(ctx, workspace_dir, skip: str, remote_repository): """Add a remote Git repository as the origin for the workspace and do the initial push of the workspace and any other resources. """ ns = ctx.obj if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) publish_command(workspace, remote_repository) push_command(workspace, only=None, skip=skip.split(",") if skip else None, only_workspace=False)
def get_resource_info(workspace_uri_or_path: Optional[str] = None, verbose: bool = False): """Returns a list of ResourceInfo instances, describing the resources defined for this workspace. """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) return [ ResourceInfo( r.name, r.role, r.resource_type, cast(LocalStateResourceMixin, r).get_local_path_if_any() if isinstance(r, LocalStateResourceMixin) else None, ) for r in workspace.get_resources() ]
def rclone( ctx, role, name, config: str, compute_hash: bool, export: bool, imported: bool, source: str, dest: str, ): """Add an rclone-d repository as a resource to the workspace. Subcommand of ``add``""" ns = ctx.obj if role is None: if imported: role = ResourceRoles.SOURCE_DATA_SET elif ns.batch: raise BatchModeError("--role") else: role = click.prompt( "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults", type=ROLE_PARAM, ) rclone_re = r".*:.*" if re.match(rclone_re, source) == None: raise click.BadOptionUsage( message= "Source in rclone should be specified as remotename:filepath", option_name="source", ) if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE): raise click.BadOptionUsage( message="Cannot export a source data or code resource", option_name="export") if export and imported: raise click.BadOptionUsage( message="Cannot specify both --export and --imported", option_name="imported") if imported and role != ResourceRoles.SOURCE_DATA_SET: raise click.BadOptionUsage( message="--imported only for source-data roles", option_name="imported") dest = abspath(expanduser(dest)) workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) add_command("rclone", role, name, workspace, source, dest, config, compute_hash, export, imported)
def get_results( workspace_uri_or_path: Optional[str] = None, tag_or_hash: Optional[str] = None, resource_name: Optional[str] = None, verbose: bool = False, ) -> Optional[Tuple[JSONDict, str]]: """Get a results file as a parsed json dict. If no resource or snapshot is specified, searches all the results resources for a file. If a snapshot is specified, we look in the subdirectory where the resuls have been moved. If no snapshot is specified, and we don't find a file, we look in the most recent snapshot. Returns a tuple with the results and the logical path (resource:/subpath) to the results. If nothing is found, returns None. """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) return _get_results(workspace, tag_or_hash, resource_name)
def restore( tag_or_hash: str, workspace_uri_or_path: Optional[str] = None, only: Optional[List[str]] = None, leave: Optional[List[str]] = None, verbose: bool = False, ) -> int: """Restore to a previous snapshot, identified by either its hash or its tag (if one was specified). Parameters: * ``only`` - an optional list of resources to store. If specified all other resources will be left as-is. * ``leave`` - an optional list of resource to leave as-is. Both ``only`` and ``leave`` should not be specified together. Returns the number of resources changed. """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) return restore_command(workspace, tag_or_hash=tag_or_hash, only=only, leave=leave)
def config(ctx, workspace_dir, resource, param_name, param_value): """Get or set configuration parameters. Local parameters are only for this copy of the workspace, while global parameters are stored centrally and affect all copies. If neither PARAMETER_NAME nor PARAMETER_VALUE are specified, this command prints a table of all parameters and their information (scope, value, default or not, and help text). If just PARAMETER_NAME is specified, it prints the specified parameter's information. Finally, if both the parameter name and value are specified, the parameter is set to the specified value.""" ns = ctx.obj if workspace_dir is None: if ns.batch: raise BatchModeError("--workspace-dir") else: workspace_dir = click.prompt("Please enter the workspace root dir", type=WORKSPACE_PARAM) workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir) config_command(workspace, param_name, param_value, resource)
def make_lineage_graph( output_file: str, workspace_uri_or_path: Optional[str] = None, resource_name: Optional[str] = None, tag_or_hash: Optional[str] = None, width: int = 1024, height: int = 800, verbose: bool = False, ) -> None: """Write a lineage graph as an html/javascript page to the specified file. """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) lineage_graph_command( workspace, output_file, resource_name=resource_name, snapshot=tag_or_hash, width=width, height=height, )
def make_lineage_table( workspace_uri_or_path: Optional[str] = None, tag_or_hash: Optional[str] = None, verbose: bool = False, ) -> Iterable[Tuple[str, str, str, Optional[List[str]]]]: """Make a table of the lineage for each resource. The columns are: ref, lineage type, details, inputs """ workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path) if not isinstance(workspace, SnapshotWorkspaceMixin): raise ConfigurationError("Workspace %s does not support lineage" % workspace.name) if not workspace.supports_lineage(): raise ConfigurationError("Workspace %s does not support lineage" % workspace.name) snapshot_hash = None # type: Optional[str] if tag_or_hash is not None: md = workspace.get_snapshot_by_tag_or_hash(tag_or_hash) snapshot_hash = md.hashval return lu.make_lineage_table( workspace.get_instance(), workspace.get_lineage_store(), snapshot_hash )
def git(ctx, role, name, branch, read_only, export, imported, path): """Add a local git repository as a resource. Subcommand of ``add``""" ns = ctx.obj if role is None: if imported: role = ResourceRoles.SOURCE_DATA_SET elif ns.batch: raise BatchModeError("--role") else: role = click.prompt( "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults", type=ROLE_PARAM, ) if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE): raise click.BadOptionUsage( message="Cannot export a source data or code resource", option_name="export") if export and imported: raise click.BadOptionUsage( message="Cannot specify both --export and --imported", option_name="imported") if imported and role != ResourceRoles.SOURCE_DATA_SET: raise click.BadOptionUsage( message="--imported only for source-data roles", option_name="imported") if imported: read_only = True if path.startswith("git@") or path.startswith("https://"): raise click.BadOptionUsage( message="It looks like you tried to specify a git URL (%s)." % path + " Currently, git resources only accept a local path." + " Try cloning your repository and then pasing the local path to that repository.", option_name="path", ) path = abspath(expanduser(path)) workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) add_command("git", role, name, workspace, path, branch, read_only, export, imported)
def load_dataset_from_resource(resource_name: str, subpath: Optional[str] = None, workspace_dir: Optional[str] = None) -> Bunch: """ Load a datset (data and targets) from the specified resource, and returns an sklearn-style Bunch (a dictionary-like object). The bunch will include at least three attributes: * ``data`` - a NumPy array of shape number_samples * number_features * ``target`` - a NumPy array of length number_samples * ``resource`` - a :class:`~dataworkspaces.workspace.ResourceRef` that provides the resource name and subpath (if any) for the data Some other attributes that may also be present, depending on the data set: * ``DESCR`` - text containing a full description of the data set (for humans) * ``feature_names`` - an array of length number_features containing the name of each feature. * ``target_names`` - an array containing the name of each target class Data sets may define their own attributes as well (see below). **Parameters** resource_name The name of the resource containing the dataset. subpath Optional subpath within the resource where this specific dataset is located. If not specified, the root of the resource is used. workspace_dir The root directory of your workspace in the local file system. Usually, this can be left unspecified and inferred by DWS, which will search up from the current working directory. **Creating a Dataset** To create a dataset in your resource that is suitable for importing by this function, you simply need to create a file for each attribute you want in the bunch and place all these files in the same directory within your resource. The names of the files should be ``ATTRIBUTE.extn`` where ``ATTRIBUTE`` is the attribute name (e.g. ``data`` or ``DESCR``) and ``.extn`` is a file extension indicating the format. Supported file extensions are: * ``.txt`` or ``.rst`` - text files * ``.csv`` - csv files. These are read in using ``numpy.loadtxt()``. If this fails because the csv does not contain all numeric data, pandas is used to read in the file. It is then converted back to a numpy array. * ``.csv.gz`` or ``.csv.bz2`` - these are compressed csv files which are treated the same was as csv files (numpy and pandas will automatically uncompress before parsing). * ``.npy`` - this a a file containing a serialized NumPy array saved via ``numpy.save()``. It is loaded using ``numpy.load()``. """ workspace = find_and_load_workspace(True, False, workspace_dir) workspace.validate_resource_name(resource_name, subpath) dataset_name = ("Resource " + resource_name + " subpath " + subpath if subpath is not None else "Resource " + resource_name) r = workspace.get_resource(resource_name) if not isinstance( r, LocalStateResourceMixin) or (r.get_local_path_if_any() is None): # TODO: Support a data access api raise ConfigurationError( "Unable to instantiate a data set for resource '%s': currently not supported for non-local resources" % resource_name) local_path = r.get_local_path_if_any() assert local_path is not None dataset_path = join(local_path, subpath) if subpath is not None else local_path result = {} # this will be the args to the result Bunch # First load data and target files, which are required data_file = join(dataset_path, "data.csv") if exists(data_file): pass elif exists(data_file + ".gz"): data_file += ".gz" elif exists(data_file + ".bz2"): data_file += ".bz2" else: raise ConfigurationError("Did not find data file for %s at '%s'" % (dataset_name, data_file)) result["data"] = np.loadtxt(data_file, delimiter=",") target_file = join(dataset_path, "target.csv") if exists(target_file): pass elif exists(target_file + ".gz"): target_file += ".gz" elif exists(target_file + ".bz2"): target_file += ".bz2" else: raise ConfigurationError("Did not find target file for %s at '%s'" % (dataset_name, target_file)) result["target"] = np.loadtxt(target_file, delimiter=",") if result["data"].shape[0] != result["target"].shape[0]: raise ConfigurationError( "Data matrix at '%s' has %d rows, but target at '%s' has %d rows" % (data_file, result["data"].shape[0], target_file, result["target"].shape[0])) result["resource"] = ResourceRef(resource_name, subpath) # check for and load any other attributes for fname in os.listdir(dataset_path): if fname.endswith(".txt"): result[fname[:-4]] = _load_dataset_file(dataset_path, fname) elif fname.endswith(".rst"): result[fname[:-4]] = _load_dataset_file(dataset_path, fname) elif fname.endswith(".csv"): result[fname[:-4]] = _load_dataset_file(dataset_path, fname) elif fname.endswith(".csv.gz"): result[fname[:-7]] = _load_dataset_file(dataset_path, fname) elif fname.endswith(".csv.bz2"): result[fname[:-8]] = _load_dataset_file(dataset_path, fname) elif fname.endswith(".npy"): result[fname[:-4]] = _load_dataset_file(dataset_path, fname) return Bunch(**result)
def deploy_run(ctx, image_name: Optional[str], no_mount_ssh_keys: bool): """Build a docker image containing this workspace. This command is EXERIMENTAL and subject to change.""" ns = ctx.obj workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) deploy_run_command(workspace, image_name, no_mount_ssh_keys)
def add_lineage_to_keras_model_class( Cls: type, input_resource: Optional[Union[str, ResourceRef]] = None, results_resource: Optional[Union[str, ResourceRef]] = None, workspace_dir: Optional[str] = None, checkpoint_config: Optional[CheckpointConfig] = None, verbose: bool = False, ) -> type: """This function wraps a Keras model class with a subclass that overwrites key methods to make calls to the data lineage API. **Parameters:** * ``Cls`` -- the class being wrapped * ``input_resources`` -- optional list of input resources to this model. Each resource may be specified by name, by a local file path, or via a ``ResourceRef``. If no inputs are specified, will try to infer from the workspace. * ``results_resource`` -- optional resource where the results are to be stored. May be specified by name, by a local file path, or via a ``ResourceRef``. if not specified, will try to infer from the workspace. * ``workspace-dir`` -- Optional directory specifying the workspace. Usually can be inferred from the current directory. * ``checkpoint_config`` -- Optional instance of :class:`~CheckpointConfig`, which is used to enable checkpointing on fit and fit_generator() * ``verbose`` -- If True, print extra debugging information. The following methods are wrapped: * :func:`~__init__` - loads the workspace and adds dws-specific class members * :func:`~compile` - captures the ``optimizer`` and ``loss_function`` parameter values * :func:`~fit` - captures the ``epochs`` and ``batch_size`` parameter values; if input is an API resource, capture hash values of training data, otherwise capture input resource name. If the input is an API resource, and it is either a Keras Sequence or a generator, writes the generator and captures the hashes of returned values as it is iterated through. * :func:`~evaluate` - captures the ``batch_size`` parameter value; if input is an API resource, capture hash values of test data, otherwise capture input resource name; capture metrics and write them to results resource. If the input is an API resource, and it is either a Keras Sequence or a generator, writes the generator and captures the hashes of returned values as it is iterated through. """ if hasattr( Cls, "_dws_model_wrap") and Cls._dws_model_wrap is True: # type: ignore print("dws>> %s or a superclass is already wrapped" % Cls.__name__) return Cls # already wrapped workspace = find_and_load_workspace(batch=True, verbose=verbose, uri_or_local_path=workspace_dir) class WrappedModel(Cls): # type: ignore _dws_model_wrap = True def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._dws_state = _DwsModelState(workspace, input_resource, results_resource) if checkpoint_config is not None: self.checkpoint_cb = DwsModelCheckpoint( checkpoint_config.model_name, monitor=checkpoint_config.monitor, save_best_only=checkpoint_config.save_best_only, mode=checkpoint_config.mode, save_freq=checkpoint_config.save_freq, results_resource=results_resource, workspace_dir=workspace_dir, verbose=verbose, ) # type: Optional[DwsModelCheckpoint] else: self.checkpoint_cb = None def compile( self, optimizer, loss=None, metrics=None, loss_weights=None, sample_weight_mode=None, weighted_metrics=None, target_tensors=None, distribute=None, **kwargs, ): if isinstance(optimizer, str): self._dws_state.lineage.add_param("optimizer", optimizer) elif isinstance(optimizer, optimizers.Optimizer): self._dws_state.lineage.add_param("optimizer", optimizer.__class__.__name__) if isinstance(loss, str): self._dws_state.lineage.add_param("loss_function", loss) elif isinstance(loss, losses.Loss): self._dws_state.lineage.add_param("loss_function", loss.__class__.__name__) if tensorflow.__version__ < "2.2.": # type: ignore return super().compile( optimizer, loss, metrics, loss_weights, sample_weight_mode, weighted_metrics, target_tensors, distribute, **kwargs, ) else: # starting in 2.2, tensorflow removed the tartet_tensors and distribute args return super().compile( optimizer, loss, metrics, loss_weights, sample_weight_mode, weighted_metrics, **kwargs, ) def fit(self, x, y=None, **kwargs): """x, y can be arrays or x can be a generator. """ if "epochs" in kwargs: self._dws_state.lineage.add_param("fit.epochs", kwargs["epochs"]) else: self._dws_state.lineage.add_param("fit.epochs", 1) if "batch_size" in kwargs: self._dws_state.lineage.add_param("fit.batch_size", kwargs["batch_size"]) else: self._dws_state.lineage.add_param("fit.batch_size", None) api_resource = self._dws_state.find_input_resources_and_return_if_api( x, y) if api_resource is not None: _verify_eager_if_dataset(x, y, api_resource) api_resource.init_hash_state() hash_state = api_resource.get_hash_state() if isinstance(x, kerasutils.Sequence): if y is not None: raise NotSupportedError( "fit() method does not suppport a generator for x AND a y value" ) x = _TfKerasSequenceWrapper(x, hash_state) elif isinstance(x, GeneratorType): if y is not None: raise NotSupportedError( "fit() method does not suppport a generator for x AND a y value" ) x = _wrap_generator(x, hash_state) else: # x and y are provided as full arrays _add_to_hash(x, hash_state) if y is not None: _add_to_hash(y, hash_state) api_resource.save_current_hash( ) # in case we evaluate in a separate process if self.checkpoint_cb: if "callbacks" in kwargs: kwargs["callbacks"].append(self.checkpoint_cb) else: kwargs["callbacks"] = [ self.checkpoint_cb, ] return super().fit(x, y, **kwargs) def fit_generator( self, generator, steps_per_epoch=None, epochs=1, verbose=1, callbacks=None, validation_data=None, validation_steps=None, validation_freq=1, class_weight=None, max_queue_size=10, workers=1, use_multiprocessing=False, shuffle=True, initial_epoch=0, ): self._dws_state.lineage.add_param("fit_generator.epochs", epochs) self._dws_state.lineage.add_param("fit_generator.steps_per_epoch", steps_per_epoch) api_resource = self._dws_state.find_input_resources_and_return_if_api( generator) if api_resource is not None: # wrap the generator to capture each entry as it is returned api_resource.init_hash_state() hash_state = api_resource.get_hash_state() if isinstance(generator, kerasutils.Sequence): generator = _TfKerasSequenceWrapper(generator, hash_state) else: generator = _wrap_generator(generator, hash_state) if self.checkpoint_cb: if callbacks is not None: callbacks.append(self.checkpoint_cb) else: callbacks = [ self.checkpoint_cb, ] results = super().fit_generator( generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch, ) if api_resource is not None: api_resource.save_current_hash() return results def evaluate(self, x, y=None, **kwargs): if "batch_size" in kwargs: self._dws_state.lineage.add_param("evaluate.batch_size", kwargs["batch_size"]) else: self._dws_state.lineage.add_param("evaluate.batch_size", None) api_resource = self._dws_state.find_input_resources_and_return_if_api( x, y) if api_resource is not None: _verify_eager_if_dataset(x, y, api_resource) api_resource.dup_hash_state() hash_state = api_resource.get_hash_state() if isinstance(x, kerasutils.Sequence): if y is not None: raise NotSupportedError( "evaluate() method does not suppport a generator for x AND a y value" ) x = _TfKerasSequenceWrapper(x, hash_state) elif isinstance(x, GeneratorType): if y is not None: raise NotSupportedError( "evaluate() method does not suppport a generator for x AND a y value" ) x = _wrap_generator(x, hash_state) else: _add_to_hash(x, hash_state) if y is not None: _add_to_hash(y, hash_state) results = super().evaluate(x, y, **kwargs) assert len(results) == len(self.metrics_names) if api_resource is not None: api_resource.save_current_hash() api_resource.pop_hash_state() self._dws_state.write_metrics_and_complete( {n: v for (n, v) in zip(self.metrics_names, results)}) return results def evaluate_generator( self, generator, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False, verbose=0, ): self._dws_state.lineage.add_param("evaluate_generator.steps", steps) api_resource = self._dws_state.find_input_resources_and_return_if_api( generator) if api_resource is not None: # wrap the generator to capture each entry as it is returned api_resource.dup_hash_state() hash_state = api_resource.get_hash_state() if isinstance(generator, kerasutils.Sequence): generator = _TfKerasSequenceWrapper(generator, hash_state) else: generator = _wrap_generator(generator, hash_state) results = super().evaluate_generator(generator, steps, callbacks, max_queue_size, workers, use_multiprocessing, verbose) if api_resource is not None: api_resource.save_current_hash() api_resource.pop_hash_state() assert len(results) == len(self.metrics_names) self._dws_state.write_metrics_and_complete( {n: v for (n, v) in zip(self.metrics_names, results)}) return results WrappedModel.__name__ = Cls.__name__ # this is to fake things out for the reporting if workspace.verbose: print("dws>> Wrapped model class %s" % Cls.__name__) return WrappedModel
def rclone( ctx, role, name, config: str, compute_hash: bool, export: bool, imported: bool, master: str, sync_mode: str, size_only: bool, remote: str, local_path: str, ): """Add an rclone-d repository as a resource to the workspace. Subcommand of ``add``. This is designed for uni-directional synchronization between a remote and a local_path. The remote has the form remote_name:remote_path, where remote_name is an entry in your rclone config file. """ ns = ctx.obj if role is None: if imported: role = ResourceRoles.SOURCE_DATA_SET elif ns.batch: raise BatchModeError("--role") else: role = click.prompt( "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults", type=ROLE_PARAM, ) rclone_re = r".*:.*" if re.match(rclone_re, remote) == None: raise click.BadOptionUsage( message= "Source in rclone should be specified as remotename:filepath", option_name="source", ) if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE): raise click.BadOptionUsage( message="Cannot export a source data or code resource", option_name="export") if export and imported: raise click.BadOptionUsage( message="Cannot specify both --export and --imported", option_name="imported") if imported and role != ResourceRoles.SOURCE_DATA_SET: raise click.BadOptionUsage( message="--imported only for source-data roles", option_name="imported") local_path = abspath(expanduser(local_path)) workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir) add_command( "rclone", role, name, workspace, remote, local_path, config, compute_hash, export, imported, master, sync_mode, size_only, )