def load_model(experiment_id: Optional[str] = None, operator_id: Optional[str] = None) -> Dict[str, object]: """Retrieves a model from object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Returns: dict: A dictionary of models. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() try: object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}" data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) except (NoSuchBucket, NoSuchKey): return {} buffer = BytesIO(data.read()) return load(buffer)
def list_metrics(experiment_id: Optional[str] = None, operator_id: Optional[str] = None, run_id: Optional[str] = None) -> List[Dict[str, object]]: """Lists metrics from object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. run_id (str, optional): the run id. Defaults to None. Returns: list: A list of metrics. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id is None: # gets run_id from env variable # Attention: returns None if env is unset run_id = get_run_id() elif run_id == "latest": try: metadata = stat_metadata(experiment_id, operator_id) run_id = metadata.get("run_id") except FileNotFoundError: return [] try: object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id) data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) except (NoSuchBucket, NoSuchKey): raise FileNotFoundError(f"No such file or directory: '{experiment_id}'") return load(data)
def save_metrics(experiment_id: Optional[str] = None, operator_id: Optional[str] = None, run_id: Optional[str] = None, **kwargs): """Saves metrics of an experiment to the object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None operator_id (str, optional): the operator uuid. Defaults to None run_id (str, optional): the run id. Defaults to None. **kwargs: the metrics dict. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id: metadata = {} try: metadata = stat_metadata(experiment_id, operator_id) if run_id == "latest": run_id = metadata.get("run_id") except FileNotFoundError: pass metadata["run_id"] = run_id # encodes metadata to JSON format and uploads to MinIO buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=f'experiments/{experiment_id}/operators/{operator_id}/.metadata', data=buffer, length=buffer.getbuffer().nbytes, ) object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id) encoded_metrics = [] # retrieves the metrics saved previosuly try: data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) encoded_metrics = loads(data.read()) except NoSuchKey: pass # appends new metrics encoded_metrics.extend(_encode_metrics(kwargs)) # puts metrics into buffer buffer = BytesIO(dumps(encoded_metrics).encode()) # uploads metrics to MinIO MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )
def load_dataset( name: str, run_id: Optional[str] = None, operator_id: Optional[str] = None, page: Optional[int] = None, page_size: Optional[int] = None) -> Union[pd.DataFrame, BinaryIO]: """Retrieves the contents of a dataset. If run_id exists, then loads the dataset from the specified run. If the dataset does not exist for given run_id/operator_id return the 'original' dataset Args: name (str): the dataset name. run_id (str, optional): the run id of training pipeline. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Returns: The contents of a dataset. Either a `pandas.DataFrame` or an `BinaryIO` buffer. Raises: FileNotFoundError: If dataset does not exist in the object storage. """ if run_id is None: # gets run_id from env variable # Attention: returns None if env is unset run_id = get_run_id() elif run_id == "latest": metadata = stat_dataset(name) run_id = metadata.get("run_id") # when the dataset does not exist for given run_id/operator_id # must return the 'original' dataset # unset run_id so data_filepath points to the 'original' dataset if run_id and operator_id: try: metadata = stat_dataset(name, run_id, operator_id) except FileNotFoundError: run_id = None elif run_id: try: run_metadata = stat_dataset(name, run_id) operator_id = run_metadata.get("operator_id") except FileNotFoundError: run_id = None # builds the path to the dataset file path = _data_filepath(name, run_id, operator_id) if page_size and page_size > 0: nrows = page_size else: nrows = None if page and page > 0: skiprows = (page - 1) * page_size else: skiprows = None try: metadata = stat_dataset(name, run_id, operator_id) dataset = pd.read_csv(S3FS.open(path), header=0, index_col=False, nrows=nrows, skiprows=skiprows) dtypes = dict((column, "object") for column, ftype in zip( metadata["columns"], metadata["featuretypes"]) if ftype in [CATEGORICAL, DATETIME]) dataset = dataset.astype(dtypes) except (UnicodeDecodeError, pd.errors.EmptyDataError, pd.errors.ParserError): # reads the raw file data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), ) return BytesIO(data.read()) except KeyError: # metadata file does not contains "columns" or "featuretypes" # ignore this error and return dataset without cast its type pass except FileNotFoundError: raise FileNotFoundError("The specified dataset does not exist") return dataset
def stat_dataset(name: str, run_id: Optional[str] = None, operator_id: Optional[str] = None) -> Dict[str, str]: """Retrieves the metadata of a dataset. Args: name (str): the dataset name. run_id (str, optional): the run id of trainning pipeline. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Returns: dict: The metadata. Raises: FileNotFoundError: If dataset does not exist in the object storage. """ metadata = {} # remove /tmp/data/ from dataset name # because in jupyter we use dataset with full path name = name.replace("/tmp/data/", "") if run_id == "latest": metadata = stat_dataset(name) run_id = metadata.get("run_id") if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() if run_id and operator_id: # get metadata for a specific operator of a run, if exists object_name = _metadata_filepath(name, run_id, operator_id) elif run_id: # if no metadata was generated by the operator, # get the last one generated by the pipeline flow object_name = _metadata_filepath(name, run_id) if not metadata_exists(name, run_id): # if it is at the beginning of a run, # there will be no metadata generated by run_id object_name = _metadata_filepath(name) else: # unable to get run_id automatically, # this function is probably being called out of a run object_name = _metadata_filepath(name) else: # get path according to received parameters run_id = None if run_id == "root" else run_id object_name = _metadata_filepath(name, run_id, operator_id) try: # reads the .metadata file data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) # decodes the metadata (which is in JSON format) metadata = loads(data.read()) except (NoSuchBucket, NoSuchKey): raise FileNotFoundError("The specified dataset does not exist") return metadata