def load_model(experiment_id: Optional[str] = None, operator_id: Optional[str] = None) -> Dict[str, object]: """Retrieves a model from object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Returns: dict: A dictionary of models. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() try: object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}" data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) except (NoSuchBucket, NoSuchKey): return {} buffer = BytesIO(data.read()) return load(buffer)
def list_metrics(experiment_id: Optional[str] = None, operator_id: Optional[str] = None, run_id: Optional[str] = None) -> List[Dict[str, object]]: """Lists metrics from object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. run_id (str, optional): the run id. Defaults to None. Returns: list: A list of metrics. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id is None: # gets run_id from env variable # Attention: returns None if env is unset run_id = get_run_id() elif run_id == "latest": try: metadata = stat_metadata(experiment_id, operator_id) run_id = metadata.get("run_id") except FileNotFoundError: return [] try: object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id) data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) except (NoSuchBucket, NoSuchKey): raise FileNotFoundError(f"No such file or directory: '{experiment_id}'") return load(data)
def save_model(**kwargs): """Serializes and saves models. Args: **kwargs: the models as keyword arguments. Raises: TypeError: when a figure is not a matplotlib figure. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ experiment_id = kwargs.get("experiment_id") if experiment_id is None: experiment_id = get_experiment_id() operator_id = kwargs.get("operator_id") if operator_id is None: operator_id = get_operator_id() object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}" model_buffer = BytesIO() dump(kwargs, model_buffer) model_buffer.seek(0, SEEK_SET) # ensures MinIO bucket exists make_bucket(BUCKET_NAME) # uploads file to MinIO MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=model_buffer, length=model_buffer.getbuffer().nbytes, )
def save_metrics(experiment_id: Optional[str] = None, operator_id: Optional[str] = None, run_id: Optional[str] = None, **kwargs): """Saves metrics of an experiment to the object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None operator_id (str, optional): the operator uuid. Defaults to None run_id (str, optional): the run id. Defaults to None. **kwargs: the metrics dict. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id: metadata = {} try: metadata = stat_metadata(experiment_id, operator_id) if run_id == "latest": run_id = metadata.get("run_id") except FileNotFoundError: pass metadata["run_id"] = run_id # encodes metadata to JSON format and uploads to MinIO buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=f'experiments/{experiment_id}/operators/{operator_id}/.metadata', data=buffer, length=buffer.getbuffer().nbytes, ) object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id) encoded_metrics = [] # retrieves the metrics saved previosuly try: data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) encoded_metrics = loads(data.read()) except NoSuchKey: pass # appends new metrics encoded_metrics.extend(_encode_metrics(kwargs)) # puts metrics into buffer buffer = BytesIO(dumps(encoded_metrics).encode()) # uploads metrics to MinIO MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )
def save_dataset(name: str, data: Union[pd.DataFrame, BinaryIO] = None, df: pd.DataFrame = None, metadata: Optional[Dict[str, str]] = None, run_id: Optional[str] = None, operator_id: Optional[str] = None): """Saves a dataset and its metadata to the object storage. Args: name (str): the dataset name. data (pandas.DataFrame, BinaryIO, optional): the dataset contents as a pandas.DataFrame or an `BinaryIO` buffer. Defaults to None. df (pandas.DataFrame, optional): the dataset contents as an `pandas.DataFrame`. df exists only for compatibility with existing components. Use "data" for all types of datasets. Defaults to None. metadata (dict, optional): metadata about the dataset. Defaults to None.. run_id (str, optional): the run id. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Raises: PermissionError: If dataset was read only. """ # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() if operator_id is None: # gets operator_id from env variables # Attention: returns None if env is unset operator_id = get_operator_id(raise_for_none=False) # df exists only for compatibility with existing components # from now on one must use "data" for all types of datasets if df is not None: data = df try: # gets metadata (if dataset exists) stored_metadata = stat_dataset(name, run_id) metadata_should_be_updated = False # update stored metadata values if metadata: stored_metadata.update(metadata) elif isinstance(data, pd.DataFrame): metadata_should_be_updated = True metadata = stored_metadata except FileNotFoundError: metadata_should_be_updated = False # builds metadata dict: # sets filename and run_id if metadata is None: metadata = {} metadata["filename"] = name if isinstance(data, pd.DataFrame): # sets metadata specific for pandas.DataFrame: # columns, featuretypes metadata["columns"] = data.columns.tolist() metadata["total"] = len(data.index) if "featuretypes" not in metadata: metadata["featuretypes"] = infer_featuretypes(data) # if the metadata was given (set manually), ignore updates, otherwise # search for changes and then update current featuretypes to be even with columns if metadata_should_be_updated: previous_metadata = stat_dataset(name, run_id) previous_columns = previous_metadata["columns"] previous_featuretypes = previous_metadata["featuretypes"] column_to_type = dict(zip(previous_columns, previous_featuretypes)) new_featuretypes = [] for new_column in metadata["columns"]: if new_column in column_to_type: new_featuretypes.append(column_to_type[new_column]) else: new_featuretypes.append( infer_featuretypes(pd.DataFrame(data[new_column]))[0]) metadata["featuretypes"] = new_featuretypes if run_id: metadata["run_id"] = run_id # When saving a dataset of a run, also # set the run_id in datasets/<name>.metadata # This enables load_dataset by run="latest" try: root_metadata = stat_dataset(name, "root") except FileNotFoundError: root_metadata = {} root_metadata["run_id"] = run_id object_name = _metadata_filepath(name) # encodes metadata to JSON format buffer = BytesIO(dumps(root_metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) # create a run metadata to save the last operator id # to dataset get loaded on next step of the pipeline flow metadata["operator_id"] = operator_id object_name = _metadata_filepath(name, run_id=run_id) buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) path = _data_filepath(name, run_id, operator_id) if isinstance(data, pd.DataFrame): # uploads dataframe to MinIO as a .csv file temp_file = tempfile.NamedTemporaryFile(dir='.', delete=False) data.to_csv(temp_file.name, header=True, index=False) MINIO_CLIENT.fput_object(bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), file_path=temp_file.name) temp_file.close() os.remove(temp_file.name) else: # uploads raw data to MinIO buffer = BytesIO(data.read()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), data=buffer, length=buffer.getbuffer().nbytes, ) object_name = _metadata_filepath(name, run_id, operator_id) # encodes metadata to JSON format buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )