def setUp(self): self.make_bucket() buffer = io.BytesIO(b"mock") MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="artifacts/mock.txt", data=buffer, length=buffer.getbuffer().nbytes, )
def create_mock_figure(self): file = BytesIO( b'<svg viewBox=\'0 0 125 80\' xmlns=\'http://www.w3.org/2000/svg\'>\n' b'<text y="75" font-size="100" font-family="serif"><![CDATA[10]]></text>\n' b'</svg>\n') MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="experiments/test/operators/test/figure-123456.svg", data=file, length=file.getbuffer().nbytes, )
def create_mock_model(self): model = {"model": MockModel()} buffer = BytesIO() dump(model, buffer) buffer.seek(0, SEEK_SET) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name= f"experiments/{EXPERIMENT_ID}/operators/{OPERATOR_ID}/model.joblib", data=buffer, length=buffer.getbuffer().nbytes, )
def load_model(experiment_id: Optional[str] = None, operator_id: Optional[str] = None) -> Dict[str, object]: """Retrieves a model from object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Returns: dict: A dictionary of models. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() try: object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}" data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) except (NoSuchBucket, NoSuchKey): return {} buffer = BytesIO(data.read()) return load(buffer)
def download_artifact(name: str, path: str): """Downloads the given artifact to the path. Args: name (str): the dataset name. path (str): destination path. Raises: FileNotFoundError """ try: MINIO_CLIENT.fget_object( bucket_name=BUCKET_NAME, object_name=f"{PREFIX}/{name}", file_path=path, ) except (NoSuchBucket, NoSuchKey): raise FileNotFoundError("The specified artifact does not exist")
def create_mock_dataset3(self): with open("mock.jpg", 'wb') as imagef: imagef.write(MOCK_IMAGE) MINIO_CLIENT.fput_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.jpg/mock.jpg", file_path="mock.jpg", ) metadata = { "filename": "mock.jpg", } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.jpg/mock.jpg.metadata", data=buffer, length=buffer.getbuffer().nbytes, )
def create_mock_dataset2(self): with ZipFile("mock.zip", "w") as zipf: zipf.writestr("mock.gif", MOCK_IMAGE) MINIO_CLIENT.fput_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.zip/mock.zip", file_path="mock.zip", ) metadata = { "filename": "mock.zip", } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.zip/mock.zip.metadata", data=buffer, length=buffer.getbuffer().nbytes, )
def update_dataset_metadata(name: str, metadata: Dict[str, str], run_id: Optional[str] = None, operator_id: Optional[str] = None): """Update the metadata of a dataset. Args: name (str): the dataset name. metadata (dict): metadata about the dataset. run_id (str, optional): the run id of trainning pipeline. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. """ object_name = _metadata_filepath(name, run_id, operator_id) # encodes metadata to JSON format buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )
def save_model(**kwargs): """Serializes and saves models. Args: **kwargs: the models as keyword arguments. Raises: TypeError: when a figure is not a matplotlib figure. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ experiment_id = kwargs.get("experiment_id") if experiment_id is None: experiment_id = get_experiment_id() operator_id = kwargs.get("operator_id") if operator_id is None: operator_id = get_operator_id() object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}" model_buffer = BytesIO() dump(kwargs, model_buffer) model_buffer.seek(0, SEEK_SET) # ensures MinIO bucket exists make_bucket(BUCKET_NAME) # uploads file to MinIO MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=model_buffer, length=model_buffer.getbuffer().nbytes, )
def list_datasets() -> List[str]: """Lists dataset names from object storage. Returns: list: A list of all datasets names. """ datasets = [] # ensures MinIO bucket exists make_bucket(BUCKET_NAME) objects = MINIO_CLIENT.list_objects_v2(BUCKET_NAME, PREFIX + "/") for obj in objects: name = obj.object_name[len(PREFIX) + 1:-1] datasets.append(name) return datasets
def list_metrics(experiment_id: Optional[str] = None, operator_id: Optional[str] = None, run_id: Optional[str] = None) -> List[Dict[str, object]]: """Lists metrics from object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. run_id (str, optional): the run id. Defaults to None. Returns: list: A list of metrics. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id is None: # gets run_id from env variable # Attention: returns None if env is unset run_id = get_run_id() elif run_id == "latest": try: metadata = stat_metadata(experiment_id, operator_id) run_id = metadata.get("run_id") except FileNotFoundError: return [] try: object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id) data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) except (NoSuchBucket, NoSuchKey): raise FileNotFoundError(f"No such file or directory: '{experiment_id}'") return load(data)
def create_mock_dataset1(self, size=1e2): header = ",".join(self.mock_columns()) + "\n" rows = "\n".join([ ",".join([str(v) for v in self.mock_values()]) for x in range(int(size)) ]) buffer = BytesIO((header + rows).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.csv/mock.csv", data=buffer, length=buffer.getbuffer().nbytes, ) metadata = { "columns": self.mock_columns(), "featuretypes": self.mock_featuretypes(), "filename": "mock.csv", "run_id": RUN_ID, } buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name="datasets/mock.csv/mock.csv.metadata", data=buffer, length=buffer.getbuffer().nbytes, ) MINIO_CLIENT.copy_object( bucket_name=BUCKET_NAME, object_name= f"datasets/mock.csv/runs/{RUN_ID}/operators/{OPERATOR_ID}/mock.csv/mock.csv", object_source=f"/{BUCKET_NAME}/datasets/mock.csv/mock.csv", ) MINIO_CLIENT.copy_object( bucket_name=BUCKET_NAME, object_name= f"datasets/mock.csv/runs/{RUN_ID}/operators/{OPERATOR_ID}/mock.csv/mock.csv.metadata", object_source=f"/{BUCKET_NAME}/datasets/mock.csv/mock.csv.metadata", )
def make_bucket(self): try: MINIO_CLIENT.make_bucket(BUCKET_NAME) except BucketAlreadyOwnedByYou: pass
def empty_bucket(self): for obj in MINIO_CLIENT.list_objects(BUCKET_NAME, prefix="", recursive=True): MINIO_CLIENT.remove_object(BUCKET_NAME, obj.object_name)
def stat_dataset(name: str, run_id: Optional[str] = None, operator_id: Optional[str] = None) -> Dict[str, str]: """Retrieves the metadata of a dataset. Args: name (str): the dataset name. run_id (str, optional): the run id of trainning pipeline. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Returns: dict: The metadata. Raises: FileNotFoundError: If dataset does not exist in the object storage. """ metadata = {} # remove /tmp/data/ from dataset name # because in jupyter we use dataset with full path name = name.replace("/tmp/data/", "") if run_id == "latest": metadata = stat_dataset(name) run_id = metadata.get("run_id") if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() if run_id and operator_id: # get metadata for a specific operator of a run, if exists object_name = _metadata_filepath(name, run_id, operator_id) elif run_id: # if no metadata was generated by the operator, # get the last one generated by the pipeline flow object_name = _metadata_filepath(name, run_id) if not metadata_exists(name, run_id): # if it is at the beginning of a run, # there will be no metadata generated by run_id object_name = _metadata_filepath(name) else: # unable to get run_id automatically, # this function is probably being called out of a run object_name = _metadata_filepath(name) else: # get path according to received parameters run_id = None if run_id == "root" else run_id object_name = _metadata_filepath(name, run_id, operator_id) try: # reads the .metadata file data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) # decodes the metadata (which is in JSON format) metadata = loads(data.read()) except (NoSuchBucket, NoSuchKey): raise FileNotFoundError("The specified dataset does not exist") return metadata
def tearDown(self): MINIO_CLIENT.remove_object( bucket_name=BUCKET_NAME, object_name="artifacts/mock.txt", )
def save_metrics(experiment_id: Optional[str] = None, operator_id: Optional[str] = None, run_id: Optional[str] = None, **kwargs): """Saves metrics of an experiment to the object storage. Args: experiment_id (str, optional): the experiment uuid. Defaults to None operator_id (str, optional): the operator uuid. Defaults to None run_id (str, optional): the run id. Defaults to None. **kwargs: the metrics dict. Raises: TypeError: when experiment_id is undefined in args and env. TypeError: when operator_id is undefined in args and env. """ if experiment_id is None: experiment_id = get_experiment_id() if operator_id is None: operator_id = get_operator_id() if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id: metadata = {} try: metadata = stat_metadata(experiment_id, operator_id) if run_id == "latest": run_id = metadata.get("run_id") except FileNotFoundError: pass metadata["run_id"] = run_id # encodes metadata to JSON format and uploads to MinIO buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=f'experiments/{experiment_id}/operators/{operator_id}/.metadata', data=buffer, length=buffer.getbuffer().nbytes, ) object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id) encoded_metrics = [] # retrieves the metrics saved previosuly try: data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=object_name, ) encoded_metrics = loads(data.read()) except NoSuchKey: pass # appends new metrics encoded_metrics.extend(_encode_metrics(kwargs)) # puts metrics into buffer buffer = BytesIO(dumps(encoded_metrics).encode()) # uploads metrics to MinIO MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )
def save_dataset(name: str, data: Union[pd.DataFrame, BinaryIO] = None, df: pd.DataFrame = None, metadata: Optional[Dict[str, str]] = None, run_id: Optional[str] = None, operator_id: Optional[str] = None): """Saves a dataset and its metadata to the object storage. Args: name (str): the dataset name. data (pandas.DataFrame, BinaryIO, optional): the dataset contents as a pandas.DataFrame or an `BinaryIO` buffer. Defaults to None. df (pandas.DataFrame, optional): the dataset contents as an `pandas.DataFrame`. df exists only for compatibility with existing components. Use "data" for all types of datasets. Defaults to None. metadata (dict, optional): metadata about the dataset. Defaults to None.. run_id (str, optional): the run id. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Raises: PermissionError: If dataset was read only. """ # ensures MinIO bucket exists make_bucket(BUCKET_NAME) if run_id is None: # gets run_id from env variables # Attention: returns None if env is unset run_id = get_run_id() if operator_id is None: # gets operator_id from env variables # Attention: returns None if env is unset operator_id = get_operator_id(raise_for_none=False) # df exists only for compatibility with existing components # from now on one must use "data" for all types of datasets if df is not None: data = df try: # gets metadata (if dataset exists) stored_metadata = stat_dataset(name, run_id) metadata_should_be_updated = False # update stored metadata values if metadata: stored_metadata.update(metadata) elif isinstance(data, pd.DataFrame): metadata_should_be_updated = True metadata = stored_metadata except FileNotFoundError: metadata_should_be_updated = False # builds metadata dict: # sets filename and run_id if metadata is None: metadata = {} metadata["filename"] = name if isinstance(data, pd.DataFrame): # sets metadata specific for pandas.DataFrame: # columns, featuretypes metadata["columns"] = data.columns.tolist() metadata["total"] = len(data.index) if "featuretypes" not in metadata: metadata["featuretypes"] = infer_featuretypes(data) # if the metadata was given (set manually), ignore updates, otherwise # search for changes and then update current featuretypes to be even with columns if metadata_should_be_updated: previous_metadata = stat_dataset(name, run_id) previous_columns = previous_metadata["columns"] previous_featuretypes = previous_metadata["featuretypes"] column_to_type = dict(zip(previous_columns, previous_featuretypes)) new_featuretypes = [] for new_column in metadata["columns"]: if new_column in column_to_type: new_featuretypes.append(column_to_type[new_column]) else: new_featuretypes.append( infer_featuretypes(pd.DataFrame(data[new_column]))[0]) metadata["featuretypes"] = new_featuretypes if run_id: metadata["run_id"] = run_id # When saving a dataset of a run, also # set the run_id in datasets/<name>.metadata # This enables load_dataset by run="latest" try: root_metadata = stat_dataset(name, "root") except FileNotFoundError: root_metadata = {} root_metadata["run_id"] = run_id object_name = _metadata_filepath(name) # encodes metadata to JSON format buffer = BytesIO(dumps(root_metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) # create a run metadata to save the last operator id # to dataset get loaded on next step of the pipeline flow metadata["operator_id"] = operator_id object_name = _metadata_filepath(name, run_id=run_id) buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, ) path = _data_filepath(name, run_id, operator_id) if isinstance(data, pd.DataFrame): # uploads dataframe to MinIO as a .csv file temp_file = tempfile.NamedTemporaryFile(dir='.', delete=False) data.to_csv(temp_file.name, header=True, index=False) MINIO_CLIENT.fput_object(bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), file_path=temp_file.name) temp_file.close() os.remove(temp_file.name) else: # uploads raw data to MinIO buffer = BytesIO(data.read()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), data=buffer, length=buffer.getbuffer().nbytes, ) object_name = _metadata_filepath(name, run_id, operator_id) # encodes metadata to JSON format buffer = BytesIO(dumps(metadata).encode()) MINIO_CLIENT.put_object( bucket_name=BUCKET_NAME, object_name=object_name, data=buffer, length=buffer.getbuffer().nbytes, )
def load_dataset( name: str, run_id: Optional[str] = None, operator_id: Optional[str] = None, page: Optional[int] = None, page_size: Optional[int] = None) -> Union[pd.DataFrame, BinaryIO]: """Retrieves the contents of a dataset. If run_id exists, then loads the dataset from the specified run. If the dataset does not exist for given run_id/operator_id return the 'original' dataset Args: name (str): the dataset name. run_id (str, optional): the run id of training pipeline. Defaults to None. operator_id (str, optional): the operator uuid. Defaults to None. Returns: The contents of a dataset. Either a `pandas.DataFrame` or an `BinaryIO` buffer. Raises: FileNotFoundError: If dataset does not exist in the object storage. """ if run_id is None: # gets run_id from env variable # Attention: returns None if env is unset run_id = get_run_id() elif run_id == "latest": metadata = stat_dataset(name) run_id = metadata.get("run_id") # when the dataset does not exist for given run_id/operator_id # must return the 'original' dataset # unset run_id so data_filepath points to the 'original' dataset if run_id and operator_id: try: metadata = stat_dataset(name, run_id, operator_id) except FileNotFoundError: run_id = None elif run_id: try: run_metadata = stat_dataset(name, run_id) operator_id = run_metadata.get("operator_id") except FileNotFoundError: run_id = None # builds the path to the dataset file path = _data_filepath(name, run_id, operator_id) if page_size and page_size > 0: nrows = page_size else: nrows = None if page and page > 0: skiprows = (page - 1) * page_size else: skiprows = None try: metadata = stat_dataset(name, run_id, operator_id) dataset = pd.read_csv(S3FS.open(path), header=0, index_col=False, nrows=nrows, skiprows=skiprows) dtypes = dict((column, "object") for column, ftype in zip( metadata["columns"], metadata["featuretypes"]) if ftype in [CATEGORICAL, DATETIME]) dataset = dataset.astype(dtypes) except (UnicodeDecodeError, pd.errors.EmptyDataError, pd.errors.ParserError): # reads the raw file data = MINIO_CLIENT.get_object( bucket_name=BUCKET_NAME, object_name=path.lstrip(f"{BUCKET_NAME}/"), ) return BytesIO(data.read()) except KeyError: # metadata file does not contains "columns" or "featuretypes" # ignore this error and return dataset without cast its type pass except FileNotFoundError: raise FileNotFoundError("The specified dataset does not exist") return dataset