Пример #1
0
 def setUp(self):
     self.make_bucket()
     buffer = io.BytesIO(b"mock")
     MINIO_CLIENT.put_object(
         bucket_name=BUCKET_NAME,
         object_name="artifacts/mock.txt",
         data=buffer,
         length=buffer.getbuffer().nbytes,
     )
Пример #2
0
 def create_mock_figure(self):
     file = BytesIO(
         b'<svg viewBox=\'0 0 125 80\' xmlns=\'http://www.w3.org/2000/svg\'>\n'
         b'<text y="75" font-size="100" font-family="serif"><![CDATA[10]]></text>\n'
         b'</svg>\n')
     MINIO_CLIENT.put_object(
         bucket_name=BUCKET_NAME,
         object_name="experiments/test/operators/test/figure-123456.svg",
         data=file,
         length=file.getbuffer().nbytes,
     )
Пример #3
0
 def create_mock_model(self):
     model = {"model": MockModel()}
     buffer = BytesIO()
     dump(model, buffer)
     buffer.seek(0, SEEK_SET)
     MINIO_CLIENT.put_object(
         bucket_name=BUCKET_NAME,
         object_name=
         f"experiments/{EXPERIMENT_ID}/operators/{OPERATOR_ID}/model.joblib",
         data=buffer,
         length=buffer.getbuffer().nbytes,
     )
Пример #4
0
def load_model(experiment_id: Optional[str] = None,
               operator_id: Optional[str] = None) -> Dict[str, object]:
    """Retrieves a model from object storage.

    Args:
        experiment_id (str, optional): the experiment uuid. Defaults to None.
        operator_id (str, optional): the operator uuid. Defaults to None.

    Returns:
        dict: A dictionary of models.

    Raises:
        TypeError: when experiment_id is undefined in args and env.
        TypeError: when operator_id is undefined in args and env.
    """
    if experiment_id is None:
        experiment_id = get_experiment_id()

    if operator_id is None:
        operator_id = get_operator_id()

    try:
        object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}"
        data = MINIO_CLIENT.get_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
        )
    except (NoSuchBucket, NoSuchKey):
        return {}

    buffer = BytesIO(data.read())

    return load(buffer)
Пример #5
0
def download_artifact(name: str, path: str):
    """Downloads the given artifact to the path.

    Args:
        name (str): the dataset name.
        path (str): destination path.

    Raises:
        FileNotFoundError
    """
    try:
        MINIO_CLIENT.fget_object(
            bucket_name=BUCKET_NAME,
            object_name=f"{PREFIX}/{name}",
            file_path=path,
        )
    except (NoSuchBucket, NoSuchKey):
        raise FileNotFoundError("The specified artifact does not exist")
Пример #6
0
    def create_mock_dataset3(self):
        with open("mock.jpg", 'wb') as imagef:
            imagef.write(MOCK_IMAGE)

        MINIO_CLIENT.fput_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.jpg/mock.jpg",
            file_path="mock.jpg",
        )
        metadata = {
            "filename": "mock.jpg",
        }
        buffer = BytesIO(dumps(metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.jpg/mock.jpg.metadata",
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )
Пример #7
0
    def create_mock_dataset2(self):
        with ZipFile("mock.zip", "w") as zipf:
            zipf.writestr("mock.gif", MOCK_IMAGE)

        MINIO_CLIENT.fput_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.zip/mock.zip",
            file_path="mock.zip",
        )
        metadata = {
            "filename": "mock.zip",
        }
        buffer = BytesIO(dumps(metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name="datasets/mock.zip/mock.zip.metadata",
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )
Пример #8
0
def update_dataset_metadata(name: str,
                            metadata: Dict[str, str],
                            run_id: Optional[str] = None,
                            operator_id: Optional[str] = None):
    """Update the metadata of a dataset.
    Args:
        name (str): the dataset name.
        metadata (dict): metadata about the dataset.
        run_id (str, optional): the run id of trainning pipeline. Defaults to None.
        operator_id (str, optional): the operator uuid. Defaults to None.
    """
    object_name = _metadata_filepath(name, run_id, operator_id)

    # encodes metadata to JSON format
    buffer = BytesIO(dumps(metadata).encode())

    MINIO_CLIENT.put_object(
        bucket_name=BUCKET_NAME,
        object_name=object_name,
        data=buffer,
        length=buffer.getbuffer().nbytes,
    )
Пример #9
0
def save_model(**kwargs):
    """Serializes and saves models.

    Args:
        **kwargs: the models as keyword arguments.

    Raises:
        TypeError: when a figure is not a matplotlib figure.

    Raises:
        TypeError: when experiment_id is undefined in args and env.
        TypeError: when operator_id is undefined in args and env.
    """
    experiment_id = kwargs.get("experiment_id")
    if experiment_id is None:
        experiment_id = get_experiment_id()

    operator_id = kwargs.get("operator_id")
    if operator_id is None:
        operator_id = get_operator_id()

    object_name = f"{PREFIX_1}/{experiment_id}/{PREFIX_2}/{operator_id}/{MODEL_FILE}"

    model_buffer = BytesIO()
    dump(kwargs, model_buffer)
    model_buffer.seek(0, SEEK_SET)

    # ensures MinIO bucket exists
    make_bucket(BUCKET_NAME)

    # uploads file to MinIO
    MINIO_CLIENT.put_object(
        bucket_name=BUCKET_NAME,
        object_name=object_name,
        data=model_buffer,
        length=model_buffer.getbuffer().nbytes,
    )
Пример #10
0
def list_datasets() -> List[str]:
    """Lists dataset names from object storage.

    Returns:
        list: A list of all datasets names.
    """
    datasets = []

    # ensures MinIO bucket exists
    make_bucket(BUCKET_NAME)

    objects = MINIO_CLIENT.list_objects_v2(BUCKET_NAME, PREFIX + "/")

    for obj in objects:
        name = obj.object_name[len(PREFIX) + 1:-1]
        datasets.append(name)

    return datasets
Пример #11
0
def list_metrics(experiment_id: Optional[str] = None,
                 operator_id: Optional[str] = None,
                 run_id: Optional[str] = None) -> List[Dict[str, object]]:
    """Lists metrics from object storage.
    Args:
        experiment_id (str, optional): the experiment uuid. Defaults to None.
        operator_id (str, optional): the operator uuid. Defaults to None.
        run_id (str, optional): the run id. Defaults to None.
    Returns:
        list: A list of metrics.
    Raises:
        TypeError: when experiment_id is undefined in args and env.
        TypeError: when operator_id is undefined in args and env.
    """
    if experiment_id is None:
        experiment_id = get_experiment_id()

    if operator_id is None:
        operator_id = get_operator_id()

    # ensures MinIO bucket exists
    make_bucket(BUCKET_NAME)

    if run_id is None:
        # gets run_id from env variable
        # Attention: returns None if env is unset
        run_id = get_run_id()
    elif run_id == "latest":
        try:
            metadata = stat_metadata(experiment_id, operator_id)
            run_id = metadata.get("run_id")
        except FileNotFoundError:
            return []

    try:
        object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id)
        data = MINIO_CLIENT.get_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
        )
    except (NoSuchBucket, NoSuchKey):
        raise FileNotFoundError(f"No such file or directory: '{experiment_id}'")

    return load(data)
Пример #12
0
 def create_mock_dataset1(self, size=1e2):
     header = ",".join(self.mock_columns()) + "\n"
     rows = "\n".join([
         ",".join([str(v) for v in self.mock_values()])
         for x in range(int(size))
     ])
     buffer = BytesIO((header + rows).encode())
     MINIO_CLIENT.put_object(
         bucket_name=BUCKET_NAME,
         object_name="datasets/mock.csv/mock.csv",
         data=buffer,
         length=buffer.getbuffer().nbytes,
     )
     metadata = {
         "columns": self.mock_columns(),
         "featuretypes": self.mock_featuretypes(),
         "filename": "mock.csv",
         "run_id": RUN_ID,
     }
     buffer = BytesIO(dumps(metadata).encode())
     MINIO_CLIENT.put_object(
         bucket_name=BUCKET_NAME,
         object_name="datasets/mock.csv/mock.csv.metadata",
         data=buffer,
         length=buffer.getbuffer().nbytes,
     )
     MINIO_CLIENT.copy_object(
         bucket_name=BUCKET_NAME,
         object_name=
         f"datasets/mock.csv/runs/{RUN_ID}/operators/{OPERATOR_ID}/mock.csv/mock.csv",
         object_source=f"/{BUCKET_NAME}/datasets/mock.csv/mock.csv",
     )
     MINIO_CLIENT.copy_object(
         bucket_name=BUCKET_NAME,
         object_name=
         f"datasets/mock.csv/runs/{RUN_ID}/operators/{OPERATOR_ID}/mock.csv/mock.csv.metadata",
         object_source=f"/{BUCKET_NAME}/datasets/mock.csv/mock.csv.metadata",
     )
Пример #13
0
 def make_bucket(self):
     try:
         MINIO_CLIENT.make_bucket(BUCKET_NAME)
     except BucketAlreadyOwnedByYou:
         pass
Пример #14
0
 def empty_bucket(self):
     for obj in MINIO_CLIENT.list_objects(BUCKET_NAME,
                                          prefix="",
                                          recursive=True):
         MINIO_CLIENT.remove_object(BUCKET_NAME, obj.object_name)
Пример #15
0
def stat_dataset(name: str,
                 run_id: Optional[str] = None,
                 operator_id: Optional[str] = None) -> Dict[str, str]:
    """Retrieves the metadata of a dataset.

    Args:
        name (str): the dataset name.
        run_id (str, optional): the run id of trainning pipeline. Defaults to None.
        operator_id (str, optional): the operator uuid. Defaults to None.

    Returns:
        dict: The metadata.

    Raises:
        FileNotFoundError: If dataset does not exist in the object storage.
    """
    metadata = {}

    # remove /tmp/data/ from dataset name
    # because in jupyter we use dataset with full path
    name = name.replace("/tmp/data/", "")

    if run_id == "latest":
        metadata = stat_dataset(name)
        run_id = metadata.get("run_id")

    if run_id is None:
        # gets run_id from env variables
        # Attention: returns None if env is unset
        run_id = get_run_id()

        if run_id and operator_id:
            # get metadata for a specific operator of a run, if exists
            object_name = _metadata_filepath(name, run_id, operator_id)
        elif run_id:
            # if no metadata was generated by the operator,
            # get the last one generated by the pipeline flow
            object_name = _metadata_filepath(name, run_id)
            if not metadata_exists(name, run_id):
                # if it is at the beginning of a run,
                # there will be no metadata generated by run_id
                object_name = _metadata_filepath(name)
        else:
            # unable to get run_id automatically,
            # this function is probably being called out of a run
            object_name = _metadata_filepath(name)
    else:
        # get path according to received parameters
        run_id = None if run_id == "root" else run_id
        object_name = _metadata_filepath(name, run_id, operator_id)

    try:
        # reads the .metadata file
        data = MINIO_CLIENT.get_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
        )
        # decodes the metadata (which is in JSON format)
        metadata = loads(data.read())

    except (NoSuchBucket, NoSuchKey):
        raise FileNotFoundError("The specified dataset does not exist")

    return metadata
Пример #16
0
 def tearDown(self):
     MINIO_CLIENT.remove_object(
         bucket_name=BUCKET_NAME,
         object_name="artifacts/mock.txt",
     )
Пример #17
0
def save_metrics(experiment_id: Optional[str] = None,
                 operator_id: Optional[str] = None,
                 run_id: Optional[str] = None,
                 **kwargs):
    """Saves metrics of an experiment to the object storage.
    Args:
        experiment_id (str, optional): the experiment uuid. Defaults to None
        operator_id (str, optional): the operator uuid. Defaults to None
        run_id (str, optional): the run id. Defaults to None.
        **kwargs: the metrics dict.
    Raises:
        TypeError: when experiment_id is undefined in args and env.
        TypeError: when operator_id is undefined in args and env.
    """
    if experiment_id is None:
        experiment_id = get_experiment_id()

    if operator_id is None:
        operator_id = get_operator_id()

    if run_id is None:
        # gets run_id from env variables
        # Attention: returns None if env is unset
        run_id = get_run_id()

    # ensures MinIO bucket exists
    make_bucket(BUCKET_NAME)

    if run_id:
        metadata = {}
        try:
            metadata = stat_metadata(experiment_id, operator_id)
            if run_id == "latest":
                run_id = metadata.get("run_id")
        except FileNotFoundError:
            pass
        metadata["run_id"] = run_id

        # encodes metadata to JSON format and uploads to MinIO
        buffer = BytesIO(dumps(metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name=f'experiments/{experiment_id}/operators/{operator_id}/.metadata',
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )

    object_name = operator_filepath(METRICS_FILE, experiment_id, operator_id, run_id)

    encoded_metrics = []

    # retrieves the metrics saved previosuly
    try:
        data = MINIO_CLIENT.get_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
        )
        encoded_metrics = loads(data.read())
    except NoSuchKey:
        pass

    # appends new metrics
    encoded_metrics.extend(_encode_metrics(kwargs))

    # puts metrics into buffer
    buffer = BytesIO(dumps(encoded_metrics).encode())

    # uploads metrics to MinIO
    MINIO_CLIENT.put_object(
        bucket_name=BUCKET_NAME,
        object_name=object_name,
        data=buffer,
        length=buffer.getbuffer().nbytes,
    )
Пример #18
0
def save_dataset(name: str,
                 data: Union[pd.DataFrame, BinaryIO] = None,
                 df: pd.DataFrame = None,
                 metadata: Optional[Dict[str, str]] = None,
                 run_id: Optional[str] = None,
                 operator_id: Optional[str] = None):
    """Saves a dataset and its metadata to the object storage.

    Args:
        name (str): the dataset name.
        data (pandas.DataFrame, BinaryIO, optional): the dataset contents as a
            pandas.DataFrame or an `BinaryIO` buffer. Defaults to None.
        df (pandas.DataFrame, optional): the dataset contents as an `pandas.DataFrame`.
            df exists only for compatibility with existing components.
            Use "data" for all types of datasets. Defaults to None.
        metadata (dict, optional): metadata about the dataset. Defaults to None..
        run_id (str, optional): the run id. Defaults to None.
        operator_id (str, optional): the operator uuid. Defaults to None.

    Raises:
        PermissionError: If dataset was read only.
    """
    # ensures MinIO bucket exists
    make_bucket(BUCKET_NAME)

    if run_id is None:
        # gets run_id from env variables
        # Attention: returns None if env is unset
        run_id = get_run_id()

    if operator_id is None:
        # gets operator_id from env variables
        # Attention: returns None if env is unset
        operator_id = get_operator_id(raise_for_none=False)

    # df exists only for compatibility with existing components
    # from now on one must use "data" for all types of datasets
    if df is not None:
        data = df

    try:
        # gets metadata (if dataset exists)
        stored_metadata = stat_dataset(name, run_id)
        metadata_should_be_updated = False

        # update stored metadata values
        if metadata:
            stored_metadata.update(metadata)
        elif isinstance(data, pd.DataFrame):
            metadata_should_be_updated = True

        metadata = stored_metadata
    except FileNotFoundError:
        metadata_should_be_updated = False

    # builds metadata dict:
    # sets filename and run_id
    if metadata is None:
        metadata = {}

    metadata["filename"] = name

    if isinstance(data, pd.DataFrame):
        # sets metadata specific for pandas.DataFrame:
        # columns, featuretypes
        metadata["columns"] = data.columns.tolist()
        metadata["total"] = len(data.index)

        if "featuretypes" not in metadata:
            metadata["featuretypes"] = infer_featuretypes(data)

    # if the metadata was given (set manually), ignore updates, otherwise
    # search for changes and then update current featuretypes to be even with columns
    if metadata_should_be_updated:
        previous_metadata = stat_dataset(name, run_id)
        previous_columns = previous_metadata["columns"]
        previous_featuretypes = previous_metadata["featuretypes"]
        column_to_type = dict(zip(previous_columns, previous_featuretypes))

        new_featuretypes = []
        for new_column in metadata["columns"]:
            if new_column in column_to_type:
                new_featuretypes.append(column_to_type[new_column])
            else:
                new_featuretypes.append(
                    infer_featuretypes(pd.DataFrame(data[new_column]))[0])

        metadata["featuretypes"] = new_featuretypes

    if run_id:
        metadata["run_id"] = run_id

        # When saving a dataset of a run, also
        # set the run_id in datasets/<name>.metadata
        # This enables load_dataset by run="latest"
        try:
            root_metadata = stat_dataset(name, "root")
        except FileNotFoundError:
            root_metadata = {}

        root_metadata["run_id"] = run_id
        object_name = _metadata_filepath(name)
        # encodes metadata to JSON format
        buffer = BytesIO(dumps(root_metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )

        # create a run metadata to save the last operator id
        # to dataset get loaded on next step of the pipeline flow
        metadata["operator_id"] = operator_id
        object_name = _metadata_filepath(name, run_id=run_id)
        buffer = BytesIO(dumps(metadata).encode())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name=object_name,
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )

    path = _data_filepath(name, run_id, operator_id)

    if isinstance(data, pd.DataFrame):
        # uploads dataframe to MinIO as a .csv file
        temp_file = tempfile.NamedTemporaryFile(dir='.', delete=False)
        data.to_csv(temp_file.name, header=True, index=False)
        MINIO_CLIENT.fput_object(bucket_name=BUCKET_NAME,
                                 object_name=path.lstrip(f"{BUCKET_NAME}/"),
                                 file_path=temp_file.name)
        temp_file.close()
        os.remove(temp_file.name)
    else:
        # uploads raw data to MinIO
        buffer = BytesIO(data.read())
        MINIO_CLIENT.put_object(
            bucket_name=BUCKET_NAME,
            object_name=path.lstrip(f"{BUCKET_NAME}/"),
            data=buffer,
            length=buffer.getbuffer().nbytes,
        )

    object_name = _metadata_filepath(name, run_id, operator_id)
    # encodes metadata to JSON format
    buffer = BytesIO(dumps(metadata).encode())
    MINIO_CLIENT.put_object(
        bucket_name=BUCKET_NAME,
        object_name=object_name,
        data=buffer,
        length=buffer.getbuffer().nbytes,
    )
Пример #19
0
def load_dataset(
        name: str,
        run_id: Optional[str] = None,
        operator_id: Optional[str] = None,
        page: Optional[int] = None,
        page_size: Optional[int] = None) -> Union[pd.DataFrame, BinaryIO]:
    """Retrieves the contents of a dataset.

    If run_id exists, then loads the dataset from the specified run.
    If the dataset does not exist for given run_id/operator_id return the
    'original' dataset

    Args:
        name (str): the dataset name.
        run_id (str, optional): the run id of training pipeline. Defaults to None.
        operator_id (str, optional): the operator uuid. Defaults to None.

    Returns:
        The contents of a dataset. Either a `pandas.DataFrame` or an `BinaryIO` buffer.

    Raises:
        FileNotFoundError: If dataset does not exist in the object storage.
    """
    if run_id is None:
        # gets run_id from env variable
        # Attention: returns None if env is unset
        run_id = get_run_id()
    elif run_id == "latest":
        metadata = stat_dataset(name)
        run_id = metadata.get("run_id")

    # when the dataset does not exist for given run_id/operator_id
    # must return the 'original' dataset
    # unset run_id so data_filepath points to the 'original' dataset
    if run_id and operator_id:
        try:
            metadata = stat_dataset(name, run_id, operator_id)
        except FileNotFoundError:
            run_id = None
    elif run_id:
        try:
            run_metadata = stat_dataset(name, run_id)
            operator_id = run_metadata.get("operator_id")
        except FileNotFoundError:
            run_id = None

    # builds the path to the dataset file
    path = _data_filepath(name, run_id, operator_id)

    if page_size and page_size > 0:
        nrows = page_size
    else:
        nrows = None

    if page and page > 0:
        skiprows = (page - 1) * page_size
    else:
        skiprows = None

    try:
        metadata = stat_dataset(name, run_id, operator_id)
        dataset = pd.read_csv(S3FS.open(path),
                              header=0,
                              index_col=False,
                              nrows=nrows,
                              skiprows=skiprows)

        dtypes = dict((column, "object") for column, ftype in zip(
            metadata["columns"], metadata["featuretypes"])
                      if ftype in [CATEGORICAL, DATETIME])
        dataset = dataset.astype(dtypes)
    except (UnicodeDecodeError, pd.errors.EmptyDataError,
            pd.errors.ParserError):
        # reads the raw file
        data = MINIO_CLIENT.get_object(
            bucket_name=BUCKET_NAME,
            object_name=path.lstrip(f"{BUCKET_NAME}/"),
        )
        return BytesIO(data.read())
    except KeyError:
        # metadata file does not contains "columns" or "featuretypes"
        # ignore this error and return dataset without cast its type
        pass
    except FileNotFoundError:
        raise FileNotFoundError("The specified dataset does not exist")

    return dataset