Пример #1
0
def generate_name(filename: str, attempt: int = 1) -> str:
    """Generates a dataset name from a given filename.

    Args:
        filename (str): source filename.
        attempt (int): the current attempt of generating a new name.

    Return:
        str: new generated dataset name.
    """
    # normalize filename to ASCII characters
    # replace spaces by dashes
    name = normalize('NFKD', filename) \
        .encode('ASCII', 'ignore') \
        .replace(b' ', b'-') \
        .decode()

    if attempt > 1:
        # adds a suffix '-NUMBER' to filename
        name, extension = splitext(name)
        name = f"{name}-{attempt}{extension}"

    try:
        # check if final_name is already in use
        stat_dataset(name)
    except FileNotFoundError:
        return name

    # if it is already in use,
    return generate_name(filename, attempt + 1)
Пример #2
0
def get_dataset(project_id, experiment_id, operator_id):
    """Retrieves a dataset as json.

    Args:
        project_id (str): the project uuid.
        experiment_id (str): the experiment uuid.
        operator_id (str): the operator uuid.
    """
    raise_if_project_does_not_exist(project_id)

    experiment = Experiment.query.get(experiment_id)

    if experiment is None:
        raise NotFound("The specified experiment does not exist")

    raise_if_operator_does_not_exist(operator_id)

    try:
        metadata = platiagro.stat_dataset(name=experiment.dataset,
                                          operator_id=operator_id)
        if "run_id" not in metadata:
            raise FileNotFoundError()

        dataset = platiagro.load_dataset(name=experiment.dataset,
                                         run_id="latest",
                                         operator_id=operator_id)
        dataset = dataset.to_dict(orient="split")
        del dataset["index"]
    except FileNotFoundError as e:
        raise NotFound(str(e))

    return dataset
Пример #3
0
def list_columns(dataset: str) -> List[Dict[str, str]]:
    """Lists all columns from a dataset.

    Args:
        dataset (str): the dataset name.

    Returns:
        A list of columns names and featuretypes.

    Raises:
        NotFound: when the dataset does not exist.
    """
    try:
        metadata = stat_dataset(dataset)

        columns = metadata.get("columns", [])
        featuretypes = metadata.get("featuretypes", [])

        columns = [{
            "name": col,
            "featuretype": ftype
        } for col, ftype in zip(columns, featuretypes)]
        return columns
    except FileNotFoundError:
        raise NotFound("The specified dataset does not exist")
Пример #4
0
def list_columns(dataset):
    """
    Lists all columns from a dataset.

    Parameters
    ----------
    dataset : str
        The dataset name.

    Returns
    -------
    list
        A list of columns names and featuretypes.

    Raises
    ------
    NotFound
        When the dataset does not exist.
    """
    try:
        metadata = stat_dataset(dataset)

        columns = metadata.get("columns", [])
        featuretypes = metadata.get("featuretypes", [])

        columns = [{
            "name": col,
            "featuretype": ftype
        } for col, ftype in zip(columns, featuretypes)]
        return columns
    except FileNotFoundError:
        raise DATASET_NOT_FOUND
Пример #5
0
def get_dataset_pagination(project_id, experiment_id, operator_id, page, page_size):
    """Retrieves a dataset as json.

    Args:
        project_id (str): the project uuid.
        experiment_id (str): the experiment uuid.
        operator_id (str): the operator uuid.
    """
    raise_if_project_does_not_exist(project_id)

    raise_if_experiment_does_not_exist(experiment_id)

    operator = Operator.query.get(operator_id)
    if operator is None:
        raise NotFound("The specified operator does not exist")

    # get dataset name
    dataset = operator.parameters.get('dataset')
    if dataset is None:
        raise NotFound()

    try:
        metadata = platiagro.stat_dataset(name=dataset, operator_id=operator_id)
        if "run_id" not in metadata:
            raise FileNotFoundError()

        dataset = platiagro.load_dataset(name=dataset,
                                         run_id="latest",
                                         operator_id=operator_id)
        dataset = dataset.to_dict(orient="split")
        del dataset["index"]
    except FileNotFoundError as e:
        raise NotFound(str(e))
    return pagination_datasets(page=page, page_size=page_size, elements=dataset)
Пример #6
0
 def test_update_dataset_metadata(self):
     featuretypes = [
         'Categorical', 'Categorical', 'Categorical', 'Categorical',
         'Categorical'
     ]
     metadata = stat_dataset("mock.csv")
     metadata["featuretypes"] = featuretypes
     update_dataset_metadata("mock.csv", metadata)
     result = stat_dataset("mock.csv")
     expected = {
         "columns": self.mock_columns(),
         "featuretypes": featuretypes,
         "filename": "mock.csv",
         "run_id": RUN_ID,
     }
     self.assertDictEqual(result, expected)
Пример #7
0
def get_dataset(name: str) -> Dict[str, Any]:
    """Details a dataset from our object storage.

    Args:
        name (str): the dataset name to look for in our object storage.

    Returns:
        The dataset details: name, columns, and filename.

    Raises:
        NotFound: when the dataset does not exist.
    """
    try:
        metadata = stat_dataset(name)

        filename = metadata.get("original-filename")

        if "columns" in metadata and "featuretypes" in metadata:
            columns = metadata["columns"]
            featuretypes = metadata["featuretypes"]
            columns = [{
                "name": col,
                "featuretype": ftype
            } for col, ftype in zip(columns, featuretypes)]
            return {"name": name, "columns": columns, "filename": filename}

        return {"name": name, "filename": filename}
    except FileNotFoundError:
        raise NotFound("The specified dataset does not exist")
Пример #8
0
def get_featuretypes(name):
    """
    Get the dataset featuretypes.

    Parameters
    ----------
    name : str
        The dataset name to look for in our object storage.

    Returns
    -------
    bytes
        The dataset featuretypes encoded.

    Raises
    ------
    NotFound
        When the dataset does not exist.
    """
    try:
        metadata = stat_dataset(name)
    except FileNotFoundError:
        raise NOT_FOUND

    metadata_featuretypes = metadata.get("featuretypes")
    featuretypes = "\n".join(metadata_featuretypes)
    return featuretypes.encode()
Пример #9
0
def get_dataset(name, page=1, page_size=10):
    """
    Details a dataset from our object storage.

    Parameters
    ----------
    name : str
        The dataset name to look for in our object storage.
    page : int or str
        The page number. First page is 1. Default to 1.
    page_size : int or str
        The page size. Default value is 10.

    Returns
    -------
    dict
        The dataset details: name, columns, and filename.

    Raises
    ------
    NotFound
        When the dataset does not exist.
    BadRequest
    """
    try:
        page, page_size = int(page), int(page_size)
        metadata = stat_dataset(name)
        filename = metadata.get("original-filename")
        dataset = {"name": name, "filename": filename}

        if "columns" in metadata and "featuretypes" in metadata:
            columns = metadata["columns"]
            featuretypes = metadata["featuretypes"]
            columns = [
                {"name": col, "featuretype": ftype}
                for col, ftype in zip(columns, featuretypes)
            ]
            content = load_dataset(name)
            # Replaces NaN value by a text "NaN" so JSON encode doesn't fail
            content.replace(np.nan, "NaN", inplace=True, regex=True)
            content.replace(np.inf, "Inf", inplace=True, regex=True)
            content.replace(-np.inf, "-Inf", inplace=True, regex=True)
            data = content.values.tolist()

            if page_size != -1:
                data = data_pagination(content=data, page=page, page_size=page_size)

            dataset.update(
                {"columns": columns, "data": data, "total": len(content.index)}
            )
        return dataset
    except FileNotFoundError:
        raise NOT_FOUND
    except ValueError:
        raise BadRequest("ValueError", VALUE_ERROR_MESSAGE)
Пример #10
0
def update_column(dataset, column, featuretype):
    """
    Updates a column from a dataset.

    Paramters
    ---------
    dataset : str
        The dataset name.
    column : str
        The column name.
    featuretype : str
        The feature type (Numerical, Categorical, or DateTime).

    Returns
    -------
    dict
        The column info.

    Raises
    ------
    NotFound
        When the dataset or column does not exist.

    BadRequest
        When the featuretype is invalid.
    """
    try:
        metadata = stat_dataset(dataset)

        if "columns" not in metadata or "featuretypes" not in metadata:
            raise COLUMN_NOT_FOUND

        columns = metadata["columns"]

        if column not in columns:
            raise COLUMN_NOT_FOUND

        # sets new metadata
        index = columns.index(column)
        metadata["featuretypes"][index] = featuretype

        validate_featuretypes(metadata["featuretypes"])

        df = load_dataset(dataset)

        # uses PlatIAgro SDK to save the dataset
        save_dataset(dataset, df, metadata=metadata)
    except FileNotFoundError:
        raise DATASET_NOT_FOUND
    except ValueError as e:
        raise BadRequest("ValueError", str(e))

    return {"name": column, "featuretype": featuretype}
Пример #11
0
def patch_dataset(name, file_object):
    """
    Update the dataset metadata in our object storage.

    Parameters
    ----------
    name : str
        The dataset name to look for in our object storage.

    file_object : dict
        File object.

    Returns
    -------
    dict
        The dataset details: name, columns, and filename.

    Raises
    ------
    BadRequest
        When incoming files are missing or invalid.

    NotFound
        When the dataset does not exist
    """
    if not file_object.file:
        raise BadRequest("NoFeatureTypes", "No featuretypes part")

    try:
        metadata = stat_dataset(name)
    except FileNotFoundError:
        raise NOT_FOUND

    try:
        ftype_file = file_object.file
        featuretypes = list(
            map(lambda s: s.strip().decode("utf8"), ftype_file.readlines())
        )
        validate_featuretypes(featuretypes)
    except ValueError as e:
        raise BadRequest("ValueError", str(e))

    columns = metadata["columns"]
    if len(columns) != len(featuretypes):
        raise BadRequest(
            "DifferentLengths",
            "featuretypes must be the same length as the DataFrame columns"
        )

    # uses PlatIAgro SDK to update the dataset metadata
    metadata["featuretypes"] = featuretypes
    update_dataset_metadata(name=name, metadata=metadata)
    return get_dataset(name)
Пример #12
0
def generate_name(filename, attempt=1):
    """Generates a dataset name from a given filename.

    Parameters
    ----------
    filename : str
        Source filename.
    attempt : int
        The current attempt of generating a new name. Default to 1.

    Returns
    -------
    str
        New generated dataset name.
    """
    # normalize filename to ASCII characters
    # replace spaces by dashes
    name = (
        normalize("NFKD", filename)
        .encode("ASCII", "ignore")
        .replace(b" ", b"-")
        .decode()
    )

    if attempt > 1:
        # adds a suffix '-NUMBER' to filename
        name, extension = splitext(name)
        name = f"{name}-{attempt}{extension}"

    try:
        # check if final_name is already in use
        stat_dataset(name)
    except FileNotFoundError:
        return name

    # if it is already in use,
    return generate_name(filename, attempt + 1)
Пример #13
0
def update_column(dataset: str, column: str,
                  featuretype: str) -> Dict[str, str]:
    """Updates a column from a dataset.

    Args:
        dataset (str): the dataset name.
        column (str): the column name.
        featuretype (str): the feature type (Numerical, Categorical, or DateTime).

    Returns:
        The column info.

    Raises:
        NotFound: when the dataset or column does not exist.
        BadRequest: when the featuretype is invalid.
    """
    try:
        metadata = stat_dataset(dataset)

        if "columns" not in metadata or "featuretypes" not in metadata:
            raise NotFound("The specified column does not exist")

        columns = metadata["columns"]

        if column not in columns:
            raise NotFound("The specified column does not exist")

        # sets new metadata
        index = columns.index(column)
        metadata["featuretypes"][index] = featuretype

        validate_featuretypes(metadata["featuretypes"])

        df = load_dataset(dataset)

        # uses PlatIAgro SDK to save the dataset
        save_dataset(dataset, df, metadata=metadata)
    except FileNotFoundError:
        raise NotFound("The specified dataset does not exist")
    except ValueError as e:
        raise BadRequest(str(e))

    return {"name": column, "featuretype": featuretype}
Пример #14
0
def get_dataset_pagination(application_csv,
                           name,
                           operator_id,
                           page,
                           page_size,
                           run_id):
    """Retrieves a dataset.
    Args:
        application_csv(bool): if is to return dataset as csv
        name(str): the dataset name
        operator_id(str): the operator uuid
        page_size(int) : record numbers
        page(int): page number
        run_id (str): the run id.
    Returns:
        Dataset
    """
    try:
        metadata = platiagro.stat_dataset(name=name, operator_id=operator_id)
        if "run_id" not in metadata:
            raise FileNotFoundError()
        dataset = platiagro.load_dataset(name=name, operator_id=operator_id, run_id=run_id)
    except FileNotFoundError as e:
        raise NotFound(str(e))

    if page_size == -1:
        if application_csv:
            return dataset.to_csv(index=False)
        dataset = dataset.to_dict(orient="split")
        del dataset["index"]
        return dataset
    else:
        dataset = dataset.to_dict(orient="split")
        del dataset["index"]
        pdataset = pagination_datasets(page=page, page_size=page_size, dataset=dataset)
        if application_csv:
            df = pd.DataFrame(columns=pdataset['columns'], data=pdataset['data'])
            return df.to_csv(index=False)
        return pdataset
Пример #15
0
    def get_dataset(
        self,
        project_id: str,
        experiment_id: str,
        run_id: str,
        operator_id: str,
        page: Optional[int] = 1,
        page_size: Optional[int] = 10,
        accept: Optional[str] = None,
    ):
        """
        Get dataset records from a run. Supports pagination.

        Parameters
        ----------
        project_id : str
        experiment_id : str
        run_id : str
            The run_id. If `run_id=latest`, then returns datasets from the latest run_id.
        operator_id : str
        page : int
            The page number. First page is 1.
        page_size : int
            The page size. Default value is 10.
        accept : str
            Whether dataset should be returned as csv file. Default to None.

        Returns
        -------
        list
            A list of dataset records.

        Raises
        ------
        NotFound
            When any of project_id, experiment_id, run_id, or operator_id does not exist.
        """
        if run_id == "latest":
            run_id = get_latest_run_id(experiment_id)

        name = self.get_dataset_name(operator_id, experiment_id)

        try:
            metadata = stat_dataset(name=name, operator_id=operator_id, run_id=run_id)
        except FileNotFoundError:
            raise NotFound(
                code="DatasetNotFound",
                message="The specified run does not contain dataset",
            )

        dataset = load_dataset(
            name=name,
            run_id=run_id,
            operator_id=operator_id,
            page=page,
            page_size=page_size,
        )
        if isinstance(dataset, pd.DataFrame):
            # Replaces NaN value by a text "NaN" so JSON encode doesn't fail
            dataset.replace(np.nan, "NaN", inplace=True, regex=True)
            data = dataset.to_dict(orient="split")
            total = metadata.get("total", len(dataset.index))
            return {"columns": data["columns"], "data": data["data"], "total": total}

        return StreamingResponse(
            dataset,
            media_type="application/octet-stream",
        )
Пример #16
0
    def test_stat_dataset(self):
        with self.assertRaises(FileNotFoundError):
            stat_dataset("UNK")

        result = stat_dataset("mock.zip")
        expected = {
            "filename": "mock.zip",
        }
        self.assertDictEqual(result, expected)

        result = stat_dataset("/tmp/data/mock.zip")
        expected = {
            "filename": "mock.zip",
        }
        self.assertDictEqual(result, expected)

        result = stat_dataset("mock.csv")
        expected = {
            "columns": self.mock_columns(),
            "featuretypes": self.mock_featuretypes(),
            "filename": "mock.csv",
            "run_id": RUN_ID,
        }
        self.assertDictEqual(result, expected)

        result = stat_dataset("mock.csv",
                              run_id="latest",
                              operator_id=OPERATOR_ID)
        expected = {
            "columns": self.mock_columns(),
            "featuretypes": self.mock_featuretypes(),
            "filename": "mock.csv",
            "run_id": RUN_ID,
        }
        self.assertDictEqual(result, expected)

        result = stat_dataset("mock.csv",
                              run_id=RUN_ID,
                              operator_id=OPERATOR_ID)
        expected = {
            "columns": self.mock_columns(),
            "featuretypes": self.mock_featuretypes(),
            "filename": "mock.csv",
            "run_id": RUN_ID,
        }
        self.assertDictEqual(result, expected)

        os.environ["RUN_ID"] = RUN_ID

        result = stat_dataset("mock.csv")
        expected = {
            "columns": self.mock_columns(),
            "featuretypes": self.mock_featuretypes(),
            "filename": "mock.csv",
            "run_id": RUN_ID,
        }
        self.assertDictEqual(result, expected)

        result = stat_dataset("mock.csv", operator_id=OPERATOR_ID)
        expected = {
            "columns": self.mock_columns(),
            "featuretypes": self.mock_featuretypes(),
            "filename": "mock.csv",
            "run_id": RUN_ID,
        }
        self.assertDictEqual(result, expected)

        run_id = "THIS_RUN_ID_DOES_NOT_EXIST"
        with self.assertRaises(FileNotFoundError):
            stat_dataset("mock.csv", run_id=run_id)