def get_dataset_pagination(project_id, experiment_id, operator_id, page, page_size): """Retrieves a dataset as json. Args: project_id (str): the project uuid. experiment_id (str): the experiment uuid. operator_id (str): the operator uuid. """ raise_if_project_does_not_exist(project_id) raise_if_experiment_does_not_exist(experiment_id) operator = Operator.query.get(operator_id) if operator is None: raise NotFound("The specified operator does not exist") # get dataset name dataset = operator.parameters.get('dataset') if dataset is None: raise NotFound() try: metadata = platiagro.stat_dataset(name=dataset, operator_id=operator_id) if "run_id" not in metadata: raise FileNotFoundError() dataset = platiagro.load_dataset(name=dataset, run_id="latest", operator_id=operator_id) dataset = dataset.to_dict(orient="split") del dataset["index"] except FileNotFoundError as e: raise NotFound(str(e)) return pagination_datasets(page=page, page_size=page_size, elements=dataset)
def get_dataset(project_id, experiment_id, operator_id): """Retrieves a dataset as json. Args: project_id (str): the project uuid. experiment_id (str): the experiment uuid. operator_id (str): the operator uuid. """ raise_if_project_does_not_exist(project_id) experiment = Experiment.query.get(experiment_id) if experiment is None: raise NotFound("The specified experiment does not exist") raise_if_operator_does_not_exist(operator_id) try: metadata = platiagro.stat_dataset(name=experiment.dataset, operator_id=operator_id) if "run_id" not in metadata: raise FileNotFoundError() dataset = platiagro.load_dataset(name=experiment.dataset, run_id="latest", operator_id=operator_id) dataset = dataset.to_dict(orient="split") del dataset["index"] except FileNotFoundError as e: raise NotFound(str(e)) return dataset
def get_dataset(name, page=1, page_size=10): """ Details a dataset from our object storage. Parameters ---------- name : str The dataset name to look for in our object storage. page : int or str The page number. First page is 1. Default to 1. page_size : int or str The page size. Default value is 10. Returns ------- dict The dataset details: name, columns, and filename. Raises ------ NotFound When the dataset does not exist. BadRequest """ try: page, page_size = int(page), int(page_size) metadata = stat_dataset(name) filename = metadata.get("original-filename") dataset = {"name": name, "filename": filename} if "columns" in metadata and "featuretypes" in metadata: columns = metadata["columns"] featuretypes = metadata["featuretypes"] columns = [ {"name": col, "featuretype": ftype} for col, ftype in zip(columns, featuretypes) ] content = load_dataset(name) # Replaces NaN value by a text "NaN" so JSON encode doesn't fail content.replace(np.nan, "NaN", inplace=True, regex=True) content.replace(np.inf, "Inf", inplace=True, regex=True) content.replace(-np.inf, "-Inf", inplace=True, regex=True) data = content.values.tolist() if page_size != -1: data = data_pagination(content=data, page=page, page_size=page_size) dataset.update( {"columns": columns, "data": data, "total": len(content.index)} ) return dataset except FileNotFoundError: raise NOT_FOUND except ValueError: raise BadRequest("ValueError", VALUE_ERROR_MESSAGE)
def update_column(dataset, column, featuretype): """ Updates a column from a dataset. Paramters --------- dataset : str The dataset name. column : str The column name. featuretype : str The feature type (Numerical, Categorical, or DateTime). Returns ------- dict The column info. Raises ------ NotFound When the dataset or column does not exist. BadRequest When the featuretype is invalid. """ try: metadata = stat_dataset(dataset) if "columns" not in metadata or "featuretypes" not in metadata: raise COLUMN_NOT_FOUND columns = metadata["columns"] if column not in columns: raise COLUMN_NOT_FOUND # sets new metadata index = columns.index(column) metadata["featuretypes"][index] = featuretype validate_featuretypes(metadata["featuretypes"]) df = load_dataset(dataset) # uses PlatIAgro SDK to save the dataset save_dataset(dataset, df, metadata=metadata) except FileNotFoundError: raise DATASET_NOT_FOUND except ValueError as e: raise BadRequest("ValueError", str(e)) return {"name": column, "featuretype": featuretype}
def update_column(dataset: str, column: str, featuretype: str) -> Dict[str, str]: """Updates a column from a dataset. Args: dataset (str): the dataset name. column (str): the column name. featuretype (str): the feature type (Numerical, Categorical, or DateTime). Returns: The column info. Raises: NotFound: when the dataset or column does not exist. BadRequest: when the featuretype is invalid. """ try: metadata = stat_dataset(dataset) if "columns" not in metadata or "featuretypes" not in metadata: raise NotFound("The specified column does not exist") columns = metadata["columns"] if column not in columns: raise NotFound("The specified column does not exist") # sets new metadata index = columns.index(column) metadata["featuretypes"][index] = featuretype validate_featuretypes(metadata["featuretypes"]) df = load_dataset(dataset) # uses PlatIAgro SDK to save the dataset save_dataset(dataset, df, metadata=metadata) except FileNotFoundError: raise NotFound("The specified dataset does not exist") except ValueError as e: raise BadRequest(str(e)) return {"name": column, "featuretype": featuretype}
def get_dataset_pagination(application_csv, name, operator_id, page, page_size, run_id): """Retrieves a dataset. Args: application_csv(bool): if is to return dataset as csv name(str): the dataset name operator_id(str): the operator uuid page_size(int) : record numbers page(int): page number run_id (str): the run id. Returns: Dataset """ try: metadata = platiagro.stat_dataset(name=name, operator_id=operator_id) if "run_id" not in metadata: raise FileNotFoundError() dataset = platiagro.load_dataset(name=name, operator_id=operator_id, run_id=run_id) except FileNotFoundError as e: raise NotFound(str(e)) if page_size == -1: if application_csv: return dataset.to_csv(index=False) dataset = dataset.to_dict(orient="split") del dataset["index"] return dataset else: dataset = dataset.to_dict(orient="split") del dataset["index"] pdataset = pagination_datasets(page=page, page_size=page_size, dataset=dataset) if application_csv: df = pd.DataFrame(columns=pdataset['columns'], data=pdataset['data']) return df.to_csv(index=False) return pdataset
def test_load_dataset(self): with self.assertRaises(FileNotFoundError): load_dataset("UNK") # UnicodeDecodeError result = load_dataset("mock.zip") self.assertIsInstance(result, BytesIO) # EmptyDataError result = load_dataset("mock.jpg") self.assertIsInstance(result, BytesIO) result = load_dataset("mock.csv") expected = pd.DataFrame( data=[self.mock_values() for x in range(int(1e2))], columns=self.mock_columns(), ) self.assertTrue(result.equals(expected)) result = load_dataset("mock.csv", run_id=RUN_ID, operator_id=OPERATOR_ID) expected = pd.DataFrame( data=[self.mock_values() for x in range(int(1e2))], columns=self.mock_columns(), ) self.assertTrue(result.equals(expected)) result = load_dataset("mock.csv", run_id="latest", operator_id=OPERATOR_ID) expected = pd.DataFrame( data=[self.mock_values() for x in range(int(1e2))], columns=self.mock_columns(), ) self.assertTrue(result.equals(expected))
def get_dataset( self, project_id: str, experiment_id: str, run_id: str, operator_id: str, page: Optional[int] = 1, page_size: Optional[int] = 10, accept: Optional[str] = None, ): """ Get dataset records from a run. Supports pagination. Parameters ---------- project_id : str experiment_id : str run_id : str The run_id. If `run_id=latest`, then returns datasets from the latest run_id. operator_id : str page : int The page number. First page is 1. page_size : int The page size. Default value is 10. accept : str Whether dataset should be returned as csv file. Default to None. Returns ------- list A list of dataset records. Raises ------ NotFound When any of project_id, experiment_id, run_id, or operator_id does not exist. """ if run_id == "latest": run_id = get_latest_run_id(experiment_id) name = self.get_dataset_name(operator_id, experiment_id) try: metadata = stat_dataset(name=name, operator_id=operator_id, run_id=run_id) except FileNotFoundError: raise NotFound( code="DatasetNotFound", message="The specified run does not contain dataset", ) dataset = load_dataset( name=name, run_id=run_id, operator_id=operator_id, page=page, page_size=page_size, ) if isinstance(dataset, pd.DataFrame): # Replaces NaN value by a text "NaN" so JSON encode doesn't fail dataset.replace(np.nan, "NaN", inplace=True, regex=True) data = dataset.to_dict(orient="split") total = metadata.get("total", len(dataset.index)) return {"columns": data["columns"], "data": data["data"], "total": total} return StreamingResponse( dataset, media_type="application/octet-stream", )
def create_prediction( self, deployment_id: str, upload_file: Optional[bytes] = None, dataset: Optional[str] = None, ): """ POST a prediction file to seldon deployment. Parameters ---------- project_id : str deployment_id : str upload_file : starlette.datastructures.UploadFile File buffer. dataset : str Dataset name. Returns ------- prediction_as_schema: schemas.prediction.PredictionBase """ if upload_file is not None: file = upload_file.file request = parse_file_buffer_to_seldon_request(file=file._file) elif dataset is not None: try: dataset = load_dataset(dataset) request = parse_dataframe_to_seldon_request(dataframe=dataset) except AttributeError: request = parse_file_buffer_to_seldon_request(file=dataset) except FileNotFoundError: raise BadRequest(code="InvalidDataset", message="a valid dataset is required") else: raise BadRequest( code="MissingRequiredDatasetOrFile", message="either dataset name or file is required", ) prediction_object = self.create_prediction_database_object( prediction_id=str(uuid_alpha()), deployment_id=deployment_id, request_body=request, response_body=None, status="started", ) prediction_as_schema = schemas.PredictionBase.from_orm( prediction_object) url = get_seldon_deployment_url(deployment_id=deployment_id, external_url=False) self.background_tasks.add_task( self.start_and_save_seldon_prediction, request_body=request, prediction_object=prediction_object, url=url, ) return prediction_as_schema