def update_dataset(self, data_frame, dataset_id, table_name, update_policy): """ Update a previously created dataset with an Pandas Data Frame :param data_frame: Pandas Data Frame to use to update an in-memory dataset :param dataset_id: Identifier of the dataset to update, provided by create_dataset() :param table_name: Name of the table to update within the dataset :param update_policy: Update operation to perform. One of 'add' (inserts new, unique rows), 'update' (updates data in existing rows and columns), 'upsert' (updates existing data and inserts new rows), 'replace' (similar to truncate and load, replaces the existing data with new data) """ # warning for future deprecation / replacement by Datasets class warnings.warn( "This method will be deprecated. The Dataset constructor is preferred and supports multi-table data.", DeprecationWarning) # Replace any leading/trailing whitespace in df names, replace '.' with '_' _df = data_frame.copy() _df.columns = _df.columns.str.replace(".", "_") _df.columns = _df.columns.str.strip() # create dataset instance, add table, then publish the updates to the dataset ds = Dataset(connection=self, dataset_id=dataset_id) ds.add_table(name=table_name, data_frame=_df, update_policy=update_policy) ds.update() ds.publish()
def save_as(self, name, description=None, folder_id=None, table_name=None): """Creates a new single-table cube with the data frame stored in the Cube instance (cube.dataframe). Before the update, make sure that the data exists. Args: name(str): Name of cube. description(str): Description of the cube. folder_id (str, optional): ID of the shared folder that the dataset should be created within. If `None`, defaults to the user's My Reports folder. table_name (str, optional): Name of the table. If None (default), the first table name of the original cube will be used. """ if len(self._tables) > 1: helper.exception_handler( msg="""This feature works only for the single-table cubes. \rTo export multi-table cube use Dataset class.""" ) else: if table_name is None: table_name = self._tables[0]["name"] dataset = Dataset(self._connection, name=name, description=description) dataset.add_table(name=table_name, data_frame=self.dataframe, update_policy="add") dataset.create(folder_id=folder_id)
def test_add_table(self): # Test that adding a table to the dataset increases length of tables property by one ds = Dataset(connection={}, name="test_name") ds.add_table(name="TEST1", data_frame=make_df(), update_policy="add") self.assertEqual(len(ds._tables), 1) ds.add_table(name="TEST2", data_frame=make_df(), update_policy="add") self.assertEqual(len(ds._tables), 2)
def create_dataset(self, data_frame, dataset_name, table_name, to_metric=None, to_attribute=None, folder_id=None): """ Create an in-memory MicroStrategy dataset from a Pandas Data Frame :param data_frame: A Pandas Data Frame from which an in-memory dataset will be created :param dataset_name: Name of the in-memory dataset :param table_name: Name of the table to create within the dataset :param to_metric: (optional) A vector of column names from the Data.Frame to format as metrics in the dataset. By default, numeric types are formatted as metrics while character and date types are formatted as attributes. For example, a column of integer-like strings ("1", "2", "3") would appear as an attribute in the newly created dataset. If the intent is to format this data as a metric, provide the corresponding column name as \code{to_metric=c('myStringIntegers')} :param to_attribute: (optional) Logical opposite of to_metric. Helpful for formatting an integer-based row identifier as a primary key in the dataset :param folder_id: (optional) ID of the shared folder that the dataset should be created within. If `None`, defaults to the user's My Reports folder. :return: Unique identifiers of the dataset and table within the newly created dataset. Required for update_dataset() """ # warning for future deprecation / replacement by Datasets class warnings.warn( "This method will be deprecated. The Dataset constructor is preferred and supports multi-table data.", DeprecationWarning) # Replace any leading/trailing whitespace in df names, replace '.' with '_' _df = data_frame.copy() _df.columns = _df.columns.str.replace(".", "_") _df.columns = _df.columns.str.strip() if folder_id is None: folder_id = "" else: folder_id = folder_id # create dataset instance ds = Dataset(connection=self, name=dataset_name) # add table to the dataset ds.add_table(name=table_name, data_frame=_df, update_policy='add', to_metric=to_metric, to_attribute=to_attribute) # publish the dataset ds.create(folder_id=folder_id) return ds.dataset_id
def test_init_null_values(self): # Test that null param values are assigned properly when initiated ds = Dataset(connection={}, name="__test_name") self.assertIsNone(ds.description) self.assertIsNone(ds.dataset_id) self.assertIsNone(ds._definition) self.assertIsNone(ds._session_id) self.assertIsNone(ds._folder_id) self.assertIsNone(ds.upload_body) self.assertEqual(len(ds._tables), 0)
def test_init_non_null(self, mock_definition): # Test that non-null param values are assigned properly when initiated __test_name = "TEST" __test_desc = "TEST DESCRIPTION" __test_ds_id = "id1234567890" __definition = {'name': 'test_name', 'id': __test_ds_id} mock_definition.return_value = Mock(ok=True) mock_definition.return_value.json.return_value = __definition ds = Dataset(connection={}, name=__test_name) self.assertEqual(ds._name, __test_name) ds = Dataset(connection={}, name=__test_name, description=__test_desc) self.assertEqual(ds._desc, __test_desc) ds = Dataset(connection={}, dataset_id=__test_ds_id) self.assertTrue(mock_definition.called) self.assertEqual(ds._definition, __definition) self.assertEqual(ds._dataset_id, __test_ds_id) self.assertEqual(ds._name, __definition['name'])
def update(self, update_policy='upsert'): """Update single-table cube easily with the data frame stored in the Cube instance (cube.dataframe). Before the update, make sure that the data frame has been modified Args: update_policy(str): Update operation to perform. One of 'add' (inserts new, unique rows), 'update' (updates data in existing rows and columns), 'upsert' (updates existing data and inserts new rows), or 'replace' (replaces the existing data with new data). """ if len(self._tables) > 1: helper.exception_handler( msg="""This feature works only for the single-table cubes. \rTo update multi-table cube use Dataset class.""" ) else: table_name = self._tables[0]["name"] dataset = Dataset(self._connection, dataset_id=self._cube_id) dataset.add_table(name=table_name, data_frame=self.dataframe, update_policy=update_policy) dataset.update()
# prepare Pandas DataFrames to add it into tables of dataset stores = {"store_id": [1, 2, 3], "location": ["New York", "Seattle", "Los Angeles"]} stores_df = pd.DataFrame(stores, columns=["store_id", "location"]) sales = {"store_id": [1, 2, 3], "category": ["TV", "Books", "Accessories"], "sales": [400, 200, 100], "sales_fmt": ["$400", "$200", "$100"]} sales_df = pd.DataFrame(sales, columns=["store_id", "category", "sales", "sales_fmt"]) # add tables to the dataset and create it # by default 'create()' will additionally upload data to the I-Server and publish it # you can manipulate it by setting parameters `auto_upload` and `auto_publish` ds = Dataset(connection=connection, name="Store Analysis") ds.add_table(name="Stores", data_frame=stores_df, update_policy="add") ds.add_table(name="Sales", data_frame=sales_df, update_policy="add") ds.create() # when using `Dataset.add_table()`, Pandas data types are mapped to MicroStrategy data types # by default numeric data is modeled as MSTR metrics and non-numeric as attributes # you can set manually which columns treat as attributes and which as metrics ds.add_table(name="Stores", data_frame=stores_df, update_policy="add", to_attribute=["store_id"]) ds.add_table(name="Sales", data_frame=sales_df, update_policy="add", to_attribute=["store_id"], to_metric=["sales_fmt"]) # it is possible to update previously created dataset what looks really similar to creation
def test_duplicate_col_attr_metr_override(self): # Test that duplicate attribute and metric column names when override produces an error ds = Dataset(connection={}, name="test_name") self.assertRaises(ValueError, ds.add_table, name="TEST", data_frame=make_df(), update_policy="add", to_attribute=["age"], to_metric=["age"])
def test_invalid_metr_override(self): # Test that metric override columns names which dont match the source table produces an error ds = Dataset(connection={}, name="test_name") self.assertRaises(ValueError, ds.add_table, name="TEST", data_frame=make_df(), update_policy="add", to_metric=["invalid"])
def test_invalid_update_policy(self): # Test that invalid update policy values produces an error ds = Dataset(connection={}, name="test_name") self.assertRaises(ValueError, ds.add_table, name="TEST", data_frame=make_df(), update_policy="invalid")