def bind_tuple_value_expr(expr: 'AbstractExpression'): catalog = CatalogManager() table_id, column_ids = catalog.get_table_bindings(None, expr.table_name, expr.col_name) expr.table_metadata_id = table_id expr.col_metadata_id = column_ids.pop()
def create_video_metadata(name: str) -> DataFrameMetadata: """Create video metadata object. We have predefined columns for such a object id: the frame id data: the frame data Arguments: name (str): name of the metadata to be added to the catalog Returns: DataFrameMetadata: corresponding metadata for the input table info """ catalog = CatalogManager() columns = [ColumnDefinition('id', ColumnType.INTEGER, None, [], ColConstraintInfo(unique=True))] # the ndarray dimensions are set as None. We need to fix this as we # cannot assume. Either ask the user to provide this with load or # we infer this from the provided video. columns.append( ColumnDefinition( 'data', ColumnType.NDARRAY, NdArrayType.UINT8, [None, None, None] ) ) col_metadata = create_column_metadata(columns) uri = str(generate_file_path(name)) metadata = catalog.create_metadata( name, uri, col_metadata, identifier_column='id') return metadata
def test_delete_metadata(self, dcs_mock, ds_mock, initdb_mock): dataset_name = "name" catalog = CatalogManager() catalog.delete_metadata(dataset_name) ds_id_mock = ds_mock.return_value.dataset_by_name ds_id_mock.assert_called_with(dataset_name) ds_mock.return_value.delete_dataset_by_id.assert_called_with( ds_id_mock.return_value)
def test_create_udf_io_object(self, udfio_mock): catalog = CatalogManager() actual = catalog.udf_io('name', ColumnType.TEXT, [100], True) udfio_mock.assert_called_with('name', ColumnType.TEXT, array_dimensions=[100], is_input=True) self.assertEqual(actual, udfio_mock.return_value)
def test_catalog_manager_reset(self, mock_bootstrap, mock_shutdown): x = CatalogManager() mock_init = MagicMock() with mock.patch.object(CatalogManager, '__init__', mock_init): x.reset() mock_init.assert_called_once_with() mock_bootstrap.assert_called_once_with() mock_shutdown.assert_called_once_with()
def test_create_udf(self, udfio_mock, udf_mock): catalog = CatalogManager() udf_io_list = [MagicMock()] actual = catalog.create_udf('udf', 'sample.py', 'classification', udf_io_list) udfio_mock.return_value.add_udf_io.assert_called_with(udf_io_list) udf_mock.return_value.create_udf.assert_called_with( 'udf', 'sample.py', 'classification') self.assertEqual(actual, udf_mock.return_value.create_udf.return_value)
def test_create_udf_io_object(self, udfio_mock): catalog = CatalogManager() actual = catalog.udf_io('name', ColumnType.NDARRAY, NdArrayType.UINT8, [2, 3, 4], True) udfio_mock.assert_called_with('name', ColumnType.NDARRAY, array_type=NdArrayType.UINT8, array_dimensions=[2, 3, 4], is_input=True) self.assertEqual(actual, udfio_mock.return_value)
def bind_function_expr(expr: FunctionExpression, column_mapping): catalog = CatalogManager() udf_obj = catalog.get_udf_by_name(expr.name) if expr.output: expr.output_obj = catalog.get_udf_io_by_name(expr.output) if expr.output_obj is None: LoggingManager().log( 'Invalid output {} selected for UDF {}'.format( expr.output, expr.name), LoggingLevel().ERROR) expr.function = path_to_class(udf_obj.impl_file_path, udf_obj.name)()
def bind_dataset(video_info: TableInfo) -> DataFrameMetadata: """ Uses catalog to bind the dataset information for given video string. Arguments: video_info (TableInfo): video information obtained in SQL query Returns: DataFrameMetadata - corresponding metadata for the input table info """ catalog = CatalogManager() return catalog.get_dataset_metadata(video_info.database_name, video_info.table_name)
def test_table_binding_without_columns_returns_no_column_ids( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() dataset_name = "name" database_name = "database" actual = catalog.get_dataset_metadata(database_name, dataset_name) ds_mock.return_value.dataset_object_by_name.assert_called_with( database_name, dataset_name) self.assertEqual( actual, ds_mock.return_value.dataset_object_by_name.return_value)
def _old_bind_tuple_value_expr(expr): """ NOTE: No tests for this should be combined with latest interface """ catalog = CatalogManager() table_id, column_ids = catalog.get_table_bindings(None, expr.table_name, [expr.col_name]) expr.table_metadata_id = table_id if not isinstance(column_ids, list) or len(column_ids) == 0: LoggingManager().log( "Optimizer Utils:: bind_tuple_expr: \ Cannot bind column name provided", LoggingLevel.ERROR) expr.col_metadata_id = column_ids.pop()
def bind_table_ref(video_info: TableInfo) -> int: """Grab the metadata id from the catalog for input video Arguments: video_info {TableInfo} -- [input parsed video info] Return: catalog_entry for input table """ catalog = CatalogManager() catalog_entry_id, _ = catalog.get_table_bindings(video_info.database_name, video_info.table_name, None) return catalog_entry_id
def test_table_binding_without_columns_returns_no_column_ids( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() dataset_name = "name" database_name = "database" actual = catalog.get_table_bindings(database_name, dataset_name) ds_dataset_name_mock = ds_mock.return_value.dataset_by_name ds_dataset_name_mock.assert_called_with(dataset_name) column_values_mock = \ dcs_mock.return_value.columns_by_dataset_id_and_names column_values_mock.assert_not_called() self.assertEqual(actual, (ds_dataset_name_mock.return_value, []))
def test_get_dataset_metadata_when_table_doesnot_exists( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() dataset_name = "name" database_name = "database" metadata_obj = None ds_mock.return_value.dataset_object_by_name.return_value = metadata_obj actual = catalog.get_dataset_metadata(database_name, dataset_name) ds_mock.return_value.dataset_object_by_name.assert_called_with( database_name, dataset_name) dcs_mock.return_value.columns_by_id_and_dataset_id.assert_not_called() self.assertEqual(actual, metadata_obj)
def create_column_metadata(col_list: List[ColumnDefinition]): """Create column metadata for the input parsed column list. This function will not commit the provided column into catalog table. Will only return in memory list of ColumnDataframe objects Arguments: col_list {List[ColumnDefinition]} -- parsed col list to be created """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: LoggingManager().log( "Empty column while creating column metadata", LoggingLevel.ERROR) result_list.append(col) result_list.append( CatalogManager().create_column_metadata( col.name, col.type, col.array_type, col.dimension ) ) return result_list
def column_definition_to_udf_io( col_list: List[ColumnDefinition], is_input: bool): """Create the UdfIO object fro each column definition provided Arguments: col_list(List[ColumnDefinition]): parsed input/output definitions is_input(bool): true if input else false """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: LoggingManager().log( "Empty column definition while creating udf io", LoggingLevel.ERROR) result_list.append(col) result_list.append( CatalogManager().udf_io(col.name, col.type, array_type=col.array_type, dimensions=col.dimension, is_input=is_input) ) return result_list
def test_should_load_video_in_table(self): query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" perform_query(query) metadata = CatalogManager().get_dataset_metadata("", "MyVideo") actual_batch = list(StorageEngine.read(metadata)) expected_batch = list(create_dummy_batches()) self.assertEqual(actual_batch, expected_batch)
def test_should_load_video_in_table(self): query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(query) metadata = CatalogManager().get_dataset_metadata("", "MyVideo") actual_batch = Batch(pd.DataFrame()) actual_batch = Batch.concat(StorageEngine.read(metadata), copy=False) actual_batch.sort() expected_batch = list(create_dummy_batches()) self.assertEqual([actual_batch], expected_batch)
def test_delete_column_metadata(self, dcs_mock, ds_mock, initdb_mock): dataset_name = "name" columns = ["column1", "column2"] catalog = CatalogManager() actual = catalog.delete_column_metadata(dataset_name, columns) ds_id_mock = ds_mock.return_value.dataset_by_name ds_id_mock.assert_called_with(dataset_name) column_ids_mock = dcs_mock.return_value.columns_by_dataset_id_and_names column_ids_mock.assert_called_with(ds_id_mock.return_value, columns) column_objects_mock = dcs_mock.return_value.columns_by_id_and_dataset_id column_objects_mock.assert_called_with(ds_id_mock.return_value, column_ids_mock.return_value) column_del_mock = dcs_mock.return_value.delete_column column_del_mock.assert_called_with(column_objects_mock.return_value)
def test_table_binding_returns_metadata_and_column_ids( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() dataset_name = "name" columns = ["column1", "column2"] database_name = "database" actual = catalog.get_table_bindings(database_name, dataset_name, columns) ds_dataset_name_mock = ds_mock.return_value.dataset_by_name ds_dataset_name_mock.assert_called_with(dataset_name) column_values_mock = \ dcs_mock.return_value.columns_by_dataset_id_and_names column_values_mock.assert_called_with( ds_dataset_name_mock.return_value, columns) self.assertEqual(actual, (ds_dataset_name_mock.return_value, column_values_mock.return_value))
def test_dataset_by_name_should_return_name_of_model( self, dcs_mock, ds_mock, initdb_mock): #tests for dataset_by_name in df_service.py catalog = CatalogManager() file_url = "file1" set_name = "test_name" columns = [(DataFrameColumn("column", ColumnType.INTEGER))] catalog.create_metadata(set_name, file_url, columns) for column in columns: column.metadata_id = \ ds_mock.return_value.create_dataset.return_value.id real = catalog._dataset_service.dataset_by_name(set_name) ds_mock.return_value.dataset_by_name.assert_called_with(set_name) test = ds_mock.return_value.dataset_by_name.return_value self.assertEqual(test, real)
def test_get_dataset_metadata_when_table_exists(self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() dataset_name = "name" database_name = "database" schema = [1, 2, 3] id = 1 metadata_obj = MagicMock(id=id, schema=None) ds_mock.return_value.dataset_object_by_name.return_value = metadata_obj dcs_mock.return_value. \ columns_by_id_and_dataset_id.return_value = schema actual = catalog.get_dataset_metadata(database_name, dataset_name) ds_mock.return_value.dataset_object_by_name.assert_called_with( database_name, dataset_name) dcs_mock.return_value.columns_by_id_and_dataset_id.assert_called_with( id, None) self.assertEqual(actual.id, id) self.assertEqual(actual.schema, schema)
def test_create_metadata_should_create_dataset_and_columns( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() file_url = "file1" dataset_name = "name" columns = [(DataFrameColumn("c1", ColumnType.INTEGER))] actual = catalog.create_metadata(dataset_name, file_url, columns) ds_mock.return_value.create_dataset.assert_called_with( dataset_name, file_url, identifier_id='id') for column in columns: column.metadata_id = \ ds_mock.return_value.create_dataset.return_value.id dcs_mock.return_value.create_column.assert_called_with(columns) expected = ds_mock.return_value.create_dataset.return_value expected.schema = \ dcs_mock.return_value.create_column.return_value self.assertEqual(actual, expected)
def test_should_load_video_in_table(self): query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" perform_query(query) metadata = CatalogManager().get_dataset_metadata("", "MyVideo") actual_batch = Batch(pd.DataFrame()) for batch in StorageEngine.read(metadata): actual_batch += batch actual_batch.sort() expected_batch = list(create_dummy_batches()) self.assertEqual([actual_batch], expected_batch)
def setUp(self): CatalogManager().reset() create_sample_video(NUM_FRAMES) load_query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(load_query) create_udf_query = """CREATE UDF DummyObjectDetector INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py'; """ execute_query_fetch_all(create_udf_query)
def exec(self): """Create udf executor Calls the catalog to create udf metadata. """ if (self.node.if_not_exists): # check catalog if it already has this udf entry return io_list = [] io_list.extend(self.node.inputs) io_list.extend(self.node.outputs) impl_path = self.node.impl_path.absolute().as_posix() udf_metadata = CatalogManager().create_udf(self.node.name, impl_path, self.node.udf_type, io_list)
def test_create_plan(self): dummy_info = TableInfo('dummy') dummy_table = TableRef(dummy_info) CatalogManager().reset() columns = [DataFrameColumn('id', ColumnType.INTEGER), DataFrameColumn('name', ColumnType.TEXT, array_dimensions=50)] dummy_plan_node = CreatePlan(dummy_table, columns, False) self.assertEqual(dummy_plan_node.opr_type, PlanOprType.CREATE) self.assertEqual(dummy_plan_node.if_not_exists, False) self.assertEqual(dummy_plan_node.video_ref.table.table_name, "dummy") self.assertEqual(dummy_plan_node.column_list[0].name, "id") self.assertEqual(dummy_plan_node.column_list[1].name, "name")
def exec(self): """Create table executor Calls the catalog to create metadata corresponding to the table. Calls the storage to create a spark dataframe from the metadata object. """ if (self.node.if_not_exists): # check catalog if we already have this table return table_name = self.node.video_ref.table_info.table_name file_url = str(generate_file_path(table_name)) metadata = CatalogManager().create_metadata(table_name, file_url, self.node.column_list) StorageEngine.create(table=metadata)
def exec(self): """Create table executor Calls the catalog to create metadata corresponding to the table. Calls the storage to create a spark dataframe from the metadata object. """ if (self.node.if_not_exists): # check catalog if we already have this table return # Generate a file_url to be used for table # hard coding a path right now, should write a auto-generator table_name = self.node.video_ref.table_info.table_name file_url = os.path.join(tempfile.gettempdir(), table_name) file_url = 'file://' + file_url metadata = CatalogManager().create_metadata(table_name, file_url, self.node.column_list) create_dataframe(metadata) return file_url
def exec(self): """ Based on the table it constructs a valid tuple using the values provided. Right now we assume there are no missing values """ table_id = self.node.video_id data_tuple = [] for col, val in zip(self.node.column_list, self.node.value_list): val = val.evaluate() val.frames.columns = [col.col_name] data_tuple.append(val) batch = Batch.merge_column_wise(data_tuple) metadata = CatalogManager().get_metadata(table_id) # verify value types are consistent batch.frames = SchemaUtils.petastorm_type_cast( metadata.schema.petastorm_schema, batch.frames) StorageEngine.write(metadata, batch)