def create_video_metadata(name: str) -> DataFrameMetadata: """Create video metadata object. We have predefined columns for such a object id: the frame id data: the frame data Arguments: name (str): name of the metadata to be added to the catalog Returns: DataFrameMetadata: corresponding metadata for the input table info """ catalog = CatalogManager() columns = [ColumnDefinition('id', ColumnType.INTEGER, None, [], ColConstraintInfo(unique=True))] # the ndarray dimensions are set as None. We need to fix this as we # cannot assume. Either ask the user to provide this with load or # we infer this from the provided video. columns.append( ColumnDefinition( 'data', ColumnType.NDARRAY, NdArrayType.UINT8, [None, None, None] ) ) col_metadata = create_column_metadata(columns) uri = str(generate_file_path(name)) metadata = catalog.create_metadata( name, uri, col_metadata, identifier_column='id', is_video=True) return metadata
def exec(self): """Drop table executor""" catalog_manager = CatalogManager() if len(self.node.table_refs) > 1: logger.exception('Drop supports only single table') table_ref = self.node.table_refs[0] if not catalog_manager.check_table_exists( table_ref.table.database_name, table_ref.table.table_name): err_msg = "Table: {} does not exsits".format(table_ref) if self.node.if_exists: logger.warn(err_msg) else: logger.exception(err_msg) if table_ref.table.table_obj.is_video: VideoStorageEngine.drop(table=table_ref.table.table_obj) else: StorageEngine.drop(table=table_ref.table.table_obj) success = catalog_manager.drop_dataset_metadata( table_ref.table.database_name, table_ref.table.table_name) if not success: err_msg = "Failed to drop {}".format(table_ref) logger.exception(err_msg) yield Batch( pd.DataFrame( { "Table Successfully dropped: {}".format( table_ref.table.table_name) }, index=[0], ))
def test_catalog_manager_reset(self, mock_bootstrap, mock_shutdown): x = CatalogManager() mock_init = MagicMock() with mock.patch.object(CatalogManager, '__init__', mock_init): x.reset() mock_init.assert_called_once_with() mock_bootstrap.assert_called_once_with() mock_shutdown.assert_called_once_with()
def test_create_udf(self, udfio_mock, udf_mock): catalog = CatalogManager() udf_io_list = [MagicMock()] actual = catalog.create_udf('udf', 'sample.py', 'classification', udf_io_list) udfio_mock.return_value.add_udf_io.assert_called_with(udf_io_list) udf_mock.return_value.create_udf.assert_called_with( 'udf', 'sample.py', 'classification') self.assertEqual(actual, udf_mock.return_value.create_udf.return_value)
def test_get_udf_inputs(self, udf_mock): mock_func = udf_mock.return_value.get_inputs_by_udf_id udf_obj = MagicMock(spec=UdfMetadata) CatalogManager().get_udf_inputs(udf_obj) mock_func.assert_called_once_with(udf_obj.id) # should raise error with self.assertRaises(ValueError): CatalogManager().get_udf_inputs(MagicMock())
def test_create_udf_io_object(self, udfio_mock): catalog = CatalogManager() actual = catalog.udf_io('name', ColumnType.NDARRAY, NdArrayType.UINT8, [2, 3, 4], True) udfio_mock.assert_called_with('name', ColumnType.NDARRAY, array_type=NdArrayType.UINT8, array_dimensions=[2, 3, 4], is_input=True) self.assertEqual(actual, udfio_mock.return_value)
def exec(self): """Create udf executor Calls the catalog to create udf metadata. """ catalog_manager = CatalogManager() show_entries = [] if self.node.show_type is ShowType.UDFS: udfs = catalog_manager.get_all_udf_entries() for udf in udfs: show_entries.append(udf.display_format()) yield Batch(pd.DataFrame(show_entries))
def test_get_dataset_metadata_when_table_doesnot_exists( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() dataset_name = "name" database_name = "database" metadata_obj = None ds_mock.return_value.dataset_object_by_name.return_value = metadata_obj actual = catalog.get_dataset_metadata(database_name, dataset_name) ds_mock.return_value.dataset_object_by_name.assert_called_with( database_name, dataset_name) dcs_mock.return_value.columns_by_id_and_dataset_id.assert_not_called() self.assertEqual(actual, metadata_obj)
def exec(self): """rename table executor Calls the catalog to modified metadata corresponding to the table. """ CatalogManager().rename_table(self.node.new_name, self.node.old_table.table)
def setUpClass(cls): CatalogManager().reset() copy_sample_video_to_prefix() query = """LOAD DATA INFILE 'ua_detrac.mp4' INTO MyVideo;""" execute_query_fetch_all(query) load_inbuilt_udfs()
def exec(self): """Create udf executor Calls the catalog to create udf metadata. """ catalog_manager = CatalogManager() if (self.node.if_not_exists): # check catalog if it already has this udf entry if catalog_manager.get_udf_by_name(self.node.name): return io_list = [] io_list.extend(self.node.inputs) io_list.extend(self.node.outputs) impl_path = self.node.impl_path.absolute().as_posix() catalog_manager.create_udf( self.node.name, impl_path, self.node.udf_type, io_list)
def setUpClass(cls): CatalogManager().reset() create_sample_video(NUM_FRAMES) load_query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(load_query) load_inbuilt_udfs() cls.table1 = create_table("table1", 100, 3) cls.table2 = create_table("table2", 500, 3) cls.table3 = create_table("table3", 1000, 3)
def test_drop_plan(self): dummy_info = TableInfo('dummy') dummy_table = TableRef(dummy_info) CatalogManager().reset() dummy_plan_node = DropPlan([dummy_table], False) self.assertEqual(dummy_plan_node.opr_type, PlanOprType.DROP) self.assertEqual(dummy_plan_node.table_refs[0].table.table_name, "dummy")
def test_rename_plan(self): dummy_info = TableInfo("old") dummy_old = TableRef(dummy_info) dummy_new = TableInfo("new") CatalogManager().reset() dummy_plan_node = RenamePlan(dummy_old, dummy_new) self.assertEqual(dummy_plan_node.opr_type, PlanOprType.RENAME) self.assertEqual(dummy_plan_node.old_table.table.table_name, "old") self.assertEqual(dummy_plan_node.new_name.table_name, "new")
def create_table_metadata(table_ref: TableRef, columns: List[ColumnDefinition])\ -> DataFrameMetadata: table_name = table_ref.table.table_name column_metadata_list = create_column_metadata(columns) file_url = str(generate_file_path(table_name)) metadata = CatalogManager().create_metadata(table_name, file_url, column_metadata_list) return metadata
def bind_table_info(table_info: TableInfo) -> DataFrameMetadata: """ Uses catalog to bind the dataset information for given video string. Arguments: video_info (TableInfo): video information obtained in SQL query Returns: DataFrameMetadata - corresponding metadata for the input table info """ catalog = CatalogManager() obj = catalog.get_dataset_metadata(table_info.database_name, table_info.table_name) if obj: table_info.table_obj = obj else: error = '{} does not exists. Create the table using \ CREATE TABLE.'.format(table_info.table_name) logger.error(error) raise RuntimeError(error)
def test_get_dataset_metadata_when_table_exists(self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() dataset_name = "name" database_name = "database" schema = [1, 2, 3] id = 1 metadata_obj = MagicMock(id=id, schema=None) ds_mock.return_value.dataset_object_by_name.return_value = metadata_obj dcs_mock.return_value. \ columns_by_id_and_dataset_id.return_value = schema actual = catalog.get_dataset_metadata(database_name, dataset_name) ds_mock.return_value.dataset_object_by_name.assert_called_with( database_name, dataset_name) dcs_mock.return_value.columns_by_id_and_dataset_id.assert_called_with( id, None) self.assertEqual(actual.id, id) self.assertEqual(actual.schema, schema)
def setUpClass(cls): # reset the catalog manager before running each test CatalogManager().reset() create_sample_video() copy_sample_video_to_prefix() load_query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(load_query) query = """LOAD DATA INFILE 'ua_detrac.mp4' INTO UATRAC;""" execute_query_fetch_all(query) load_inbuilt_udfs()
def handle_if_not_exists(table_ref: TableRef, if_not_exist=False): if CatalogManager().check_table_exists(table_ref.table.database_name, table_ref.table.table_name): err_msg = 'Table: {} already exsits'.format(table_ref) if if_not_exist: logger.warn(err_msg) return True else: logger.error(err_msg) raise RuntimeError(err_msg) else: return False
def test_create_metadata_should_create_dataset_and_columns( self, dcs_mock, ds_mock, initdb_mock): catalog = CatalogManager() file_url = "file1" dataset_name = "name" columns = [(DataFrameColumn("c1", ColumnType.INTEGER))] actual = catalog.create_metadata(dataset_name, file_url, columns) ds_mock.return_value.create_dataset.assert_called_with( dataset_name, file_url, identifier_id='id', is_video=False) for column in columns: column.metadata_id = \ ds_mock.return_value.create_dataset.return_value.id dcs_mock.return_value.create_column.assert_called_with(columns) expected = ds_mock.return_value.create_dataset.return_value expected.schema = \ dcs_mock.return_value.create_column.return_value self.assertEqual(actual, expected)
def setUp(self): CatalogManager().reset() create_sample_video(NUM_FRAMES) load_query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(load_query) create_udf_query = """CREATE UDF DummyObjectDetector INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py'; """ execute_query_fetch_all(create_udf_query)
def test_create_plan(self): dummy_info = TableInfo('dummy') dummy_table = TableRef(dummy_info) CatalogManager().reset() columns = [ DataFrameColumn('id', ColumnType.INTEGER), DataFrameColumn('name', ColumnType.TEXT, array_dimensions=[50]) ] dummy_plan_node = CreatePlan(dummy_table, columns, False) self.assertEqual(dummy_plan_node.opr_type, PlanOprType.CREATE) self.assertEqual(dummy_plan_node.if_not_exists, False) self.assertEqual(dummy_plan_node.table_ref.table.table_name, "dummy") self.assertEqual(dummy_plan_node.column_list[0].name, "id") self.assertEqual(dummy_plan_node.column_list[1].name, "name")
def column_definition_to_udf_io(col_list: List[ColumnDefinition], is_input: bool): """Create the UdfIO object fro each column definition provided Arguments: col_list(List[ColumnDefinition]): parsed input/output definitions is_input(bool): true if input else false """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: logger.error("Empty column definition while creating udf io") result_list.append(col) result_list.append(CatalogManager().udf_io(col.name, col.type, array_type=col.array_type, dimensions=col.dimension, is_input=is_input)) return result_list
def test_should_drop_table(self): catalog_manager = CatalogManager() query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(query) metadata_obj = catalog_manager.get_dataset_metadata(None, "MyVideo") video_dir = metadata_obj.file_url self.assertFalse(metadata_obj is None) column_objects = catalog_manager.get_all_column_objects(metadata_obj) self.assertEqual(len(column_objects), 2) self.assertTrue(Path(video_dir).exists()) drop_query = """DROP TABLE MyVideo;""" execute_query_fetch_all(drop_query) self.assertTrue( catalog_manager.get_dataset_metadata(None, "MyVideo") is None) column_objects = catalog_manager.get_all_column_objects(metadata_obj) self.assertEqual(len(column_objects), 0) self.assertFalse(Path(video_dir).exists())
def create_column_metadata(col_list: List[ColumnDefinition]): """Create column metadata for the input parsed column list. This function will not commit the provided column into catalog table. Will only return in memory list of ColumnDataframe objects Arguments: col_list {List[ColumnDefinition]} -- parsed col list to be created """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: logger.error( "Empty column while creating column metadata") result_list.append(col) result_list.append( CatalogManager().create_column_metadata( col.name, col.type, col.array_type, col.dimension ) ) return result_list
def test_should_rename_table(self): catalog_manager = CatalogManager() query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(query) self.assertTrue( catalog_manager.get_dataset_metadata(None, "MyVideo") is not None ) self.assertTrue( catalog_manager.get_dataset_metadata(None, "MyVideo1") is None ) rename_query = """RENAME TABLE MyVideo TO MyVideo1;""" execute_query_fetch_all(rename_query) self.assertTrue( catalog_manager.get_dataset_metadata(None, "MyVideo") is None ) self.assertTrue( catalog_manager.get_dataset_metadata(None, "MyVideo1") is not None )
def setUp(self): CatalogManager().reset() queries = [Fastrcnn_udf_query, ArrayCount_udf_query] for query in queries: execute_query_fetch_all(query)
def __init__(self, binder_context: StatementBinderContext): self._binder_context = binder_context self._catalog = CatalogManager()
class StatementBinder: def __init__(self, binder_context: StatementBinderContext): self._binder_context = binder_context self._catalog = CatalogManager() @singledispatchmethod def bind(self, node): raise NotImplementedError(f'Cannot bind {type(node)}') @bind.register(AbstractStatement) def _bind_abstract_statement(self, node: AbstractStatement): pass @bind.register(AbstractExpression) def _bind_abstract_expr(self, node: AbstractExpression): for child in node.children: self.bind(child) @bind.register(SelectStatement) def _bind_select_statement(self, node: SelectStatement): self.bind(node.from_table) if node.where_clause: self.bind(node.where_clause) if node.target_list: # SELECT * support if len(node.target_list) == 1 and \ isinstance(node.target_list[0], TupleValueExpression) and \ node.target_list[0].col_name == '*': node.target_list = extend_star(self._binder_context) for expr in node.target_list: self.bind(expr) if node.orderby_list: for expr in node.orderby_list: self.bind(expr[0]) if node.union_link: current_context = self._binder_context self._binder_context = StatementBinderContext() self.bind(node.union_link) self._binder_context = current_context @bind.register(CreateMaterializedViewStatement) def _bind_create_mat_statement(self, node: CreateMaterializedViewStatement): self.bind(node.query) # Todo Verify if the number projected columns matches table @bind.register(LoadDataStatement) def _bind_load_data_statement(self, node: LoadDataStatement): table_ref = node.table_ref if node.file_options['file_format'] == FileFormatType.VIDEO: # Create a new metadata object create_video_metadata(table_ref.table.table_name) self.bind(table_ref) table_ref_obj = table_ref.table.table_obj if table_ref_obj is None: error = '{} does not exists. Create the table using \ CREATE TABLE.'.format(table_ref.table.table_name) logger.error(error) raise RuntimeError(error) # if query had columns specified, we just copy them if node.column_list is not None: column_list = node.column_list # else we curate the column list from the metadata else: column_list = [] for column in table_ref_obj.columns: column_list.append( TupleValueExpression( col_name=column.name, table_alias=table_ref_obj.name.lower(), col_object=column)) # bind the columns for expr in column_list: self.bind(expr) node.column_list = column_list @bind.register(DropTableStatement) def _bind_drop_table_statement(self, node: DropTableStatement): for table in node.table_refs: self.bind(table) @bind.register(TableRef) def _bind_tableref(self, node: TableRef): if node.is_table_atom(): # Table self._binder_context.add_table_alias(node.alias, node.table.table_name) bind_table_info(node.table) elif node.is_select(): current_context = self._binder_context self._binder_context = StatementBinderContext() self.bind(node.select_statement) self._binder_context = current_context self._binder_context.add_derived_table_alias( node.alias, node.select_statement.target_list) elif node.is_join(): self.bind(node.join_node.left) self.bind(node.join_node.right) if node.join_node.predicate: self.bind(node.join_node.predicate) elif node.is_func_expr(): self.bind(node.func_expr) self._binder_context.add_derived_table_alias( node.func_expr.alias, [node.func_expr]) else: raise ValueError(f'Unsupported node {type(node)}') @bind.register(TupleValueExpression) def _bind_tuple_expr(self, node: TupleValueExpression): table_alias, col_obj = self._binder_context.get_binded_column( node.col_name, node.table_alias) node.col_alias = '{}.{}'.format(table_alias, node.col_name.lower()) node.col_object = col_obj @bind.register(FunctionExpression) def _bind_func_expr(self, node: FunctionExpression): # bind all the children for child in node.children: self.bind(child) node.alias = node.alias or node.name.lower() udf_obj = self._catalog.get_udf_by_name(node.name) assert udf_obj is not None, ( 'UDF with name {} does not exist in the catalog. Please ' 'create the UDF using CREATE UDF command'.format(node.name)) output_objs = self._catalog.get_udf_outputs(udf_obj) if node.output: for obj in output_objs: if obj.name.lower() == node.output: node.output_col_aliases.append('{}.{}'.format( node.alias, obj.name.lower())) node.output_objs = [obj] assert len(node.output_col_aliases) == 1, ( 'Duplicate columns {} in UDF {}'.format( node.output, udf_obj.name)) else: node.output_col_aliases = [ '{}.{}'.format(node.alias, obj.name.lower()) for obj in output_objs ] node.output_objs = output_objs node.function = path_to_class(udf_obj.impl_file_path, udf_obj.name)()
def setUp(self): # reset the catalog manager before running each test CatalogManager().reset() create_sample_video()