def get_if_schema_and_table_exists(metastore_id, schema_name, table_name) -> Tuple[bool, bool]: """ Check if the table name / schema name exists in cache, then check the actual metastore if they don't exist Returns [schema_exists, table_exists] """ verify_metastore_permission(metastore_id) with DataTableFinder(metastore_id) as t_finder: table_exists_in_cache = t_finder.get_table_by_name( schema_name, table_name) if table_exists_in_cache: return [True, True] metastore_loader = get_metastore_loader(metastore_id) table_exists = metastore_loader.check_if_table_exists( schema_name, table_name) if table_exists: return [True, True] schema_exists_in_cache = t_finder.get_schema_by_name(schema_name) if schema_exists_in_cache: return [True, False] schema_exists = metastore_loader.check_if_schema_exists(schema_name) if schema_exists: return [True, False] return [False, False]
def get_metastore(): """Lazily initialize metastore_loader from DB. Use outer-scope variable to memoized initialization Raises: LatestPartitionException: If the metastore does not exist for engine_id, throw error Returns: BaseMetastoreLoader: metastore loader to fetch table/schema info """ nonlocal _metastore_loader if _metastore_loader is not None: return _metastore_loader with DBSession() as session: engine = admin_logic.get_query_engine_by_id(engine_id, session=session) metastore_id = engine.metastore_id if engine else None _metastore_loader = (metastore.get_metastore_loader( metastore_id, session=session) if metastore_id is not None else None) if _metastore_loader is None: raise LatestPartitionException( f"Unable to load metastore for engine id {engine_id}") return _metastore_loader
def destination_s3_root(self, session=None) -> str: """Generate the bucket name + prefix before the table specific folder Returns: str: s3 path consisting bucket + prefix + schema name """ if "s3_path" in self._exporter_config: schema_name, _ = self._fq_table_name s3_path: str = self._exporter_config["s3_path"] return sanitize_s3_url_with_trailing_slash( s3_path) + schema_name + "/" if self._exporter_config.get("use_schema_location", False): # Defer import since this is only needed for this option from lib.metastore.loaders.hive_metastore_loader import HMSMetastoreLoader query_engine = get_query_engine_by_id(self._engine_id, session=session) metastore: HMSMetastoreLoader = get_metastore_loader( query_engine.metastore_id, session=session) if metastore is None or not isinstance(metastore, HMSMetastoreLoader): raise Exception( "Invalid metastore to use use_schema_location option") schema_location_uri = metastore.hmc.get_database( self._table_config["schema_name"]).locationUri return sanitize_s3_url_with_trailing_slash(schema_location_uri) raise Exception("Must specify s3_path or set use_schema_location=True")
def _get_metastore_loader(self, session=None): engine = get_query_engine_by_id(self._engine_id, session=session) metastore_id = engine.metastore_id if metastore_id is None: return None loader = get_metastore_loader(metastore_id, session=session) return loader
def refresh_table_from_metastore(table_id): """Refetch table info from metastore""" with DBSession() as session: verify_data_table_permission(table_id, session=session) table = logic.get_table_by_id(table_id, session=session) schema = table.data_schema metastore_id = schema.metastore_id metastore_loader = get_metastore_loader(metastore_id, session=session) metastore_loader.sync_create_or_update_table(schema.name, table.name, session=session) session.refresh(table) return table
def sync_table_to_metastore(table_per_statement, statement_types, metastore_id, session=None): metastore_loader = get_metastore_loader(metastore_id, session=session) assert metastore_loader is not None tables_to_add = set() tables_to_remove = set() for tables, statement_type in zip(table_per_statement, statement_types): if statement_type == "DROP": for table in tables: tables_to_add.discard(table) tables_to_remove.add(table) elif statement_type is not None: # Any other DML/DDL for table in tables: tables_to_remove.discard(table) # If table is create or alert, we must update metastore if table not in tables_to_add: # This is to minimize the checks if statement_type in ("CREATE", "ALTER"): tables_to_add.add(table) else: # Otherwise for things like insert/select we only update # if it doesn't exist in the metastore schema_name, table_name = table.split(".") query_table = m_logic.get_table_by_name( schema_name, table_name, metastore_id=metastore_id, session=session, ) if not query_table: tables_to_add.add(table) for table in tables_to_remove: schema_name, table_name = table.split(".") metastore_loader.sync_delete_table(schema_name, table_name, session=session) for table in tables_to_add: schema_name, table_name = table.split(".") metastore_loader.sync_create_or_update_table(schema_name, table_name, session=session)
def log_table_per_statement( table_per_statement, statement_types, query_execution_id, metastore_id, cell_id, session=None, ): metastore_loader = get_metastore_loader(metastore_id, session=session) assert metastore_loader is not None all_tables = set() # Only show example queries of SELECT statements for tables, statement_type in zip(table_per_statement, statement_types): if statement_type in ("SELECT", "INSERT"): all_tables.update(tables) for table in all_tables: schema_name, table_name = table.split(".") query_table = m_logic.get_table_by_name(schema_name, table_name, metastore_id=metastore_id, session=session) if query_table: # Sanity check m_logic.delete_old_able_query_execution_log( cell_id=cell_id, query_execution_id=query_execution_id, commit=False, session=session, ) m_logic.create_table_query_execution_log( table_id=query_table.id, cell_id=cell_id, query_execution_id=query_execution_id, session=session, )