def load(self, record) -> None: """ Creates a table stub if it does not exist, updates this template with information in `record`. :param record: :return: """ if not record: return if type(record) in [MetricValue, Watermark]: table = record.table else: table = record.name schema = record.schema cluster = record.cluster if ( "/" in record.database ): # TODO: In general, we should always use self.database_name, unless we override the amundsen extractor and add subdirectories database = record.database else: # ... so we have to do this. database = self.database_name or record.database if cluster == "None": # edge case for Hive Metastore cluster = None table_file_path_base = get_table_file_path_base( database=database, cluster=cluster, schema=schema, table=table, base_directory=self.conf.get("base_directory"), ) file_path = table_file_path_base + ".md" subdirectory = "/".join(file_path.split("/")[:-1]) Path(subdirectory).mkdir(parents=True, exist_ok=True) if not os.path.exists(file_path): create_base_table_stub( file_path=file_path, database=database, cluster=cluster, schema=schema, table=table, ) update_markdown(file_path, record) if self.tmp_manifest_path is not None: _append_to_temp_manifest( database=database, cluster=cluster, schema=schema, table=table, tmp_manifest_path=self.tmp_manifest_path, )
def _get_extract_iter(self): schemas = self.execute(self._sql_stmt_schemas) for schema_row in schemas: schema = schema_row[0] LOGGER.info('Fetching all tables in {}.'.format(schema)) if (schema not in self._excluded_schemas) \ and ( (schema in self._included_schemas) or not self._included_schemas): full_schema_address = \ '.'.join(filter(None, [self._cluster, schema])) tables = list( self.execute( 'show tables in {}'.format(full_schema_address))) n_tables = len(tables) LOGGER.info('There are {} tables in {}.'.format( n_tables, schema)) for i, table_row in enumerate(tables): if (i % 10 == 0) or (i == n_tables - 1): LOGGER.info('On table {} of {}'.format( i + 1, n_tables)) table = table_row[0] file_name = get_table_file_path_base( database=self._database, cluster=self._cluster, schema=schema, table=table, ) # Only update if the stub already exists if os.path.exists(file_name + '.md'): if self._is_table_metadata_enabled: table_metadata = \ self.get_table_metadata( schema, table, cluster=self._cluster, is_view_query_enabled=self._is_view_query_enabled) # noqa: E501 yield table_metadata if self._is_analyze_enabled: self.get_analyze(schema, table, self._cluster) if self._is_stats_enabled: stats_generator = \ self.get_stats(schema, table, self._cluster) yield from stats_generator else: LOGGER.info( 'Skipping {}.{} because the file already exists.'. format(schema, table))
def load(self, record) -> None: """ Creates a table stub if it does not exist, updates this template with information in `record`. :param record: :return: """ if not record: return if type(record) in [MetricValue, Watermark]: table = record.table else: table = record.name schema = record.schema cluster = record.cluster database = self.database_name or record.database if cluster == "None": # edge case for Hive Metastore cluster = None table_file_path_base = get_table_file_path_base( database=database, cluster=cluster, schema=schema, table=table, base_directory=self.conf.get('base_directory')) file_path = table_file_path_base + '.md' subdirectory = '/'.join(file_path.split('/')[:-1]) Path(subdirectory).mkdir(parents=True, exist_ok=True) if not os.path.exists(file_path): create_base_table_stub(file_path=file_path, database=database, cluster=cluster, schema=schema, table=table) update_markdown(file_path, record) if self.tmp_manifest_path is not None: _append_to_temp_manifest(database=database, cluster=cluster, schema=schema, table=table, tmp_manifest_path=self.tmp_manifest_path)