def build_and_run_analysis_command(table_name, schema, partition_clause='', columns_clause=''): """ Base function for ANALYZE TABLE commands. Builds the command based off of the parameters provided by the ANALYZE TABLE convenience functions. The Hive commands used for the different types of ANALYZE table commands are similar enough to be built off of one template, but semantically different enough that having separate convenience functions will help clarify the intent of the user. Args: table_name (str): The name of the table to analyze schema (str): The schema that contains the table partition_clause (str): Clause of the command designating which partitions are to be analyzed columns_clause (str): Clause of the command designating which columns are to be analyzed """ analyze_command = ('ANALYZE TABLE {}.{} {}COMPUTE STATISTICS{}').format( schema, table_name, partition_clause, columns_clause) inform(analyze_command) hive.run_lake_query(analyze_command)
def add_partition(table_name, schema, partition_values, partition_path=None): table_name, schema = meta.prep_schema_and_table(table_name, schema) partition_strings = build_partition_strings(partition_values) if partition_path is None: # Datetimes cast to str will by default provide an invalid path partition_path = '/'.join([ val if not isinstance(val, datetime) else str(val.date()) for val in partition_values.values() ]) + '/' else: partition_path = meta.validate_table_path(partition_path, table_name) if not check.partition_existence(table_name, schema, partition_values): add_partition_query = ('ALTER TABLE {}.{} ADD IF NOT EXISTS ' 'PARTITION ({}) LOCATION \'{}\''.format( schema, table_name, partition_strings, partition_path)) inform(add_partition_query) hive.run_lake_query(add_partition_query, engine='hive') else: logging.warn('Partition ({}) already exists in table.'.format( partition_strings)) return partition_path
def describe_table(table_name, schema=None, include_metadata=False): """ Retrieves the description of a specific table in hive Args: table_name (str): The name of the table to be queried schema (str): The name of the schema to search for the table in include_metadata (bool): Whether the returned DataFrame should contain just column names, types, and comments, or more detailed information such as storage location and type, partitioning metadata, etc. Returns: desc (pd.DataFrame): A dataframe containing descriptive information on the specified table """ table_name, schema = meta.prep_schema_and_table(table_name, schema) # Presto does not support the 'FORMATTED' keyword, so # we're locking the engine for 'DESCRIBE' queries to Hive desc_query = 'DESCRIBE {formatted}{schema}.{table_name}'.format( formatted=('FORMATTED ' if include_metadata else ''), schema=schema, table_name=table_name) desc = hive.run_lake_query(desc_query, engine='hive') if include_metadata: desc = desc.loc[1:].reset_index(drop=True) return desc
def __nuke_table(table_name, schema): """ USE AT YOUR OWN RISK. THIS OPERATION IS NOT REVERSIBLE. Drop a table from the lake metastore and completely remove all of its underlying files from S3. Args: table_name (str): Name of the table to drop schema (str): Schema the table is in """ table_metadata = meta.get_table_metadata(table_name, schema) bucket = table_metadata['bucket'] path = meta.ensure_path_ends_w_slash(table_metadata['path']) hive.run_lake_query('DROP TABLE IF EXISTS {}.{}'.format( schema, table_name), engine='hive') rv.delete(path, bucket, recursive=True)
def check_schema_existence(schema): """ Checks if a given schema exists in the lake """ show_schemas_query = ('SHOW SCHEMAS LIKE \'{schema}\''.format( schema=schema)) similar_schemas = hive.run_lake_query(show_schemas_query) if similar_schemas is not None: # NOTE: 'database' and 'schema' are interchangeable terms in Hive if schema in similar_schemas['database_name']: return True return False
def __nuke_partition(table_name, schema, partition_values): """ USE AT YOUR OWN RISK. THIS OPERATION IS NOT REVERSIBLE. Drop a partition from a table and completely remove all of its underlying files from S3. Args: table_name (str): Name of the table to drop schema (str): Schema the table is in partition_values (dict<str:str>): Mapping from partition name to partition value, identifying the partition to be nuked """ partition_string = ', '.join([ '{}=\'{}\''.format(partition_key, partition_value) for partition_key, partition_value in partition_values.items() ]) partition_metadata = hive.run_lake_query( 'DESCRIBE FORMATTED {}.{} PARTITION ({})'.format( schema, table_name, partition_string), engine='hive') # The DataFrame returned by DESCRIBE queries are not organized like a # normal DataFrame, hence the inaccurate column names partition_location = partition_metadata.loc[ partition_metadata['col_name'].str.strip() == 'Location:', 'data_type'].values[0] uri_prefix = 's3://' bucket, path = partition_location[len(uri_prefix):].split('/', 1) path = meta.ensure_path_ends_w_slash(path) hive.run_lake_query( 'ALTER TABLE {}.{} DROP IF EXISTS PARTITION ({})'.format( schema, table_name, partition_string), engine='hive') rv.delete(path, bucket, recursive=True)
def check_table_existence(table_name, schema): """ Checks if a specific table exists in a specific schema Args: schema (str): Which schema to check for the table in table_name (str): The name of the table to check for Returns: bool: Whether or not the specified table exists """ show_tables_query = ( 'SHOW TABLES IN {schema} LIKE \'{table_name}\''.format( schema=schema, table_name=table_name)) similar_tables = hive.run_lake_query(show_tables_query, engine='hive') if table_name in similar_tables['tab_name'].values: return True return False
def get_table_storage_type(table_name, schema): """ Identifies the format a table's underlying files are stored in using the table's metadata. Args: table_metadata (pd.DataFrame): Metadata of the table being examined """ create_stmt_query = create_stmt_query_template.format( schema=schema, table_name=table_name) table_metadata = hive.run_lake_query(create_stmt_query) hive_input_format_to_storage_type = { 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat': 'avro', 'org.apache.hadoop.mapred.TextInputFormat': 'text', 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat': 'parquet', 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat': 'orc' } format_label_idx = table_metadata.index[ table_metadata['createtab_stmt'].str.strip() == "STORED AS INPUTFORMAT"].values[0] input_format = table_metadata.loc[format_label_idx + 1, 'createtab_stmt'].strip()[1:-1] storage_format = hive_input_format_to_storage_type[input_format] if storage_format == 'text': # Both CSV and JSON tables will have a storage format of 'text', # so we must further differentiate them by checking the # serde type serde_label_idx = table_metadata.index[ table_metadata['createtab_stmt'].str.strip() == "ROW FORMAT SERDE"].values[0] serde_type = table_metadata.loc[serde_label_idx + 1, 'createtab_stmt'].strip()[1:-1] if serde_type == 'org.apache.hadoop.hive.serde2.JsonSerDe': storage_format = 'json' else: storage_format = 'csv' return storage_format
def check_partition_existence(table_name, schema, partition_values): """ Checks if a specific partition exists in a specific table Args: schema (str): Which schema the table is in table_name (str): The name of the table to check in partition_values (dict<str:str>): A mapping from partition keys to the values being checked for Returns: bool: Whether or not the specified partition exists in the table """ partition_value_strings = ', '.join([ '{}=\'{}\''.format(partition_name, partition_value) for partition_name, partition_value in partition_values.items() ]) partition_spec = ' PARTITION({})'.format(partition_value_strings) show_partitions_query = ( 'SHOW PARTITIONS {schema}.{table_name}{partition_spec}'.format( schema=schema, table_name=table_name, partition_spec=partition_spec)) similar_partitions = hive.run_lake_query(show_partitions_query, engine='hive')['partition'] if len(similar_partitions): for partition in similar_partitions.str.split('/'): existing_partition_values = { partition_key: partition_value for partition_key, partition_value in [ partition_value_str.split('=') for partition_value_str in partition ] } if partition_values == existing_partition_values: return True return False
def get_table_s3_location(table_name, schema): """ Extracts the underlying S3 location a table uses from its metadata Args: table_metadata (pd.DataFrame): The metadata of a table in the lake as returned from 'get_table_metadata' """ create_stmt_query = create_stmt_query_template.format( schema=schema, table_name=table_name) table_metadata = hive.run_lake_query(create_stmt_query) loc_label_idx = table_metadata.index[ table_metadata['createtab_stmt'].str.strip() == "LOCATION"].values[0] location = table_metadata.loc[loc_label_idx + 1, 'createtab_stmt'].strip()[1:-1] prefix = 's3://' full_path = location[len(prefix):] bucket, path = full_path.split('/', 1) return bucket, path
def ctas(select_stmt, table_name, schema=None, path=None, table_comment=None, col_comments=None, storage_type='parquet', overwrite=False): """ Emulates the standard SQL 'CREATE TABLE AS SELECT' syntax. Under the hood, this function creates a view using the provided SELECT statement, and then performs an INSERT OVERWRITE from that view into the new table. Because this function uses INSERT OVERWRITE, there are considerable protections within this function to prevent accidental data loss. When an INSERT OVERWRITE command is done on an external table, all of the files in S3 at that table's path are deleted. If the table's path is, for example, the root of a bucket, there could be substantial data loss. As a result, we do our best to smartly assign table paths and prevent large-scale object deletion. Args: select_stmt (str): The select statement to build a new table from table_name (str): The name of the table to be created schema (str): The schema the new table should be created in path (str): The path that the new table's underlying files will be stored at. If left unset, it will be set to a folder with the same name as the table, which is generally recommended table_comment (str, optional): Documentation on the table's purpose col_comments (dict<str:str>, optional): Dictionary from column name keys to column descriptions. storage_type (str): The desired storage type of the new table overwrite (bool): Whether to overwrite or fail if a table already exists with the intended name of the new table in the selected schema """ table_name, schema = meta.prep_schema_and_table(table_name, schema) if schema == 'curated': check_for_allowed_overwrite(overwrite) if not os.getenv('HC_PROD_ENV'): raise ValueError( 'Non-production CTAS functionality is currently disabled in ' 'the curated zone. Contact Data Engineering for ' 'further information.') bucket = schema_to_zone_bucket_map[schema] path = meta.validate_table_path(path, table_name) full_path = '/'.join([bucket, path]) # If this function is used to overwrite a table that is being selected # from, we need to make sure that the original table is not dropped before # selecting from it (which happens at execution time of the INSERT) # In this case, we will temporarily rename the table. If any section of # the remainder of this function fails before the INSERT, the table # will be restored to its original name table_rename_template = 'ALTER TABLE {}.{} RENAME TO {}.{}' if '{}.{}'.format(schema, table_name) in select_stmt: if overwrite: source_table_name = table_name + '_temp_ctas_rename' select_stmt = re.sub( r'{}\.{}([\s,.]|$)'.format(schema, table_name), r'{}.{}\1'.format(schema, source_table_name), select_stmt) hive.run_lake_query( table_rename_template.format(schema, table_name, schema, source_table_name)) table_renamed = True else: raise ValueError( 'CTAS functionality must have \'overwrite\' set to True ' 'in order to overwrite one of the source tables of the ' 'SELECT statement.') # No rename needed else: source_table_name = table_name table_renamed = False try: temp_schema = 'experimental' view_name = '{}_temp_ctas_view'.format(table_name) create_view_stmt = 'CREATE VIEW {}.{} AS {}'.format( temp_schema, view_name, select_stmt) hive.run_lake_query(create_view_stmt) # If we DESCRIBE the view, we can get a list of all the columns # in the new table for building DDL and adding comments. # Useful in queries that involve JOINing, so you don't have to build # that column list yourself. col_defs = describe_table(view_name, schema=temp_schema) if schema == 'curated': check_for_comments(table_comment, col_defs['col_name'], col_comments) create_table_ddl = build_create_table_ddl(table_name, schema, col_defs, col_comments, table_comment, storage_type, partitioned_by=None, full_path=full_path) handle_existing_table(table_name, schema, overwrite) hive.run_lake_query(create_table_ddl) insert_overwrite_command = ( 'INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}').format( schema, table_name, temp_schema, view_name) hive.run_lake_query(insert_overwrite_command, complex_join=True) except Exception as e: # If an error occurred at any point in the above and a source table # was renamed, restore its original name if table_renamed: hive.run_lake_query( table_rename_template.format(schema, source_table_name, schema, table_name)) raise e finally: # Regardless of success or failure of the above, we want to # drop the temporary view if it was created hive.run_lake_query('DROP VIEW IF EXISTS {}.{}'.format( temp_schema, view_name)) # If the source table had to be renamed, it would not have been dropped # by the call to 'handle_existing_table', so we have to handle it here. # If it still shares a storage location with the new table, we just # drop it. Otherwise, we nuke it. if table_renamed: source_metadata = meta.get_table_metadata(source_table_name, schema) source_path = meta.ensure_path_ends_w_slash(source_metadata['path']) if source_path == path: hive.run_lake_query('DROP TABLE {}.{}'.format( schema, source_table_name)) else: __nuke_table(source_table_name, schema)
def build_and_run_ddl_stmt(df, table_name, schema, col_defs, storage_type, bucket, path, filename, col_comments=None, table_comment=None, partitioned_by=None, partition_values=None, auto_upload_df=True, avro_schema=None): """ After preparation is performed in other calling functions, this function actually generates a CREATE TABLE command and runs it, optionally automatically uploading the DataFrame to the table as well Args: df (pd.DataFrame): The DataFrame to create the table from. table_name (str): The name of the table to be created schema (str): The name of the schema to create the table in col_defs (pd.DataFrame): A DataFrame with two columns, 'names' containing column names, and 'dtypes', containing a string representation of the column's dtype bucket (str): Bucket containing the table's files path (str): Path within bucket containing the table's files filename (str, optional): Name to store the file under. Used to determine storage format. Can be left blank if writing to the experimental zone, in which case a name will be generated and storage format will default to Parquet col_comments (dict<str:str>, optional): Dictionary from column name keys to column descriptions. table_comment (str, optional): Documentation on the table's purpose partitioned_by (dict<str:str>, collections.OrderedDict<str:str>, or list<tuple<str:str>>, optional): Dictionary or list of tuples containing a partition name and type. Cannot be a vanilla dictionary if using Python version < 3.6 partition_values (dict<str:str>): Required if 'partitioned_by' is used and 'auto_upload_df' is True. List of tuples containing partition name and value to store the dataframe under auto_upload_df (bool, default True): Whether the df that the table's structure will be based off of should be automatically uploaded to the table avro_schema (dict, optional): Schema to use when writing a DataFrame to an Avro file. If not provided, one will be auto-generated. """ # Gets settings to pass to rivet on how to write the files in a # Hive-readable format storage_settings = meta.storage_type_specs[storage_type]['settings'] # tblproperties is for additional metadata to be provided to Hive # for the table. Generally, it is not needed tblproperties = {} if storage_type == 'avro': storage_settings, tblproperties = handle_avro_filetype( df, storage_settings, tblproperties, avro_schema, col_comments) full_path = '/'.join([bucket, path]) create_table_ddl = build_create_table_ddl(table_name, schema, col_defs, col_comments, table_comment, storage_type, partitioned_by, full_path, tblproperties) inform(create_table_ddl) hive.run_lake_query(create_table_ddl, engine='hive') if partitioned_by and partition_values: path += add_partition(table_name, schema, partition_values) path += filename if auto_upload_df: # Creating the table doesn't populate it with data. Unless # auto_upload_df == False, we now need to write the DataFrame to a # file and upload it to S3 _ = rv.write(df, path, bucket, show_progressbar=False, **storage_settings)
def insert_into_orc_table(table_name, schema, source_table_name, source_schema, partition_values=None, hive_functions=None, matching_partitions=False, allow_hive_reserved_words=False, overwrite=False): """ Inserts all the values in a particular table into its corresponding ORC table. We can't simple do a SELECT *, because that will include partition columns, which cannot be included in an INSERT statement (since they're technically metadata, rather than part of the dataset itself) Args: table_name (str): The ORC table to be inserted into schema (str): The schema that the destination table is stored in source_table_name (str): The table to insert from source_schema (str): The schema that the source table is stored in partition_values (dict<str:str>, Optional): The partition in the destination table to insert into matching_partitions (bool, default False): Whether the partition being inserted to has a matching partition in the source table. Used for inserting subsets of a source table rather than the entire thing. allow_hive_reserved_words (bool, default False): By default, Hive will not allow column names to use reserved words. This can be surpassed by wrapping the column name in backticks (`). Doing so is discouraged, but if the source table makes use of such columns and cannot be easily changed, setting this to True will allow the table to be inserted from overwrite (bool, default False): Whether the insert type should be 'INTO' or 'OVERWRITE' """ # List of reserved words in Hive that could reasonably be used as column # names. This list may expand with time hive_reserved_words = ['date', 'time', 'timestamp', 'order', 'primary'] # This discludes partition columns, which is desired behavior col_names = meta.get_table_column_order(table_name, schema) partition_strings = (' PARTITION ({})'.format( build_partition_strings(partition_values)) if partition_values else '') for i in range(len(col_names)): if col_names[i] in hive_reserved_words: if allow_hive_reserved_words: col_names[i] = '`{}`'.format(col_names[i]) else: raise ValueError( 'Source table has columns named with Hive reserved words. ' 'If this is unavoidable, set "allow_hive_reserved_words" ' 'to True.') if overwrite: insert_type = 'OVERWRITE' else: insert_type = 'INTO' if hive_functions is not None: col_names = insert_hive_fns_into_col_names(col_names, hive_functions) where_clause = '' if matching_partitions: where_clause = '\nWHERE ' + ' AND '.join([ 'source_table.{}="{}"'.format(partition_key, partition_value) for partition_key, partition_value in partition_values.items() ]) insert_command = ( 'INSERT {} TABLE {}.{}{}\n'.format(insert_type, schema, table_name, partition_strings) + 'SELECT\n' ' {}\n'.format(',\n '.join(col_names)) + 'FROM {}.{} source_table'.format(source_schema, source_table_name) + '{}'.format(where_clause)) inform(insert_command) hive.run_lake_query(insert_command)
def flash_update_table_from_df(df, table_name, schema=None, dtypes=None, table_comment=None, col_comments=None, timezones=None, copy_df=True): """ Overwrites single-file table with minimal table downtime. Similar to 'create_table_from_df' with overwrite=True, but only usable when the table only consists of one underlying file Args: df (pd.DataFrame): The DataFrame to create the table from. table_name (str): The name of the table to be created schema (str): The name of the schema to create the table in dtypes (dict<str:str>, optional): A dictionary specifying dtypes for specific columns to be cast to prior to uploading. table_comment (str, optional): Documentation on the table's purpose col_comments (dict<str:str>, optional): Dictionary from column name keys to column descriptions. timezones (dict<str, str>): Dictionary from datetime columns to the timezone they represent. If the column is timezone-naive, it will have the timezone added to its metadata, leaving the times themselves unmodified. If the column is timezone-aware and is in a different timezone than the one that is specified, the column's timezone will be converted, modifying the original times. copy_df (bool): Whether the operations performed on df should be performed on the original or a copy. Keep in mind that if this is set to False, the original df passed in will be modified as well - twice as memory efficient, but may be undesirable if the df is needed again later """ # Less memory efficient, but prevents modification of original df if copy_df: df = df.copy() table_name, schema = meta.prep_schema_and_table(table_name, schema) if schema == 'curated': check_for_comments(table_comment, df.columns, col_comments) if not os.getenv('HC_PROD_ENV'): raise ValueError( 'Flash update functionality is only available in ' 'the experimental zone. Contact a lake administrator if ' 'modification of a non-experimental table is needed.') table_exists = check.table_existence(table_name, schema) if not table_exists: raise ValueError('Table {}.{} does not exist.'.format( schema, table_name)) table_metadata = meta.get_table_metadata(table_name, schema) bucket = table_metadata['bucket'] path = meta.ensure_path_ends_w_slash(table_metadata['path']) objects_present = rv.list_objects(path, bucket) if len(objects_present) > 1: # Flash updates are supposed to feel as close to atomic as possible. # Multi-file operations interfere with this. raise ValueError('Flash update functionality is only available on ' 'tables that only consist of one underlying file.') if meta.is_partitioned_table(table_name, schema): # Difficult to deterministically restore partitions based on new data raise ValueError('Flash update functionality is not available on ' 'partitioned tables.') if objects_present: filename = objects_present[0] else: filename = meta.gen_filename_if_allowed(schema) path += filename storage_type = get_storage_type_from_filename(filename) df, col_defs = prep_df_and_col_defs(df, dtypes, timezones, schema, storage_type) # Gets settings to pass to rivet on how to write the files in a # Hive-readable format storage_settings = meta.storage_type_specs[storage_type]['settings'] # tblproperties is for additional metadata to be provided to Hive # for the table. Generally, it is not needed tblproperties = {} if storage_type == 'avro': storage_settings, tblproperties = handle_avro_filetype( df, storage_settings, tblproperties, col_comments) full_path = '/'.join([bucket, path]) create_table_ddl = build_create_table_ddl(table_name, schema, col_defs, col_comments, table_comment, storage_type, partitioned_by=None, full_path=full_path, tblproperties=tblproperties) inform(create_table_ddl) drop_table_stmt = 'DROP TABLE IF EXISTS {}.{}'.format(schema, table_name) # Creating the table doesn't populate it with data. We now need to write # the DataFrame to a file and upload it to S3 _ = rv.write(df, path, bucket, show_progressbar=False, **storage_settings) hive.run_lake_query(drop_table_stmt, engine='hive') hive.run_lake_query(create_table_ddl, engine='hive')