def add_partition(table_name, schema, partition_values, partition_path=None): table_name, schema = meta.prep_schema_and_table(table_name, schema) partition_strings = build_partition_strings(partition_values) if partition_path is None: # Datetimes cast to str will by default provide an invalid path partition_path = '/'.join([ val if not isinstance(val, datetime) else str(val.date()) for val in partition_values.values() ]) + '/' else: partition_path = meta.validate_table_path(partition_path, table_name) if not check.partition_existence(table_name, schema, partition_values): add_partition_query = ('ALTER TABLE {}.{} ADD IF NOT EXISTS ' 'PARTITION ({}) LOCATION \'{}\''.format( schema, table_name, partition_strings, partition_path)) inform(add_partition_query) hive.run_lake_query(add_partition_query, engine='hive') else: logging.warn('Partition ({}) already exists in table.'.format( partition_strings)) return partition_path
def describe_table(table_name, schema=None, include_metadata=False): """ Retrieves the description of a specific table in hive Args: table_name (str): The name of the table to be queried schema (str): The name of the schema to search for the table in include_metadata (bool): Whether the returned DataFrame should contain just column names, types, and comments, or more detailed information such as storage location and type, partitioning metadata, etc. Returns: desc (pd.DataFrame): A dataframe containing descriptive information on the specified table """ table_name, schema = meta.prep_schema_and_table(table_name, schema) # Presto does not support the 'FORMATTED' keyword, so # we're locking the engine for 'DESCRIBE' queries to Hive desc_query = 'DESCRIBE {formatted}{schema}.{table_name}'.format( formatted=('FORMATTED ' if include_metadata else ''), schema=schema, table_name=table_name) desc = hive.run_lake_query(desc_query, engine='hive') if include_metadata: desc = desc.loc[1:].reset_index(drop=True) return desc
def analyze_partitions(table_name, schema=None, partition_values=None): """ Convenience function for doing partition-level analysis for as partitioned table Args: table_name (str): The name of the table to analyze schema (str): The schema that contains the table partition_values (dict<str:str>): Dictionary from partition colname to partition value, used to filter partitions. See documentation at top of file for assembly instructions """ table_name, schema = meta.prep_schema_and_table(table_name, schema) if partition_values is None: partition_values = {} if not meta.is_partitioned_table(table_name, schema): raise TypeError( ('The table {}.{} is not partitioned. Use the ' '"analyze_table" function instead.').format(schema, table_name)) partition_clause = get_partition_clause(table_name, schema, partition_values) build_and_run_analysis_command(table_name, schema, partition_clause=partition_clause)
def analyze_columns(table_name, schema=None, columns=None): """ Convenienct function for doing column-level analysis for a non-partitioned table Args: table_name (str): The name of the table to analyze schema (str): The schema that contains the table columns (list<str>): The columns the user wants statistics to be computed for. See documentation at top of file for assembly instructions """ table_name, schema = meta.prep_schema_and_table(table_name, schema) if columns is None: columns = [] if meta.is_partitioned_table(table_name, schema): raise TypeError( ('The table {}.{} is partitioned. Use the ' '"analyze_partition_columns" function instead.').format( schema, table_name)) columns_clause = get_columns_clause(columns) build_and_run_analysis_command(table_name, schema, columns_clause=columns_clause)
def analyze_table(table_name, schema=None): """ Convenience function for doing table-level analysis for a non-partitioned table. Cannot be used on a partitioned table, which must have each partition analyzed individually Args: table_name (str): The name of the table to analyze schema (str): The schema that contains the table """ table_name, schema = meta.prep_schema_and_table(table_name, schema) if meta.is_partitioned_table(table_name, schema): raise TypeError(('The table {}.{} is partitioned. Use the ' '"analyze_partitions" function instead.').format( schema, table_name)) build_and_run_analysis_command(table_name, schema)
def ctas(select_stmt, table_name, schema=None, path=None, table_comment=None, col_comments=None, storage_type='parquet', overwrite=False): """ Emulates the standard SQL 'CREATE TABLE AS SELECT' syntax. Under the hood, this function creates a view using the provided SELECT statement, and then performs an INSERT OVERWRITE from that view into the new table. Because this function uses INSERT OVERWRITE, there are considerable protections within this function to prevent accidental data loss. When an INSERT OVERWRITE command is done on an external table, all of the files in S3 at that table's path are deleted. If the table's path is, for example, the root of a bucket, there could be substantial data loss. As a result, we do our best to smartly assign table paths and prevent large-scale object deletion. Args: select_stmt (str): The select statement to build a new table from table_name (str): The name of the table to be created schema (str): The schema the new table should be created in path (str): The path that the new table's underlying files will be stored at. If left unset, it will be set to a folder with the same name as the table, which is generally recommended table_comment (str, optional): Documentation on the table's purpose col_comments (dict<str:str>, optional): Dictionary from column name keys to column descriptions. storage_type (str): The desired storage type of the new table overwrite (bool): Whether to overwrite or fail if a table already exists with the intended name of the new table in the selected schema """ table_name, schema = meta.prep_schema_and_table(table_name, schema) if schema == 'curated': check_for_allowed_overwrite(overwrite) if not os.getenv('HC_PROD_ENV'): raise ValueError( 'Non-production CTAS functionality is currently disabled in ' 'the curated zone. Contact Data Engineering for ' 'further information.') bucket = schema_to_zone_bucket_map[schema] path = meta.validate_table_path(path, table_name) full_path = '/'.join([bucket, path]) # If this function is used to overwrite a table that is being selected # from, we need to make sure that the original table is not dropped before # selecting from it (which happens at execution time of the INSERT) # In this case, we will temporarily rename the table. If any section of # the remainder of this function fails before the INSERT, the table # will be restored to its original name table_rename_template = 'ALTER TABLE {}.{} RENAME TO {}.{}' if '{}.{}'.format(schema, table_name) in select_stmt: if overwrite: source_table_name = table_name + '_temp_ctas_rename' select_stmt = re.sub( r'{}\.{}([\s,.]|$)'.format(schema, table_name), r'{}.{}\1'.format(schema, source_table_name), select_stmt) hive.run_lake_query( table_rename_template.format(schema, table_name, schema, source_table_name)) table_renamed = True else: raise ValueError( 'CTAS functionality must have \'overwrite\' set to True ' 'in order to overwrite one of the source tables of the ' 'SELECT statement.') # No rename needed else: source_table_name = table_name table_renamed = False try: temp_schema = 'experimental' view_name = '{}_temp_ctas_view'.format(table_name) create_view_stmt = 'CREATE VIEW {}.{} AS {}'.format( temp_schema, view_name, select_stmt) hive.run_lake_query(create_view_stmt) # If we DESCRIBE the view, we can get a list of all the columns # in the new table for building DDL and adding comments. # Useful in queries that involve JOINing, so you don't have to build # that column list yourself. col_defs = describe_table(view_name, schema=temp_schema) if schema == 'curated': check_for_comments(table_comment, col_defs['col_name'], col_comments) create_table_ddl = build_create_table_ddl(table_name, schema, col_defs, col_comments, table_comment, storage_type, partitioned_by=None, full_path=full_path) handle_existing_table(table_name, schema, overwrite) hive.run_lake_query(create_table_ddl) insert_overwrite_command = ( 'INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}').format( schema, table_name, temp_schema, view_name) hive.run_lake_query(insert_overwrite_command, complex_join=True) except Exception as e: # If an error occurred at any point in the above and a source table # was renamed, restore its original name if table_renamed: hive.run_lake_query( table_rename_template.format(schema, source_table_name, schema, table_name)) raise e finally: # Regardless of success or failure of the above, we want to # drop the temporary view if it was created hive.run_lake_query('DROP VIEW IF EXISTS {}.{}'.format( temp_schema, view_name)) # If the source table had to be renamed, it would not have been dropped # by the call to 'handle_existing_table', so we have to handle it here. # If it still shares a storage location with the new table, we just # drop it. Otherwise, we nuke it. if table_renamed: source_metadata = meta.get_table_metadata(source_table_name, schema) source_path = meta.ensure_path_ends_w_slash(source_metadata['path']) if source_path == path: hive.run_lake_query('DROP TABLE {}.{}'.format( schema, source_table_name)) else: __nuke_table(source_table_name, schema)
def append_df_to_table(df, table_name, schema=None, dtypes=None, filename=None, overwrite_file=False, timezones=None, copy_df=True, partition_values=None, require_identical_columns=True, avro_schema=None, hive_functions=None): """ Uploads a dataframe to S3 and appends it to an already existing table. Queries existing table metadata to Args: df (pd.DataFrame): Which schema to check for the table in table_name (str): The name of the table to be created schema (str, optional): Name of the schema to create the table in dtypes (dict<str:str>, optional): A dictionary specifying dtypes for specific columns to be cast to prior to uploading. filename (str, optional): Name to store the file under. Can be left blank if writing to the experimental zone, in which case a name will be generated. overwrite_file (bool): Whether to overwrite the file if a file with a matching name to "filename" is already present in S3. timezones (dict<str, str>): Dictionary from datetime columns to the timezone they represent. If the column is timezone-naive, it will have the timezone added to its metadata, leaving the times themselves unmodified. If the column is timezone-aware and is in a different timezone than the one that is specified, the column's timezone will be converted, modifying the original times. copy_df (bool): Whether the operations performed on df should be performed on the original or a copy. Keep in mind that if this is set to False, the original df passed in will be modified as well - twice as memory efficient, but may be undesirable if the df is needed again later partition_values (dict<str:str>, optional): List of tuples containing partition keys and values to store the dataframe under. If there is no partiton at the value, it will be created. require_identical_columns (bool, default True): Whether extra/missing columns should be allowed and handled, or if they should lead to an error being raised. avro_schema (dict, optional): Schema to use when writing a DataFrame to an Avro file. If not provided, one will be auto-generated. hive_functions (dict<str:str> or dict<str:dict>): Specifications on what hive functions to apply to which columns. Only usable when working with ORC tables. See 'orc.py' for additional documentation """ # Less memory efficient, but prevents original DataFrame from modification if copy_df: df = df.copy() table_name, schema = meta.prep_schema_and_table(table_name, schema) table_exists = check.table_existence(table_name, schema) if not table_exists: raise ValueError( 'Table \'{schema}.{table_name}\' does not exist. '.format( schema=schema, table_name=table_name)) # Gets the table's S3 location and storage type from metadata # We need to know where to write the data to be appended, and # the format to write it in table_metadata = meta.get_table_metadata(table_name, schema) bucket = table_metadata['bucket'] path = table_metadata['path'] storage_type = table_metadata['storage_type'] if filename is None: filename = meta.gen_filename_if_allowed(schema, storage_type) if not filename.endswith(storage_type): raise ValueError( 'The type specified in the filename does not match the ' 'filetype of the table.') path = meta.ensure_path_ends_w_slash(path) df = dtype_mapping.special_dtype_handling(df, spec_dtypes=dtypes, spec_timezones=timezones, schema=schema) # Columns being in the same order as the table is either # mandatory or highly advisible, depending on storage format. df = reorder_columns_for_appending(df, table_name, schema, partition_values, storage_type, require_identical_columns) # If the data is to be appended into a partition, we must get the # subpath of the partition if it exists, or create # the partition if it doesn't if partition_values: path += add_partition(table_name, schema, partition_values) if storage_type == 'orc': append_df_to_orc_table(df, table_name, schema, bucket, path, filename, partition_values, hive_functions) else: path += filename if rv.exists(path, bucket) and not overwrite_file: raise KeyError('A file already exists at s3://{}/{}, ' 'Which will be overwritten by this operation. ' 'Specify a different filename to proceed.'.format( bucket, path)) storage_settings = meta.storage_type_specs[storage_type]['settings'] if avro_schema is not None: storage_settings['schema'] = avro_schema rv.write(df, path, bucket, show_progressbar=False, **storage_settings)
def flash_update_table_from_df(df, table_name, schema=None, dtypes=None, table_comment=None, col_comments=None, timezones=None, copy_df=True): """ Overwrites single-file table with minimal table downtime. Similar to 'create_table_from_df' with overwrite=True, but only usable when the table only consists of one underlying file Args: df (pd.DataFrame): The DataFrame to create the table from. table_name (str): The name of the table to be created schema (str): The name of the schema to create the table in dtypes (dict<str:str>, optional): A dictionary specifying dtypes for specific columns to be cast to prior to uploading. table_comment (str, optional): Documentation on the table's purpose col_comments (dict<str:str>, optional): Dictionary from column name keys to column descriptions. timezones (dict<str, str>): Dictionary from datetime columns to the timezone they represent. If the column is timezone-naive, it will have the timezone added to its metadata, leaving the times themselves unmodified. If the column is timezone-aware and is in a different timezone than the one that is specified, the column's timezone will be converted, modifying the original times. copy_df (bool): Whether the operations performed on df should be performed on the original or a copy. Keep in mind that if this is set to False, the original df passed in will be modified as well - twice as memory efficient, but may be undesirable if the df is needed again later """ # Less memory efficient, but prevents modification of original df if copy_df: df = df.copy() table_name, schema = meta.prep_schema_and_table(table_name, schema) if schema == 'curated': check_for_comments(table_comment, df.columns, col_comments) if not os.getenv('HC_PROD_ENV'): raise ValueError( 'Flash update functionality is only available in ' 'the experimental zone. Contact a lake administrator if ' 'modification of a non-experimental table is needed.') table_exists = check.table_existence(table_name, schema) if not table_exists: raise ValueError('Table {}.{} does not exist.'.format( schema, table_name)) table_metadata = meta.get_table_metadata(table_name, schema) bucket = table_metadata['bucket'] path = meta.ensure_path_ends_w_slash(table_metadata['path']) objects_present = rv.list_objects(path, bucket) if len(objects_present) > 1: # Flash updates are supposed to feel as close to atomic as possible. # Multi-file operations interfere with this. raise ValueError('Flash update functionality is only available on ' 'tables that only consist of one underlying file.') if meta.is_partitioned_table(table_name, schema): # Difficult to deterministically restore partitions based on new data raise ValueError('Flash update functionality is not available on ' 'partitioned tables.') if objects_present: filename = objects_present[0] else: filename = meta.gen_filename_if_allowed(schema) path += filename storage_type = get_storage_type_from_filename(filename) df, col_defs = prep_df_and_col_defs(df, dtypes, timezones, schema, storage_type) # Gets settings to pass to rivet on how to write the files in a # Hive-readable format storage_settings = meta.storage_type_specs[storage_type]['settings'] # tblproperties is for additional metadata to be provided to Hive # for the table. Generally, it is not needed tblproperties = {} if storage_type == 'avro': storage_settings, tblproperties = handle_avro_filetype( df, storage_settings, tblproperties, col_comments) full_path = '/'.join([bucket, path]) create_table_ddl = build_create_table_ddl(table_name, schema, col_defs, col_comments, table_comment, storage_type, partitioned_by=None, full_path=full_path, tblproperties=tblproperties) inform(create_table_ddl) drop_table_stmt = 'DROP TABLE IF EXISTS {}.{}'.format(schema, table_name) # Creating the table doesn't populate it with data. We now need to write # the DataFrame to a file and upload it to S3 _ = rv.write(df, path, bucket, show_progressbar=False, **storage_settings) hive.run_lake_query(drop_table_stmt, engine='hive') hive.run_lake_query(create_table_ddl, engine='hive')
def create_table_from_df(df, table_name, schema=None, dtypes=None, path=None, filename=None, table_comment=None, col_comments=None, timezones=None, copy_df=True, partitioned_by=None, partition_values=None, overwrite=False, auto_upload_df=True, avro_schema=None, hive_functions=None): """ Uploads a dataframe to S3 and establishes it as a new table in Hive. Args: df (pd.DataFrame): The DataFrame to create the table from. table_name (str): The name of the table to be created schema (str): The name of the schema to create the table in dtypes (dict<str:str>, optional): A dictionary specifying dtypes for specific columns to be cast to prior to uploading. path (str, optional): Folder in S3 to store all files for this table in filename (str, optional): Name to store the file under. Used to determine storage format. Can be left blank if writing to the experimental zone, in which case a name will be generated and storage format will default to Parquet table_comment (str, optional): Documentation on the table's purpose col_comments (dict<str:str>, optional): Dictionary from column name keys to column descriptions. timezones (dict<str, str>): Dictionary from datetime columns to the timezone they represent. If the column is timezone-naive, it will have the timezone added to its metadata, leaving the times themselves unmodified. If the column is timezone-aware and is in a different timezone than the one that is specified, the column's timezone will be converted, modifying the original times. copy_df (bool): Whether the operations performed on df should be performed on the original or a copy. Keep in mind that if this is set to False, the original df passed in will be modified as well - twice as memory efficient, but may be undesirable if the df is needed again later partitioned_by (dict<str:str>, collections.OrderedDict<str:str>, or list<tuple<str:str>>, optional): Dictionary or list of tuples containing a partition name and type. Cannot be a vanilla dictionary if using Python version < 3.6 partition_values (dict<str:str>): Required if 'partitioned_by' is used and 'auto_upload_df' is True. List of tuples containing partition name and value to store the dataframe under overwrite (bool, default False): Whether to overwrite the current table if one is already present at the specified name auto_upload_df (bool, default True): Whether the df that the table's structure will be based off of should be automatically uploaded to the table avro_schema (dict, optional): Schema to use when writing a DataFrame to an Avro file. If not provided, one will be auto-generated. hive_functions (dict<str:str> or dict<str:dict>): Specifications on what hive functions to apply to which columns. Only usable when working with ORC tables. See 'orc.py' for additional documentation """ # Less memory efficient, but prevents modification of original df if copy_df: df = df.copy() table_name, schema = meta.prep_schema_and_table(table_name, schema) if partitioned_by: if isinstance(partitioned_by, dict) and not confirm_ordered_dicts(): raise TypeError( 'The order of "partitioned_by" must be preserved, and ' 'dictionaries are not guaranteed to be order-preserving ' 'in Python versions < 3.7. Use a list of tuples or an ' 'OrderedDict, or upgrade your Python version.') elif isinstance(partitioned_by, list): partitioned_by = OrderedDict(partitioned_by) if auto_upload_df and not partition_values: raise ValueError( 'If using "partitioned_by" and "auto_upload_df" is True, ' 'values must be passed to "partition_values" as well.') if schema == 'curated': check_for_comments(table_comment, df.columns, col_comments) check_for_allowed_overwrite(overwrite) handle_existing_table(table_name, schema, overwrite) if filename is None: filename = meta.gen_filename_if_allowed(schema) path = meta.validate_table_path(path, table_name) bucket = schema_to_zone_bucket_map[schema] if rv.list_objects(path, bucket): raise KeyError(( 'Files are already present in s3://{}/{}. Creation of a new table ' 'requires a dedicated, empty folder. Either specify a different ' 'path for the table or ensure the directory is empty before ' 'attempting table creation.').format(bucket, path)) storage_type = get_storage_type_from_filename(filename) df, col_defs = prep_df_and_col_defs(df, dtypes, timezones, schema, storage_type) if storage_type == 'orc' and auto_upload_df: create_orc_table_from_df(df, table_name, schema, col_defs, bucket, path, filename, col_comments, table_comment, partitioned_by, partition_values, hive_functions) else: build_and_run_ddl_stmt(df, table_name, schema, col_defs, storage_type, bucket, path, filename, col_comments, table_comment, partitioned_by, partition_values, auto_upload_df, avro_schema)