Exemplo n.º 1
0
def __nuke_table(table_name, schema):
    """
    USE AT YOUR OWN RISK. THIS OPERATION IS NOT REVERSIBLE.

    Drop a table from the lake metastore and completely remove all of its
    underlying files from S3.

    Args:
        table_name (str): Name of the table to drop
        schema (str): Schema the table is in
    """
    table_metadata = meta.get_table_metadata(table_name, schema)
    bucket = table_metadata['bucket']
    path = meta.ensure_path_ends_w_slash(table_metadata['path'])

    hive.run_lake_query('DROP TABLE IF EXISTS {}.{}'.format(
        schema, table_name),
                        engine='hive')
    rv.delete(path, bucket, recursive=True)
Exemplo n.º 2
0
def __nuke_partition(table_name, schema, partition_values):
    """
    USE AT YOUR OWN RISK. THIS OPERATION IS NOT REVERSIBLE.

    Drop a partition from a table and completely remove all of its
    underlying files from S3.

    Args:
        table_name (str): Name of the table to drop
        schema (str): Schema the table is in
        partition_values (dict<str:str>):
            Mapping from partition name to partition value, identifying the
            partition to be nuked
    """
    partition_string = ', '.join([
        '{}=\'{}\''.format(partition_key, partition_value)
        for partition_key, partition_value in partition_values.items()
    ])
    partition_metadata = hive.run_lake_query(
        'DESCRIBE FORMATTED {}.{} PARTITION ({})'.format(
            schema, table_name, partition_string),
        engine='hive')

    # The DataFrame returned by DESCRIBE queries are not organized like a
    # normal DataFrame, hence the inaccurate column names
    partition_location = partition_metadata.loc[
        partition_metadata['col_name'].str.strip() == 'Location:',
        'data_type'].values[0]

    uri_prefix = 's3://'
    bucket, path = partition_location[len(uri_prefix):].split('/', 1)
    path = meta.ensure_path_ends_w_slash(path)

    hive.run_lake_query(
        'ALTER TABLE {}.{} DROP IF EXISTS PARTITION ({})'.format(
            schema, table_name, partition_string),
        engine='hive')
    rv.delete(path, bucket, recursive=True)
Exemplo n.º 3
0
def ctas(select_stmt,
         table_name,
         schema=None,
         path=None,
         table_comment=None,
         col_comments=None,
         storage_type='parquet',
         overwrite=False):
    """
    Emulates the standard SQL 'CREATE TABLE AS SELECT' syntax.

    Under the hood, this function creates a view using the provided SELECT
    statement, and then performs an INSERT OVERWRITE from that view into the
    new table.

    Because this function uses INSERT OVERWRITE, there are considerable
    protections within this function to prevent accidental data loss.
    When an INSERT OVERWRITE command is done on an external table, all of the
    files in S3 at that table's path are deleted. If the table's path is,
    for example, the root of a bucket, there could be substantial data loss.
    As a result, we do our best to smartly assign table paths and prevent
    large-scale object deletion.

    Args:
        select_stmt (str):
            The select statement to build a new table from
        table_name (str):
            The name of the table to be created
        schema (str):
            The schema the new table should be created in
        path (str):
            The path that the new table's underlying files will be stored at.
            If left unset, it will be set to a folder with the same name
            as the table, which is generally recommended
        table_comment (str, optional): Documentation on the table's purpose
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        storage_type (str):
            The desired storage type of the new table
        overwrite (bool):
            Whether to overwrite or fail if a table already exists with
            the intended name of the new table in the selected schema
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if schema == 'curated':
        check_for_allowed_overwrite(overwrite)
        if not os.getenv('HC_PROD_ENV'):
            raise ValueError(
                'Non-production CTAS functionality is currently disabled in '
                'the curated zone. Contact Data Engineering for '
                'further information.')

    bucket = schema_to_zone_bucket_map[schema]
    path = meta.validate_table_path(path, table_name)

    full_path = '/'.join([bucket, path])

    # If this function is used to overwrite a table that is being selected
    # from, we need to make sure that the original table is not dropped before
    # selecting from it (which happens at execution time of the INSERT)
    # In this case, we will temporarily rename the table. If any section of
    # the remainder of this function fails before the INSERT, the table
    # will be restored to its original name
    table_rename_template = 'ALTER TABLE {}.{} RENAME TO {}.{}'
    if '{}.{}'.format(schema, table_name) in select_stmt:
        if overwrite:
            source_table_name = table_name + '_temp_ctas_rename'
            select_stmt = re.sub(
                r'{}\.{}([\s,.]|$)'.format(schema, table_name),
                r'{}.{}\1'.format(schema, source_table_name), select_stmt)
            hive.run_lake_query(
                table_rename_template.format(schema, table_name, schema,
                                             source_table_name))
            table_renamed = True
        else:
            raise ValueError(
                'CTAS functionality must have \'overwrite\' set to True '
                'in order to overwrite one of the source tables of the '
                'SELECT statement.')
    # No rename needed
    else:
        source_table_name = table_name
        table_renamed = False

    try:
        temp_schema = 'experimental'
        view_name = '{}_temp_ctas_view'.format(table_name)
        create_view_stmt = 'CREATE VIEW {}.{} AS {}'.format(
            temp_schema, view_name, select_stmt)
        hive.run_lake_query(create_view_stmt)

        # If we DESCRIBE the view, we can get a list of all the columns
        # in the new table for building DDL and adding comments.
        # Useful in queries that involve JOINing, so you don't have to build
        # that column list yourself.
        col_defs = describe_table(view_name, schema=temp_schema)

        if schema == 'curated':
            check_for_comments(table_comment, col_defs['col_name'],
                               col_comments)

        create_table_ddl = build_create_table_ddl(table_name,
                                                  schema,
                                                  col_defs,
                                                  col_comments,
                                                  table_comment,
                                                  storage_type,
                                                  partitioned_by=None,
                                                  full_path=full_path)
        handle_existing_table(table_name, schema, overwrite)
        hive.run_lake_query(create_table_ddl)
        insert_overwrite_command = (
            'INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}').format(
                schema, table_name, temp_schema, view_name)
        hive.run_lake_query(insert_overwrite_command, complex_join=True)
    except Exception as e:
        # If an error occurred at any point in the above and a source table
        # was renamed, restore its original name
        if table_renamed:
            hive.run_lake_query(
                table_rename_template.format(schema, source_table_name, schema,
                                             table_name))
        raise e

    finally:
        # Regardless of success or failure of the above, we want to
        # drop the temporary view if it was created
        hive.run_lake_query('DROP VIEW IF EXISTS {}.{}'.format(
            temp_schema, view_name))

    # If the source table had to be renamed, it would not have been dropped
    # by the call to 'handle_existing_table', so we have to handle it here.
    # If it still shares a storage location with the new table, we just
    # drop it. Otherwise, we nuke it.
    if table_renamed:
        source_metadata = meta.get_table_metadata(source_table_name, schema)
        source_path = meta.ensure_path_ends_w_slash(source_metadata['path'])
        if source_path == path:
            hive.run_lake_query('DROP TABLE {}.{}'.format(
                schema, source_table_name))
        else:
            __nuke_table(source_table_name, schema)
Exemplo n.º 4
0
def append_df_to_table(df,
                       table_name,
                       schema=None,
                       dtypes=None,
                       filename=None,
                       overwrite_file=False,
                       timezones=None,
                       copy_df=True,
                       partition_values=None,
                       require_identical_columns=True,
                       avro_schema=None,
                       hive_functions=None):
    """
    Uploads a dataframe to S3 and appends it to an already existing table.
    Queries existing table metadata to

    Args:
        df (pd.DataFrame): Which schema to check for the table in
        table_name (str): The name of the table to be created
        schema (str, optional): Name of the schema to create the table in
        dtypes (dict<str:str>, optional): A dictionary specifying dtypes for
            specific columns to be cast to prior to uploading.
        filename (str, optional):
            Name to store the file under. Can be left blank if writing to the
            experimental zone, in which case a name will be generated.
        overwrite_file (bool):
            Whether to overwrite the file if a file with a matching name
            to "filename" is already present in S3.
        timezones (dict<str, str>):
            Dictionary from datetime columns to the timezone they
            represent. If the column is timezone-naive, it will have the
            timezone added to its metadata, leaving the times themselves
            unmodified. If the column is timezone-aware and is in a different
            timezone than the one that is specified, the column's timezone
            will be converted, modifying the original times.
        copy_df (bool):
            Whether the operations performed on df should be performed on the
            original or a copy. Keep in mind that if this is set to False,
            the original df passed in will be modified as well - twice as
            memory efficient, but may be undesirable if the df is needed
            again later
        partition_values (dict<str:str>, optional):
            List of tuples containing partition keys and values to
            store the dataframe under. If there is no partiton at the value,
            it will be created.
        require_identical_columns (bool, default True):
            Whether extra/missing columns should be allowed and handled, or
            if they should lead to an error being raised.
        avro_schema (dict, optional):
            Schema to use when writing a DataFrame to an Avro file. If not
            provided, one will be auto-generated.
        hive_functions (dict<str:str> or dict<str:dict>):
            Specifications on what hive functions to apply to which columns.
            Only usable when working with ORC tables. See 'orc.py'
            for additional documentation
    """
    # Less memory efficient, but prevents original DataFrame from modification
    if copy_df:
        df = df.copy()

    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    table_exists = check.table_existence(table_name, schema)
    if not table_exists:
        raise ValueError(
            'Table \'{schema}.{table_name}\' does not exist. '.format(
                schema=schema, table_name=table_name))

    # Gets the table's S3 location and storage type from metadata
    # We need to know where to write the data to be appended, and
    # the format to write it in
    table_metadata = meta.get_table_metadata(table_name, schema)

    bucket = table_metadata['bucket']
    path = table_metadata['path']
    storage_type = table_metadata['storage_type']
    if filename is None:
        filename = meta.gen_filename_if_allowed(schema, storage_type)
    if not filename.endswith(storage_type):
        raise ValueError(
            'The type specified in the filename does not match the '
            'filetype of the table.')
    path = meta.ensure_path_ends_w_slash(path)

    df = dtype_mapping.special_dtype_handling(df,
                                              spec_dtypes=dtypes,
                                              spec_timezones=timezones,
                                              schema=schema)

    # Columns being in the same order as the table is either
    # mandatory or highly advisible, depending on storage format.
    df = reorder_columns_for_appending(df, table_name, schema,
                                       partition_values, storage_type,
                                       require_identical_columns)

    # If the data is to be appended into a partition, we must get the
    # subpath of the partition if it exists, or create
    # the partition if it doesn't
    if partition_values:
        path += add_partition(table_name, schema, partition_values)

    if storage_type == 'orc':
        append_df_to_orc_table(df, table_name, schema, bucket, path, filename,
                               partition_values, hive_functions)

    else:
        path += filename

        if rv.exists(path, bucket) and not overwrite_file:
            raise KeyError('A file already exists at s3://{}/{}, '
                           'Which will be overwritten by this operation. '
                           'Specify a different filename to proceed.'.format(
                               bucket, path))

        storage_settings = meta.storage_type_specs[storage_type]['settings']
        if avro_schema is not None:
            storage_settings['schema'] = avro_schema
        rv.write(df, path, bucket, show_progressbar=False, **storage_settings)
def flash_update_table_from_df(df,
                               table_name,
                               schema=None,
                               dtypes=None,
                               table_comment=None,
                               col_comments=None,
                               timezones=None,
                               copy_df=True):
    """
    Overwrites single-file table with minimal table downtime.
    Similar to 'create_table_from_df' with overwrite=True, but only usable
    when the table only consists of one underlying file

    Args:
        df (pd.DataFrame): The DataFrame to create the table from.
        table_name (str): The name of the table to be created
        schema (str): The name of the schema to create the table in
        dtypes (dict<str:str>, optional): A dictionary specifying dtypes for
            specific columns to be cast to prior to uploading.
        table_comment (str, optional): Documentation on the table's purpose
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        timezones (dict<str, str>):
            Dictionary from datetime columns to the timezone they
            represent. If the column is timezone-naive, it will have the
            timezone added to its metadata, leaving the times themselves
            unmodified. If the column is timezone-aware and is in a different
            timezone than the one that is specified, the column's timezone
            will be converted, modifying the original times.
        copy_df (bool):
            Whether the operations performed on df should be performed on the
            original or a copy. Keep in mind that if this is set to False,
            the original df passed in will be modified as well - twice as
            memory efficient, but may be undesirable if the df is needed
            again later
    """
    # Less memory efficient, but prevents modification of original df
    if copy_df:
        df = df.copy()

    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if schema == 'curated':
        check_for_comments(table_comment, df.columns, col_comments)
        if not os.getenv('HC_PROD_ENV'):
            raise ValueError(
                'Flash update functionality is only available in '
                'the experimental zone. Contact a lake administrator if '
                'modification of a non-experimental table is needed.')

    table_exists = check.table_existence(table_name, schema)
    if not table_exists:
        raise ValueError('Table {}.{} does not exist.'.format(
            schema, table_name))

    table_metadata = meta.get_table_metadata(table_name, schema)
    bucket = table_metadata['bucket']
    path = meta.ensure_path_ends_w_slash(table_metadata['path'])

    objects_present = rv.list_objects(path, bucket)

    if len(objects_present) > 1:
        # Flash updates are supposed to feel as close to atomic as possible.
        # Multi-file operations interfere with this.
        raise ValueError('Flash update functionality is only available on '
                         'tables that only consist of one underlying file.')
    if meta.is_partitioned_table(table_name, schema):
        # Difficult to deterministically restore partitions based on new data
        raise ValueError('Flash update functionality is not available on '
                         'partitioned tables.')

    if objects_present:
        filename = objects_present[0]
    else:
        filename = meta.gen_filename_if_allowed(schema)
    path += filename

    storage_type = get_storage_type_from_filename(filename)
    df, col_defs = prep_df_and_col_defs(df, dtypes, timezones, schema,
                                        storage_type)

    # Gets settings to pass to rivet on how to write the files in a
    # Hive-readable format
    storage_settings = meta.storage_type_specs[storage_type]['settings']

    # tblproperties is for additional metadata to be provided to Hive
    # for the table. Generally, it is not needed
    tblproperties = {}

    if storage_type == 'avro':
        storage_settings, tblproperties = handle_avro_filetype(
            df, storage_settings, tblproperties, col_comments)

    full_path = '/'.join([bucket, path])
    create_table_ddl = build_create_table_ddl(table_name,
                                              schema,
                                              col_defs,
                                              col_comments,
                                              table_comment,
                                              storage_type,
                                              partitioned_by=None,
                                              full_path=full_path,
                                              tblproperties=tblproperties)
    inform(create_table_ddl)
    drop_table_stmt = 'DROP TABLE IF EXISTS {}.{}'.format(schema, table_name)

    # Creating the table doesn't populate it with data. We now need to write
    # the DataFrame to a file and upload it to S3
    _ = rv.write(df, path, bucket, show_progressbar=False, **storage_settings)
    hive.run_lake_query(drop_table_stmt, engine='hive')
    hive.run_lake_query(create_table_ddl, engine='hive')