Пример #1
0
def build_and_run_analysis_command(table_name,
                                   schema,
                                   partition_clause='',
                                   columns_clause=''):
    """
    Base function for ANALYZE TABLE commands. Builds the command
    based off of the parameters provided by the ANALYZE TABLE convenience
    functions.
    The Hive commands used for the different types of ANALYZE table commands
    are similar enough to be built off of one template, but semantically
    different enough that having separate convenience functions will
    help clarify the intent of the user.

    Args:
        table_name (str): The name of the table to analyze
        schema (str): The schema that contains the table
        partition_clause (str):
            Clause of the command designating which partitions are
            to be analyzed
        columns_clause (str):
            Clause of the command designating which columns are
            to be analyzed
    """
    analyze_command = ('ANALYZE TABLE {}.{} {}COMPUTE STATISTICS{}').format(
        schema, table_name, partition_clause, columns_clause)

    inform(analyze_command)
    hive.run_lake_query(analyze_command)
Пример #2
0
def add_partition(table_name, schema, partition_values, partition_path=None):
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    partition_strings = build_partition_strings(partition_values)
    if partition_path is None:
        # Datetimes cast to str will by default provide an invalid path
        partition_path = '/'.join([
            val if not isinstance(val, datetime) else str(val.date())
            for val in partition_values.values()
        ]) + '/'
    else:
        partition_path = meta.validate_table_path(partition_path, table_name)

    if not check.partition_existence(table_name, schema, partition_values):
        add_partition_query = ('ALTER TABLE {}.{} ADD IF NOT EXISTS '
                               'PARTITION ({}) LOCATION \'{}\''.format(
                                   schema, table_name, partition_strings,
                                   partition_path))
        inform(add_partition_query)

        hive.run_lake_query(add_partition_query, engine='hive')
    else:
        logging.warn('Partition ({}) already exists in table.'.format(
            partition_strings))

    return partition_path
Пример #3
0
def describe_table(table_name, schema=None,
                   include_metadata=False):
    """
    Retrieves the description of a specific table in hive

    Args:
        table_name (str): The name of the table to be queried
        schema (str): The name of the schema to search for the table in
        include_metadata (bool):
            Whether the returned DataFrame should contain just column names,
            types, and comments, or more detailed information such as
            storage location and type, partitioning metadata, etc.

    Returns:
        desc (pd.DataFrame): A dataframe containing descriptive information
            on the specified table
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    # Presto does not support the 'FORMATTED' keyword, so
    # we're locking the engine for 'DESCRIBE' queries to Hive
    desc_query = 'DESCRIBE {formatted}{schema}.{table_name}'.format(
        formatted=('FORMATTED ' if include_metadata else ''),
        schema=schema,
        table_name=table_name)
    desc = hive.run_lake_query(desc_query, engine='hive')

    if include_metadata:
        desc = desc.loc[1:].reset_index(drop=True)
    return desc
Пример #4
0
def __nuke_table(table_name, schema):
    """
    USE AT YOUR OWN RISK. THIS OPERATION IS NOT REVERSIBLE.

    Drop a table from the lake metastore and completely remove all of its
    underlying files from S3.

    Args:
        table_name (str): Name of the table to drop
        schema (str): Schema the table is in
    """
    table_metadata = meta.get_table_metadata(table_name, schema)
    bucket = table_metadata['bucket']
    path = meta.ensure_path_ends_w_slash(table_metadata['path'])

    hive.run_lake_query('DROP TABLE IF EXISTS {}.{}'.format(
        schema, table_name),
                        engine='hive')
    rv.delete(path, bucket, recursive=True)
Пример #5
0
def check_schema_existence(schema):
    """
    Checks if a given schema exists in the lake
    """
    show_schemas_query = ('SHOW SCHEMAS LIKE \'{schema}\''.format(
        schema=schema))

    similar_schemas = hive.run_lake_query(show_schemas_query)
    if similar_schemas is not None:
        # NOTE: 'database' and 'schema' are interchangeable terms in Hive
        if schema in similar_schemas['database_name']:
            return True
    return False
Пример #6
0
def __nuke_partition(table_name, schema, partition_values):
    """
    USE AT YOUR OWN RISK. THIS OPERATION IS NOT REVERSIBLE.

    Drop a partition from a table and completely remove all of its
    underlying files from S3.

    Args:
        table_name (str): Name of the table to drop
        schema (str): Schema the table is in
        partition_values (dict<str:str>):
            Mapping from partition name to partition value, identifying the
            partition to be nuked
    """
    partition_string = ', '.join([
        '{}=\'{}\''.format(partition_key, partition_value)
        for partition_key, partition_value in partition_values.items()
    ])
    partition_metadata = hive.run_lake_query(
        'DESCRIBE FORMATTED {}.{} PARTITION ({})'.format(
            schema, table_name, partition_string),
        engine='hive')

    # The DataFrame returned by DESCRIBE queries are not organized like a
    # normal DataFrame, hence the inaccurate column names
    partition_location = partition_metadata.loc[
        partition_metadata['col_name'].str.strip() == 'Location:',
        'data_type'].values[0]

    uri_prefix = 's3://'
    bucket, path = partition_location[len(uri_prefix):].split('/', 1)
    path = meta.ensure_path_ends_w_slash(path)

    hive.run_lake_query(
        'ALTER TABLE {}.{} DROP IF EXISTS PARTITION ({})'.format(
            schema, table_name, partition_string),
        engine='hive')
    rv.delete(path, bucket, recursive=True)
Пример #7
0
def check_table_existence(table_name, schema):
    """
    Checks if a specific table exists in a specific schema

    Args:
        schema (str): Which schema to check for the table in
        table_name (str): The name of the table to check for

    Returns:
        bool: Whether or not the specified table exists
    """
    show_tables_query = (
        'SHOW TABLES IN {schema} LIKE \'{table_name}\''.format(
            schema=schema, table_name=table_name))

    similar_tables = hive.run_lake_query(show_tables_query, engine='hive')
    if table_name in similar_tables['tab_name'].values:
        return True
    return False
Пример #8
0
def get_table_storage_type(table_name, schema):
    """
    Identifies the format a table's underlying files are stored in using
    the table's metadata.

    Args:
        table_metadata (pd.DataFrame): Metadata of the table being examined
    """
    create_stmt_query = create_stmt_query_template.format(
        schema=schema, table_name=table_name)
    table_metadata = hive.run_lake_query(create_stmt_query)

    hive_input_format_to_storage_type = {
        'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat': 'avro',
        'org.apache.hadoop.mapred.TextInputFormat': 'text',
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat':
        'parquet',
        'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat': 'orc'
    }
    format_label_idx = table_metadata.index[
        table_metadata['createtab_stmt'].str.strip() ==
        "STORED AS INPUTFORMAT"].values[0]
    input_format = table_metadata.loc[format_label_idx + 1,
                                      'createtab_stmt'].strip()[1:-1]

    storage_format = hive_input_format_to_storage_type[input_format]
    if storage_format == 'text':
        # Both CSV and JSON tables will have a storage format of 'text',
        # so we must further differentiate them by checking the
        # serde type
        serde_label_idx = table_metadata.index[
            table_metadata['createtab_stmt'].str.strip() ==
            "ROW FORMAT SERDE"].values[0]
        serde_type = table_metadata.loc[serde_label_idx + 1,
                                        'createtab_stmt'].strip()[1:-1]
        if serde_type == 'org.apache.hadoop.hive.serde2.JsonSerDe':
            storage_format = 'json'
        else:
            storage_format = 'csv'

    return storage_format
Пример #9
0
def check_partition_existence(table_name, schema, partition_values):
    """
    Checks if a specific partition exists in a specific table

    Args:
        schema (str): Which schema the table is in
        table_name (str): The name of the table to check in
        partition_values (dict<str:str>):
            A mapping from partition keys to the values being checked for

    Returns:
        bool: Whether or not the specified partition exists in the table
    """
    partition_value_strings = ', '.join([
        '{}=\'{}\''.format(partition_name, partition_value)
        for partition_name, partition_value in partition_values.items()
    ])
    partition_spec = ' PARTITION({})'.format(partition_value_strings)

    show_partitions_query = (
        'SHOW PARTITIONS {schema}.{table_name}{partition_spec}'.format(
            schema=schema,
            table_name=table_name,
            partition_spec=partition_spec))

    similar_partitions = hive.run_lake_query(show_partitions_query,
                                             engine='hive')['partition']
    if len(similar_partitions):
        for partition in similar_partitions.str.split('/'):
            existing_partition_values = {
                partition_key: partition_value
                for partition_key, partition_value in [
                    partition_value_str.split('=')
                    for partition_value_str in partition
                ]
            }
            if partition_values == existing_partition_values:
                return True
    return False
Пример #10
0
def get_table_s3_location(table_name, schema):
    """
    Extracts the underlying S3 location a table uses from its metadata

    Args:
        table_metadata (pd.DataFrame):
            The metadata of a table in the lake as returned from
            'get_table_metadata'
    """
    create_stmt_query = create_stmt_query_template.format(
        schema=schema, table_name=table_name)
    table_metadata = hive.run_lake_query(create_stmt_query)

    loc_label_idx = table_metadata.index[
        table_metadata['createtab_stmt'].str.strip() == "LOCATION"].values[0]
    location = table_metadata.loc[loc_label_idx + 1,
                                  'createtab_stmt'].strip()[1:-1]

    prefix = 's3://'
    full_path = location[len(prefix):]

    bucket, path = full_path.split('/', 1)
    return bucket, path
Пример #11
0
def ctas(select_stmt,
         table_name,
         schema=None,
         path=None,
         table_comment=None,
         col_comments=None,
         storage_type='parquet',
         overwrite=False):
    """
    Emulates the standard SQL 'CREATE TABLE AS SELECT' syntax.

    Under the hood, this function creates a view using the provided SELECT
    statement, and then performs an INSERT OVERWRITE from that view into the
    new table.

    Because this function uses INSERT OVERWRITE, there are considerable
    protections within this function to prevent accidental data loss.
    When an INSERT OVERWRITE command is done on an external table, all of the
    files in S3 at that table's path are deleted. If the table's path is,
    for example, the root of a bucket, there could be substantial data loss.
    As a result, we do our best to smartly assign table paths and prevent
    large-scale object deletion.

    Args:
        select_stmt (str):
            The select statement to build a new table from
        table_name (str):
            The name of the table to be created
        schema (str):
            The schema the new table should be created in
        path (str):
            The path that the new table's underlying files will be stored at.
            If left unset, it will be set to a folder with the same name
            as the table, which is generally recommended
        table_comment (str, optional): Documentation on the table's purpose
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        storage_type (str):
            The desired storage type of the new table
        overwrite (bool):
            Whether to overwrite or fail if a table already exists with
            the intended name of the new table in the selected schema
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if schema == 'curated':
        check_for_allowed_overwrite(overwrite)
        if not os.getenv('HC_PROD_ENV'):
            raise ValueError(
                'Non-production CTAS functionality is currently disabled in '
                'the curated zone. Contact Data Engineering for '
                'further information.')

    bucket = schema_to_zone_bucket_map[schema]
    path = meta.validate_table_path(path, table_name)

    full_path = '/'.join([bucket, path])

    # If this function is used to overwrite a table that is being selected
    # from, we need to make sure that the original table is not dropped before
    # selecting from it (which happens at execution time of the INSERT)
    # In this case, we will temporarily rename the table. If any section of
    # the remainder of this function fails before the INSERT, the table
    # will be restored to its original name
    table_rename_template = 'ALTER TABLE {}.{} RENAME TO {}.{}'
    if '{}.{}'.format(schema, table_name) in select_stmt:
        if overwrite:
            source_table_name = table_name + '_temp_ctas_rename'
            select_stmt = re.sub(
                r'{}\.{}([\s,.]|$)'.format(schema, table_name),
                r'{}.{}\1'.format(schema, source_table_name), select_stmt)
            hive.run_lake_query(
                table_rename_template.format(schema, table_name, schema,
                                             source_table_name))
            table_renamed = True
        else:
            raise ValueError(
                'CTAS functionality must have \'overwrite\' set to True '
                'in order to overwrite one of the source tables of the '
                'SELECT statement.')
    # No rename needed
    else:
        source_table_name = table_name
        table_renamed = False

    try:
        temp_schema = 'experimental'
        view_name = '{}_temp_ctas_view'.format(table_name)
        create_view_stmt = 'CREATE VIEW {}.{} AS {}'.format(
            temp_schema, view_name, select_stmt)
        hive.run_lake_query(create_view_stmt)

        # If we DESCRIBE the view, we can get a list of all the columns
        # in the new table for building DDL and adding comments.
        # Useful in queries that involve JOINing, so you don't have to build
        # that column list yourself.
        col_defs = describe_table(view_name, schema=temp_schema)

        if schema == 'curated':
            check_for_comments(table_comment, col_defs['col_name'],
                               col_comments)

        create_table_ddl = build_create_table_ddl(table_name,
                                                  schema,
                                                  col_defs,
                                                  col_comments,
                                                  table_comment,
                                                  storage_type,
                                                  partitioned_by=None,
                                                  full_path=full_path)
        handle_existing_table(table_name, schema, overwrite)
        hive.run_lake_query(create_table_ddl)
        insert_overwrite_command = (
            'INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}').format(
                schema, table_name, temp_schema, view_name)
        hive.run_lake_query(insert_overwrite_command, complex_join=True)
    except Exception as e:
        # If an error occurred at any point in the above and a source table
        # was renamed, restore its original name
        if table_renamed:
            hive.run_lake_query(
                table_rename_template.format(schema, source_table_name, schema,
                                             table_name))
        raise e

    finally:
        # Regardless of success or failure of the above, we want to
        # drop the temporary view if it was created
        hive.run_lake_query('DROP VIEW IF EXISTS {}.{}'.format(
            temp_schema, view_name))

    # If the source table had to be renamed, it would not have been dropped
    # by the call to 'handle_existing_table', so we have to handle it here.
    # If it still shares a storage location with the new table, we just
    # drop it. Otherwise, we nuke it.
    if table_renamed:
        source_metadata = meta.get_table_metadata(source_table_name, schema)
        source_path = meta.ensure_path_ends_w_slash(source_metadata['path'])
        if source_path == path:
            hive.run_lake_query('DROP TABLE {}.{}'.format(
                schema, source_table_name))
        else:
            __nuke_table(source_table_name, schema)
Пример #12
0
def build_and_run_ddl_stmt(df, table_name, schema, col_defs,
                           storage_type, bucket, path, filename,
                           col_comments=None, table_comment=None,
                           partitioned_by=None, partition_values=None,
                           auto_upload_df=True, avro_schema=None):
    """
    After preparation is performed in other calling functions,
    this function actually generates a CREATE TABLE command and runs it,
    optionally automatically uploading the DataFrame to the table as well

    Args:
        df (pd.DataFrame): The DataFrame to create the table from.
        table_name (str): The name of the table to be created
        schema (str): The name of the schema to create the table in
        col_defs (pd.DataFrame):
            A DataFrame with two columns, 'names' containing column names,
            and 'dtypes', containing a string representation of
            the column's dtype
        bucket (str): Bucket containing the table's files
        path (str): Path within bucket containing the table's files
        filename (str, optional):
            Name to store the file under. Used to determine storage format.
            Can be left blank if writing to the experimental zone,
            in which case a name will be generated and storage format will
            default to Parquet
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        table_comment (str, optional): Documentation on the table's purpose
        partitioned_by (dict<str:str>,
                        collections.OrderedDict<str:str>, or
                        list<tuple<str:str>>, optional):
            Dictionary or list of tuples containing a partition name and type.
            Cannot be a vanilla dictionary if using Python version < 3.6
        partition_values (dict<str:str>):
            Required if 'partitioned_by' is used and 'auto_upload_df' is True.
            List of tuples containing partition name and value to store
            the dataframe under
        auto_upload_df (bool, default True):
            Whether the df that the table's structure will be based off of
            should be automatically uploaded to the table
        avro_schema (dict, optional):
            Schema to use when writing a DataFrame to an Avro file. If not
            provided, one will be auto-generated.
    """
    # Gets settings to pass to rivet on how to write the files in a
    # Hive-readable format
    storage_settings = meta.storage_type_specs[storage_type]['settings']

    # tblproperties is for additional metadata to be provided to Hive
    # for the table. Generally, it is not needed
    tblproperties = {}

    if storage_type == 'avro':
        storage_settings, tblproperties = handle_avro_filetype(
            df, storage_settings, tblproperties, avro_schema, col_comments)

    full_path = '/'.join([bucket, path])
    create_table_ddl = build_create_table_ddl(table_name, schema, col_defs,
                                              col_comments, table_comment,
                                              storage_type, partitioned_by,
                                              full_path, tblproperties)
    inform(create_table_ddl)
    hive.run_lake_query(create_table_ddl, engine='hive')

    if partitioned_by and partition_values:
        path += add_partition(table_name, schema, partition_values)
    path += filename

    if auto_upload_df:
        # Creating the table doesn't populate it with data. Unless
        # auto_upload_df == False, we now need to write the DataFrame to a
        # file and upload it to S3
        _ = rv.write(df, path, bucket,
                     show_progressbar=False, **storage_settings)
Пример #13
0
def insert_into_orc_table(table_name,
                          schema,
                          source_table_name,
                          source_schema,
                          partition_values=None,
                          hive_functions=None,
                          matching_partitions=False,
                          allow_hive_reserved_words=False,
                          overwrite=False):
    """
    Inserts all the values in a particular table into its corresponding ORC
    table. We can't simple do a SELECT *, because that will include partition
    columns, which cannot be included in an INSERT statement (since they're
    technically metadata, rather than part of the dataset itself)

    Args:
        table_name (str): The ORC table to be inserted into
        schema (str): The schema that the destination table is stored in
        source_table_name (str): The table to insert from
        source_schema (str): The schema that the source table is stored in
        partition_values (dict<str:str>, Optional):
            The partition in the destination table to insert into
        matching_partitions (bool, default False):
            Whether the partition being inserted to has a matching partition
            in the source table. Used for inserting subsets of a source table
            rather than the entire thing.
        allow_hive_reserved_words (bool, default False):
            By default, Hive will not allow column names to use reserved words.
            This can be surpassed by wrapping the column name in backticks (`).
            Doing so is discouraged, but if the source table makes use of such
            columns and cannot be easily changed, setting this to True will
            allow the table to be inserted from
        overwrite (bool, default False):
            Whether the insert type should be 'INTO' or 'OVERWRITE'
    """
    # List of reserved words in Hive that could reasonably be used as column
    # names. This list may expand with time
    hive_reserved_words = ['date', 'time', 'timestamp', 'order', 'primary']

    # This discludes partition columns, which is desired behavior
    col_names = meta.get_table_column_order(table_name, schema)
    partition_strings = (' PARTITION ({})'.format(
        build_partition_strings(partition_values)) if partition_values else '')

    for i in range(len(col_names)):
        if col_names[i] in hive_reserved_words:
            if allow_hive_reserved_words:
                col_names[i] = '`{}`'.format(col_names[i])
            else:
                raise ValueError(
                    'Source table has columns named with Hive reserved words. '
                    'If this is unavoidable, set "allow_hive_reserved_words" '
                    'to True.')

    if overwrite:
        insert_type = 'OVERWRITE'
    else:
        insert_type = 'INTO'

    if hive_functions is not None:
        col_names = insert_hive_fns_into_col_names(col_names, hive_functions)

    where_clause = ''
    if matching_partitions:
        where_clause = '\nWHERE ' + ' AND '.join([
            'source_table.{}="{}"'.format(partition_key, partition_value)
            for partition_key, partition_value in partition_values.items()
        ])
    insert_command = (
        'INSERT {} TABLE {}.{}{}\n'.format(insert_type, schema, table_name,
                                           partition_strings) + 'SELECT\n'
        '    {}\n'.format(',\n    '.join(col_names)) +
        'FROM {}.{} source_table'.format(source_schema, source_table_name) +
        '{}'.format(where_clause))

    inform(insert_command)

    hive.run_lake_query(insert_command)
def flash_update_table_from_df(df,
                               table_name,
                               schema=None,
                               dtypes=None,
                               table_comment=None,
                               col_comments=None,
                               timezones=None,
                               copy_df=True):
    """
    Overwrites single-file table with minimal table downtime.
    Similar to 'create_table_from_df' with overwrite=True, but only usable
    when the table only consists of one underlying file

    Args:
        df (pd.DataFrame): The DataFrame to create the table from.
        table_name (str): The name of the table to be created
        schema (str): The name of the schema to create the table in
        dtypes (dict<str:str>, optional): A dictionary specifying dtypes for
            specific columns to be cast to prior to uploading.
        table_comment (str, optional): Documentation on the table's purpose
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        timezones (dict<str, str>):
            Dictionary from datetime columns to the timezone they
            represent. If the column is timezone-naive, it will have the
            timezone added to its metadata, leaving the times themselves
            unmodified. If the column is timezone-aware and is in a different
            timezone than the one that is specified, the column's timezone
            will be converted, modifying the original times.
        copy_df (bool):
            Whether the operations performed on df should be performed on the
            original or a copy. Keep in mind that if this is set to False,
            the original df passed in will be modified as well - twice as
            memory efficient, but may be undesirable if the df is needed
            again later
    """
    # Less memory efficient, but prevents modification of original df
    if copy_df:
        df = df.copy()

    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if schema == 'curated':
        check_for_comments(table_comment, df.columns, col_comments)
        if not os.getenv('HC_PROD_ENV'):
            raise ValueError(
                'Flash update functionality is only available in '
                'the experimental zone. Contact a lake administrator if '
                'modification of a non-experimental table is needed.')

    table_exists = check.table_existence(table_name, schema)
    if not table_exists:
        raise ValueError('Table {}.{} does not exist.'.format(
            schema, table_name))

    table_metadata = meta.get_table_metadata(table_name, schema)
    bucket = table_metadata['bucket']
    path = meta.ensure_path_ends_w_slash(table_metadata['path'])

    objects_present = rv.list_objects(path, bucket)

    if len(objects_present) > 1:
        # Flash updates are supposed to feel as close to atomic as possible.
        # Multi-file operations interfere with this.
        raise ValueError('Flash update functionality is only available on '
                         'tables that only consist of one underlying file.')
    if meta.is_partitioned_table(table_name, schema):
        # Difficult to deterministically restore partitions based on new data
        raise ValueError('Flash update functionality is not available on '
                         'partitioned tables.')

    if objects_present:
        filename = objects_present[0]
    else:
        filename = meta.gen_filename_if_allowed(schema)
    path += filename

    storage_type = get_storage_type_from_filename(filename)
    df, col_defs = prep_df_and_col_defs(df, dtypes, timezones, schema,
                                        storage_type)

    # Gets settings to pass to rivet on how to write the files in a
    # Hive-readable format
    storage_settings = meta.storage_type_specs[storage_type]['settings']

    # tblproperties is for additional metadata to be provided to Hive
    # for the table. Generally, it is not needed
    tblproperties = {}

    if storage_type == 'avro':
        storage_settings, tblproperties = handle_avro_filetype(
            df, storage_settings, tblproperties, col_comments)

    full_path = '/'.join([bucket, path])
    create_table_ddl = build_create_table_ddl(table_name,
                                              schema,
                                              col_defs,
                                              col_comments,
                                              table_comment,
                                              storage_type,
                                              partitioned_by=None,
                                              full_path=full_path,
                                              tblproperties=tblproperties)
    inform(create_table_ddl)
    drop_table_stmt = 'DROP TABLE IF EXISTS {}.{}'.format(schema, table_name)

    # Creating the table doesn't populate it with data. We now need to write
    # the DataFrame to a file and upload it to S3
    _ = rv.write(df, path, bucket, show_progressbar=False, **storage_settings)
    hive.run_lake_query(drop_table_stmt, engine='hive')
    hive.run_lake_query(create_table_ddl, engine='hive')