예제 #1
0
def add_partition(table_name, schema, partition_values, partition_path=None):
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    partition_strings = build_partition_strings(partition_values)
    if partition_path is None:
        # Datetimes cast to str will by default provide an invalid path
        partition_path = '/'.join([
            val if not isinstance(val, datetime) else str(val.date())
            for val in partition_values.values()
        ]) + '/'
    else:
        partition_path = meta.validate_table_path(partition_path, table_name)

    if not check.partition_existence(table_name, schema, partition_values):
        add_partition_query = ('ALTER TABLE {}.{} ADD IF NOT EXISTS '
                               'PARTITION ({}) LOCATION \'{}\''.format(
                                   schema, table_name, partition_strings,
                                   partition_path))
        inform(add_partition_query)

        hive.run_lake_query(add_partition_query, engine='hive')
    else:
        logging.warn('Partition ({}) already exists in table.'.format(
            partition_strings))

    return partition_path
예제 #2
0
def describe_table(table_name, schema=None,
                   include_metadata=False):
    """
    Retrieves the description of a specific table in hive

    Args:
        table_name (str): The name of the table to be queried
        schema (str): The name of the schema to search for the table in
        include_metadata (bool):
            Whether the returned DataFrame should contain just column names,
            types, and comments, or more detailed information such as
            storage location and type, partitioning metadata, etc.

    Returns:
        desc (pd.DataFrame): A dataframe containing descriptive information
            on the specified table
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    # Presto does not support the 'FORMATTED' keyword, so
    # we're locking the engine for 'DESCRIBE' queries to Hive
    desc_query = 'DESCRIBE {formatted}{schema}.{table_name}'.format(
        formatted=('FORMATTED ' if include_metadata else ''),
        schema=schema,
        table_name=table_name)
    desc = hive.run_lake_query(desc_query, engine='hive')

    if include_metadata:
        desc = desc.loc[1:].reset_index(drop=True)
    return desc
예제 #3
0
def analyze_partitions(table_name, schema=None, partition_values=None):
    """
    Convenience function for doing partition-level analysis for as
    partitioned table

    Args:
        table_name (str): The name of the table to analyze
        schema (str): The schema that contains the table
        partition_values (dict<str:str>):
            Dictionary from partition colname to partition value, used
            to filter partitions. See documentation at top of file for
            assembly instructions
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if partition_values is None:
        partition_values = {}

    if not meta.is_partitioned_table(table_name, schema):
        raise TypeError(
            ('The table {}.{} is not partitioned. Use the '
             '"analyze_table" function instead.').format(schema, table_name))

    partition_clause = get_partition_clause(table_name, schema,
                                            partition_values)

    build_and_run_analysis_command(table_name,
                                   schema,
                                   partition_clause=partition_clause)
예제 #4
0
def analyze_columns(table_name, schema=None, columns=None):
    """
    Convenienct function for doing column-level analysis for a
    non-partitioned table

    Args:
        table_name (str): The name of the table to analyze
        schema (str): The schema that contains the table
        columns (list<str>):
            The columns the user wants statistics to be computed for.
            See documentation at top of file for assembly instructions
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if columns is None:
        columns = []

    if meta.is_partitioned_table(table_name, schema):
        raise TypeError(
            ('The table {}.{} is partitioned. Use the '
             '"analyze_partition_columns" function instead.').format(
                 schema, table_name))
    columns_clause = get_columns_clause(columns)

    build_and_run_analysis_command(table_name,
                                   schema,
                                   columns_clause=columns_clause)
예제 #5
0
def analyze_table(table_name, schema=None):
    """
    Convenience function for doing table-level analysis for a
    non-partitioned table. Cannot be used on a partitioned table, which
    must have each partition analyzed individually
    Args:
        table_name (str): The name of the table to analyze
        schema (str): The schema that contains the table
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if meta.is_partitioned_table(table_name, schema):
        raise TypeError(('The table {}.{} is partitioned. Use the '
                         '"analyze_partitions" function instead.').format(
                             schema, table_name))
    build_and_run_analysis_command(table_name, schema)
예제 #6
0
def ctas(select_stmt,
         table_name,
         schema=None,
         path=None,
         table_comment=None,
         col_comments=None,
         storage_type='parquet',
         overwrite=False):
    """
    Emulates the standard SQL 'CREATE TABLE AS SELECT' syntax.

    Under the hood, this function creates a view using the provided SELECT
    statement, and then performs an INSERT OVERWRITE from that view into the
    new table.

    Because this function uses INSERT OVERWRITE, there are considerable
    protections within this function to prevent accidental data loss.
    When an INSERT OVERWRITE command is done on an external table, all of the
    files in S3 at that table's path are deleted. If the table's path is,
    for example, the root of a bucket, there could be substantial data loss.
    As a result, we do our best to smartly assign table paths and prevent
    large-scale object deletion.

    Args:
        select_stmt (str):
            The select statement to build a new table from
        table_name (str):
            The name of the table to be created
        schema (str):
            The schema the new table should be created in
        path (str):
            The path that the new table's underlying files will be stored at.
            If left unset, it will be set to a folder with the same name
            as the table, which is generally recommended
        table_comment (str, optional): Documentation on the table's purpose
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        storage_type (str):
            The desired storage type of the new table
        overwrite (bool):
            Whether to overwrite or fail if a table already exists with
            the intended name of the new table in the selected schema
    """
    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if schema == 'curated':
        check_for_allowed_overwrite(overwrite)
        if not os.getenv('HC_PROD_ENV'):
            raise ValueError(
                'Non-production CTAS functionality is currently disabled in '
                'the curated zone. Contact Data Engineering for '
                'further information.')

    bucket = schema_to_zone_bucket_map[schema]
    path = meta.validate_table_path(path, table_name)

    full_path = '/'.join([bucket, path])

    # If this function is used to overwrite a table that is being selected
    # from, we need to make sure that the original table is not dropped before
    # selecting from it (which happens at execution time of the INSERT)
    # In this case, we will temporarily rename the table. If any section of
    # the remainder of this function fails before the INSERT, the table
    # will be restored to its original name
    table_rename_template = 'ALTER TABLE {}.{} RENAME TO {}.{}'
    if '{}.{}'.format(schema, table_name) in select_stmt:
        if overwrite:
            source_table_name = table_name + '_temp_ctas_rename'
            select_stmt = re.sub(
                r'{}\.{}([\s,.]|$)'.format(schema, table_name),
                r'{}.{}\1'.format(schema, source_table_name), select_stmt)
            hive.run_lake_query(
                table_rename_template.format(schema, table_name, schema,
                                             source_table_name))
            table_renamed = True
        else:
            raise ValueError(
                'CTAS functionality must have \'overwrite\' set to True '
                'in order to overwrite one of the source tables of the '
                'SELECT statement.')
    # No rename needed
    else:
        source_table_name = table_name
        table_renamed = False

    try:
        temp_schema = 'experimental'
        view_name = '{}_temp_ctas_view'.format(table_name)
        create_view_stmt = 'CREATE VIEW {}.{} AS {}'.format(
            temp_schema, view_name, select_stmt)
        hive.run_lake_query(create_view_stmt)

        # If we DESCRIBE the view, we can get a list of all the columns
        # in the new table for building DDL and adding comments.
        # Useful in queries that involve JOINing, so you don't have to build
        # that column list yourself.
        col_defs = describe_table(view_name, schema=temp_schema)

        if schema == 'curated':
            check_for_comments(table_comment, col_defs['col_name'],
                               col_comments)

        create_table_ddl = build_create_table_ddl(table_name,
                                                  schema,
                                                  col_defs,
                                                  col_comments,
                                                  table_comment,
                                                  storage_type,
                                                  partitioned_by=None,
                                                  full_path=full_path)
        handle_existing_table(table_name, schema, overwrite)
        hive.run_lake_query(create_table_ddl)
        insert_overwrite_command = (
            'INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}').format(
                schema, table_name, temp_schema, view_name)
        hive.run_lake_query(insert_overwrite_command, complex_join=True)
    except Exception as e:
        # If an error occurred at any point in the above and a source table
        # was renamed, restore its original name
        if table_renamed:
            hive.run_lake_query(
                table_rename_template.format(schema, source_table_name, schema,
                                             table_name))
        raise e

    finally:
        # Regardless of success or failure of the above, we want to
        # drop the temporary view if it was created
        hive.run_lake_query('DROP VIEW IF EXISTS {}.{}'.format(
            temp_schema, view_name))

    # If the source table had to be renamed, it would not have been dropped
    # by the call to 'handle_existing_table', so we have to handle it here.
    # If it still shares a storage location with the new table, we just
    # drop it. Otherwise, we nuke it.
    if table_renamed:
        source_metadata = meta.get_table_metadata(source_table_name, schema)
        source_path = meta.ensure_path_ends_w_slash(source_metadata['path'])
        if source_path == path:
            hive.run_lake_query('DROP TABLE {}.{}'.format(
                schema, source_table_name))
        else:
            __nuke_table(source_table_name, schema)
예제 #7
0
def append_df_to_table(df,
                       table_name,
                       schema=None,
                       dtypes=None,
                       filename=None,
                       overwrite_file=False,
                       timezones=None,
                       copy_df=True,
                       partition_values=None,
                       require_identical_columns=True,
                       avro_schema=None,
                       hive_functions=None):
    """
    Uploads a dataframe to S3 and appends it to an already existing table.
    Queries existing table metadata to

    Args:
        df (pd.DataFrame): Which schema to check for the table in
        table_name (str): The name of the table to be created
        schema (str, optional): Name of the schema to create the table in
        dtypes (dict<str:str>, optional): A dictionary specifying dtypes for
            specific columns to be cast to prior to uploading.
        filename (str, optional):
            Name to store the file under. Can be left blank if writing to the
            experimental zone, in which case a name will be generated.
        overwrite_file (bool):
            Whether to overwrite the file if a file with a matching name
            to "filename" is already present in S3.
        timezones (dict<str, str>):
            Dictionary from datetime columns to the timezone they
            represent. If the column is timezone-naive, it will have the
            timezone added to its metadata, leaving the times themselves
            unmodified. If the column is timezone-aware and is in a different
            timezone than the one that is specified, the column's timezone
            will be converted, modifying the original times.
        copy_df (bool):
            Whether the operations performed on df should be performed on the
            original or a copy. Keep in mind that if this is set to False,
            the original df passed in will be modified as well - twice as
            memory efficient, but may be undesirable if the df is needed
            again later
        partition_values (dict<str:str>, optional):
            List of tuples containing partition keys and values to
            store the dataframe under. If there is no partiton at the value,
            it will be created.
        require_identical_columns (bool, default True):
            Whether extra/missing columns should be allowed and handled, or
            if they should lead to an error being raised.
        avro_schema (dict, optional):
            Schema to use when writing a DataFrame to an Avro file. If not
            provided, one will be auto-generated.
        hive_functions (dict<str:str> or dict<str:dict>):
            Specifications on what hive functions to apply to which columns.
            Only usable when working with ORC tables. See 'orc.py'
            for additional documentation
    """
    # Less memory efficient, but prevents original DataFrame from modification
    if copy_df:
        df = df.copy()

    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    table_exists = check.table_existence(table_name, schema)
    if not table_exists:
        raise ValueError(
            'Table \'{schema}.{table_name}\' does not exist. '.format(
                schema=schema, table_name=table_name))

    # Gets the table's S3 location and storage type from metadata
    # We need to know where to write the data to be appended, and
    # the format to write it in
    table_metadata = meta.get_table_metadata(table_name, schema)

    bucket = table_metadata['bucket']
    path = table_metadata['path']
    storage_type = table_metadata['storage_type']
    if filename is None:
        filename = meta.gen_filename_if_allowed(schema, storage_type)
    if not filename.endswith(storage_type):
        raise ValueError(
            'The type specified in the filename does not match the '
            'filetype of the table.')
    path = meta.ensure_path_ends_w_slash(path)

    df = dtype_mapping.special_dtype_handling(df,
                                              spec_dtypes=dtypes,
                                              spec_timezones=timezones,
                                              schema=schema)

    # Columns being in the same order as the table is either
    # mandatory or highly advisible, depending on storage format.
    df = reorder_columns_for_appending(df, table_name, schema,
                                       partition_values, storage_type,
                                       require_identical_columns)

    # If the data is to be appended into a partition, we must get the
    # subpath of the partition if it exists, or create
    # the partition if it doesn't
    if partition_values:
        path += add_partition(table_name, schema, partition_values)

    if storage_type == 'orc':
        append_df_to_orc_table(df, table_name, schema, bucket, path, filename,
                               partition_values, hive_functions)

    else:
        path += filename

        if rv.exists(path, bucket) and not overwrite_file:
            raise KeyError('A file already exists at s3://{}/{}, '
                           'Which will be overwritten by this operation. '
                           'Specify a different filename to proceed.'.format(
                               bucket, path))

        storage_settings = meta.storage_type_specs[storage_type]['settings']
        if avro_schema is not None:
            storage_settings['schema'] = avro_schema
        rv.write(df, path, bucket, show_progressbar=False, **storage_settings)
def flash_update_table_from_df(df,
                               table_name,
                               schema=None,
                               dtypes=None,
                               table_comment=None,
                               col_comments=None,
                               timezones=None,
                               copy_df=True):
    """
    Overwrites single-file table with minimal table downtime.
    Similar to 'create_table_from_df' with overwrite=True, but only usable
    when the table only consists of one underlying file

    Args:
        df (pd.DataFrame): The DataFrame to create the table from.
        table_name (str): The name of the table to be created
        schema (str): The name of the schema to create the table in
        dtypes (dict<str:str>, optional): A dictionary specifying dtypes for
            specific columns to be cast to prior to uploading.
        table_comment (str, optional): Documentation on the table's purpose
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        timezones (dict<str, str>):
            Dictionary from datetime columns to the timezone they
            represent. If the column is timezone-naive, it will have the
            timezone added to its metadata, leaving the times themselves
            unmodified. If the column is timezone-aware and is in a different
            timezone than the one that is specified, the column's timezone
            will be converted, modifying the original times.
        copy_df (bool):
            Whether the operations performed on df should be performed on the
            original or a copy. Keep in mind that if this is set to False,
            the original df passed in will be modified as well - twice as
            memory efficient, but may be undesirable if the df is needed
            again later
    """
    # Less memory efficient, but prevents modification of original df
    if copy_df:
        df = df.copy()

    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if schema == 'curated':
        check_for_comments(table_comment, df.columns, col_comments)
        if not os.getenv('HC_PROD_ENV'):
            raise ValueError(
                'Flash update functionality is only available in '
                'the experimental zone. Contact a lake administrator if '
                'modification of a non-experimental table is needed.')

    table_exists = check.table_existence(table_name, schema)
    if not table_exists:
        raise ValueError('Table {}.{} does not exist.'.format(
            schema, table_name))

    table_metadata = meta.get_table_metadata(table_name, schema)
    bucket = table_metadata['bucket']
    path = meta.ensure_path_ends_w_slash(table_metadata['path'])

    objects_present = rv.list_objects(path, bucket)

    if len(objects_present) > 1:
        # Flash updates are supposed to feel as close to atomic as possible.
        # Multi-file operations interfere with this.
        raise ValueError('Flash update functionality is only available on '
                         'tables that only consist of one underlying file.')
    if meta.is_partitioned_table(table_name, schema):
        # Difficult to deterministically restore partitions based on new data
        raise ValueError('Flash update functionality is not available on '
                         'partitioned tables.')

    if objects_present:
        filename = objects_present[0]
    else:
        filename = meta.gen_filename_if_allowed(schema)
    path += filename

    storage_type = get_storage_type_from_filename(filename)
    df, col_defs = prep_df_and_col_defs(df, dtypes, timezones, schema,
                                        storage_type)

    # Gets settings to pass to rivet on how to write the files in a
    # Hive-readable format
    storage_settings = meta.storage_type_specs[storage_type]['settings']

    # tblproperties is for additional metadata to be provided to Hive
    # for the table. Generally, it is not needed
    tblproperties = {}

    if storage_type == 'avro':
        storage_settings, tblproperties = handle_avro_filetype(
            df, storage_settings, tblproperties, col_comments)

    full_path = '/'.join([bucket, path])
    create_table_ddl = build_create_table_ddl(table_name,
                                              schema,
                                              col_defs,
                                              col_comments,
                                              table_comment,
                                              storage_type,
                                              partitioned_by=None,
                                              full_path=full_path,
                                              tblproperties=tblproperties)
    inform(create_table_ddl)
    drop_table_stmt = 'DROP TABLE IF EXISTS {}.{}'.format(schema, table_name)

    # Creating the table doesn't populate it with data. We now need to write
    # the DataFrame to a file and upload it to S3
    _ = rv.write(df, path, bucket, show_progressbar=False, **storage_settings)
    hive.run_lake_query(drop_table_stmt, engine='hive')
    hive.run_lake_query(create_table_ddl, engine='hive')
def create_table_from_df(df,
                         table_name,
                         schema=None,
                         dtypes=None,
                         path=None,
                         filename=None,
                         table_comment=None,
                         col_comments=None,
                         timezones=None,
                         copy_df=True,
                         partitioned_by=None,
                         partition_values=None,
                         overwrite=False,
                         auto_upload_df=True,
                         avro_schema=None,
                         hive_functions=None):
    """
    Uploads a dataframe to S3 and establishes it as a new table in Hive.

    Args:
        df (pd.DataFrame): The DataFrame to create the table from.
        table_name (str): The name of the table to be created
        schema (str): The name of the schema to create the table in
        dtypes (dict<str:str>, optional): A dictionary specifying dtypes for
            specific columns to be cast to prior to uploading.
        path (str, optional): Folder in S3 to store all files for this table in
        filename (str, optional):
            Name to store the file under. Used to determine storage format.
            Can be left blank if writing to the experimental zone,
            in which case a name will be generated and storage format will
            default to Parquet
        table_comment (str, optional): Documentation on the table's purpose
        col_comments (dict<str:str>, optional):
            Dictionary from column name keys to column descriptions.
        timezones (dict<str, str>):
            Dictionary from datetime columns to the timezone they
            represent. If the column is timezone-naive, it will have the
            timezone added to its metadata, leaving the times themselves
            unmodified. If the column is timezone-aware and is in a different
            timezone than the one that is specified, the column's timezone
            will be converted, modifying the original times.
        copy_df (bool):
            Whether the operations performed on df should be performed on the
            original or a copy. Keep in mind that if this is set to False,
            the original df passed in will be modified as well - twice as
            memory efficient, but may be undesirable if the df is needed
            again later
        partitioned_by (dict<str:str>,
                        collections.OrderedDict<str:str>, or
                        list<tuple<str:str>>, optional):
            Dictionary or list of tuples containing a partition name and type.
            Cannot be a vanilla dictionary if using Python version < 3.6
        partition_values (dict<str:str>):
            Required if 'partitioned_by' is used and 'auto_upload_df' is True.
            List of tuples containing partition name and value to store
            the dataframe under
        overwrite (bool, default False):
            Whether to overwrite the current table if one is already present
            at the specified name
        auto_upload_df (bool, default True):
            Whether the df that the table's structure will be based off of
            should be automatically uploaded to the table
        avro_schema (dict, optional):
            Schema to use when writing a DataFrame to an Avro file. If not
            provided, one will be auto-generated.
        hive_functions (dict<str:str> or dict<str:dict>):
            Specifications on what hive functions to apply to which columns.
            Only usable when working with ORC tables. See 'orc.py'
            for additional documentation
    """
    # Less memory efficient, but prevents modification of original df
    if copy_df:
        df = df.copy()

    table_name, schema = meta.prep_schema_and_table(table_name, schema)

    if partitioned_by:
        if isinstance(partitioned_by, dict) and not confirm_ordered_dicts():
            raise TypeError(
                'The order of "partitioned_by" must be preserved, and '
                'dictionaries are not guaranteed to be order-preserving '
                'in Python versions < 3.7. Use a list of tuples or an '
                'OrderedDict, or upgrade your Python version.')
        elif isinstance(partitioned_by, list):
            partitioned_by = OrderedDict(partitioned_by)
        if auto_upload_df and not partition_values:
            raise ValueError(
                'If using "partitioned_by" and "auto_upload_df" is True, '
                'values must be passed to "partition_values" as well.')

    if schema == 'curated':
        check_for_comments(table_comment, df.columns, col_comments)
        check_for_allowed_overwrite(overwrite)

    handle_existing_table(table_name, schema, overwrite)

    if filename is None:
        filename = meta.gen_filename_if_allowed(schema)
    path = meta.validate_table_path(path, table_name)

    bucket = schema_to_zone_bucket_map[schema]

    if rv.list_objects(path, bucket):
        raise KeyError((
            'Files are already present in s3://{}/{}. Creation of a new table '
            'requires a dedicated, empty folder. Either specify a different '
            'path for the table or ensure the directory is empty before '
            'attempting table creation.').format(bucket, path))

    storage_type = get_storage_type_from_filename(filename)
    df, col_defs = prep_df_and_col_defs(df, dtypes, timezones, schema,
                                        storage_type)

    if storage_type == 'orc' and auto_upload_df:
        create_orc_table_from_df(df, table_name, schema, col_defs, bucket,
                                 path, filename, col_comments, table_comment,
                                 partitioned_by, partition_values,
                                 hive_functions)
    else:
        build_and_run_ddl_stmt(df, table_name, schema, col_defs, storage_type,
                               bucket, path, filename, col_comments,
                               table_comment, partitioned_by, partition_values,
                               auto_upload_df, avro_schema)