def test_iam_user(self): '''Verify that iam_user will assume the value of ec2_user when run on an ec2 server''' sh = SessionHelper(region='cheeseburger', cluster_id='cheeseburger_id', host='cheeseburger_host', port='cheeseburger_port', db_name='cheeseburger_params', ec2_user='******') # Fake is_ec2_flag to be True sh.is_ec2_flag = True sh.set_iam_user() assert sh.iam_user is 'cheeseburger_user'
def test_no_ec2(self): '''Verify that False is returned if not run on an ec2 server''' is_ec2 = SessionHelper._is_ec2(self) assert is_ec2 is False '''Verify that ec2 flag evaluates to false if not run on an ec2 server''' sh = SessionHelper(region='cheeseburger', cluster_id='cheeseburger_id', host='cheeseburger_host', port='cheeseburger_port', db_name='cheeseburger_params', ec2_user='******') assert sh.is_ec2_flag is False
def create_table(table_name: str, schema_name: str, columns: dict, partitions: dict, path: str, session_helper: SessionHelper): """ Creates a table in AWS redshift. The table will be named schema_name.table_name and belong to the (existing) Redshift db db_name. Args: table_name: name of created table. NOTE: THIS WILL ERROR IF table_name ALREADY EXISTS IN REDSHIFT schema_name: name of schema in redshift. Schema must be external and already exist! columns: Dictionary with keys corresponding to column names and values corresponding to pandas dtypes, excluding partition columns. partitions: Dict similar to columns, except ONLY with partition columns path: Path to published contract in s3 (excluding partitions) session_helper: Instance of Redshift s3parq.session_helper """ _redshift_name_validator(table_name) redshift_columns = _datatype_mapper(columns) redshift_partitions = _datatype_mapper(partitions) with session_helper.db_session_scope() as scope: if_exists_query = f'SELECT EXISTS(SELECT schemaname, tablename FROM SVV_EXTERNAL_TABLES WHERE tablename=\'{table_name}\' AND schemaname=\'{schema_name}\');' table_exists = scope.execute(if_exists_query).first()[0] if table_exists: return if not partitions: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ STORED AS PARQUET \ LOCATION \'{path}\';') else: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ PARTITIONED BY {redshift_partitions} STORED AS PARQUET \ LOCATION \'{path}\';') logger.info(f'Running query to create table: {new_schema_query}') scope.execute(new_schema_query)
def create_partitions(bucket: str, schema: str, table: str, filepath: str, session_helper: SessionHelper) -> str: ''' Generates partitioning SQL Args: bucket (str): S3 bucket where data is stored schema (str): name of redshift schema (must already exist) table (str): name of table in schema. Must have partitions scoped out in `CREATE TABLE ...` filepath (str): path to data in S3 that will be queryable by it's partitions ---- Returns: query (str): a raw SQL string that adds the partitioned data to the table -------- Example: Args: bucket = 'MyBucket' schema = 'MySchema' table = 'MyTable' filepath = 'path/to/data/apple=abcd/banana=1234/abcd1234.parquet' Returns: "ALTER TABLE MySchema.MyTable ADD PARTITION (apple='abcd' ,banana='1234') LOCATION 's3://MyBucket/path/to/data/apple=abcd';" ''' partitions = _get_partitions_for_spectrum(filepath) formatted_partitions = _format_partition_strings_for_sql(partitions) path_to_data = _get_partition_location(filepath) with session_helper.db_session_scope() as scope: partitions_query = f"ALTER TABLE {schema}.{table} \ ADD IF NOT EXISTS PARTITION ({', '.join(formatted_partitions)}) \ LOCATION 's3://{bucket}/{path_to_data}';" logger.info(f'Running query to create: {partitions_query}') scope.execute(partitions_query)
def create_partitions(bucket: str, schema: str, table: str, filepath: str, session_helper: SessionHelper) -> None: ''' Executes the SQL that creates partitions on the given table for an individual file Args: bucket (str): S3 bucket where data is stored schema (str): name of redshift schema (must already exist) table (str): name of table in schema. Must have partitions scoped out in `CREATE TABLE ...` filepath (str): path to data in S3 that will be queryable by it's partitions NOTE: This is to the single parquet file, including the partitions session_helper (SessionHelper): a configured s3parq.session_helper.SessionHelper session Returns: None Example: Args: bucket = 'MyBucket' schema = 'MySchema' table = 'MyTable' filepath = 'path/to/data/apple=abcd/banana=1234/abcd1234.parquet' session_helper = some_configured_session ''' partitions = _get_partitions_for_spectrum(filepath) formatted_partitions = _format_partition_strings_for_sql(partitions) path_to_data = _get_partition_location(filepath) with session_helper.db_session_scope() as scope: partitions_query = f"ALTER TABLE {schema}.{table} \ ADD IF NOT EXISTS PARTITION ({', '.join(formatted_partitions)}) \ LOCATION 's3://{bucket}/{path_to_data}';" logger.info(f'Running query to create: {partitions_query}') scope.execute(partitions_query)
def test_ec2_user(self): '''Verify that SessionHelper can accept a string as an argument for ec2 user''' sh = SessionHelper(region='cheeseburger', cluster_id='cheeseburger_id', host='cheeseburger_host', port='cheeseburger_port', db_name='cheeseburger_params', ec2_user='******') assert (isinstance(sh.ec2_user, str)) is True
def test_no_ec2_user(self): '''Verify that SessionHelper can accept None as an argument for ec2 user''' sh = SessionHelper(region='cheeseburger', cluster_id='cheeseburger_id', host='cheeseburger_host', port='cheeseburger_port', db_name='cheeseburger_params', ec2_user=None) assert (type(sh.ec2_user)) is type(None)
def create_custom_table(table_name: str, schema_name: str, partitions: dict, path: str, custom_redshift_columns: dict, session_helper: SessionHelper) -> None: """ Creates a table in AWS redshift. The table will be named schema_name.table_name and belong to the (existing) Redshift db db_name. The created table will use the CUSTOM redshift column data types defined in custom_redshift_columns. Args: table_name (str): name of created table NOTE: THIS WILL ERROR IF table_name ALREADY EXISTS IN REDSHIFT schema_name (str): name of schema in redshift; Schema must be external and already exist! partitions (dict): Dict similar to columns, except ONLY with partition columns path (str): Path to published dataset in s3 (excluding partitions) custom_redshift_columns (dict): This dictionary contains custom column data type definitions for redshift. The params should be formatted as follows: - column name (str) - data type (str) session_helper (SessionHelper): Instance of Redshift s3parq.session_helper """ logger.info("Running create_custom_table...") _redshift_name_validator(table_name) logger.info("Generating create columns sql statement with custom redshift columns...") logger.info("Generating create partitions sql statement with custom redshift columns...") redshift_columns_sql = "" redshift_partitions_sql = "" for k, v in custom_redshift_columns.items(): if k in partitions: redshift_partitions_sql += f'{k} {v}, ' else: redshift_columns_sql += f'{k} {v}, ' redshift_columns = f"({redshift_columns_sql[:-2]})" # Slice off the last space and comma redshift_partitions = f"({redshift_partitions_sql[:-2]})" # Slice off the last space and comma with session_helper.db_session_scope() as scope: if_exists_query = f'SELECT EXISTS(SELECT schemaname, tablename FROM SVV_EXTERNAL_TABLES WHERE tablename=\'{table_name}\' AND schemaname=\'{schema_name}\');' table_exists = scope.execute(if_exists_query).first()[0] if table_exists: return if not partitions: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ STORED AS PARQUET \ LOCATION \'{path}\';' ) else: new_schema_query = ( f'CREATE EXTERNAL TABLE {schema_name}.{table_name} {redshift_columns} \ PARTITIONED BY {redshift_partitions} STORED AS PARQUET \ LOCATION \'{path}\';' ) logger.info(f'Running query to create table: {new_schema_query}') scope.execute(new_schema_query)
def create_schema(schema_name: str, db_name: str, iam_role: str, session_helper: SessionHelper): """Creates a schema in AWS redshift using a given iam_role. The schema is named schema_name and belongs to the (existing) Redshift db db_name. iam_role is a link to an existing AWS IAM Role with Redshift Spectrum write permissions.""" _redshift_name_validator(schema_name, db_name) with session_helper.db_session_scope() as scope: new_schema_query = f"CREATE EXTERNAL SCHEMA IF NOT EXISTS {schema_name} \ FROM DATA CATALOG \ database '{db_name}' \ iam_role '{iam_role}';" logger.info(f'Running query to create schema: {new_schema_query}') scope.execute(new_schema_query)
def create_schema(schema_name: str, db_name: str, iam_role: str, session_helper: SessionHelper) -> None: """ Creates a schema in AWS redshift using a given iam_role Args: schema_name (str): Name of the schema to create in Redshift Spectrum db_name (str): (Existing) database that the schema should belong to iam_role (str): link to an existing AWS IAM Role with Redshift Spectrum write permissions session_helper (str): Active and configured session_helper session to use """ _redshift_name_validator(schema_name, db_name) with session_helper.db_session_scope() as scope: new_schema_query = f"CREATE EXTERNAL SCHEMA IF NOT EXISTS {schema_name} \ FROM DATA CATALOG \ database '{db_name}' \ iam_role '{iam_role}';" logger.info(f'Running query to create schema: {new_schema_query}') scope.execute(new_schema_query)
def custom_publish(bucket: str, key: str, partitions: List[str], dataframe: pd.DataFrame, custom_redshift_columns: dict, redshift_params: dict = None) -> List[str]: """ Dataframe to S3 Parquet Publisher with a CUSTOM redshift column definition. Custom publish allows custom defined redshift column definitions to be used and enables support for Redshift's decimal data type. This function handles the portion of work that will see a dataframe converted to parquet and then published to the given S3 location. It supports partitions and will use the custom redshift columns defined in the custom_redshift_columns dictionary when creating the table schema for the parquet file. View the Custom Publishes section of s3parq's readme file for more guidance on formatting the custom_redshift_columns dictionary. It also has the option to automatically publish up to Redshift Spectrum for the newly published parquet files. Args: bucket (str): S3 Bucket name key (str): S3 key to lead to the desired dataset partitions (List[str]): List of columns that should be partitioned on dataframe (pd.DataFrame): Dataframe to be published custom_redshift_columns (dict): This dictionary contains custom column data type definitions for redshift. The params should be formatted as follows: - column name (str) - data type (str) redshift_params (dict, Optional): This dictionary should be provided in the following format in order for data to be published to Spectrum. Leave out entirely to avoid publishing to Spectrum. The params should be formatted as follows: - schema_name (str): Name of the Spectrum schema to publish to - table_name (str): Name of the table to write the dataset as - iam_role (str): Role to take while writing data to Spectrum - region (str): AWS region for Spectrum - cluster_id (str): Spectrum cluster id - host (str): Redshift Spectrum host name - port (str): Redshift Spectrum port to use - db_name (str): Redshift Spectrum database name to use - ec2_user (str): If on ec2, the user that should be used Returns: A str list of all the newly published object keys """ logger.debug("Running custom publish...") session_helper = None if redshift_params: if "index" in dataframe.columns: raise ValueError( "'index' is a reserved keyword in Redshift. Please remove or rename your DataFrame's 'index' column." ) logger.debug( "Found redshift parameters. Checking validity of params...") redshift_params = validate_redshift_params(redshift_params) logger.debug("Redshift parameters valid. Opening Session helper.") session_helper = SessionHelper( region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name'], ec2_user=redshift_params['ec2_user']) session_helper.configure_session_helper() publish_redshift.create_schema(redshift_params['schema_name'], redshift_params['db_name'], redshift_params['iam_role'], session_helper) logger.debug( f"Schema {redshift_params['schema_name']} created. Creating table {redshift_params['table_name']}..." ) publish_redshift.create_custom_table(redshift_params['table_name'], redshift_params['schema_name'], partitions, s3_url(bucket, key), custom_redshift_columns, session_helper) logger.debug(f"Custom table {redshift_params['table_name']} created.") logger.debug("Checking publish params...") check_empty_dataframe(dataframe) check_dataframe_for_timedelta(dataframe) check_partitions(partitions, dataframe) logger.debug("Publish params valid.") logger.debug("Begin writing to S3..") files = [] for frame_params in _sized_dataframes(dataframe): logger.info( f"Publishing dataframe chunk : {frame_params['lower']} to {frame_params['upper']}" ) frame = pd.DataFrame( dataframe[frame_params['lower']:frame_params['upper']]) _gen_parquet_to_s3(bucket=bucket, key=key, dataframe=frame, partitions=partitions, custom_redshift_columns=custom_redshift_columns) published_files = _assign_partition_meta( bucket=bucket, key=key, dataframe=frame, partitions=partitions, session_helper=session_helper, redshift_params=redshift_params, custom_redshift_columns=custom_redshift_columns) files = files + published_files logger.info("Done writing to S3.") return files
def publish(bucket: str, key: str, partitions: List['str'], dataframe: pd.DataFrame, redshift_params=None) -> None: """Redshift Params: ARGS: schema_name: str table_name: str iam_role: str region: str cluster_id: str host: str port: str db_name: str """ session_helper = None if redshift_params: if "index" in dataframe.columns: raise ValueError( "'index' is a reserved keyword in Redshift. Please remove or rename your DataFrame's 'index' column." ) logger.debug( "Found redshift parameters. Checking validity of params...") check_redshift_params(redshift_params) logger.debug("Redshift parameters valid. Opening Session helper.") session_helper = SessionHelper( region=redshift_params['region'], cluster_id=redshift_params['cluster_id'], host=redshift_params['host'], port=redshift_params['port'], db_name=redshift_params['db_name']) session_helper.configure_session_helper() publish_redshift.create_schema(redshift_params['schema_name'], redshift_params['db_name'], redshift_params['iam_role'], session_helper) logger.debug( f"Schema {redshift_params['schema_name']} created. Creating table {redshift_params['table_name']}..." ) df_types = _get_dataframe_datatypes(dataframe, partitions) partition_types = _get_dataframe_datatypes(dataframe, partitions, True) publish_redshift.create_table(redshift_params['table_name'], redshift_params['schema_name'], df_types, partition_types, s3_url(bucket, key), session_helper) logger.debug(f"Table {redshift_params['table_name']} created.") logger.info("Checking params...") check_empty_dataframe(dataframe) check_dataframe_for_timedelta(dataframe) check_partitions(partitions, dataframe) logger.info("Params valid.") logger.debug("Begin writing to S3..") files = [] for frame in _sized_dataframes(dataframe): _gen_parquet_to_s3(bucket=bucket, key=key, dataframe=frame, partitions=partitions) published_files = _assign_partition_meta( bucket=bucket, key=key, dataframe=frame, partitions=partitions, session_helper=session_helper, redshift_params=redshift_params) files = files + published_files logger.debug("Done writing to S3.") return files
def create_schema(schema_name: str, session_helper: SessionHelper): schema_name_validator(schema_name) with session_helper.db_session_scope() as scope: scope.execute(f'CREATE SCHEMA IF NOT EXISTS {schema_name}')