def mapper_init(self): """ mrjob initialization. Should you decide to override, you should call 'super' to invoke this method. """ yaml_data = load_from_file(self.options.extractions) schema = RedShiftLogSchema(yaml.load(yaml_data)) self.schema = schema self.table_name_to_columns = dict((table_name, [ Column.create_from_table(table, column) for column in table['columns'] ]) for table_name, table in schema.tables().iteritems()) self.table_name_to_table = dict( (table_name, Table.create(table, columns=self.table_name_to_columns[table_name])) for table_name, table in schema.tables().iteritems()) self.table_name_to_output_order = dict( (table_name, [column.name for column in columns if not column.is_noop]) for table_name, columns in self.table_name_to_columns.iteritems()) self.redshift_export = RedshiftExportProtocol( delimiter=self.options.column_delimiter) error_table_name, error_table = self.schema.get_error_table() self.error_tbl_name = error_table_name self.error_tbl_output_order = [ c['log_key'] for c in error_table['columns'] ]
def mapper_init(self): """ mrjob initialization. Should you decide to override, you should call 'super' to invoke this method. """ yaml_data = load_from_file(self.options.extractions) schema = RedShiftLogSchema(yaml.load(yaml_data)) self.schema = schema self.table_name_to_columns = dict( (table_name, [Column.create_from_table(table, column) for column in table['columns']]) for table_name, table in schema.tables().iteritems() ) self.table_name_to_table = dict( (table_name, Table.create(table, columns=self.table_name_to_columns[table_name])) for table_name, table in schema.tables().iteritems() ) self.table_name_to_output_order = dict( (table_name, [column.name for column in columns if not column.is_noop]) for table_name, columns in self.table_name_to_columns.iteritems() ) self.redshift_export = RedshiftExportProtocol( delimiter=self.options.column_delimiter ) error_table_name, error_table = self.schema.get_error_table() self.error_tbl_name = error_table_name self.error_tbl_output_order = [c['log_key'] for c in error_table['columns']]
def create_schema(file_path): yaml_data = load_from_file(file_path) schema = RedShiftLogSchema(yaml.load(yaml_data)) name_to_columns = dict((name, [ Column.create_from_table(table, column) for column in table['columns'] ]) for name, table in schema.tables().iteritems()) for __, columns in name_to_columns.iteritems(): assert columns name_to_table = dict( (name, Table.create(table, columns=name_to_columns[name])) for name, table in schema.tables().iteritems()) assert name_to_table
def create_schema(file_path): yaml_data = load_from_file(file_path) schema = RedShiftLogSchema(yaml.load(yaml_data)) name_to_columns = dict((name, [Column.create_from_table(table, column) for column in table['columns']]) for name, table in schema.tables().iteritems()) for __, columns in name_to_columns.iteritems(): assert columns name_to_table = dict((name, Table.create(table, columns=name_to_columns[name])) for name, table in schema.tables().iteritems()) assert name_to_table
def create_new_yaml(suffix, input_file=sys.stdin): schema = RedShiftLogSchema(yaml.load(input_file)) for table_name in schema.tables().keys(): schema.table_rename( table_name, '{original_name}{suffix}'.format(original_name=table_name, suffix=suffix)) return yaml.dump(schema.schema(), default_flow_style=False)
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format( namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def create_new_yaml(suffix, input_file=sys.stdin): schema = RedShiftLogSchema(yaml.load(input_file)) for table_name in schema.tables().keys(): schema.table_rename( table_name, '{original_name}{suffix}'.format( original_name=table_name, suffix=suffix ) ) return yaml.dump(schema.schema(), default_flow_style=False)
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def mk_table(db, table_name, src='', src_type='dict', sortkeys=None, flattenNestedKeys=True, add_source_filename=False): """Create a table definition from db table_name: see RedshiftSchema.table_create src: see RedshiftSchema.table_create src_type: see RedshiftSchema.table_create sortkeys: see RedshiftSchema.table_create flattenNestedKeys: Allow column names of the form 'location.bounds.max' or change to location_bounds_max instead. add_source_filename: see RedshiftSchema.table_create """ rs_schema = RedShiftLogSchema() rs_schema.table_create( table_name, src, src_type, sortkeys, add_source_filename ) columns = [] # sort for consistent output for key in sorted(db.keys()): val = db[key] log_key = key sql_attr = RedShiftSchemaMaker.type_to_sqlattr( val['type'], val['max_len'] ) if sql_attr is None: continue is_json = val.get('is_json', False) is_foreign = val.get('is_foreign', False) # nested log_key are a pain in RedShift. # Ex. select results."location.bounds" # replace '.' with '_' instead. name = re.sub('\.', '_', key) if flattenNestedKeys is True else key if val.get('is_mandatory', False) is True: sql_attr += ' not null' if is_foreign: rs_schema.column_add( table_name, name, sql_attr, log_key, is_json, is_foreign ) else: columns.append( [table_name, name, sql_attr, log_key, is_json, False] ) for args in columns: rs_schema.column_add(*args) return rs_schema.tables()
def mk_table(db, table_name, src='', src_type='dict', sortkeys=None, flattenNestedKeys=True, add_source_filename=False): """Create a table definition from db table_name: see RedshiftSchema.table_create src: see RedshiftSchema.table_create src_type: see RedshiftSchema.table_create sortkeys: see RedshiftSchema.table_create flattenNestedKeys: Allow column names of the form 'location.bounds.max' or change to location_bounds_max instead. add_source_filename: see RedshiftSchema.table_create """ rs_schema = RedShiftLogSchema() rs_schema.table_create(table_name, src, src_type, sortkeys, add_source_filename) columns = [] # sort for consistent output for key in sorted(db.keys()): val = db[key] log_key = key sql_attr = RedShiftSchemaMaker.type_to_sqlattr( val['type'], val['max_len']) if sql_attr is None: continue is_json = val.get('is_json', False) is_foreign = val.get('is_foreign', False) # nested log_key are a pain in RedShift. # Ex. select results."location.bounds" # replace '.' with '_' instead. name = re.sub('\.', '_', key) if flattenNestedKeys is True else key if val.get('is_mandatory', False) is True: sql_attr += ' not null' if is_foreign: rs_schema.column_add(table_name, name, sql_attr, log_key, is_json, is_foreign) else: columns.append( [table_name, name, sql_attr, log_key, is_json, False]) for args in columns: rs_schema.column_add(*args) return rs_schema.tables()
def get_create_commands(input_file, add_error_table=True): """ get_create_command takes an input file and reads the create table sql command from it. Args: input_file -- the full path of the input file add_error_table -- boolean flag to add error table to schema Returns: a list of (command, table_name) tuples where the command is a SQL command for creating the table, and the table_name is the name of the table to be created. Important because we don't want to create a table that already exists """ # in regex \S is all non-whitespace and \s is whitespace only table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(') command = load_from_file(input_file) if input_file[-5:] == ".yaml": rs_log_schema = RedShiftLogSchema(safe_load(command)) if add_error_table: err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) command = tables_to_sql(rs_log_schema.tables()) commands = command.split('CREATE TABLE') table_create_tuples = [] for cmd in commands[1:]: match = table_regex.search(cmd) if match is None: table_name = None else: table_name = match.group('tablename') table_to_create = get_namespaced_tablename(table_name) cmd = cmd.replace(table_name, table_to_create, 1) table_create_tuples.append((table_name, "create table " + cmd)) return table_create_tuples
def get_create_commands(input_file, add_error_table=True): """ get_create_command takes an input file and reads the create table sql command from it. Args: input_file -- the full path of the input file add_error_table -- boolean flag to add error table to schema Returns: a list of (command, table_name) tuples where the command is a SQL command for creating the table, and the table_name is the name of the table to be created. Important because we don't want to create a table that already exists """ # in regex \S is all non-whitespace and \s is whitespace only table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(') command = load_from_file(input_file) if input_file[-5:] == ".yaml": rs_log_schema = RedShiftLogSchema(safe_load(command)) if add_error_table: err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) command = tables_to_sql(rs_log_schema.tables()) commands = command.split('CREATE TABLE') table_create_tuples = [] for cmd in commands[1:]: match = table_regex.search(cmd) if match is None: table_name = None else: table_name = match.group('tablename') table_to_create = get_namespaced_tablename(table_name) cmd = cmd.replace(table_name, table_to_create, 1) table_create_tuples.append((table_name, "create table " + cmd)) return table_create_tuples
def main(): schema = RedShiftLogSchema(safe_load(sys.stdin)) sql_str = tables_to_sql(schema.tables()) sys.stdout.write(sql_str)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures) ) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)
def main(): schema = RedShiftLogSchema(safe_load(sys.stdin)) sql_str = tables_to_sql(schema.tables()) sys.stdout.write(sql_str)
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE): num_failures = 0 for tbl_name in tables: tbl_name = get_namespaced_tablename(tbl_name, schemaname) try: analyze_table(psql, db, tbl_name) except: num_failures += 1 if num_failures: raise RuntimeError( 'failed to analyze {0} tables, see log'.format(num_failures)) if __name__ == "__main__": args = get_cmd_line_args() run_local = args.run_local merge_configs(args.config) db = read_string('pipeline.redshift_database') log_stream = read_string('pipeline.load_step.s3_to_redshift_stream') logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint') psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local) yaml = load_from_file(args.schema) schema = RedShiftLogSchema(safe_load(yaml)) if args.compact: compact_tables(psql, db, schema.tables(), args.redshift_schema) analyze_tables(psql, db, schema.tables(), args.redshift_schema)