def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format( namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream): """ Check new schema against what exists in the database such as 1. create new tables if missing 2. compare table definitions 3. add new columns Args: psql -- handle to talk to redshift db -- redshift database containing table ddate -- the date string of the data to be copied formatted YYYY/MM/DD s3_logdir -- path to location of tables in PSV format schema_file -- the name of the schema file with the create table command logstream -- a PipelineStreamLogger Return: None """ # TODO: db.yaml as SPOT fname = schema_file.replace('.sql', '.yaml') yaml_dict = load_from_file(fname) rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict)) err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) tables = rs_log_schema.tables() # create tables if missing for schema create_tuples = get_table_creates(schema_file, logstream) create_tables(psql, db, create_tuples) # check for schema changes for table in tables.keys(): tmp_tbl_name = "tmp_{0}".format(table) namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name) # create temp tables create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table]) psql.run_sql(create_table_cmd, db, create_table_cmd) try: # fetch table definition cur_tbl_def = get_table_def(psql, db, table) tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name) compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def) tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name) to_add = tmp_tbl_def[len(cur_tbl_def):] defaults = get_column_defaults(tables[table]) add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults, logstream) finally: if tmp_tbl_name != table: delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table) psql.run_sql(delete_table_cmd, db, delete_table_cmd)
def get_create_commands(input_file, add_error_table=True): """ get_create_command takes an input file and reads the create table sql command from it. Args: input_file -- the full path of the input file add_error_table -- boolean flag to add error table to schema Returns: a list of (command, table_name) tuples where the command is a SQL command for creating the table, and the table_name is the name of the table to be created. Important because we don't want to create a table that already exists """ # in regex \S is all non-whitespace and \s is whitespace only table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(') command = load_from_file(input_file) if input_file[-5:] == ".yaml": rs_log_schema = RedShiftLogSchema(safe_load(command)) if add_error_table: err_tbl_name, err_tbl = rs_log_schema.get_error_table() rs_log_schema.table_add(err_tbl_name, err_tbl) command = tables_to_sql(rs_log_schema.tables()) commands = command.split('CREATE TABLE') table_create_tuples = [] for cmd in commands[1:]: match = table_regex.search(cmd) if match is None: table_name = None else: table_name = match.group('tablename') table_to_create = get_namespaced_tablename(table_name) cmd = cmd.replace(table_name, table_to_create, 1) table_create_tuples.append((table_name, "create table " + cmd)) return table_create_tuples