Пример #1
0
    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict((table_name, [
            Column.create_from_table(table, column)
            for column in table['columns']
        ]) for table_name, table in schema.tables().iteritems())
        self.table_name_to_table = dict(
            (table_name,
             Table.create(table,
                          columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems())
        self.table_name_to_output_order = dict(
            (table_name,
             [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems())
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter)

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [
            c['log_key'] for c in error_table['columns']
        ]
Пример #2
0
    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict(
            (table_name,
                [Column.create_from_table(table, column)
                    for column in table['columns']])
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_table = dict(
            (table_name,
                Table.create(table,
                             columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_output_order = dict(
            (table_name,
                [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems()
        )
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter
        )

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [c['log_key'] for c in error_table['columns']]
Пример #3
0
def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [
        Column.create_from_table(table, column) for column in table['columns']
    ]) for name, table in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict(
        (name, Table.create(table, columns=name_to_columns[name]))
        for name, table in schema.tables().iteritems())
    assert name_to_table
Пример #4
0
def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [Column.create_from_table(table, column)
                                   for column in table['columns']])
                           for name, table
                           in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict((name,
                          Table.create(table,
                                       columns=name_to_columns[name]))
                         for name, table in schema.tables().iteritems())
    assert name_to_table
Пример #5
0
def create_new_yaml(suffix, input_file=sys.stdin):

    schema = RedShiftLogSchema(yaml.load(input_file))
    for table_name in schema.tables().keys():
        schema.table_rename(
            table_name,
            '{original_name}{suffix}'.format(original_name=table_name,
                                             suffix=suffix))
    return yaml.dump(schema.schema(), default_flow_style=False)
Пример #6
0
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table,
                                                   tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add, tbl_tuple, defaults,
                        logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(
                    namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)
Пример #7
0
def create_new_yaml(suffix, input_file=sys.stdin):

    schema = RedShiftLogSchema(yaml.load(input_file))
    for table_name in schema.tables().keys():
        schema.table_rename(
            table_name, '{original_name}{suffix}'.format(
                original_name=table_name,
                suffix=suffix
            )
        )
    return yaml.dump(schema.schema(), default_flow_style=False)
Пример #8
0
def update_database_schema(psql, db, ddate, s3_logdir, schema_file, logstream):
    """
    Check new schema against what exists in the database such as
        1.  create new tables if missing
        2.  compare table definitions
        3.  add new columns

    Args:
    psql -- handle to talk to redshift
    db -- redshift database containing table
    ddate -- the date string of the data to be copied formatted YYYY/MM/DD
    s3_logdir -- path to location of tables in PSV format
    schema_file -- the name of the schema file with the create table command
    logstream -- a PipelineStreamLogger

    Return: None
    """
    # TODO: db.yaml as SPOT
    fname = schema_file.replace('.sql', '.yaml')
    yaml_dict = load_from_file(fname)
    rs_log_schema = RedShiftLogSchema(safe_load(yaml_dict))
    err_tbl_name, err_tbl = rs_log_schema.get_error_table()
    rs_log_schema.table_add(err_tbl_name, err_tbl)
    tables = rs_log_schema.tables()

    # create tables if missing for schema
    create_tuples = get_table_creates(schema_file, logstream)
    create_tables(psql, db, create_tuples)

    # check for schema changes
    for table in tables.keys():
        tmp_tbl_name = "tmp_{0}".format(table)
        namespaced_tmp_table = get_namespaced_tablename(tmp_tbl_name)

        # create temp tables
        create_table_cmd = mk_create_table_sql_cmd(namespaced_tmp_table, tables[table])
        psql.run_sql(create_table_cmd, db, create_table_cmd)

        try:
            # fetch table definition
            cur_tbl_def = get_table_def(psql, db, table)
            tmp_tbl_def = get_table_def(psql, db, tmp_tbl_name)
            compare_table_defs(psql, db, table, cur_tbl_def, tmp_tbl_def)

            tbl_tuple = (join(s3_logdir, ddate, table), tmp_tbl_name)
            to_add = tmp_tbl_def[len(cur_tbl_def):]
            defaults = get_column_defaults(tables[table])
            add_columns(psql, db, ddate, table, to_add,
                        tbl_tuple, defaults, logstream)
        finally:
            if tmp_tbl_name != table:
                delete_table_cmd = 'drop table {0}'.format(namespaced_tmp_table)
                psql.run_sql(delete_table_cmd, db, delete_table_cmd)
Пример #9
0
    def mk_table(db, table_name, src='', src_type='dict', sortkeys=None,
                 flattenNestedKeys=True, add_source_filename=False):
        """Create a table definition from db
        table_name: see RedshiftSchema.table_create
        src:        see RedshiftSchema.table_create
        src_type:   see RedshiftSchema.table_create
        sortkeys:   see RedshiftSchema.table_create
        flattenNestedKeys: Allow column names of the form
            'location.bounds.max' or change to location_bounds_max instead.
        add_source_filename:  see RedshiftSchema.table_create
        """
        rs_schema = RedShiftLogSchema()
        rs_schema.table_create(
            table_name,
            src,
            src_type,
            sortkeys,
            add_source_filename
        )
        columns = []

        # sort for consistent output
        for key in sorted(db.keys()):
            val = db[key]
            log_key = key
            sql_attr = RedShiftSchemaMaker.type_to_sqlattr(
                val['type'],
                val['max_len']
            )
            if sql_attr is None:
                continue

            is_json = val.get('is_json', False)
            is_foreign = val.get('is_foreign', False)
            # nested log_key are a pain in RedShift.
            # Ex. select results."location.bounds"
            # replace '.' with '_' instead.
            name = re.sub('\.', '_', key) if flattenNestedKeys is True else key

            if val.get('is_mandatory', False) is True:
                sql_attr += ' not null'

            if is_foreign:
                rs_schema.column_add(
                    table_name, name, sql_attr, log_key, is_json, is_foreign
                )
            else:
                columns.append(
                    [table_name, name, sql_attr, log_key, is_json, False]
                )
        for args in columns:
            rs_schema.column_add(*args)
        return rs_schema.tables()
Пример #10
0
    def mk_table(db,
                 table_name,
                 src='',
                 src_type='dict',
                 sortkeys=None,
                 flattenNestedKeys=True,
                 add_source_filename=False):
        """Create a table definition from db
        table_name: see RedshiftSchema.table_create
        src:        see RedshiftSchema.table_create
        src_type:   see RedshiftSchema.table_create
        sortkeys:   see RedshiftSchema.table_create
        flattenNestedKeys: Allow column names of the form
            'location.bounds.max' or change to location_bounds_max instead.
        add_source_filename:  see RedshiftSchema.table_create
        """
        rs_schema = RedShiftLogSchema()
        rs_schema.table_create(table_name, src, src_type, sortkeys,
                               add_source_filename)
        columns = []

        # sort for consistent output
        for key in sorted(db.keys()):
            val = db[key]
            log_key = key
            sql_attr = RedShiftSchemaMaker.type_to_sqlattr(
                val['type'], val['max_len'])
            if sql_attr is None:
                continue

            is_json = val.get('is_json', False)
            is_foreign = val.get('is_foreign', False)
            # nested log_key are a pain in RedShift.
            # Ex. select results."location.bounds"
            # replace '.' with '_' instead.
            name = re.sub('\.', '_', key) if flattenNestedKeys is True else key

            if val.get('is_mandatory', False) is True:
                sql_attr += ' not null'

            if is_foreign:
                rs_schema.column_add(table_name, name, sql_attr, log_key,
                                     is_json, is_foreign)
            else:
                columns.append(
                    [table_name, name, sql_attr, log_key, is_json, False])
        for args in columns:
            rs_schema.column_add(*args)
        return rs_schema.tables()
Пример #11
0
def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples
Пример #12
0
def get_create_commands(input_file, add_error_table=True):
    """
    get_create_command takes an input file and reads the create table sql
    command from it.

    Args:
    input_file -- the full path of the input file
    add_error_table -- boolean flag to add error table to schema

    Returns:
    a list of (command, table_name) tuples where the command is a SQL command
    for creating the table, and the table_name is the name of the table to be
    created.  Important because we don't want to create a table that already
    exists
    """

    # in regex \S is all non-whitespace and \s is whitespace only
    table_regex = re.compile(r'[\s]*(?P<tablename>[\S]+[\s]*)\(')
    command = load_from_file(input_file)

    if input_file[-5:] == ".yaml":
        rs_log_schema = RedShiftLogSchema(safe_load(command))
        if add_error_table:
            err_tbl_name, err_tbl = rs_log_schema.get_error_table()
            rs_log_schema.table_add(err_tbl_name, err_tbl)
        command = tables_to_sql(rs_log_schema.tables())

    commands = command.split('CREATE TABLE')
    table_create_tuples = []
    for cmd in commands[1:]:
        match = table_regex.search(cmd)
        if match is None:
            table_name = None
        else:
            table_name = match.group('tablename')
            table_to_create = get_namespaced_tablename(table_name)
            cmd = cmd.replace(table_name, table_to_create, 1)
        table_create_tuples.append((table_name, "create table " + cmd))
    return table_create_tuples
Пример #13
0
def main():
    schema = RedShiftLogSchema(safe_load(sys.stdin))
    sql_str = tables_to_sql(schema.tables())
    sys.stdout.write(sql_str)
Пример #14
0
def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures)
        )


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)
Пример #15
0
def main():
    schema = RedShiftLogSchema(safe_load(sys.stdin))
    sql_str = tables_to_sql(schema.tables())
    sys.stdout.write(sql_str)
Пример #16
0

def analyze_tables(psql, db, tables, schemaname=DEFAULT_NAMESPACE):
    num_failures = 0
    for tbl_name in tables:
        tbl_name = get_namespaced_tablename(tbl_name, schemaname)
        try:
            analyze_table(psql, db, tbl_name)
        except:
            num_failures += 1
    if num_failures:
        raise RuntimeError(
            'failed to analyze {0} tables, see log'.format(num_failures))


if __name__ == "__main__":
    args = get_cmd_line_args()
    run_local = args.run_local
    merge_configs(args.config)
    db = read_string('pipeline.redshift_database')
    log_stream = read_string('pipeline.load_step.s3_to_redshift_stream')
    logstream = PipelineStreamLogger(log_stream, run_local, 'redshift_maint')
    psql = RedshiftPostgres(logstream, args.credentials, run_local=run_local)

    yaml = load_from_file(args.schema)
    schema = RedShiftLogSchema(safe_load(yaml))

    if args.compact:
        compact_tables(psql, db, schema.tables(), args.redshift_schema)
    analyze_tables(psql, db, schema.tables(), args.redshift_schema)