示例#1
0
    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict((table_name, [
            Column.create_from_table(table, column)
            for column in table['columns']
        ]) for table_name, table in schema.tables().iteritems())
        self.table_name_to_table = dict(
            (table_name,
             Table.create(table,
                          columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems())
        self.table_name_to_output_order = dict(
            (table_name,
             [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems())
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter)

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [
            c['log_key'] for c in error_table['columns']
        ]
示例#2
0
    def mapper_init(self):
        """ mrjob initialization.
        Should you decide to override, you should call 'super' to invoke
        this method.
        """
        yaml_data = load_from_file(self.options.extractions)
        schema = RedShiftLogSchema(yaml.load(yaml_data))
        self.schema = schema

        self.table_name_to_columns = dict(
            (table_name,
                [Column.create_from_table(table, column)
                    for column in table['columns']])
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_table = dict(
            (table_name,
                Table.create(table,
                             columns=self.table_name_to_columns[table_name]))
            for table_name, table in schema.tables().iteritems()
        )
        self.table_name_to_output_order = dict(
            (table_name,
                [column.name for column in columns if not column.is_noop])
            for table_name, columns in self.table_name_to_columns.iteritems()
        )
        self.redshift_export = RedshiftExportProtocol(
            delimiter=self.options.column_delimiter
        )

        error_table_name, error_table = self.schema.get_error_table()
        self.error_tbl_name = error_table_name
        self.error_tbl_output_order = [c['log_key'] for c in error_table['columns']]
示例#3
0
def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [
        Column.create_from_table(table, column) for column in table['columns']
    ]) for name, table in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict(
        (name, Table.create(table, columns=name_to_columns[name]))
        for name, table in schema.tables().iteritems())
    assert name_to_table
示例#4
0
def create_schema(file_path):
    yaml_data = load_from_file(file_path)
    schema = RedShiftLogSchema(yaml.load(yaml_data))

    name_to_columns = dict((name, [Column.create_from_table(table, column)
                                   for column in table['columns']])
                           for name, table
                           in schema.tables().iteritems())
    for __, columns in name_to_columns.iteritems():
        assert columns
    name_to_table = dict((name,
                          Table.create(table,
                                       columns=name_to_columns[name]))
                         for name, table in schema.tables().iteritems())
    assert name_to_table