Exemplo n.º 1
0
def test_make_nullable():
    assert {'type': ['boolean', 'null']} \
           == json_schema.make_nullable({'type': 'boolean'})
    assert {'type': ['null', 'boolean']} \
           == json_schema.make_nullable({'type': ['null', 'boolean']})
    assert {'type': ['null', 'string']} \
           == json_schema.make_nullable({'type': ['null', 'string']})

    ## Make sure we're not modifying the original
    schema = {'type': ['string']}
    assert json_schema.get_type(schema) == ['string']
    assert {'type': ['string', 'null']} \
           == json_schema.make_nullable(schema)
    assert json_schema.get_type(schema) == ['string']

    assert {
               'definitions': {
                   'address': {
                       'type': 'object',
                       'properties': {
                           'street_address': {'type': 'string'},
                           'city': {'type': 'string'},
                           'state': {'type': 'string'}
                       },
                       'required': ['street_address', 'city', 'state']
                   }
               },
               'type': ['object', 'null'],
               'properties': {
                   'billing_address': {'$ref': '#/definitions/address'},
                   'shipping_address': {'$ref': '#/definitions/address'}}} \
           == json_schema.make_nullable(
        {
            'definitions': {
                'address': {
                    'type': 'object',
                    'properties': {
                        'street_address': {'type': 'string'},
                        'city': {'type': 'string'},
                        'state': {'type': 'string'}
                    },
                    'required': ['street_address', 'city', 'state']
                }
            },
            'type': 'object',
            'properties': {
                'billing_address': {'$ref': '#/definitions/address'},
                'shipping_address': {'$ref': '#/definitions/address'}}})
Exemplo n.º 2
0
    def _serialize_table_record_field_name(self, remote_schema,
                                           streamed_schema, path,
                                           value_json_schema):
        """
        Returns the appropriate remote field (column) name for `field`.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param streamed_schema: TABLE_SCHEMA(local)
        :param path: (string, ...)
        :value_json_schema: dict, JSON Schema
        :return: string
        """

        simple_json_schema = json_schema.simple_type(value_json_schema)

        mapping = self._get_mapping(remote_schema, path, simple_json_schema)

        if not mapping is None:
            return mapping

        ## Numbers are valid as `float` OR `int`
        ##  ie, 123.0 and 456 are valid 'number's
        if json_schema.INTEGER in json_schema.get_type(simple_json_schema):
            mapping = self._get_mapping(remote_schema, path,
                                        {'type': json_schema.NUMBER})

            if not mapping is None:
                return mapping

        raise Exception('Unknown column path: {} for table: {}'.format(
            path, remote_schema['path']))
Exemplo n.º 3
0
    def json_schema_to_sql_type(self, schema):
        _type = json_schema.get_type(schema)
        not_null = True
        ln = len(_type)
        if ln == 1:
            _type = _type[0]
        if ln == 2 and json_schema.NULL in _type:
            not_null = False
            if _type.index(json_schema.NULL) == 0:
                _type = _type[1]
            else:
                _type = _type[0]
        elif ln > 2:
            raise PostgresError('Multiple types per column not supported')

        sql_type = 'text'

        if 'format' in schema and \
                schema['format'] == 'date-time' and \
                _type == 'string':
            sql_type = 'timestamp with time zone'
        elif _type == 'boolean':
            sql_type = 'boolean'
        elif _type == 'integer':
            sql_type = 'bigint'
        elif _type == 'number':
            sql_type = 'double precision'

        if not_null:
            sql_type += ' NOT NULL'

        return sql_type
Exemplo n.º 4
0
    def json_schema_to_sql_type(self, schema):
        _type = json_schema.get_type(schema)
        not_null = True
        ln = len(_type)
        if ln == 1:
            _type = _type[0]
        if ln == 2 and json_schema.NULL in _type:
            not_null = False
            if _type.index(json_schema.NULL) == 0:
                _type = _type[1]
            else:
                _type = _type[0]
        elif ln > 2:
            raise SnowflakeError('Multiple types per column not supported')

        sql_type = 'text'

        if 'format' in schema and \
                schema['format'] == 'date-time' and \
                _type == 'string':
            sql_type = 'TIMESTAMP_TZ'
        elif _type == 'boolean':
            sql_type = 'BOOLEAN'
        elif _type == 'integer':
            sql_type = 'NUMBER'
        elif _type == 'number':
            sql_type = 'FLOAT'

        if not_null:
            sql_type += ' NOT NULL'

        return sql_type
Exemplo n.º 5
0
    def _serialize_table_record_field_name(self, remote_schema, path,
                                           value_json_schema):
        """
        Returns the appropriate remote field (column) name for `path`.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param path: (string, ...)
        :value_json_schema: dict, JSON Schema
        :return: string
        """

        simple_json_schema = json_schema.simple_type(value_json_schema)

        mapping = self._get_mapping(remote_schema, path, simple_json_schema)

        if not mapping is None:
            return mapping

        ## Numbers are valid as `float` OR `int`
        ##  ie, 123.0 and 456 are valid 'number's
        if json_schema.INTEGER in json_schema.get_type(simple_json_schema):
            mapping = self._get_mapping(remote_schema, path,
                                        {'type': json_schema.NUMBER})

            if not mapping is None:
                return mapping

        raise Exception(
            "A compatible column for path {} and JSONSchema {} in table {} cannot be found."
            .format(path, simple_json_schema, remote_schema['path']))
Exemplo n.º 6
0
    def merge_put_schemas(self, cur, table_schema, table_name, existing_schema,
                          new_schema):
        new_properties = new_schema['properties']
        existing_properties = existing_schema['schema']['properties']
        for name, schema in new_properties.items():
            ## Mapping exists
            if self.get_mapping(existing_schema, name, schema) is not None:
                pass

            ## New column
            elif name not in existing_properties:

                existing_properties[name] = schema
                self.add_column(cur, table_schema, table_name, name, schema)

            ## Existing column non-nullable, new column is nullable
            elif not json_schema.is_nullable(existing_properties[name]) \
                    and json_schema.get_type(schema) \
                    == json_schema.get_type(json_schema.make_nullable(existing_properties[name])):

                existing_properties[name] = json_schema.make_nullable(
                    existing_properties[name])
                self.make_column_nullable(cur, table_schema, table_name, name)

            ## Existing column, types compatible
            elif json_schema.to_sql(json_schema.make_nullable(schema)) \
                    == json_schema.to_sql(json_schema.make_nullable(existing_properties[name])):
                pass

            ## Column type change
            elif self.mapping_name(name, schema) not in existing_properties \
                and self.mapping_name(name, existing_properties[name]) not in existing_properties:

                self.split_column(cur, table_schema, table_name, name, schema,
                                  existing_properties)

            ## Error
            else:
                raise PostgresError(
                    'Cannot handle column type change for: {}.{} columns {} and {}. Name collision likely.'
                    .format(table_schema, table_name, name,
                            self.mapping_name(name, schema)))
Exemplo n.º 7
0
def _literal_only_schema(schema):
    ret = deepcopy(schema)

    ret_type = json_schema.get_type(ret)

    if json_schema.is_object(ret):
        ret_type.remove(json_schema.OBJECT)
    if json_schema.is_iterable(ret):
        ret_type.remove(json_schema.ARRAY)

    ret['type'] = ret_type

    return ret
Exemplo n.º 8
0
    def add_column_mapping(self, cur, table_name, from_path, to_name,
                           mapped_schema):
        metadata = self._get_table_metadata(cur, table_name)

        mapping = {
            'type': json_schema.get_type(mapped_schema),
            'from': from_path
        }

        if 't' == json_schema.shorthand(mapped_schema):
            mapping['format'] = 'date-time'

        metadata['mappings'][to_name] = mapping

        self._set_table_metadata(cur, table_name, metadata)
Exemplo n.º 9
0
    def add_column_mapping(self, cur, table_name, from_path, to_name,
                           mapped_schema):
        metadata = self._get_table_metadata(cur, table_name)

        if not metadata:
            metadata = {}

        if not 'mappings' in metadata:
            metadata['mappings'] = {}

        metadata['mappings'][to_name] = {
            'type': json_schema.get_type(mapped_schema),
            'from': from_path
        }

        self._set_table_metadata(cur, table_name, metadata)
Exemplo n.º 10
0
    def add_column_mapping(self, cur, table_schema, table_name, column_name,
                           mapped_name, mapped_schema):
        metadata = self.get_table_metadata(cur, table_schema, table_name)

        if not metadata:
            metadata = {}

        if not 'mappings' in metadata:
            metadata['mappings'] = {}

        metadata['mappings'][mapped_name] = {
            'type': json_schema.get_type(mapped_schema),
            'from': column_name
        }

        self.set_table_metadata(cur, table_schema, table_name, metadata)
Exemplo n.º 11
0
    def _serialize_table_record_field_name(self, remote_schema, path, value_json_schema_tuple):
        """
        Returns the appropriate remote field (column) name for `path`.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param path: (string, ...)
        :value_json_schema: tuple, JSON Schema
        :return: string
        """

        # rebuild the dict that needs to be passed further down the call stack
        if len(value_json_schema_tuple) == 1:
            value_json_schema = { 'type': value_json_schema_tuple[0] }
        else:
            value_json_schema = {'type': value_json_schema_tuple[0],
                     'format': value_json_schema_tuple[1]}

        simple_json_schema = json_schema.simple_type(value_json_schema)

        mapping = self._get_mapping(remote_schema,
                                    path,
                                    simple_json_schema)

        if not mapping is None:
            return mapping

        ## Numbers are valid as `float` OR `int`
        ##  ie, 123.0 and 456 are valid 'number's
        if json_schema.INTEGER in json_schema.get_type(simple_json_schema):
            mapping = self._get_mapping(remote_schema,
                                        path,
                                        {'type': json_schema.NUMBER})

            if not mapping is None:
                return mapping

        raise Exception("A compatible column for path {} and JSONSchema {} in table {} cannot be found.".format(
            path,
            simple_json_schema,
            remote_schema['path']
        ))
Exemplo n.º 12
0
def _literal_only_schema(schema):

    ret_types = json_schema.get_type(schema)

    if json_schema.is_object(schema):
        ret_types.remove(json_schema.OBJECT)
    if json_schema.is_iterable(schema):
        ret_types.remove(json_schema.ARRAY)
    if json_schema.is_nullable(schema):
        ret_types.remove(json_schema.NULL)

    ret_schemas = []
    for t in ret_types:
        s = deepcopy(schema)
        s['type'] = [t]

        if json_schema.is_nullable(schema):
            s = json_schema.make_nullable(s)

        ret_schemas.append(s)

    return {'anyOf': ret_schemas}
Exemplo n.º 13
0
    def write_batch(self, stream_buffer):
        if not self.persist_empty_tables and stream_buffer.count == 0:
            return None

        with self.conn.cursor() as cur:
            try:
                cur.execute('BEGIN;')

                self.setup_table_mapping_cache(cur)

                root_table_name = self.add_table_mapping_helper(
                    (stream_buffer.stream, ), self.table_mapping_cache)['to']
                current_table_schema = self.get_table_schema(
                    cur, root_table_name)

                current_table_version = None

                if current_table_schema:
                    current_table_version = current_table_schema.get(
                        'version', None)

                    if set(stream_buffer.key_properties) \
                            != set(current_table_schema.get('key_properties')):
                        raise PostgresError(
                            '`key_properties` change detected. Existing values are: {}. Streamed values are: {}'
                            .format(current_table_schema.get('key_properties'),
                                    stream_buffer.key_properties))

                    for key_property in stream_buffer.key_properties:
                        canonicalized_key, remote_column_schema = self.fetch_column_from_path(
                            (key_property, ), current_table_schema)
                        if self.json_schema_to_sql_type(remote_column_schema) \
                                != self.json_schema_to_sql_type(stream_buffer.schema['properties'][key_property]):
                            raise PostgresError((
                                '`key_properties` type change detected for "{}". '
                                + 'Existing values are: {}. ' +
                                'Streamed values are: {}, {}, {}').format(
                                    key_property,
                                    json_schema.get_type(
                                        current_table_schema['schema']
                                        ['properties'][key_property]),
                                    json_schema.get_type(
                                        stream_buffer.schema['properties']
                                        [key_property]),
                                    self.json_schema_to_sql_type(
                                        current_table_schema['schema']
                                        ['properties'][key_property]),
                                    self.json_schema_to_sql_type(
                                        stream_buffer.schema['properties']
                                        [key_property])))

                target_table_version = current_table_version or stream_buffer.max_version

                self.LOGGER.info(
                    'Stream {} ({}) with max_version {} targetting {}'.format(
                        stream_buffer.stream, root_table_name,
                        stream_buffer.max_version, target_table_version))

                root_table_name = stream_buffer.stream
                if current_table_version is not None and \
                        stream_buffer.max_version is not None:
                    if stream_buffer.max_version < current_table_version:
                        self.LOGGER.warning(
                            '{} - Records from an earlier table version detected.'
                            .format(stream_buffer.stream))
                        cur.execute('ROLLBACK;')
                        return None

                    elif stream_buffer.max_version > current_table_version:
                        root_table_name += SEPARATOR + str(
                            stream_buffer.max_version)
                        target_table_version = stream_buffer.max_version

                self.LOGGER.info('Root table name {}'.format(root_table_name))

                written_batches_details = self.write_batch_helper(
                    cur, root_table_name, stream_buffer.schema,
                    stream_buffer.key_properties, stream_buffer.get_batch(),
                    {'version': target_table_version})

                cur.execute('COMMIT;')

                return written_batches_details
            except Exception as ex:
                cur.execute('ROLLBACK;')
                message = 'Exception writing records'
                self.LOGGER.exception(message)
                raise PostgresError(message, ex)
Exemplo n.º 14
0
    def upsert_table_helper(self, connection, schema, metadata):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        table_name = self.add_table_mapping(connection, table_path, metadata)

        existing_schema = self.get_table_schema(connection, table_path,
                                                table_name)

        if existing_schema is None:
            self.add_table(connection, table_name, metadata)
            existing_schema = self.get_table_schema(connection, table_path,
                                                    table_name)

        self.add_key_properties(connection, table_name,
                                schema.get('key_properties', None))

        ## Only process columns which have single, nullable, types
        single_type_columns = []
        for column_name__or__path, column_schema in schema['schema'][
                'properties'].items():
            column_path = column_name__or__path
            if isinstance(column_name__or__path, str):
                column_path = (column_name__or__path, )

            single_type_column_schema = deepcopy(column_schema)
            column_types = json_schema.get_type(single_type_column_schema)
            make_nullable = json_schema.is_nullable(column_schema)

            for type in column_types:
                if type == json_schema.NULL:
                    continue

                single_type_column_schema['type'] = [type]

                if make_nullable:
                    single_type_columns.append(
                        (column_path,
                         json_schema.make_nullable(single_type_column_schema)))
                else:
                    single_type_columns.append(
                        (column_path, single_type_column_schema))

        ## Process new columns against existing
        raw_mappings = existing_schema.get('mappings', {})

        mappings = []

        for to, m in raw_mappings.items():
            mappings.append({
                'from': tuple(m['from']),
                'to': to,
                'type': m['type']
            })

        table_empty = self.is_table_empty(connection, table_name)

        for column_path, column_schema in single_type_columns:
            canonicalized_column_name = self._canonicalize_column_identifier(
                column_path, column_schema, mappings)
            nullable_column_schema = json_schema.make_nullable(column_schema)

            ## NEW COLUMN
            if not column_path in [m['from'] for m in mappings]:
                ### NON EMPTY TABLE
                if not table_empty:
                    self.LOGGER.warning(
                        'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'
                        .format(column_path, table_name))
                    column_schema = nullable_column_schema

                self.add_column(connection, table_name,
                                canonicalized_column_name, column_schema)
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        column_schema)
                mappings.append({
                    'from': column_path,
                    'to': canonicalized_column_name,
                    'type': json_schema.get_type(column_schema)
                })

                continue

            ## EXISTING COLUMNS
            ### SCHEMAS MATCH
            if [
                    True for m in mappings if m['from'] == column_path and
                    json_schema.to_sql(m) == json_schema.to_sql(column_schema)
            ]:
                continue
            ### NULLABLE SCHEMAS MATCH
            ###  New column _is not_ nullable, existing column _is_
            if [
                    True for m in mappings
                    if m['from'] == column_path and json_schema.to_sql(m) ==
                    json_schema.to_sql(nullable_column_schema)
            ]:
                continue

            ### NULL COMPATIBILITY
            ###  New column _is_ nullable, existing column is _not_
            non_null_original_column = [
                m for m in mappings
                if m['from'] == column_path and json_schema.sql_shorthand(m) ==
                json_schema.sql_shorthand(column_schema)
            ]
            if non_null_original_column:
                ## MAKE NULLABLE
                self.make_column_nullable(connection, table_name,
                                          canonicalized_column_name)
                self.drop_column_mapping(connection, table_name,
                                         canonicalized_column_name)
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)

                mappings = [
                    m for m in mappings if not (
                        m['from'] == column_path and json_schema.sql_shorthand(
                            m) == json_schema.sql_shorthand(column_schema))
                ]
                mappings.append({
                    'from':
                    column_path,
                    'to':
                    canonicalized_column_name,
                    'type':
                    json_schema.get_type(nullable_column_schema)
                })

                continue

            ### FIRST MULTI TYPE
            ###  New column matches existing column path, but the types are incompatible
            duplicate_paths = [m for m in mappings if m['from'] == column_path]

            if 1 == len(duplicate_paths):

                existing_mapping = duplicate_paths[0]
                existing_column_name = existing_mapping['to']

                if existing_column_name:
                    self.drop_column_mapping(connection, table_name,
                                             existing_column_name)

                ## Update existing properties
                mappings = [m for m in mappings if m['from'] != column_path]
                mappings.append({
                    'from':
                    column_path,
                    'to':
                    canonicalized_column_name,
                    'type':
                    json_schema.get_type(nullable_column_schema)
                })

                existing_column_new_normalized_name = self._canonicalize_column_identifier(
                    column_path, existing_mapping, mappings)
                mappings.append({
                    'from':
                    column_path,
                    'to':
                    existing_column_new_normalized_name,
                    'type':
                    json_schema.get_type(
                        json_schema.make_nullable(existing_mapping))
                })

                ## Add new columns
                ### NOTE: all migrated columns will be nullable and remain that way

                #### Table Metadata
                self.add_column_mapping(
                    connection, table_name, column_path,
                    existing_column_new_normalized_name,
                    json_schema.make_nullable(existing_mapping))
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)

                #### Columns
                self.add_column(connection, table_name,
                                existing_column_new_normalized_name,
                                json_schema.make_nullable(existing_mapping))

                self.add_column(connection, table_name,
                                canonicalized_column_name,
                                nullable_column_schema)

                ## Migrate existing data
                self.migrate_column(connection, table_name,
                                    existing_mapping['to'],
                                    existing_column_new_normalized_name)

                ## Drop existing column
                self.drop_column(connection, table_name,
                                 existing_mapping['to'])

            ## REST MULTI TYPE
            elif 1 < len(duplicate_paths):

                ## Add new column
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)
                self.add_column(connection, table_name,
                                canonicalized_column_name,
                                nullable_column_schema)

                mappings.append({
                    'from':
                    column_path,
                    'to':
                    canonicalized_column_name,
                    'type':
                    json_schema.get_type(nullable_column_schema)
                })

            ## UNKNOWN
            else:
                raise Exception(
                    'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'
                    .format(column_path, canonicalized_column_name,
                            table_name))

        return self.get_table_schema(connection, table_path, table_name)
Exemplo n.º 15
0
    def upsert_table_helper(self,
                            connection,
                            schema,
                            metadata,
                            log_schema_changes=True):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations,
        :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        _metadata = deepcopy(metadata)
        _metadata['schema_version'] = CURRENT_SCHEMA_VERSION

        table_name = self.add_table_mapping(connection, table_path, _metadata)

        existing_schema = self._get_table_schema(connection, table_path,
                                                 table_name)

        if existing_schema is None:
            self.add_table(connection, table_name, _metadata)
            existing_schema = self._get_table_schema(connection, table_path,
                                                     table_name)

        self.add_key_properties(connection, table_name,
                                schema.get('key_properties', None))

        ## Only process columns which have single, nullable, types
        single_type_columns = []
        for column_name__or__path, column_schema in schema['schema'][
                'properties'].items():
            column_path = column_name__or__path
            if isinstance(column_name__or__path, str):
                column_path = (column_name__or__path, )

            single_type_column_schema = deepcopy(column_schema)
            column_types = json_schema.get_type(single_type_column_schema)
            make_nullable = json_schema.is_nullable(column_schema)

            for type in column_types:
                if type == json_schema.NULL:
                    continue

                single_type_column_schema['type'] = [type]

                if make_nullable:
                    single_type_columns.append(
                        (column_path,
                         json_schema.make_nullable(single_type_column_schema)))
                else:
                    single_type_columns.append(
                        (column_path, deepcopy(single_type_column_schema)))

        ## Process new columns against existing
        raw_mappings = existing_schema.get('mappings', {})

        mappings = []

        for to, m in raw_mappings.items():
            mapping = json_schema.simple_type(m)
            mapping['from'] = tuple(m['from'])
            mapping['to'] = to
            mappings.append(mapping)

        table_empty = self.is_table_empty(connection, table_name)

        for column_path, column_schema in single_type_columns:
            upsert_table_helper__start__column = time.monotonic()

            canonicalized_column_name = self._canonicalize_column_identifier(
                column_path, column_schema, mappings)
            nullable_column_schema = json_schema.make_nullable(column_schema)

            def log_message(msg):
                if log_schema_changes:
                    self.LOGGER.info(
                        'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'
                        .format(
                            table_name, column_path, canonicalized_column_name,
                            msg,
                            _duration_millis(
                                upsert_table_helper__start__column)))

            ## NEW COLUMN
            if not column_path in [m['from'] for m in mappings]:
                upsert_table_helper__column = "New column"
                ### NON EMPTY TABLE
                if not table_empty:
                    upsert_table_helper__column += ", non empty table"
                    self.LOGGER.warning(
                        'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'
                        .format(column_path, table_name))
                    column_schema = nullable_column_schema

                self.add_column(connection, table_name,
                                canonicalized_column_name, column_schema)
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        column_schema)

                mapping = json_schema.simple_type(column_schema)
                mapping['from'] = column_path
                mapping['to'] = canonicalized_column_name
                mappings.append(mapping)

                log_message(upsert_table_helper__column)

                continue

            ## EXISTING COLUMNS
            ### SCHEMAS MATCH
            if [
                    True for m in mappings if m['from'] == column_path
                    and self.json_schema_to_sql_type(
                        m) == self.json_schema_to_sql_type(column_schema)
            ]:
                continue
            ### NULLABLE SCHEMAS MATCH
            ###  New column _is not_ nullable, existing column _is_
            if [
                    True for m in mappings if m['from'] == column_path
                    and self.json_schema_to_sql_type(m) ==
                    self.json_schema_to_sql_type(nullable_column_schema)
            ]:
                continue

            ### NULL COMPATIBILITY
            ###  New column _is_ nullable, existing column is _not_
            non_null_original_column = [
                m for m in mappings
                if m['from'] == column_path and json_schema.shorthand(m) ==
                json_schema.shorthand(column_schema)
            ]
            if non_null_original_column:
                ## MAKE NULLABLE
                self.make_column_nullable(connection, table_name,
                                          canonicalized_column_name)
                self.drop_column_mapping(connection, table_name,
                                         canonicalized_column_name)
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)

                mappings = [
                    m for m in mappings
                    if not (m['from'] == column_path and json_schema.shorthand(
                        m) == json_schema.shorthand(column_schema))
                ]

                mapping = json_schema.simple_type(nullable_column_schema)
                mapping['from'] = column_path
                mapping['to'] = canonicalized_column_name
                mappings.append(mapping)

                log_message(
                    "Made existing column nullable. New column is nullable, existing column is not"
                )

                continue

            ### FIRST MULTI TYPE
            ###  New column matches existing column path, but the types are incompatible
            duplicate_paths = [m for m in mappings if m['from'] == column_path]

            if 1 == len(duplicate_paths):
                existing_mapping = duplicate_paths[0]
                existing_column_name = existing_mapping['to']

                if existing_column_name:
                    self.drop_column_mapping(connection, table_name,
                                             existing_column_name)

                ## Update existing properties
                mappings = [m for m in mappings if m['from'] != column_path]

                mapping = json_schema.simple_type(nullable_column_schema)
                mapping['from'] = column_path
                mapping['to'] = canonicalized_column_name
                mappings.append(mapping)

                existing_column_new_normalized_name = self._canonicalize_column_identifier(
                    column_path, existing_mapping, mappings)

                mapping = json_schema.simple_type(
                    json_schema.make_nullable(existing_mapping))
                mapping['from'] = column_path
                mapping['to'] = existing_column_new_normalized_name
                mappings.append(mapping)

                ## Add new columns
                ### NOTE: all migrated columns will be nullable and remain that way

                #### Table Metadata
                self.add_column_mapping(
                    connection, table_name, column_path,
                    existing_column_new_normalized_name,
                    json_schema.make_nullable(existing_mapping))
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)

                #### Columns
                self.add_column(connection, table_name,
                                existing_column_new_normalized_name,
                                json_schema.make_nullable(existing_mapping))

                self.add_column(connection, table_name,
                                canonicalized_column_name,
                                nullable_column_schema)

                ## Migrate existing data
                self.migrate_column(connection, table_name,
                                    existing_mapping['to'],
                                    existing_column_new_normalized_name)

                ## Drop existing column
                self.drop_column(connection, table_name,
                                 existing_mapping['to'])

                upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format(
                    existing_column_name, existing_column_new_normalized_name,
                    canonicalized_column_name)

            ## REST MULTI TYPE
            elif 1 < len(duplicate_paths):
                ## Add new column
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)
                self.add_column(connection, table_name,
                                canonicalized_column_name,
                                nullable_column_schema)

                mapping = json_schema.simple_type(nullable_column_schema)
                mapping['from'] = column_path
                mapping['to'] = canonicalized_column_name
                mappings.append(mapping)

                upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format(
                    column_path)

            ## UNKNOWN
            else:
                raise Exception(
                    'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'
                    .format(column_path, canonicalized_column_name,
                            table_name))

            log_message(upsert_table_helper__column)

        return self._get_table_schema(connection, table_path, table_name)
Exemplo n.º 16
0
    def write_batch(self, stream_buffer):
        if stream_buffer.count == 0:
            return None

        with self.conn.cursor() as cur:
            try:
                self._validate_identifier(stream_buffer.stream)

                cur.execute('BEGIN;')

                current_table_schema = self.get_table_schema(
                    cur, (stream_buffer.stream, ), stream_buffer.stream)

                current_table_version = None

                if current_table_schema:
                    current_table_version = current_table_schema.get(
                        'version', None)

                    if set(stream_buffer.key_properties) \
                            != set(current_table_schema.get('key_properties')):
                        raise PostgresError(
                            '`key_properties` change detected. Existing values are: {}. Streamed values are: {}'
                            .format(current_table_schema.get('key_properties'),
                                    stream_buffer.key_properties))

                    for key in stream_buffer.key_properties:
                        if self.json_schema_to_sql_type(current_table_schema['schema']['properties'][key]) \
                                != self.json_schema_to_sql_type(stream_buffer.schema['properties'][key]):
                            raise PostgresError((
                                '`key_properties` type change detected for "{}". '
                                + 'Existing values are: {}. ' +
                                'Streamed values are: {}, {}, {}'
                            ).format(
                                key,
                                json_schema.get_type(
                                    current_table_schema['schema']
                                    ['properties'][key]),
                                json_schema.get_type(
                                    stream_buffer.schema['properties'][key]),
                                self.json_schema_to_sql_type(
                                    current_table_schema['schema']
                                    ['properties'][key]),
                                self.json_schema_to_sql_type(
                                    stream_buffer.schema['properties'][key])))

                root_table_name = stream_buffer.stream
                target_table_version = current_table_version or stream_buffer.max_version

                if current_table_version is not None and \
                        stream_buffer.max_version is not None:
                    if stream_buffer.max_version < current_table_version:
                        self.LOGGER.warning(
                            '{} - Records from an earlier table version detected.'
                            .format(stream_buffer.stream))
                        cur.execute('ROLLBACK;')
                        return None

                    elif stream_buffer.max_version > current_table_version:
                        root_table_name = stream_buffer.stream + SEPARATOR + str(
                            stream_buffer.max_version)
                        target_table_version = stream_buffer.max_version

                self._validate_identifier(root_table_name)
                written_batches_details = self.write_batch_helper(
                    cur, root_table_name, stream_buffer.schema,
                    stream_buffer.key_properties, stream_buffer.get_batch(),
                    {'version': target_table_version})

                cur.execute('COMMIT;')

                return written_batches_details
            except Exception as ex:
                cur.execute('ROLLBACK;')
                message = 'Exception writing records'
                self.LOGGER.exception(message)
                raise PostgresError(message, ex)
Exemplo n.º 17
0
    def write_batch(self, stream_buffer):
        if stream_buffer.count == 0:
            return

        with self.conn.cursor() as cur:
            try:
                cur.execute('BEGIN;')

                processed_records = map(
                    partial(self.process_record_message,
                            stream_buffer.use_uuid_pk,
                            self.get_postgres_datetime()),
                    stream_buffer.peek_buffer())
                versions = set()
                max_version = None
                records_all_versions = []
                for record in processed_records:
                    record_version = record.get(SINGER_TABLE_VERSION)
                    if record_version is not None and \
                       (max_version is None or record_version > max_version):
                        max_version = record_version
                    versions.add(record_version)
                    records_all_versions.append(record)

                current_table_schema = self.get_schema(cur,
                                                       self.postgres_schema,
                                                       stream_buffer.stream)

                current_table_version = None

                if current_table_schema:
                    current_table_version = current_table_schema.get(
                        'version', None)

                    if set(stream_buffer.key_properties) \
                            != set(current_table_schema.get('key_properties')):
                        raise PostgresError(
                            '`key_properties` change detected. Existing values are: {}. Streamed values are: {}'
                            .format(current_table_schema.get('key_properties'),
                                    stream_buffer.key_properties))

                if max_version is not None:
                    target_table_version = max_version
                else:
                    target_table_version = None

                if current_table_version is not None and \
                        min(versions) < current_table_version:
                    self.logger.warning(
                        '{} - Records from an earlier table vesion detected.'.
                        format(stream_buffer.stream))
                if len(versions) > 1:
                    self.logger.warning(
                        '{} - Multiple table versions in stream, only using the latest.'
                        .format(stream_buffer.stream))

                if current_table_version is not None and \
                   target_table_version > current_table_version:
                    root_table_name = stream_buffer.stream + self.SEPARATOR + str(
                        target_table_version)
                else:
                    root_table_name = stream_buffer.stream

                if target_table_version is not None:
                    records = filter(
                        lambda x: x.get(SINGER_TABLE_VERSION) ==
                        target_table_version, records_all_versions)
                else:
                    records = records_all_versions

                root_table_schema = json_schema.simplify(stream_buffer.schema)

                ## Add singer columns to root table
                self.add_singer_columns(root_table_schema,
                                        stream_buffer.key_properties)

                subtables = {}
                key_prop_schemas = {}
                for key in stream_buffer.key_properties:
                    if current_table_schema \
                            and json_schema.get_type(current_table_schema['schema']['properties'][key]) \
                            != json_schema.get_type(root_table_schema['properties'][key]):
                        raise PostgresError(
                            ('`key_properties` type change detected for "{}". '
                             + 'Existing values are: {}. ' +
                             'Streamed values are: {}').format(
                                 key,
                                 json_schema.get_type(
                                     current_table_schema['schema']
                                     ['properties'][key]),
                                 json_schema.get_type(
                                     root_table_schema['properties'][key])))

                    key_prop_schemas[key] = root_table_schema['properties'][
                        key]

                self.denest_schema(root_table_name, root_table_schema,
                                   key_prop_schemas, subtables)

                root_temp_table_name = self.upsert_table_schema(
                    cur, root_table_name, root_table_schema,
                    stream_buffer.key_properties, target_table_version)

                nested_upsert_tables = []
                for table_name, subtable_json_schema in subtables.items():
                    temp_table_name = self.upsert_table_schema(
                        cur, table_name, subtable_json_schema, None, None)
                    nested_upsert_tables.append({
                        'table_name':
                        table_name,
                        'json_schema':
                        subtable_json_schema,
                        'temp_table_name':
                        temp_table_name
                    })

                records_map = {}
                self.denest_records(root_table_name, records, records_map,
                                    stream_buffer.key_properties)
                self.persist_rows(cur, root_table_name, root_temp_table_name,
                                  root_table_schema,
                                  stream_buffer.key_properties,
                                  records_map[root_table_name])
                for nested_upsert_table in nested_upsert_tables:
                    key_properties = []
                    for key in stream_buffer.key_properties:
                        key_properties.append(SINGER_SOURCE_PK_PREFIX + key)
                    self.persist_rows(
                        cur, nested_upsert_table['table_name'],
                        nested_upsert_table['temp_table_name'],
                        nested_upsert_table['json_schema'], key_properties,
                        records_map[nested_upsert_table['table_name']])

                cur.execute('COMMIT;')
            except Exception as ex:
                cur.execute('ROLLBACK;')
                message = 'Exception writing records'
                self.logger.exception(message)
                raise PostgresError(message, ex)

        stream_buffer.flush_buffer()