示例#1
0
    def _get_mapping(self, existing_schema, path, schema):
        for to, mapping in existing_schema.get('mappings', {}).items():
            if tuple(mapping['from']) == path \
                    and json_schema.shorthand(mapping) == json_schema.shorthand(schema):
                return to

        return None
def test_sql_shorthand():
    assert 'b' == json_schema.shorthand({'type': 'boolean'})
    assert 'b' == json_schema.shorthand({'type': ['null', 'boolean']})
    assert 's' == json_schema.shorthand({'type': ['null', 'string']})
    assert 't' == json_schema.shorthand({'type': ['null', 'string'],
                                         'format': 'date-time'})
    assert 't' == json_schema.shorthand({'type': 'string',
                                         'format': 'date-time'})
    def _canonicalize_column_identifier(self, path, schema, mappings):
        """"""

        from_type__to_name = {}
        existing_paths = set()
        existing_column_names = set()

        for m in mappings:
            from_type__to_name[(m['from'], json_schema.shorthand(m))] = m['to']
            existing_paths.add(m['from'])
            existing_column_names.add(m['to'])

        ## MAPPING EXISTS, NO CANONICALIZATION NECESSARY
        if (path, json_schema.shorthand(schema)) in from_type__to_name:
            return from_type__to_name[(path, json_schema.shorthand(schema))]

        raw_canonicalized_column_name = self.canonicalize_identifier(
            SEPARATOR.join(path))
        canonicalized_column_name = raw_canonicalized_column_name[:self.
                                                                  IDENTIFIER_FIELD_LENGTH]

        raw_suffix = ''
        ## NO TYPE MATCH
        if path in existing_paths:
            raw_suffix = SEPARATOR + json_schema.shorthand(schema)
            canonicalized_column_name = raw_canonicalized_column_name[:self.
                                                                      IDENTIFIER_FIELD_LENGTH
                                                                      - len(
                                                                          raw_suffix
                                                                      )] + raw_suffix

            self.LOGGER.warning(
                'FIELD COLLISION: Field `{}` exists in remote already. No compatible type found. Appending type suffix: `{}`'
                .format(path, canonicalized_column_name))

        i = 0
        ## NAME COLLISION
        while canonicalized_column_name in existing_column_names:
            self.LOGGER.warning(
                'NAME COLLISION: Field `{}` collided with `{}` in remote. Adding new integer suffix...'
                .format(path, canonicalized_column_name))

            i += 1
            suffix = raw_suffix + SEPARATOR + str(i)
            canonicalized_column_name = raw_canonicalized_column_name[:self.
                                                                      IDENTIFIER_FIELD_LENGTH
                                                                      - len(
                                                                          suffix
                                                                      )] + suffix

        return canonicalized_column_name
    def add_column_mapping(self, cur, table_name, from_path, to_name,
                           mapped_schema):
        metadata = self._get_table_metadata(cur, table_name)

        mapping = {
            'type': json_schema.get_type(mapped_schema),
            'from': from_path
        }

        if 't' == json_schema.shorthand(mapped_schema):
            mapping['format'] = 'date-time'

        metadata['mappings'][to_name] = mapping

        self._set_table_metadata(cur, table_name, metadata)
示例#5
0
    def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations,
        :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        with self._set_timer_tags(metrics.job_timer(),
                                  'upsert_table_schema',
                                  table_path) as timer:

            _metadata = deepcopy(metadata)
            _metadata['schema_version'] = CURRENT_SCHEMA_VERSION

            table_name = self.add_table_mapping(connection, table_path, _metadata)

            self._set_metrics_tags__table(timer, table_name)

            existing_schema = self._get_table_schema(connection, table_name)

            existing_table = True
            if existing_schema is None:
                self.add_table(connection, table_path, table_name, _metadata)
                existing_schema = self._get_table_schema(connection, table_name)
                existing_table = False

            self.add_key_properties(connection, table_name, schema.get('key_properties', None))

            ## Build up mappings to compare new columns against existing
            mappings = []

            for to, m in existing_schema.get('mappings', {}).items():
                mapping = json_schema.simple_type(m)
                mapping['from'] = tuple(m['from'])
                mapping['to'] = to
                mappings.append(mapping)

            ## Only process columns which have single, nullable, types
            column_paths_seen = set()
            single_type_columns = []

            for column_path, column_schema in schema['schema']['properties'].items():
                column_paths_seen.add(column_path)
                for sub_schema in column_schema['anyOf']:
                    single_type_columns.append((column_path, deepcopy(sub_schema)))

            ### Add any columns missing from new schema
            for m in mappings:
                if not m['from'] in column_paths_seen:
                    single_type_columns.append((m['from'], json_schema.make_nullable(m)))

            ## Process new columns against existing
            table_empty = self.is_table_empty(connection, table_name)

            for column_path, column_schema in single_type_columns:
                upsert_table_helper__start__column = time.monotonic()

                canonicalized_column_name = self._canonicalize_column_identifier(column_path, column_schema, mappings)
                nullable_column_schema = json_schema.make_nullable(column_schema)

                def log_message(msg):
                    if log_schema_changes:
                        self.LOGGER.info(
                            'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'.format(
                                table_name,
                                column_path,
                                canonicalized_column_name,
                                msg,
                                _duration_millis(upsert_table_helper__start__column)))

                ## NEW COLUMN
                if not column_path in [m['from'] for m in mappings]:
                    upsert_table_helper__column = "New column"
                    ### NON EMPTY TABLE
                    if not table_empty:
                        upsert_table_helper__column += ", non empty table"
                        self.LOGGER.warning(
                            'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'.format(
                                column_path,
                                table_name))
                        column_schema = nullable_column_schema

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    column_schema)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            column_schema)

                    mapping = json_schema.simple_type(column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message(upsert_table_helper__column)

                    continue

                ## EXISTING COLUMNS
                ### SCHEMAS MATCH
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(column_schema)]:
                    continue
                ### NULLABLE SCHEMAS MATCH
                ###  New column _is not_ nullable, existing column _is_
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema)]:
                    continue

                ### NULL COMPATIBILITY
                ###  New column _is_ nullable, existing column is _not_
                non_null_original_column = [m for m in mappings if
                                            m['from'] == column_path and json_schema.shorthand(
                                                m) == json_schema.shorthand(column_schema)]
                if non_null_original_column:
                    ## MAKE NULLABLE
                    self.make_column_nullable(connection,
                                              table_name,
                                              canonicalized_column_name)
                    self.drop_column_mapping(connection, table_name, canonicalized_column_name)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    mappings = [m for m in mappings if not (m['from'] == column_path and json_schema.shorthand(
                        m) == json_schema.shorthand(column_schema))]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message("Made existing column nullable.")

                    continue

                ### FIRST MULTI TYPE
                ###  New column matches existing column path, but the types are incompatible
                duplicate_paths = [m for m in mappings if m['from'] == column_path]

                if 1 == len(duplicate_paths):
                    existing_mapping = duplicate_paths[0]
                    existing_column_name = existing_mapping['to']

                    if existing_column_name:
                        self.drop_column_mapping(connection, table_name, existing_column_name)

                    ## Update existing properties
                    mappings = [m for m in mappings if m['from'] != column_path]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    existing_column_new_normalized_name = self._canonicalize_column_identifier(column_path,
                                                                                               existing_mapping,
                                                                                               mappings)

                    mapping = json_schema.simple_type(json_schema.make_nullable(existing_mapping))
                    mapping['from'] = column_path
                    mapping['to'] = existing_column_new_normalized_name
                    mappings.append(mapping)

                    ## Add new columns
                    ### NOTE: all migrated columns will be nullable and remain that way

                    #### Table Metadata
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            existing_column_new_normalized_name,
                                            json_schema.make_nullable(existing_mapping))
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    #### Columns
                    self.add_column(connection,
                                    table_name,
                                    existing_column_new_normalized_name,
                                    json_schema.make_nullable(existing_mapping))

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    ## Migrate existing data
                    self.migrate_column(connection,
                                        table_name,
                                        existing_mapping['to'],
                                        existing_column_new_normalized_name)

                    ## Drop existing column
                    self.drop_column(connection,
                                     table_name,
                                     existing_mapping['to'])

                    upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format(
                        existing_column_name,
                        existing_column_new_normalized_name,
                        canonicalized_column_name
                    )

                ## REST MULTI TYPE
                elif 1 < len(duplicate_paths):
                    ## Add new column
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)
                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format(
                        column_path
                    )

                ## UNKNOWN
                else:
                    raise Exception(
                        'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'.format(
                            column_path,
                            canonicalized_column_name,
                            table_name
                        ))

                log_message(upsert_table_helper__column)

            if not existing_table:
                for column_names in self.new_table_indexes(schema):
                    self.add_index(connection, table_name, column_names)

            return self._get_table_schema(connection, table_name)
示例#6
0
def _mapping_name(field, schema):
    return field + SEPARATOR + json_schema.shorthand(schema)