def _get_mapping(self, existing_schema, path, schema): for to, mapping in existing_schema.get('mappings', {}).items(): if tuple(mapping['from']) == path \ and json_schema.shorthand(mapping) == json_schema.shorthand(schema): return to return None
def test_sql_shorthand(): assert 'b' == json_schema.shorthand({'type': 'boolean'}) assert 'b' == json_schema.shorthand({'type': ['null', 'boolean']}) assert 's' == json_schema.shorthand({'type': ['null', 'string']}) assert 't' == json_schema.shorthand({'type': ['null', 'string'], 'format': 'date-time'}) assert 't' == json_schema.shorthand({'type': 'string', 'format': 'date-time'})
def _canonicalize_column_identifier(self, path, schema, mappings): """""" from_type__to_name = {} existing_paths = set() existing_column_names = set() for m in mappings: from_type__to_name[(m['from'], json_schema.shorthand(m))] = m['to'] existing_paths.add(m['from']) existing_column_names.add(m['to']) ## MAPPING EXISTS, NO CANONICALIZATION NECESSARY if (path, json_schema.shorthand(schema)) in from_type__to_name: return from_type__to_name[(path, json_schema.shorthand(schema))] raw_canonicalized_column_name = self.canonicalize_identifier( SEPARATOR.join(path)) canonicalized_column_name = raw_canonicalized_column_name[:self. IDENTIFIER_FIELD_LENGTH] raw_suffix = '' ## NO TYPE MATCH if path in existing_paths: raw_suffix = SEPARATOR + json_schema.shorthand(schema) canonicalized_column_name = raw_canonicalized_column_name[:self. IDENTIFIER_FIELD_LENGTH - len( raw_suffix )] + raw_suffix self.LOGGER.warning( 'FIELD COLLISION: Field `{}` exists in remote already. No compatible type found. Appending type suffix: `{}`' .format(path, canonicalized_column_name)) i = 0 ## NAME COLLISION while canonicalized_column_name in existing_column_names: self.LOGGER.warning( 'NAME COLLISION: Field `{}` collided with `{}` in remote. Adding new integer suffix...' .format(path, canonicalized_column_name)) i += 1 suffix = raw_suffix + SEPARATOR + str(i) canonicalized_column_name = raw_canonicalized_column_name[:self. IDENTIFIER_FIELD_LENGTH - len( suffix )] + suffix return canonicalized_column_name
def add_column_mapping(self, cur, table_name, from_path, to_name, mapped_schema): metadata = self._get_table_metadata(cur, table_name) mapping = { 'type': json_schema.get_type(mapped_schema), 'from': from_path } if 't' == json_schema.shorthand(mapped_schema): mapping['format'] = 'date-time' metadata['mappings'][to_name] = mapping self._set_table_metadata(cur, table_name, metadata)
def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True): """ Upserts the `schema` to remote by: - creating table if necessary - adding columns - adding column mappings - migrating data from old columns to new, etc. :param connection: remote connection, type left to be determined by implementing class :param schema: TABLE_SCHEMA(local) :param metadata: additional information necessary for downstream operations, :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes :return: TABLE_SCHEMA(remote) """ table_path = schema['path'] with self._set_timer_tags(metrics.job_timer(), 'upsert_table_schema', table_path) as timer: _metadata = deepcopy(metadata) _metadata['schema_version'] = CURRENT_SCHEMA_VERSION table_name = self.add_table_mapping(connection, table_path, _metadata) self._set_metrics_tags__table(timer, table_name) existing_schema = self._get_table_schema(connection, table_name) existing_table = True if existing_schema is None: self.add_table(connection, table_path, table_name, _metadata) existing_schema = self._get_table_schema(connection, table_name) existing_table = False self.add_key_properties(connection, table_name, schema.get('key_properties', None)) ## Build up mappings to compare new columns against existing mappings = [] for to, m in existing_schema.get('mappings', {}).items(): mapping = json_schema.simple_type(m) mapping['from'] = tuple(m['from']) mapping['to'] = to mappings.append(mapping) ## Only process columns which have single, nullable, types column_paths_seen = set() single_type_columns = [] for column_path, column_schema in schema['schema']['properties'].items(): column_paths_seen.add(column_path) for sub_schema in column_schema['anyOf']: single_type_columns.append((column_path, deepcopy(sub_schema))) ### Add any columns missing from new schema for m in mappings: if not m['from'] in column_paths_seen: single_type_columns.append((m['from'], json_schema.make_nullable(m))) ## Process new columns against existing table_empty = self.is_table_empty(connection, table_name) for column_path, column_schema in single_type_columns: upsert_table_helper__start__column = time.monotonic() canonicalized_column_name = self._canonicalize_column_identifier(column_path, column_schema, mappings) nullable_column_schema = json_schema.make_nullable(column_schema) def log_message(msg): if log_schema_changes: self.LOGGER.info( 'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'.format( table_name, column_path, canonicalized_column_name, msg, _duration_millis(upsert_table_helper__start__column))) ## NEW COLUMN if not column_path in [m['from'] for m in mappings]: upsert_table_helper__column = "New column" ### NON EMPTY TABLE if not table_empty: upsert_table_helper__column += ", non empty table" self.LOGGER.warning( 'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'.format( column_path, table_name)) column_schema = nullable_column_schema self.add_column(connection, table_name, canonicalized_column_name, column_schema) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, column_schema) mapping = json_schema.simple_type(column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message(upsert_table_helper__column) continue ## EXISTING COLUMNS ### SCHEMAS MATCH if [True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(column_schema)]: continue ### NULLABLE SCHEMAS MATCH ### New column _is not_ nullable, existing column _is_ if [True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema)]: continue ### NULL COMPATIBILITY ### New column _is_ nullable, existing column is _not_ non_null_original_column = [m for m in mappings if m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema)] if non_null_original_column: ## MAKE NULLABLE self.make_column_nullable(connection, table_name, canonicalized_column_name) self.drop_column_mapping(connection, table_name, canonicalized_column_name) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) mappings = [m for m in mappings if not (m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema))] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message("Made existing column nullable.") continue ### FIRST MULTI TYPE ### New column matches existing column path, but the types are incompatible duplicate_paths = [m for m in mappings if m['from'] == column_path] if 1 == len(duplicate_paths): existing_mapping = duplicate_paths[0] existing_column_name = existing_mapping['to'] if existing_column_name: self.drop_column_mapping(connection, table_name, existing_column_name) ## Update existing properties mappings = [m for m in mappings if m['from'] != column_path] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) existing_column_new_normalized_name = self._canonicalize_column_identifier(column_path, existing_mapping, mappings) mapping = json_schema.simple_type(json_schema.make_nullable(existing_mapping)) mapping['from'] = column_path mapping['to'] = existing_column_new_normalized_name mappings.append(mapping) ## Add new columns ### NOTE: all migrated columns will be nullable and remain that way #### Table Metadata self.add_column_mapping(connection, table_name, column_path, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) #### Columns self.add_column(connection, table_name, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) ## Migrate existing data self.migrate_column(connection, table_name, existing_mapping['to'], existing_column_new_normalized_name) ## Drop existing column self.drop_column(connection, table_name, existing_mapping['to']) upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format( existing_column_name, existing_column_new_normalized_name, canonicalized_column_name ) ## REST MULTI TYPE elif 1 < len(duplicate_paths): ## Add new column self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format( column_path ) ## UNKNOWN else: raise Exception( 'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'.format( column_path, canonicalized_column_name, table_name )) log_message(upsert_table_helper__column) if not existing_table: for column_names in self.new_table_indexes(schema): self.add_index(connection, table_name, column_names) return self._get_table_schema(connection, table_name)
def _mapping_name(field, schema): return field + SEPARATOR + json_schema.shorthand(schema)