def test_simple_type(): assert {'type': ['integer', 'null']} \ == json_schema.simple_type({'type': ['integer', 'null']}) assert {'type': ['string'], 'format': 'date-time'} \ == json_schema.simple_type({'type': 'string', 'format': 'date-time', 'something': 1, 'extra': 2})
def _serialize_table_record_field_name(self, remote_schema, streamed_schema, path, value_json_schema): """ Returns the appropriate remote field (column) name for `field`. :param remote_schema: TABLE_SCHEMA(remote) :param streamed_schema: TABLE_SCHEMA(local) :param path: (string, ...) :value_json_schema: dict, JSON Schema :return: string """ simple_json_schema = json_schema.simple_type(value_json_schema) mapping = self._get_mapping(remote_schema, path, simple_json_schema) if not mapping is None: return mapping ## Numbers are valid as `float` OR `int` ## ie, 123.0 and 456 are valid 'number's if json_schema.INTEGER in json_schema.get_type(simple_json_schema): mapping = self._get_mapping(remote_schema, path, {'type': json_schema.NUMBER}) if not mapping is None: return mapping raise Exception('Unknown column path: {} for table: {}'.format( path, remote_schema['path']))
def _serialize_table_record_field_name(self, remote_schema, path, value_json_schema): """ Returns the appropriate remote field (column) name for `path`. :param remote_schema: TABLE_SCHEMA(remote) :param path: (string, ...) :value_json_schema: dict, JSON Schema :return: string """ simple_json_schema = json_schema.simple_type(value_json_schema) mapping = self._get_mapping(remote_schema, path, simple_json_schema) if not mapping is None: return mapping ## Numbers are valid as `float` OR `int` ## ie, 123.0 and 456 are valid 'number's if json_schema.INTEGER in json_schema.get_type(simple_json_schema): mapping = self._get_mapping(remote_schema, path, {'type': json_schema.NUMBER}) if not mapping is None: return mapping raise Exception( "A compatible column for path {} and JSONSchema {} in table {} cannot be found." .format(path, simple_json_schema, remote_schema['path']))
def fetch_column_from_path(self, path, table_schema): """ Should only be used for paths which have been established, ie, the schema will not be changing etc. :param path: :param table_schema: :return: """ for to, m in table_schema.get('mappings', {}).items(): if tuple(m['from']) == path: return to, json_schema.simple_type(m) raise Exception('blahbittyblah')
def _serialize_table_record_field_name(self, remote_schema, path, value_json_schema_tuple): """ Returns the appropriate remote field (column) name for `path`. :param remote_schema: TABLE_SCHEMA(remote) :param path: (string, ...) :value_json_schema: tuple, JSON Schema :return: string """ # rebuild the dict that needs to be passed further down the call stack if len(value_json_schema_tuple) == 1: value_json_schema = { 'type': value_json_schema_tuple[0] } else: value_json_schema = {'type': value_json_schema_tuple[0], 'format': value_json_schema_tuple[1]} simple_json_schema = json_schema.simple_type(value_json_schema) mapping = self._get_mapping(remote_schema, path, simple_json_schema) if not mapping is None: return mapping ## Numbers are valid as `float` OR `int` ## ie, 123.0 and 456 are valid 'number's if json_schema.INTEGER in json_schema.get_type(simple_json_schema): mapping = self._get_mapping(remote_schema, path, {'type': json_schema.NUMBER}) if not mapping is None: return mapping raise Exception("A compatible column for path {} and JSONSchema {} in table {} cannot be found.".format( path, simple_json_schema, remote_schema['path'] ))
def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True): """ Upserts the `schema` to remote by: - creating table if necessary - adding columns - adding column mappings - migrating data from old columns to new, etc. :param connection: remote connection, type left to be determined by implementing class :param schema: TABLE_SCHEMA(local) :param metadata: additional information necessary for downstream operations, :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes :return: TABLE_SCHEMA(remote) """ table_path = schema['path'] with self._set_timer_tags(metrics.job_timer(), 'upsert_table_schema', table_path) as timer: _metadata = deepcopy(metadata) _metadata['schema_version'] = CURRENT_SCHEMA_VERSION table_name = self.add_table_mapping(connection, table_path, _metadata) self._set_metrics_tags__table(timer, table_name) existing_schema = self._get_table_schema(connection, table_name) existing_table = True if existing_schema is None: self.add_table(connection, table_path, table_name, _metadata) existing_schema = self._get_table_schema(connection, table_name) existing_table = False self.add_key_properties(connection, table_name, schema.get('key_properties', None)) ## Build up mappings to compare new columns against existing mappings = [] for to, m in existing_schema.get('mappings', {}).items(): mapping = json_schema.simple_type(m) mapping['from'] = tuple(m['from']) mapping['to'] = to mappings.append(mapping) ## Only process columns which have single, nullable, types column_paths_seen = set() single_type_columns = [] for column_path, column_schema in schema['schema']['properties'].items(): column_paths_seen.add(column_path) for sub_schema in column_schema['anyOf']: single_type_columns.append((column_path, deepcopy(sub_schema))) ### Add any columns missing from new schema for m in mappings: if not m['from'] in column_paths_seen: single_type_columns.append((m['from'], json_schema.make_nullable(m))) ## Process new columns against existing table_empty = self.is_table_empty(connection, table_name) for column_path, column_schema in single_type_columns: upsert_table_helper__start__column = time.monotonic() canonicalized_column_name = self._canonicalize_column_identifier(column_path, column_schema, mappings) nullable_column_schema = json_schema.make_nullable(column_schema) def log_message(msg): if log_schema_changes: self.LOGGER.info( 'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'.format( table_name, column_path, canonicalized_column_name, msg, _duration_millis(upsert_table_helper__start__column))) ## NEW COLUMN if not column_path in [m['from'] for m in mappings]: upsert_table_helper__column = "New column" ### NON EMPTY TABLE if not table_empty: upsert_table_helper__column += ", non empty table" self.LOGGER.warning( 'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'.format( column_path, table_name)) column_schema = nullable_column_schema self.add_column(connection, table_name, canonicalized_column_name, column_schema) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, column_schema) mapping = json_schema.simple_type(column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message(upsert_table_helper__column) continue ## EXISTING COLUMNS ### SCHEMAS MATCH if [True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(column_schema)]: continue ### NULLABLE SCHEMAS MATCH ### New column _is not_ nullable, existing column _is_ if [True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema)]: continue ### NULL COMPATIBILITY ### New column _is_ nullable, existing column is _not_ non_null_original_column = [m for m in mappings if m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema)] if non_null_original_column: ## MAKE NULLABLE self.make_column_nullable(connection, table_name, canonicalized_column_name) self.drop_column_mapping(connection, table_name, canonicalized_column_name) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) mappings = [m for m in mappings if not (m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema))] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message("Made existing column nullable.") continue ### FIRST MULTI TYPE ### New column matches existing column path, but the types are incompatible duplicate_paths = [m for m in mappings if m['from'] == column_path] if 1 == len(duplicate_paths): existing_mapping = duplicate_paths[0] existing_column_name = existing_mapping['to'] if existing_column_name: self.drop_column_mapping(connection, table_name, existing_column_name) ## Update existing properties mappings = [m for m in mappings if m['from'] != column_path] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) existing_column_new_normalized_name = self._canonicalize_column_identifier(column_path, existing_mapping, mappings) mapping = json_schema.simple_type(json_schema.make_nullable(existing_mapping)) mapping['from'] = column_path mapping['to'] = existing_column_new_normalized_name mappings.append(mapping) ## Add new columns ### NOTE: all migrated columns will be nullable and remain that way #### Table Metadata self.add_column_mapping(connection, table_name, column_path, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) #### Columns self.add_column(connection, table_name, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) ## Migrate existing data self.migrate_column(connection, table_name, existing_mapping['to'], existing_column_new_normalized_name) ## Drop existing column self.drop_column(connection, table_name, existing_mapping['to']) upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format( existing_column_name, existing_column_new_normalized_name, canonicalized_column_name ) ## REST MULTI TYPE elif 1 < len(duplicate_paths): ## Add new column self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format( column_path ) ## UNKNOWN else: raise Exception( 'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'.format( column_path, canonicalized_column_name, table_name )) log_message(upsert_table_helper__column) if not existing_table: for column_names in self.new_table_indexes(schema): self.add_index(connection, table_name, column_names) return self._get_table_schema(connection, table_name)
def _serialize_table_records(self, remote_schema, streamed_schema, records): """ Parse the given table's `records` in preparation for persistence to the remote target. Base implementation returns a list of dictionaries, where _every_ dictionary has the same keys as `remote_schema`'s properties. :param remote_schema: TABLE_SCHEMA(remote) :param streamed_schema: TABLE_SCHEMA(local) :param records: [{(path_0, path_1, ...): (_json_schema_string_type, value), ...}, ...] :return: [{...}, ...] """ datetime_paths = [ k for k, v in streamed_schema['schema']['properties'].items() if json_schema.is_datetime(v) ] default_paths = { k: v.get('default') for k, v in streamed_schema['schema']['properties'].items() if v.get('default') is not None } ## Get the default NULL value so we can assign row values when value is _not_ NULL NULL_DEFAULT = self.serialize_table_record_null_value( remote_schema, streamed_schema, None, None) serialized_rows = [] remote_fields = set(remote_schema['schema']['properties'].keys()) default_row = dict([(field, NULL_DEFAULT) for field in remote_fields]) paths = streamed_schema['schema']['properties'].keys() for record in records: row = deepcopy(default_row) for path in paths: json_schema_string_type, value = record.get(path, (None, None)) ## Serialize fields which are not present but have default values set if path in default_paths \ and value is None: value = default_paths[path] json_schema_string_type = json_schema.python_type(value) ## Serialize datetime to compatible format if path in datetime_paths \ and json_schema_string_type == json_schema.STRING \ and value is not None: value = self.serialize_table_record_datetime_value( remote_schema, streamed_schema, path, value) value_json_schema = { 'type': json_schema.STRING, 'format': json_schema.DATE_TIME_FORMAT } elif json_schema_string_type: value_json_schema = {'type': json_schema_string_type} else: value_json_schema = json_schema.simple_type( streamed_schema['schema']['properties'][path]) ## Serialize NULL default value value = self.serialize_table_record_null_value( remote_schema, streamed_schema, path, value) field_name = self._serialize_table_record_field_name( remote_schema, streamed_schema, path, value_json_schema) if field_name in remote_fields \ and (not field_name in row or row[field_name] == NULL_DEFAULT): row[field_name] = value serialized_rows.append(row) return serialized_rows