def validate_table_design_semantics(table_design, table_name): """ Validate table design against rule set based on values (e.g. name of columns). Raise an exception if anything is amiss. """ if table_design["name"] != table_name.identifier: raise TableDesignSemanticError( "name in table design must match target '{}'".format( table_name.identifier)) schema = etl.config.get_dw_config().schema_lookup(table_name.schema) if table_design["source_name"] == "VIEW": validate_semantics_of_view(table_design) if schema.is_upstream_source: raise TableDesignSemanticError( "invalid upstream source '%s' in view '%s'" % (table_name.schema, table_name.identifier)) elif table_design["source_name"] == "CTAS": validate_semantics_of_ctas(table_design) if schema.is_upstream_source: raise TableDesignSemanticError( "invalid source name '%s' in upstream table '%s'" % (table_design["source_name"], table_name.identifier)) else: validate_semantics_of_table(table_design) if not schema.is_upstream_source: raise TableDesignSemanticError( "invalid source name '%s' in transformation '%s'" % (table_design["source_name"], table_name.identifier))
def validate_semantics_of_table(table_design): """Check for semantics that apply to tables in source schemas.""" validate_semantics_of_table_or_ctas(table_design) if "depends_on" in table_design: raise TableDesignSemanticError( "upstream table '%s' has dependencies listed" % table_design["name"]) constraints = table_design.get("constraints", []) constraint_types_in_design = [ constraint_type for constraint in constraints for constraint_type in constraint ] for constraint_type in constraint_types_in_design: if constraint_type in ("natural_key", "surrogate_key"): raise TableDesignSemanticError( "upstream table '{}' has unexpected {} constraint".format( table_design["name"], constraint_type)) [split_by_name] = table_design.get("extract_settings", {}).get("split_by", [None]) if split_by_name: split_by_column = fy.first( fy.where(table_design["columns"], name=split_by_name)) if split_by_column.get("skipped", False): raise TableDesignSemanticError( "split-by column must not be skipped") if not split_by_column.get("not_null", False): raise TableDesignSemanticError( "split-by column must have not-null constraint") if split_by_column["type"] not in ("int", "long", "date", "timestamp"): raise TableDesignSemanticError( "type of split-by column must be int, long, date or timestamp, not '{}'" .format(split_by_column["type"]))
def validate_semantics_of_table(table_design): """ Check for semantics that apply to tables in source schemas """ validate_semantics_of_table_or_ctas(table_design) if "depends_on" in table_design: raise TableDesignSemanticError( "upstream table '%s' has dependencies listed" % table_design["name"]) constraints = table_design.get("constraints", []) constraint_types_in_design = [t for c in constraints for t in c] for constraint_type in ("natural_key", "surrogate_key"): if constraint_type in constraint_types_in_design: raise TableDesignSemanticError( "upstream table '%s' has unexpected %s constraint" % (table_design["name"], constraint_type)) split_by_name = table_design.get('extract_settings', {}).get('split_by', []) if split_by_name: [split_by_column] = [ c for c in table_design["columns"] if c['name'] == split_by_name[0] ] if split_by_column["type"] not in ("int", "long"): raise TableDesignSemanticError( "Split-by column type must be numeric (int or long) not '{}'". format(split_by_column["type"]))
def validate_semantics_of_view(table_design): """ Check for semantics that only apply to views. Basically, definitions of views may only contain column names. """ # This error occurs when you change from CTAS to VIEW and then forget to remove the extra information # for the columns, like type or sql_type. for column in table_design["columns"]: if len(column) != 1: raise TableDesignSemanticError("too much information for column of a VIEW: {}".format(list(column))) for obj in ("constraints", "attributes", "extract_settings"): if obj in table_design: raise TableDesignSemanticError("{} not supported for a VIEW".format(obj))
def validate_column_references(table_design): """ Make sure that table attributes and constraints only reference columns that actually exist """ column_list_references = [('constraints', 'primary_key'), ('constraints', 'natural_key'), ('constraints', 'surrogate_key'), ('constraints', 'unique'), ('attributes', 'interleaved_sort'), ('attributes', 'compound_sort')] valid_columns = frozenset(column["name"] for column in table_design["columns"] if not column.get("skipped")) constraints = table_design.get("constraints", []) for obj, key in column_list_references: if obj == 'constraints': # This evaluates all unique constraints at once by concatenating all of the columns. cols = [ col for constraint in constraints for col in constraint.get(key, []) ] else: # 'attributes' cols = table_design.get(obj, {}).get(key, []) unknown = join_with_quotes(frozenset(cols).difference(valid_columns)) if unknown: raise TableDesignSemanticError( "{key} columns in {obj} contain unknown column(s): {unknown}". format(obj=obj, key=key, unknown=unknown))
def validate_semantics_of_ctas(table_design): """ Check for semantics that apply only to CTAS. """ validate_semantics_of_table_or_ctas(table_design) if "extract_settings" in table_design: raise TableDesignSemanticError("Extract settings not supported for transformations")
def unload_to_s3(config: DataWarehouseConfig, relations: List[RelationDescription], allow_overwrite: bool, keep_going: bool, dry_run: bool) -> None: """ Create CSV files for selected tables based on the S3 path in an "unload" source. """ logger.info( "Loading table design for %d relation(s) to look for unloadable relations", len(relations)) etl.relation.RelationDescription.load_in_parallel(relations) unloadable_relations = [d for d in relations if d.is_unloadable] if not unloadable_relations: logger.warning("Found no relations that are unloadable.") return logger.info("Starting to unload %s relation(s)", len(unloadable_relations)) target_lookup = { schema.name: schema for schema in config.schemas if schema.is_an_unload_target } relation_target_tuples = [] for relation in unloadable_relations: if relation.unload_target not in target_lookup: raise TableDesignSemanticError( "Unload target specified, but not defined: '%s'" % relation.unload_target) relation_target_tuples.append( (relation, target_lookup[relation.unload_target])) error_occurred = False conn = etl.db.connection(config.dsn_etl, autocommit=True, readonly=True) with closing(conn) as conn: for i, (relation, unload_schema) in enumerate(relation_target_tuples): try: index = { "current": i + 1, "final": len(relation_target_tuples) } unload_relation(conn, relation, unload_schema, index, allow_overwrite=allow_overwrite, dry_run=dry_run) except Exception as exc: if keep_going: error_occurred = True logger.warning("Unload failed for '%s'", relation.identifier) logger.exception( "Ignoring this exception and proceeding as requested:") else: raise DataUnloadError(exc) from exc if error_occurred: raise ETLDelayedExit( "At least one error occurred while unloading with 'keep going' option" )
def validate_identity_as_surrogate_key(table_design): """ Check whether specification of our identity column is valid and whether it matches surrogate key. """ identity_columns = [] for column in table_design["columns"]: if column.get("identity"): if not column.get("not_null"): # NULL columns may not be primary key (identity) raise TableDesignSemanticError("identity column must be set to not null") if identity_columns: raise TableDesignSemanticError("only one column should have identity") identity_columns.append(column["name"]) constraints = table_design.get("constraints", []) surrogate_keys = [col for constraint in constraints for col in constraint.get('surrogate_key', [])] if len(surrogate_keys) and not surrogate_keys == identity_columns: raise TableDesignSemanticError("surrogate key must be identity column")
def validate_semantics_of_view(table_design): """ Check for semantics that only apply to views. Basically, definitions of views may only contain column names or descriptions. Note that validation doesn't catch this since we didn't completely separate the schema for CTAS and VIEW but have that distinction only on the source_name. """ # This error occurs when you change from CTAS to VIEW and then forget to remove the extra # information for the columns, like type or sql_type. for column in table_design["columns"]: unwanted_fields = set(column).difference(("name", "description")) if unwanted_fields: raise TableDesignSemanticError( "too much information for column of a VIEW: {}".format( unwanted_fields)) for obj in ("constraints", "attributes", "extract_settings"): if obj in table_design: raise TableDesignSemanticError( "{} not supported for a VIEW".format(obj))
def validate_semantics_of_table_or_ctas(table_design): """Check for semantics that apply to tables that are in source schemas or are a CTAS.""" validate_identity_as_surrogate_key(table_design) validate_column_references(table_design) # Make sure that constraints other than unique constraint appear only once constraints = table_design.get("constraints", []) seen_constraint_types = set() for constraint in constraints: for constraint_type in constraint: if constraint_type in seen_constraint_types and constraint_type != "unique": raise TableDesignSemanticError( "multiple constraints of type {}".format(constraint_type)) seen_constraint_types.add(constraint_type)