예제 #1
0
def validate_upstream_columns(conn: connection, table: RelationDescription) -> None:
    """
    Compare columns in upstream table to the table design file.
    """
    source_table_name = table.source_table_name

    columns_info = etl.design.bootstrap.fetch_attributes(conn, source_table_name)
    if not columns_info:
        raise UpstreamValidationError("table '%s' is gone or has no columns left" % source_table_name.identifier)
    logger.info("Found %d column(s) in relation '%s'", len(columns_info), source_table_name.identifier)

    current_columns = frozenset(column.name for column in columns_info)
    design_columns = frozenset(column["name"]
                               for column in table.table_design["columns"]
                               if not column["name"].startswith("etl__"))
    if not current_columns.issuperset(design_columns):
        extra_columns = design_columns.difference(current_columns)
        raise UpstreamValidationError("design of '%s' has columns that do not exist upstream: %s" %
                                      (source_table_name.identifier, join_with_quotes(extra_columns)))
    missing_columns = current_columns.difference(design_columns)
    if missing_columns:
        logger.warning("Column(s) that exist upstream in '%s' but not in the design '%s': %s",
                       table.source_name, table.identifier, join_with_quotes(missing_columns))

    current_is_not_null = {column.name for column in columns_info if column.not_null}
    for column in table.table_design["columns"]:
        if column.get("not_null") and column["name"] not in current_is_not_null:
            raise TableDesignValidationError("not null constraint of column '%s' in '%s' not enforced upstream" %
                                             (column["name"], table.identifier))
예제 #2
0
def compare_query_to_design(from_query: Iterable, from_design: Iterable) -> Optional[str]:
    """
    Calculate differences between what was found while running the query to what was declared in the design.

    >>> compare_query_to_design(["a", "b"], ["b", "a"])
    >>> compare_query_to_design(["a", "b"], ["a"])
    "not listed in design = 'b'"
    >>> compare_query_to_design(["a"], ["a", "b"])
    "not found from query = 'b'"
    >>> compare_query_to_design(["a", "b"], ["d", "c", "a"])
    "not listed in design = 'b'; not found from query = 'c', 'd'"
    """
    actual = frozenset(from_query)
    design = frozenset(from_design)

    not_in_design = join_with_quotes(actual - design)
    not_in_query = join_with_quotes(design - actual)

    if not_in_design and not_in_query:
        return "not listed in design = {}; not found from query = {}".format(not_in_design, not_in_query)
    elif not_in_design:
        return "not listed in design = {}".format(not_in_design)
    elif not_in_query:
        return "not found from query = {}".format(not_in_query)
    else:
        return None
예제 #3
0
def validate_upstream_columns(conn: connection, table: RelationDescription) -> None:
    """
    Compare columns in upstream table to the table design file.

    It is an ERROR if the design lists columns that do not exist in the upstream table. Exceptions
    here are calculated columns (those starting with etl__) or columns that are marked as skipped.

    It causes a WARNING to have more columns in the upstream table than are defined in the design
    or to have columns skipped in the design that do not exist upstream.
    """
    source_table_name = table.source_table_name

    columns_info = etl.design.bootstrap.fetch_attributes(conn, source_table_name)
    if not columns_info:
        raise UpstreamValidationError("table '%s' is gone or has no columns left" % source_table_name.identifier)
    logger.info("Found %d column(s) in relation '%s'", len(columns_info), source_table_name.identifier)

    current_columns = frozenset(column.name for column in columns_info)
    design_columns = frozenset(
        column["name"] for column in table.table_design["columns"] if not column["name"].startswith("etl__")
    )
    design_required_columns = frozenset(
        column["name"]
        for column in table.table_design["columns"]
        if column["name"] in design_columns and not column.get("skipped", False)
    )

    missing_required_columns = design_required_columns.difference(current_columns)
    if missing_required_columns:
        raise UpstreamValidationError(
            "design of '%s' has columns that do not exist upstream: %s"
            % (source_table_name.identifier, join_with_quotes(missing_required_columns))
        )

    extra_design_columns = design_columns.difference(current_columns)
    if extra_design_columns:
        logger.warning(
            "Column(s) that are in the design of '%s' but do not exist upstream in '%s': %s",
            table.identifier,
            table.source_name,
            join_with_quotes(extra_design_columns),
        )

    missing_design_columns = current_columns.difference(design_columns)
    if missing_design_columns:
        logger.warning(
            "Column(s) that exist upstream in '%s' but not in the design '%s': %s",
            table.source_name,
            table.identifier,
            join_with_quotes(missing_design_columns),
        )

    current_is_not_null = {column.name for column in columns_info if column.not_null}
    for column in table.table_design["columns"]:
        if column.get("not_null") and column["name"] not in current_is_not_null:
            raise TableDesignValidationError(
                "not null constraint of column '%s' in '%s' not enforced upstream" % (column["name"], table.identifier)
            )
예제 #4
0
def validate_upstream_constraints(conn: connection, table: RelationDescription) -> None:
    """
    Compare table constraints between database and table design file.

    Note that "natural_key" or "surrogate_key" constraints are not valid in upstream (source) tables.
    Also, a "primary_key" in upstream may be used as a "unique" constraint in the design (but not vice versa).
    """
    current_constraint = etl.design.bootstrap.fetch_constraints(conn, table.source_table_name)
    design_constraint = table.table_design.get("constraints", [])

    current_primary_key = frozenset([col for c in current_constraint for col in c.get("primary_key", [])])
    current_uniques = [frozenset(c["unique"]) for c in current_constraint if "unique" in c]

    design_primary_key = frozenset([col for c in design_constraint for col in c.get("primary_key", [])])
    design_uniques = [frozenset(c["unique"]) for c in design_constraint if "unique" in c]

    # We'll pluck from the not_used info and report if anything wasn't used in the design at the end.
    not_used = deepcopy(current_constraint)

    if design_primary_key:
        if current_primary_key == design_primary_key:
            for i in range(len(not_used)):
                if "primary_key" in not_used[i]:
                    del not_used[i]
                    break
        elif current_primary_key:
            raise TableDesignValidationError("the primary_key constraint in '%s' (%s) does not match upstream (%s)" %
                                             (table.identifier,
                                              join_with_quotes(design_primary_key),
                                              join_with_quotes(current_primary_key)))
        else:
            raise TableDesignValidationError("the primary key constraint in '%s' (%s) is not enforced upstream" %
                                             (table.identifier, join_with_quotes(design_primary_key)))

    for design_unique in design_uniques:
        if current_primary_key == design_unique:
            for i in range(len(not_used)):
                if "primary_key" in not_used[i]:
                    del not_used[i]
                    break
        if design_unique in current_uniques:
            for i in range(len(not_used)):
                if "unique" in not_used[i] and frozenset(not_used[i]["unique"]) == design_unique:
                    del not_used[i]
                    break
        if current_primary_key != design_unique and design_unique not in current_uniques:
            raise TableDesignValidationError("the unique constraint in '%s' (%s) is not enforced upstream" %
                                             (table.identifier, join_with_quotes(design_unique)))

    for constraint in not_used:
        for constraint_type, columns in constraint.items():
            logger.warning("Upstream source has additional %s constraint (%s) for '%s'",
                           constraint_type, join_with_quotes(columns), table.table_design["source_name"])
예제 #5
0
def validate_column_ordering(conn: connection, relation: RelationDescription, tmp_view_name: TempTableName) -> None:
    """
    Download the column order (using the temporary view) and compare with table design.
    """
    attributes = etl.design.bootstrap.fetch_attributes(conn, tmp_view_name)
    actual_columns = [attribute.name for attribute in attributes]

    if not actual_columns and tmp_view_name.is_late_binding_view:
        # Thanks to late-binding views it is not an error for a view to not be able to resolve its columns.
        logger.warning(
            "Order of columns in design of '%s' cannot be validated because external table is missing",
            relation.identifier,
        )
        return

    # Identity columns are inserted after the query has been run, so skip them here.
    expected_columns = [
        column["name"]
        for column in relation.table_design["columns"]
        if not (column.get("skipped") or column.get("identity"))
    ]

    diff = get_list_difference(expected_columns, actual_columns)
    if diff:
        logger.error(
            "Order of columns in design of '%s' does not match result of running its query", relation.identifier
        )
        logger.error(
            "You need to replace, insert and/or delete in '%s' some column(s): %s",
            relation.identifier,
            join_with_quotes(diff),
        )
        raise TableDesignValidationError("invalid columns or column order in '%s'" % relation.identifier)
    else:
        logger.info("Order of columns in design of '%s' matches result of running SQL query", relation.identifier)
예제 #6
0
 def __str__(self):
     # See __init__ for tests
     if len(self._patterns) == 0:
         return "['*.*']"
     else:
         return "[{}]".format(
             join_with_quotes(p.identifier for p in self._patterns))
예제 #7
0
    def from_file_sets(
            cls,
            file_sets,
            required_relation_selector=None) -> List["RelationDescription"]:
        """
        Return a list of relation descriptions based on a list of file sets.

        If there's a file set without a table design file, then there's a warning and that file set
        is skipped.  (This comes in handy when creating the design file for a CTAS or VIEW automatically.)

        If provided, the required_relation_selector will be used to mark dependencies of high-priority.  A failure
        to dump or load in these relations will end the ETL run.
        """
        relations = []
        for file_set in file_sets:
            if file_set.design_file_name is not None:
                relations.append(cls(file_set))
            else:
                logger.warning(
                    "Found file(s) without matching table design: %s",
                    join_with_quotes(file_set.files))

        if required_relation_selector:
            set_required_relations(relations, required_relation_selector)

        return relations
예제 #8
0
def backup_schemas(schemas: List[DataWarehouseSchema], dry_run=False) -> None:
    """
    For existing schemas, rename them and drop access.
    Once the access is revoked, the backup schemas "disappear" from BI tools.
    """
    dsn_etl = etl.config.get_dw_config().dsn_etl
    with closing(etl.db.connection(dsn_etl, autocommit=True,
                                   readonly=dry_run)) as conn:
        names = [schema.name for schema in schemas]
        found = etl.db.select_schemas(conn, names)
        need_backup = [schema for schema in schemas if schema.name in found]
        if not need_backup:
            logger.info("Found no existing schemas to backup")
            return
        selected_names = join_with_quotes(name for name in names
                                          if name in found)
        if dry_run:
            logger.info("Dry-run: Skipping backup of schema(s): %s",
                        selected_names)
            return

        logger.info("Creating backup of schema(s) %s", selected_names)
        for schema in need_backup:
            logger.info(
                "Revoking access from readers and writers to schema '%s' before backup",
                schema.name)
            revoke_schema_permissions(conn, schema)
            logger.info("Renaming schema '%s' to backup '%s'", schema.name,
                        schema.backup_name)
            etl.db.drop_schema(conn, schema.backup_name)
            etl.db.alter_schema_rename(conn, schema.name, schema.backup_name)
예제 #9
0
def validate_column_references(table_design):
    """
    Make sure that table attributes and constraints only reference columns that actually exist
    """
    column_list_references = [('constraints', 'primary_key'),
                              ('constraints', 'natural_key'),
                              ('constraints', 'surrogate_key'),
                              ('constraints', 'unique'),
                              ('attributes', 'interleaved_sort'),
                              ('attributes', 'compound_sort')]
    valid_columns = frozenset(column["name"]
                              for column in table_design["columns"]
                              if not column.get("skipped"))

    constraints = table_design.get("constraints", [])
    for obj, key in column_list_references:
        if obj == 'constraints':
            # This evaluates all unique constraints at once by concatenating all of the columns.
            cols = [
                col for constraint in constraints
                for col in constraint.get(key, [])
            ]
        else:  # 'attributes'
            cols = table_design.get(obj, {}).get(key, [])
        unknown = join_with_quotes(frozenset(cols).difference(valid_columns))
        if unknown:
            raise TableDesignSemanticError(
                "{key} columns in {obj} contain unknown column(s): {unknown}".
                format(obj=obj, key=key, unknown=unknown))
예제 #10
0
 def __init__(self, source_relations, extracted_targets):
     missing_relations = [
         relation for relation in source_relations
         if relation.identifier not in extracted_targets
     ]
     self.message = (
         "Some source relations did not have extract events after the step start time: "
         + join_with_quotes(missing_relations))
예제 #11
0
    def extract_sources(self) -> None:
        """
        Iterate over sources to be extracted and parallelize extraction at the source level
        """
        self.logger.info("Starting to extract %d relation(s) in %d schema(s)",
                         len(self.relations), len(self.schemas))
        self.failed_sources.clear()
        max_workers = len(self.schemas)

        # TODO With Python 3.6, we should pass in a thread_name_prefix
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_workers) as executor:
            futures = []
            for source_name, relation_group in groupby(
                    self.relations, attrgetter("source_name")):
                future = executor.submit(self.extract_source,
                                         self.schemas[source_name],
                                         list(relation_group))
                futures.append(future)
            if self.keep_going:
                done, not_done = concurrent.futures.wait(
                    futures, return_when=concurrent.futures.ALL_COMPLETED)
            else:
                done, not_done = concurrent.futures.wait(
                    futures, return_when=concurrent.futures.FIRST_EXCEPTION)
        if self.failed_sources:
            self.logger.warning("Failed to extract from these source(s): %s",
                                join_with_quotes(self.failed_sources))

        # Note that iterating over result of futures may raise an exception (which surfaces exceptions from threads)
        missing_tables = []  # type: List
        for future in done:
            missing_tables.extend(future.result())

        if missing_tables:
            self.logger.warning(
                "Failed to extract %d relation(s): %s",
                len(missing_tables),
                join_with_quotes(table_name.identifier
                                 for table_name in missing_tables),
            )
        if not_done:
            raise DataExtractError(
                "Extract failed to complete for {:d} source(s)".format(
                    len(not_done)))
예제 #12
0
def _sanitize_dependencies(descriptions: List[SortableRelationDescription]) -> None:
    """
    Pass 1 of ordering -- make sure to drop unknown dependencies.

    This will change the sortable relations in place.
    """
    known_tables = frozenset({description.target_table_name for description in descriptions})
    has_unknown_dependencies = set()
    has_pg_catalog_dependencies = set()
    known_unknowns = set()

    for initial_order, description in enumerate(descriptions):
        unmanaged_dependencies = frozenset(dep for dep in description.dependencies if not dep.is_managed)
        pg_catalog_dependencies = frozenset(dep for dep in description.dependencies if dep.schema == "pg_catalog")
        unknowns = description.dependencies - known_tables - unmanaged_dependencies
        if unknowns:
            known_unknowns.update(unknowns)
            has_unknown_dependencies.add(description.target_table_name)
            # Drop the unknowns from the list of dependencies so that the loop below doesn't wait for their resolution.
            description.dependencies = description.dependencies.difference(unknowns)
        if unmanaged_dependencies:
            logger.info(
                "The following dependencies for relation '%s' are not managed by Arthur: %s",
                description.identifier,
                join_with_quotes([dep.identifier for dep in unmanaged_dependencies]),
            )
        if pg_catalog_dependencies:
            has_pg_catalog_dependencies.add(description.target_table_name)

    if has_unknown_dependencies:
        logger.warning(
            "These relations were unknown during dependency ordering: %s",
            join_with_quotes([dep.identifier for dep in known_unknowns]),
        )
        logger.warning(
            "This caused these relations to have dependencies that are not known: %s",
            join_with_quotes([dep.identifier for dep in has_unknown_dependencies]),
        )

    # Make tables that depend on pg_catalog tables on all our tables (except those depending on pg_catalog tables).
    has_no_internal_dependencies = known_tables - known_unknowns - has_pg_catalog_dependencies
    for description in descriptions:
        if description.target_table_name in has_pg_catalog_dependencies:
            description.dependencies.update(has_no_internal_dependencies)
예제 #13
0
def validate_reload(schemas: List[DataWarehouseSchema],
                    relations: List[RelationDescription], keep_going: bool):
    """
    Verify that columns between unloaded tables and reloaded tables are the same.

    Once the designs are validated, we can unload a relation 's.t' with a target 'u' and
    then extract and load it back into 'u.t'.

    Note that the order matters for these lists of columns.  (Which is also why we can't take
    the (symmetric) difference between columns but must be careful checking the column lists.)
    """
    unloaded_relations = [d for d in relations if d.is_unloadable]
    target_lookup = {
        schema.name
        for schema in schemas if schema.is_an_unload_target
    }
    relations_lookup = {d.identifier: d for d in relations}

    for unloaded in unloaded_relations:
        try:
            if unloaded.unload_target not in target_lookup:
                raise TableDesignValidationError(
                    "invalid target '%s' in unloadable relation '%s'" %
                    (unloaded.unload_target, unloaded.identifier))
            else:
                logger.debug("Checking whether '%s' is loaded back in",
                             unloaded.identifier)
                reloaded = TableName(unloaded.unload_target,
                                     unloaded.target_table_name.table)
                if reloaded.identifier in relations_lookup:
                    relation = relations_lookup[reloaded.identifier]
                    logger.info(
                        "Checking for consistency between '%s' and '%s'",
                        unloaded.identifier, relation.identifier)
                    unloaded_columns = unloaded.unquoted_columns
                    reloaded_columns = relation.unquoted_columns
                    if unloaded_columns != reloaded_columns:
                        diff = get_list_difference(reloaded_columns,
                                                   unloaded_columns)
                        logger.error(
                            "Column difference detected between '%s' and '%s'",
                            unloaded.identifier, relation.identifier)
                        logger.error(
                            "You need to replace, insert and/or delete in '%s' some column(s): %s",
                            relation.identifier, join_with_quotes(diff))
                        raise TableDesignValidationError(
                            "unloaded relation '%s' failed to match counterpart"
                            % unloaded.identifier)
        except TableDesignValidationError:
            if keep_going:
                _error_occurred.set()
                logger.exception(
                    "Ignoring failure to validate '%s' and proceeding as requested:",
                    unloaded.identifier)
            else:
                raise
예제 #14
0
def show_pipelines(selection: List[str]) -> None:
    """
    List the currently installed pipelines, possibly using a subset based on the selection pattern.

    Without a selection, prints an overview of the pipelines.
    With a selection, digs into details of each selected pipeline.
    """
    pipelines = list_pipelines(selection)

    if not pipelines:
        logger.warning("Found no pipelines")
        print("*** No pipelines found ***")
    elif selection and len(pipelines) > 1:
        logger.warning("Selection matches more than one pipeline")
    if pipelines:
        if selection:
            logger.info(
                "Currently active and selected pipelines: %s",
                join_with_quotes(pipeline.pipeline_id
                                 for pipeline in pipelines),
            )
        else:
            logger.info(
                "Currently active pipelines: %s",
                join_with_quotes(pipeline.pipeline_id
                                 for pipeline in pipelines))
        print(
            etl.text.format_lines(
                [(pipeline.pipeline_id, pipeline.name, pipeline.health_status,
                  pipeline.state) for pipeline in pipelines],
                header_row=["Pipeline ID", "Name", "Health", "State"],
                max_column_width=80,
            ))
    if selection and len(pipelines) == 1:
        pipeline = pipelines[0]
        print()
        print(
            etl.text.format_lines([[key, pipeline.fields[key]]
                                   for key in sorted(pipeline.fields)],
                                  header_row=["Key", "Value"]))
예제 #15
0
def filter_relations_for_sources(
        source_lookup: Dict[str, DataWarehouseSchema],
        relations: List[RelationDescription]) -> List[RelationDescription]:
    """
    Filter for the relations that a given "extract" stage cares about.
    """
    selected = [
        relation for relation in relations
        if relation.source_name in source_lookup
    ]
    if selected:
        sources = frozenset(relation.source_name for relation in selected)
        logger.info("Selected %d relation(s) from source(s): %s",
                    len(selected), join_with_quotes(sources))
    return selected
예제 #16
0
def _promote_schemas(schemas: List[DataWarehouseSchema],
                     from_where: str,
                     dry_run=False) -> None:
    """
    Promote (staging or backup) schemas into their standard names and permissions
    Changes schema.from_name_attr -> schema.name; expects from_name_attr to be 'backup_name' or 'staging_name'
    """
    attr_name = from_where + "_name"
    from_names = [getattr(schema, attr_name) for schema in schemas]
    from_name_schema_lookup = dict(zip(from_names, schemas))

    dsn_etl = etl.config.get_dw_config().dsn_etl
    with closing(etl.db.connection(dsn_etl, autocommit=True,
                                   readonly=dry_run)) as conn:
        need_promotion = etl.db.select_schemas(conn, from_names)
        if not need_promotion:
            logger.info("Found no %s schemas to promote", from_where)
            return

        # Always log the original names
        selected_names = join_with_quotes(
            from_name_schema_lookup[from_name].name
            for from_name in need_promotion)
        if dry_run:
            logger.info(
                "Dry-run: Skipping promotion of %d schema(s) from %s position: %s",
                len(need_promotion),
                from_where,
                selected_names,
            )
            return

        logger.info("Promoting %d schema(s) from %s position: %s",
                    len(need_promotion), from_where, selected_names)
        for from_name in need_promotion:
            schema = from_name_schema_lookup[from_name]
            logger.info("Renaming schema '%s' from '%s'", schema.name,
                        from_name)
            etl.db.drop_schema(conn, schema.name)
            etl.db.alter_schema_rename(conn, from_name, schema.name)
            logger.info(
                "Granting readers and writers access to schema '%s' after promotion",
                schema.name)
            grant_schema_permissions(conn, schema)
예제 #17
0
def create_schema_and_grant_access(conn,
                                   schema,
                                   owner=None,
                                   use_staging=False,
                                   dry_run=False) -> None:
    group_names = join_with_quotes(schema.groups)
    name = schema.staging_name if use_staging else schema.name
    if dry_run:
        logger.info(
            "Dry-run: Skipping creating schema '%s' and granting access to '%s'",
            name, group_names)
    else:
        logger.info("Creating schema '%s'", name)
        etl.db.create_schema(conn, name, owner)
        etl.db.grant_all_on_schema_to_user(conn, name, schema.owner)
        if use_staging:
            # Don't grant usage on staging schemas to readers/writers
            return None
        logger.info("Granting access to %s", group_names)
        for group in schema.groups:
            # Readers/writers are differentiated in table permissions, not schema permissions
            etl.db.grant_usage(conn, name, group)
예제 #18
0
 def __init__(self, failed_relations, bad_apple=None):
     self.message = "required relation(s) with failure: " + join_with_quotes(
         failed_relations)
     if bad_apple:
         self.message += ", triggered by load failure of '{}'".format(
             bad_apple)
예제 #19
0
def order_by_dependencies(relation_descriptions):
    """
    Sort the relations such that any dependents surely are loaded afterwards.

    If a table (or view) depends on other tables, then its order is larger
    than any of its managed dependencies. Ties are resolved based on the initial order
    of the tables. (This motivates the use of a priority queue.)

    If a table depends on some system catalogs (living in pg_catalog), then the table
    is treated as if it depended on all other tables.

    Provides warnings about:
        * relations that directly depend on relations not in the input
        * relations that are depended upon but are not in the input
    """
    RelationDescription.load_in_parallel(relation_descriptions)
    descriptions = [
        SortableRelationDescription(description)
        for description in relation_descriptions
    ]

    known_tables = frozenset({
        description.target_table_name
        for description in relation_descriptions
    })
    nr_tables = len(known_tables)

    # Phase 1 -- build up the priority queue all the while making sure we have only dependencies that we know about
    has_unknown_dependencies = set()
    has_internal_dependencies = set()
    known_unknowns = set()
    queue = PriorityQueue()
    for initial_order, description in enumerate(descriptions):
        # superset including internal dependencies
        unmanaged_dependencies = set(dep for dep in description.dependencies
                                     if not dep.is_managed)
        pg_internal_dependencies = set(dep for dep in description.dependencies
                                       if dep.schema == 'pg_catalog')
        unknowns = description.dependencies - known_tables - unmanaged_dependencies
        if unknowns:
            known_unknowns.update(unknowns)
            has_unknown_dependencies.add(description.target_table_name)
            # Drop the unknowns from the list of dependencies so that the loop below doesn't wait for their resolution.
            description.dependencies = description.dependencies.difference(
                unknowns)
        if unmanaged_dependencies:
            logger.info(
                "The following dependencies for relation '%s' are not managed by Arthur: %s",
                description.identifier,
                join_with_quotes(
                    [dep.identifier for dep in unmanaged_dependencies]))
        if pg_internal_dependencies:
            has_internal_dependencies.add(description.target_table_name)
        queue.put((1, initial_order, description))
    if has_unknown_dependencies:
        logger.warning(
            "These relations were unknown during dependency ordering: %s",
            join_with_quotes([dep.identifier for dep in known_unknowns]))
        logger.warning(
            'This caused these relations to have dependencies that are not known: %s',
            join_with_quotes(
                [dep.identifier for dep in has_unknown_dependencies]))
    has_no_internal_dependencies = known_tables - known_unknowns - has_internal_dependencies
    for description in descriptions:
        if description.target_table_name in has_internal_dependencies:
            description.dependencies.update(has_no_internal_dependencies)

    # Phase 2 -- keep looping until all relations have their dependencies ordered before them
    table_map = {
        description.target_table_name: description
        for description in descriptions
    }
    latest = 0
    while not queue.empty():
        minimum, tie_breaker, description = queue.get()
        if minimum > 2 * nr_tables:
            raise CyclicDependencyError(
                "Cannot determine order, suspect cycle in DAG of dependencies")
        others = [
            table_map[dep].order for dep in description.dependencies
            if dep.is_managed
        ]
        if not others:
            latest = description.order = latest + 1
        elif all(others):
            latest = description.order = max(max(others), latest) + 1
        elif any(others):
            at_least = max(order for order in others if order is not None)
            queue.put(
                (max(at_least, latest, minimum) + 1, tie_breaker, description))
        else:
            queue.put((max(latest, minimum) + 1, tie_breaker, description))

    return [
        description.original_description
        for description in sorted(descriptions, key=attrgetter("order"))
    ]