def set_required_relations(relations: List[RelationDescription], required_selector: TableSelector) -> None: """ Set the required property of the relations if they are directly or indirectly feeding into relations selected by the :required_selector. """ logger.info( "Loading table design for %d relation(s) to mark required relations", len(relations)) ordered_descriptions = order_by_dependencies(relations) # Start with all descriptions that are matching the required selector required_relations = [ description for description in ordered_descriptions if required_selector.match(description.target_table_name) ] # Walk through descriptions in reverse dependency order, expanding required set based on dependency fan-out for description in ordered_descriptions[::-1]: if any([ description.target_table_name in required.dependencies for required in required_relations ]): required_relations.append(description) for relation in ordered_descriptions: relation._is_required = False for relation in required_relations: relation._is_required = True logger.info("Marked %d relation(s) as required based on selector: %s", len(required_relations), required_selector)
def find_matches(relations: Sequence[RelationDescription], selector: TableSelector): """Return list of matching relations.""" return [ relation for relation in relations if selector.match(relation.target_table_name) ]
def fetch_tables(cx: Connection, source: DataWarehouseSchema, selector: TableSelector) -> List[TableName]: """ Retrieve tables (matching selector) for this source, return as a list of TableName instances. The :source configuration contains an "allowlist" (which tables to include) and a "denylist" (which tables to exclude). Note that "exclude" always overrides "include." The list of tables matching the allowlist but not the denylist can be further narrowed down by the pattern in :selector. """ # Look for relations ('r', ordinary tables), materialized views ('m'), and views ('v'). result = etl.db.query( cx, """ SELECT nsp.nspname AS "schema" , cls.relname AS "table" FROM pg_catalog.pg_class AS cls JOIN pg_catalog.pg_namespace AS nsp ON cls.relnamespace = nsp.oid WHERE cls.relname NOT LIKE 'tmp%%' AND cls.relname NOT LIKE 'pg_%%' AND cls.relkind IN ('r', 'm', 'v') ORDER BY nsp.nspname , cls.relname """, ) found = [] for row in result: source_table_name = TableName(row["schema"], row["table"]) target_table_name = TableName(source.name, row["table"]) for reject_pattern in source.exclude_tables: if source_table_name.match_pattern(reject_pattern): logger.debug("Table '%s' matches denylist", source_table_name.identifier) break else: for accept_pattern in source.include_tables: if source_table_name.match_pattern(accept_pattern): if selector.match(target_table_name): found.append(source_table_name) logger.debug("Table '%s' is included in result set", source_table_name.identifier) break else: logger.debug( "Table '%s' matches allowlist but is not selected", source_table_name.identifier) logger.info( "Found %d table(s) matching patterns; allowlist=%s, denylist=%s, subset='%s'", len(found), source.include_tables, source.exclude_tables, selector, ) return found
def find_data_files_in_s3(bucket_name: str, prefix: str) -> Iterator[str]: """Return paths of data files.""" iterable = etl.s3.list_objects_for_prefix(bucket_name, prefix) for file_info in _find_matching_files_from(iterable, TableSelector()): if file_info.file_type == "data": yield file_info.filename