def test_case_sensitivity(): pattern = AllowDenyPattern(allow=["Foo.myTable"]) assert pattern.allowed("foo.mytable") assert pattern.allowed("FOO.MYTABLE") assert pattern.allowed("Foo.MyTable") pattern = AllowDenyPattern(allow=["Foo.myTable"], ignoreCase=False) assert not pattern.allowed("foo.mytable") assert pattern.allowed("Foo.myTable")
def extract_dbt_entities( nodes: Dict[str, dict], catalog: Dict[str, dict], load_catalog: bool, target_platform: str, environment: str, node_type_pattern: AllowDenyPattern, ) -> List[DBTNode]: dbt_entities = [] for key in nodes: node = nodes[key] dbtNode = DBTNode() # check if node pattern allowed based on config file if not node_type_pattern.allowed(node["resource_type"]): continue dbtNode.dbt_name = key dbtNode.database = node["database"] dbtNode.schema = node["schema"] dbtNode.dbt_file_path = node["original_file_path"] dbtNode.node_type = node["resource_type"] if "identifier" in node and load_catalog is False: dbtNode.name = node["identifier"] else: dbtNode.name = node["name"] if "materialized" in node["config"].keys(): # It's a model dbtNode.materialization = node["config"]["materialized"] dbtNode.upstream_urns = get_upstreams( node["depends_on"]["nodes"], nodes, load_catalog, target_platform, environment, ) else: # It's a source dbtNode.materialization = catalog[key]["metadata"][ "type"] # get materialization from catalog? required? dbtNode.upstream_urns = [] if (dbtNode.materialization != "ephemeral" and load_catalog): # we don't want columns if platform isn't 'dbt' logger.debug("Loading schema info") dbtNode.columns = get_columns(catalog[dbtNode.dbt_name]) else: dbtNode.columns = [] dbtNode.datahub_urn = get_urn_from_dbtNode( dbtNode.database, dbtNode.schema, dbtNode.name, target_platform, environment, ) dbt_entities.append(dbtNode) return dbt_entities
def extract_dbt_entities( all_manifest_entities: Dict[str, Dict[str, Any]], all_catalog_entities: Dict[str, Dict[str, Any]], sources_results: List[Dict[str, Any]], load_schemas: bool, use_identifiers: bool, tag_prefix: str, node_type_pattern: AllowDenyPattern, report: SourceReport, node_name_pattern: AllowDenyPattern, ) -> List[DBTNode]: sources_by_id = {x["unique_id"]: x for x in sources_results} dbt_entities = [] for key, manifest_node in all_manifest_entities.items(): # check if node pattern allowed based on config file if not node_type_pattern.allowed(manifest_node["resource_type"]): continue name = manifest_node["name"] if "identifier" in manifest_node and use_identifiers: name = manifest_node["identifier"] if manifest_node.get("alias") is not None: name = manifest_node["alias"] if not node_name_pattern.allowed(key): continue # initialize comment to "" for consistency with descriptions # (since dbt null/undefined descriptions as "") comment = "" if key in all_catalog_entities and all_catalog_entities[key][ "metadata"].get("comment"): comment = all_catalog_entities[key]["metadata"]["comment"] materialization = None upstream_nodes = [] if "materialized" in manifest_node.get("config", {}): # It's a model materialization = manifest_node["config"]["materialized"] upstream_nodes = manifest_node["depends_on"]["nodes"] # It's a source catalog_node = all_catalog_entities.get(key) catalog_type = None if catalog_node is None: report.report_warning( key, f"Entity {key} ({name}) is in manifest but missing from catalog", ) else: catalog_type = all_catalog_entities[key]["metadata"]["type"] meta = manifest_node.get("meta", {}) owner = meta.get("owner") if owner is None: owner = manifest_node.get("config", {}).get("meta", {}).get("owner") tags = manifest_node.get("tags", []) tags = [tag_prefix + tag for tag in tags] meta_props = manifest_node.get("meta", {}) if not meta: meta_props = manifest_node.get("config", {}).get("meta", {}) dbtNode = DBTNode( dbt_name=key, database=manifest_node["database"], schema=manifest_node["schema"], name=name, dbt_file_path=manifest_node["original_file_path"], node_type=manifest_node["resource_type"], max_loaded_at=sources_by_id.get(key, {}).get("max_loaded_at"), comment=comment, description=manifest_node.get("description", ""), raw_sql=manifest_node.get("raw_sql"), upstream_nodes=upstream_nodes, materialization=materialization, catalog_type=catalog_type, columns=[], meta=meta_props, tags=tags, owner=owner, ) # overwrite columns from catalog if (dbtNode.materialization != "ephemeral"): # we don't want columns if platform isn't 'dbt' logger.debug("Loading schema info") catalog_node = all_catalog_entities.get(key) if catalog_node is None: report.report_warning( key, f"Entity {dbtNode.dbt_name} is in manifest but missing from catalog", ) else: dbtNode.columns = get_columns(catalog_node, manifest_node, tag_prefix) else: dbtNode.columns = [] dbt_entities.append(dbtNode) return dbt_entities
def test_deny_all(): pattern = AllowDenyPattern(allow=[], deny=[".*"]) assert not pattern.allowed("foo.table")
def test_default_deny(): pattern = AllowDenyPattern(allow=["foo.mytable"]) assert not pattern.allowed("foo.bar")
def test_single_table(): pattern = AllowDenyPattern(allow=["foo.mytable"]) assert pattern.allowed("foo.mytable")
def extract_dbt_entities( all_manifest_entities: Dict[str, Dict[str, Any]], all_catalog_entities: Dict[str, Dict[str, Any]], sources_results: List[Dict[str, Any]], load_catalog: bool, target_platform: str, environment: str, node_type_pattern: AllowDenyPattern, report: SourceReport, ) -> List[DBTNode]: sources_by_id = {x["unique_id"]: x for x in sources_results} dbt_entities = [] for key, node in all_manifest_entities.items(): # check if node pattern allowed based on config file if not node_type_pattern.allowed(node["resource_type"]): continue name = node["name"] if "identifier" in node and not load_catalog: name = node["identifier"] materialization = None upstream_urns = [] if "materialized" in node.get("config", {}).keys(): # It's a model materialization = node["config"]["materialized"] upstream_urns = get_upstreams( node["depends_on"]["nodes"], all_manifest_entities, load_catalog, target_platform, environment, ) # It's a source catalog_node = all_catalog_entities.get(key) catalog_type = None if catalog_node is None: report.report_warning( key, f"Entity {name} is in manifest but missing from catalog", ) else: catalog_type = all_catalog_entities[key]["metadata"]["type"] dbtNode = DBTNode( dbt_name=key, database=node["database"], schema=node["schema"], dbt_file_path=node["original_file_path"], node_type=node["resource_type"], max_loaded_at=sources_by_id.get(key, {}).get("max_loaded_at"), name=name, upstream_urns=upstream_urns, materialization=materialization, catalog_type=catalog_type, columns=[], datahub_urn=get_urn_from_dbtNode( node["database"], node["schema"], name, target_platform, environment, ), meta=node.get("meta", {}), ) # overwrite columns from catalog if (dbtNode.materialization != "ephemeral" and load_catalog): # we don't want columns if platform isn't 'dbt' logger.debug("Loading schema info") catalog_node = all_catalog_entities.get(key) if catalog_node is None: report.report_warning( key, f"Entity {dbtNode.dbt_name} is in manifest but missing from catalog", ) else: dbtNode.columns = get_columns(catalog_node) else: dbtNode.columns = [] dbt_entities.append(dbtNode) return dbt_entities