示例#1
0
def test_case_sensitivity():
    pattern = AllowDenyPattern(allow=["Foo.myTable"])
    assert pattern.allowed("foo.mytable")
    assert pattern.allowed("FOO.MYTABLE")
    assert pattern.allowed("Foo.MyTable")
    pattern = AllowDenyPattern(allow=["Foo.myTable"], ignoreCase=False)
    assert not pattern.allowed("foo.mytable")
    assert pattern.allowed("Foo.myTable")
示例#2
0
def extract_dbt_entities(
    nodes: Dict[str, dict],
    catalog: Dict[str, dict],
    load_catalog: bool,
    target_platform: str,
    environment: str,
    node_type_pattern: AllowDenyPattern,
) -> List[DBTNode]:
    dbt_entities = []
    for key in nodes:
        node = nodes[key]
        dbtNode = DBTNode()

        # check if node pattern allowed based on config file
        if not node_type_pattern.allowed(node["resource_type"]):
            continue
        dbtNode.dbt_name = key
        dbtNode.database = node["database"]
        dbtNode.schema = node["schema"]
        dbtNode.dbt_file_path = node["original_file_path"]
        dbtNode.node_type = node["resource_type"]
        if "identifier" in node and load_catalog is False:
            dbtNode.name = node["identifier"]
        else:
            dbtNode.name = node["name"]

        if "materialized" in node["config"].keys():
            # It's a model
            dbtNode.materialization = node["config"]["materialized"]
            dbtNode.upstream_urns = get_upstreams(
                node["depends_on"]["nodes"],
                nodes,
                load_catalog,
                target_platform,
                environment,
            )
        else:
            # It's a source
            dbtNode.materialization = catalog[key]["metadata"][
                "type"]  # get materialization from catalog? required?
            dbtNode.upstream_urns = []

        if (dbtNode.materialization != "ephemeral" and
                load_catalog):  # we don't want columns if platform isn't 'dbt'
            logger.debug("Loading schema info")
            dbtNode.columns = get_columns(catalog[dbtNode.dbt_name])
        else:
            dbtNode.columns = []

        dbtNode.datahub_urn = get_urn_from_dbtNode(
            dbtNode.database,
            dbtNode.schema,
            dbtNode.name,
            target_platform,
            environment,
        )

        dbt_entities.append(dbtNode)

    return dbt_entities
示例#3
0
def extract_dbt_entities(
    all_manifest_entities: Dict[str, Dict[str, Any]],
    all_catalog_entities: Dict[str, Dict[str, Any]],
    sources_results: List[Dict[str, Any]],
    load_schemas: bool,
    use_identifiers: bool,
    tag_prefix: str,
    node_type_pattern: AllowDenyPattern,
    report: SourceReport,
    node_name_pattern: AllowDenyPattern,
) -> List[DBTNode]:
    sources_by_id = {x["unique_id"]: x for x in sources_results}

    dbt_entities = []
    for key, manifest_node in all_manifest_entities.items():
        # check if node pattern allowed based on config file
        if not node_type_pattern.allowed(manifest_node["resource_type"]):
            continue

        name = manifest_node["name"]
        if "identifier" in manifest_node and use_identifiers:
            name = manifest_node["identifier"]

        if manifest_node.get("alias") is not None:
            name = manifest_node["alias"]

        if not node_name_pattern.allowed(key):
            continue

        # initialize comment to "" for consistency with descriptions
        # (since dbt null/undefined descriptions as "")
        comment = ""

        if key in all_catalog_entities and all_catalog_entities[key][
                "metadata"].get("comment"):
            comment = all_catalog_entities[key]["metadata"]["comment"]

        materialization = None
        upstream_nodes = []

        if "materialized" in manifest_node.get("config", {}):
            # It's a model
            materialization = manifest_node["config"]["materialized"]
            upstream_nodes = manifest_node["depends_on"]["nodes"]

        # It's a source
        catalog_node = all_catalog_entities.get(key)
        catalog_type = None

        if catalog_node is None:
            report.report_warning(
                key,
                f"Entity {key} ({name}) is in manifest but missing from catalog",
            )
        else:
            catalog_type = all_catalog_entities[key]["metadata"]["type"]

        meta = manifest_node.get("meta", {})

        owner = meta.get("owner")
        if owner is None:
            owner = manifest_node.get("config", {}).get("meta",
                                                        {}).get("owner")

        tags = manifest_node.get("tags", [])
        tags = [tag_prefix + tag for tag in tags]
        meta_props = manifest_node.get("meta", {})
        if not meta:
            meta_props = manifest_node.get("config", {}).get("meta", {})
        dbtNode = DBTNode(
            dbt_name=key,
            database=manifest_node["database"],
            schema=manifest_node["schema"],
            name=name,
            dbt_file_path=manifest_node["original_file_path"],
            node_type=manifest_node["resource_type"],
            max_loaded_at=sources_by_id.get(key, {}).get("max_loaded_at"),
            comment=comment,
            description=manifest_node.get("description", ""),
            raw_sql=manifest_node.get("raw_sql"),
            upstream_nodes=upstream_nodes,
            materialization=materialization,
            catalog_type=catalog_type,
            columns=[],
            meta=meta_props,
            tags=tags,
            owner=owner,
        )

        # overwrite columns from catalog
        if (dbtNode.materialization !=
                "ephemeral"):  # we don't want columns if platform isn't 'dbt'
            logger.debug("Loading schema info")
            catalog_node = all_catalog_entities.get(key)

            if catalog_node is None:
                report.report_warning(
                    key,
                    f"Entity {dbtNode.dbt_name} is in manifest but missing from catalog",
                )
            else:
                dbtNode.columns = get_columns(catalog_node, manifest_node,
                                              tag_prefix)

        else:
            dbtNode.columns = []

        dbt_entities.append(dbtNode)

    return dbt_entities
示例#4
0
def test_deny_all():
    pattern = AllowDenyPattern(allow=[], deny=[".*"])
    assert not pattern.allowed("foo.table")
示例#5
0
def test_default_deny():
    pattern = AllowDenyPattern(allow=["foo.mytable"])
    assert not pattern.allowed("foo.bar")
示例#6
0
def test_single_table():
    pattern = AllowDenyPattern(allow=["foo.mytable"])
    assert pattern.allowed("foo.mytable")
示例#7
0
def extract_dbt_entities(
    all_manifest_entities: Dict[str, Dict[str, Any]],
    all_catalog_entities: Dict[str, Dict[str, Any]],
    sources_results: List[Dict[str, Any]],
    load_catalog: bool,
    target_platform: str,
    environment: str,
    node_type_pattern: AllowDenyPattern,
    report: SourceReport,
) -> List[DBTNode]:

    sources_by_id = {x["unique_id"]: x for x in sources_results}

    dbt_entities = []
    for key, node in all_manifest_entities.items():
        # check if node pattern allowed based on config file
        if not node_type_pattern.allowed(node["resource_type"]):
            continue

        name = node["name"]

        if "identifier" in node and not load_catalog:
            name = node["identifier"]

        materialization = None
        upstream_urns = []

        if "materialized" in node.get("config", {}).keys():
            # It's a model
            materialization = node["config"]["materialized"]
            upstream_urns = get_upstreams(
                node["depends_on"]["nodes"],
                all_manifest_entities,
                load_catalog,
                target_platform,
                environment,
            )

        # It's a source
        catalog_node = all_catalog_entities.get(key)
        catalog_type = None

        if catalog_node is None:
            report.report_warning(
                key,
                f"Entity {name} is in manifest but missing from catalog",
            )

        else:

            catalog_type = all_catalog_entities[key]["metadata"]["type"]

        dbtNode = DBTNode(
            dbt_name=key,
            database=node["database"],
            schema=node["schema"],
            dbt_file_path=node["original_file_path"],
            node_type=node["resource_type"],
            max_loaded_at=sources_by_id.get(key, {}).get("max_loaded_at"),
            name=name,
            upstream_urns=upstream_urns,
            materialization=materialization,
            catalog_type=catalog_type,
            columns=[],
            datahub_urn=get_urn_from_dbtNode(
                node["database"],
                node["schema"],
                name,
                target_platform,
                environment,
            ),
            meta=node.get("meta", {}),
        )

        # overwrite columns from catalog
        if (dbtNode.materialization != "ephemeral" and
                load_catalog):  # we don't want columns if platform isn't 'dbt'
            logger.debug("Loading schema info")
            catalog_node = all_catalog_entities.get(key)

            if catalog_node is None:
                report.report_warning(
                    key,
                    f"Entity {dbtNode.dbt_name} is in manifest but missing from catalog",
                )
            else:
                dbtNode.columns = get_columns(catalog_node)

        else:
            dbtNode.columns = []

        dbt_entities.append(dbtNode)

    return dbt_entities