def check_test_cnt( paths: Sequence[str], manifest: Dict[str, Any], required_tests: Dict[str, int] ) -> int: status_code = 0 sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) for model in models: childs = list( get_parent_childs( manifest=manifest, obj=model, manifest_node="child_map", node_types=["test"], ) ) tests = [test for test in childs if isinstance(test, Test)] grouped = groupby( sorted(tests, key=lambda x: x.test_type), lambda x: x.test_type ) test_dict = {key: list(value) for key, value in grouped} for required_test, required_cnt in required_tests.items(): test = test_dict.get(required_test, []) test_cnt = len(test) if not test or required_cnt > test_cnt: status_code = 1 print( f"{model.model_name}: " f"has only {test_cnt} {required_test} tests, but " f"{required_cnt} are required.", ) return status_code
def check_test_cnt(paths: Sequence[str], manifest: Dict[str, Any], test_cnt: int) -> int: status_code = 0 sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) for model in models: childs = list( get_parent_childs( manifest=manifest, obj=model, manifest_node="child_map", node_types=["test"], )) tests = [test for test in childs if isinstance(test, Test)] model_test_cnt = len(tests) if model_test_cnt < test_cnt: status_code = 1 print( f"{model.model_name}: " f"has only {model_test_cnt} tests, but {test_cnt} are required.", ) return status_code
def has_description(paths: Sequence[str], manifest: Dict[str, Any]) -> int: status_code = 0 ymls = get_filenames(paths, [".yml", ".yaml"]) sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) # if user added schema but did not rerun the model schemas = get_model_schemas(list(ymls.values()), filenames) # convert to sets in_models = { model.filename for model in models if model.node.get("description") } in_schemas = { schema.model_name for schema in schemas if schema.schema.get("description") } missing = filenames.difference(in_models, in_schemas) for model in missing: status_code = 1 print( f"{sqls.get(model)}: " f"does not have defined description or properties file is missing.", ) return status_code
def check_model_columns(paths: Sequence[str], manifest: Dict[str, Any], catalog: Dict[str, Any]) -> int: status_code = 0 sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) catalog_nodes = catalog.get("nodes", {}) for model in models: catalog_node = catalog_nodes.get(model.model_id, {}) if catalog_node: model_only, catalog_only = compare_columns( catalog_columns=catalog_node.get("columns", {}), model_columns=model.node.get("columns", {}), ) schema_path = model.node.get("patch_path", "schema") # pragma: no mutate if model_only: status_code = 1 print_cols = [ "- name: %s" % (col) for col in model_only if col ] print( "{file}: columns in {schema_path} but not in db (catalog.json):\n" "{columns}".format( file=sqls.get(model.filename), columns="\n".join(print_cols), # pragma: no mutate schema_path=schema_path, )) if catalog_only: status_code = 1 print_cols = [ "- name: %s" % (col) for col in catalog_only if col ] print( "{file}: columns in db (catalog.json) but not in {schema_path}:\n" "{columns}".format( file=sqls.get(model.filename), columns="\n".join(print_cols), # pragma: no mutate schema_path=schema_path, )) else: status_code = 1 print( f"Unable to find model `{model.model_id}` in catalog file. " f"Make sure you run `dbt docs generate` before executing this hook." ) return status_code
def check_column_desc(paths: Sequence[str], manifest: Dict[str, Any]) -> Tuple[int, Dict[str, Any]]: status_code = 0 ymls = get_filenames(paths, [".yml", ".yaml"]) sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) # if user added schema but did not rerun the model schemas = get_model_schemas(list(ymls.values()), filenames) missing: Dict[str, Set[str]] = {} for item in itertools.chain(models, schemas): missing_cols = set() # pragma: no mutate if isinstance(item, ModelSchema): model_name = item.model_name missing_cols = { key.get("name") for key in item.schema.get("columns", []) if not key.get("description") } # Model elif isinstance(item, Model): model_name = item.filename missing_cols = { key for key, value in item.node.get("columns", {}).items() if not value.get("description") } else: continue # pragma: no cover, no mutate seen = missing.get(model_name) if seen: if not missing_cols: missing[model_name] = set() # pragma: no mutate else: missing[model_name] = seen.union(missing_cols) elif missing_cols: missing[model_name] = missing_cols for model, columns in missing.items(): if columns: status_code = 1 result = "\n- ".join(list(columns)) # pragma: no mutate print( f"{sqls.get(model)}: " f"following columns are missing description:\n- {result}", ) return status_code, missing
def check_child_parent_cnt( paths: Sequence[str], manifest: Dict[str, Any], required_cnt: Sequence[Dict[str, Any]], ) -> int: status_code = 0 sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) for model in models: childs = list( get_parent_childs( manifest=manifest, obj=model, manifest_node="child_map", node_types=["model"], )) parents = list( get_parent_childs( manifest=manifest, obj=model, manifest_node="parent_map", node_types=["model", "source"], )) real_cnt = {"childs": len(childs), "parents": len(parents)} for required in required_cnt: req_cnt = required.get("cnt") req_operator = required.get("operator", operator.lt) req_type = required.get("type") # pragma: no mutate req_dep = required.get("dep", "") # pragma: no mutate real_value = real_cnt.get(req_dep) if req_cnt and req_operator(real_value, req_cnt): status_code = 1 print( f"{model.model_name}: " f"has {real_value} {req_type}, but {req_type} {req_cnt}" f"is/are required.", ) return status_code
def validate_tags(paths: Sequence[str], manifest: Dict[str, Any], tags: Sequence[str]) -> int: status_code = 0 sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) for model in models: # tags can be specified only from manifest model_tags = set(model.node.get("tags", [])) valid_tags = set(tags) if not model_tags.issubset(valid_tags): status_code = 1 list_diff = list(model_tags.difference(valid_tags)) result = "\n- ".join(list_diff) # pragma: no mutate print( f"{model.node.get('original_file_path', model.filename)}: " f"has invalid tags:\n- {result}", ) return status_code
def has_properties_file(paths: Sequence[str], manifest: Dict[str, Any]) -> Tuple[int, Set[str]]: status_code = 0 sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) # convert to sets in_models = { model.filename for model in models if model.node.get("patch_path") } missing = filenames.difference(in_models) for model in missing: status_code = 1 print( f"{sqls.get(model)}: " f"does not have model properties defined in any .yml file.", ) return status_code, missing
def check_test_cnt( paths: Sequence[str], manifest: Dict[str, Any], test_group: Dict[str, int], test_cnt: int, ) -> int: status_code = 0 sqls = get_model_sqls(paths, manifest) filenames = set(sqls.keys()) # get manifest nodes that pre-commit found as changed models = get_models(manifest, filenames) for model in models: childs = list( get_parent_childs( manifest=manifest, obj=model, manifest_node="child_map", node_types=["test"], ) ) tests = [test for test in childs if isinstance(test, Test)] grouped = groupby( sorted(tests, key=lambda x: x.test_name), lambda x: x.test_name ) test_dict = {key: list(value) for key, value in grouped} required_test_count = 0 for test in test_group: if test_dict.get(test): required_test_count += 1 if required_test_count < test_cnt: print( f"{model.model_name}: " f"has only {required_test_count} test(s) from {test_group}.", ) status_code = 1 return status_code