def build_config_from_args(args, config_manager):
    """Return config manager object ready to execute.

    Args:
        config_manager (ConfigManager): Validation config manager instance.
    """
    config_manager.append_calculated_fields(
        get_calculated_config(args, config_manager))

    if config_manager.validation_type == consts.COLUMN_VALIDATION:
        config_manager.append_aggregates(
            get_aggregate_config(args, config_manager))
        if args.grouped_columns is not None:
            grouped_columns = cli_tools.get_arg_list(args.grouped_columns)
            config_manager.append_query_groups(
                config_manager.build_column_configs(grouped_columns))
    elif config_manager.validation_type == consts.ROW_VALIDATION:
        if args.comparison_fields is not None:
            comparison_fields = cli_tools.get_arg_list(args.comparison_fields,
                                                       default_value=[])
            config_manager.append_comparison_fields(
                config_manager.build_config_comparison_fields(
                    comparison_fields))
            if args.hash != "*":
                config_manager.append_dependent_aliases(comparison_fields)

    if args.primary_keys is not None:
        primary_keys = cli_tools.get_arg_list(args.primary_keys)
        config_manager.append_primary_keys(
            config_manager.build_column_configs(primary_keys))
        if args.hash != "*":
            config_manager.append_dependent_aliases(primary_keys)

    if config_manager.validation_type == consts.CUSTOM_QUERY:
        config_manager.append_aggregates(
            get_aggregate_config(args, config_manager))
        if args.custom_query_type is not None:
            config_manager.append_custom_query_type(args.custom_query_type)
        else:
            raise ValueError(
                "Expected custom query type to be given, got empty string.")
        if args.source_query_file is not None:
            query_file = cli_tools.get_arg_list(args.source_query_file)
            config_manager.append_source_query_file(query_file)
        if args.target_query_file is not None:
            query_file = cli_tools.get_arg_list(args.target_query_file)
            config_manager.append_target_query_file(query_file)

    return config_manager
def find_tables_using_string_matching(args):
    """Return JSON String with matched tables for use in validations."""
    score_cutoff = args.score_cutoff or 0.8

    mgr = state_manager.StateManager()
    source_client = clients.get_data_client(
        mgr.get_connection_config(args.source_conn))
    target_client = clients.get_data_client(
        mgr.get_connection_config(args.target_conn))

    allowed_schemas = cli_tools.get_arg_list(args.allowed_schemas)
    source_table_map = get_table_map(source_client,
                                     allowed_schemas=allowed_schemas)
    target_table_map = get_table_map(target_client)

    table_configs = _compare_match_tables(source_table_map,
                                          target_table_map,
                                          score_cutoff=score_cutoff)
    return json.dumps(table_configs)
def get_calculated_config(args, config_manager):
    """Return list of formatted calculated objects.

    Args:
        config_manager(ConfigManager): Validation config manager instance.
    """
    calculated_configs = []
    fields = []
    if args.hash:
        col_list = None if args.hash == "*" else cli_tools.get_arg_list(
            args.hash)
        fields = config_manager._build_dependent_aliases("hash", col_list)
        aliases = [field["name"] for field in fields]

        # Add to list of necessary columns for selective hashing in order to drop
        # excess columns with invalid data types (i.e structs) when generating source/target DFs
        if col_list:
            config_manager.append_dependent_aliases(col_list)
            config_manager.append_dependent_aliases(aliases)

    if len(fields) > 0:
        max_depth = max([x["depth"] for x in fields])
    else:
        max_depth = 0
    for field in fields:
        calculated_configs.append(
            config_manager.build_config_calculated_fields(
                field["reference"],
                field["calc_type"],
                field["name"],
                field["depth"],
                None,
            ))
    if args.hash:
        config_manager.append_comparison_fields(
            config_manager.build_config_comparison_fields(["hash__all"],
                                                          depth=max_depth))
    return calculated_configs
示例#4
0
def test_get_arg_list(test_input, expected):
    """Test get aggregations list of columns."""
    res = cli_tools.get_arg_list(test_input)
    assert res == expected
示例#5
0
def test_find_tables_config():
    parser = cli_tools.configure_arg_parser()
    args = parser.parse_args(CLI_FIND_TABLES_ARGS)

    allowed_schemas = cli_tools.get_arg_list(args.allowed_schemas)
    assert allowed_schemas[0] == "my_schema"
def get_aggregate_config(args, config_manager):
    """Return list of formated aggregation objects.

    Args:
        config_manager (ConfigManager): Validation config manager instance.
    """
    aggregate_configs = [config_manager.build_config_count_aggregate()]
    supported_data_types = [
        "float64",
        "float32",
        "int8",
        "int16",
        "int32",
        "int64",
        "decimal",
        "timestamp",
    ]

    if args.wildcard_include_string_len:
        supported_data_types.append("string")

    cast_to_bigint = True if args.cast_to_bigint else False

    if args.count:
        col_args = None if args.count == "*" else cli_tools.get_arg_list(
            args.count)
        aggregate_configs += config_manager.build_config_column_aggregates(
            "count", col_args, None, cast_to_bigint=cast_to_bigint)
    if args.sum:
        col_args = None if args.sum == "*" else cli_tools.get_arg_list(
            args.sum)
        aggregate_configs += config_manager.build_config_column_aggregates(
            "sum",
            col_args,
            supported_data_types,
            cast_to_bigint=cast_to_bigint)
    if args.avg:
        col_args = None if args.avg == "*" else cli_tools.get_arg_list(
            args.avg)
        aggregate_configs += config_manager.build_config_column_aggregates(
            "avg",
            col_args,
            supported_data_types,
            cast_to_bigint=cast_to_bigint)
    if args.min:
        col_args = None if args.min == "*" else cli_tools.get_arg_list(
            args.min)
        aggregate_configs += config_manager.build_config_column_aggregates(
            "min",
            col_args,
            supported_data_types,
            cast_to_bigint=cast_to_bigint)
    if args.max:
        col_args = None if args.max == "*" else cli_tools.get_arg_list(
            args.max)
        aggregate_configs += config_manager.build_config_column_aggregates(
            "max",
            col_args,
            supported_data_types,
            cast_to_bigint=cast_to_bigint)
    if args.bit_xor:
        col_args = None if args.bit_xor == "*" else cli_tools.get_arg_list(
            args.bit_xor)
        aggregate_configs += config_manager.build_config_column_aggregates(
            "bit_xor",
            col_args,
            supported_data_types,
            cast_to_bigint=cast_to_bigint)
    return aggregate_configs
def build_config_managers_from_args(args):
    """Return a list of config managers ready to execute."""
    configs = []

    validate_cmd = args.validate_cmd.capitalize()
    if validate_cmd == "Schema":
        config_type = consts.SCHEMA_VALIDATION
    elif validate_cmd == "Column":
        config_type = consts.COLUMN_VALIDATION
    elif validate_cmd == "Row":
        config_type = consts.ROW_VALIDATION
    elif validate_cmd == "Custom-query":
        config_type = consts.CUSTOM_QUERY
    else:
        raise ValueError(f"Unknown Validation Type: {validate_cmd}")

    result_handler_config = None
    if args.bq_result_handler:
        result_handler_config = cli_tools.get_result_handler(
            args.bq_result_handler, args.service_account)
    elif args.result_handler_config:
        result_handler_config = cli_tools.get_result_handler(
            args.result_handler_config, args.service_account)

    # Schema validation will not accept filters, labels, or threshold as flags
    filter_config, labels, threshold = [], [], 0.0
    if config_type != consts.SCHEMA_VALIDATION:
        if args.filters:
            filter_config = cli_tools.get_filters(args.filters)
        if args.threshold:
            threshold = args.threshold
    labels = cli_tools.get_labels(args.labels)

    mgr = state_manager.StateManager()
    source_client = clients.get_data_client(
        mgr.get_connection_config(args.source_conn))
    target_client = clients.get_data_client(
        mgr.get_connection_config(args.target_conn))

    format = args.format if args.format else "table"

    use_random_rows = (None if config_type == consts.SCHEMA_VALIDATION else
                       args.use_random_row)
    random_row_batch_size = (None if config_type == consts.SCHEMA_VALIDATION
                             else args.random_row_batch_size)

    is_filesystem = source_client._source_type == "FileSystem"
    tables_list = cli_tools.get_tables_list(args.tables_list,
                                            default_value=[{}],
                                            is_filesystem=is_filesystem)

    for table_obj in tables_list:
        config_manager = ConfigManager.build_config_manager(
            config_type,
            args.source_conn,
            args.target_conn,
            table_obj,
            labels,
            threshold,
            format,
            use_random_rows=use_random_rows,
            random_row_batch_size=random_row_batch_size,
            source_client=source_client,
            target_client=target_client,
            result_handler_config=result_handler_config,
            filter_config=filter_config,
            verbose=args.verbose,
        )
        if config_type != consts.SCHEMA_VALIDATION:
            config_manager = build_config_from_args(args, config_manager)
        else:
            if args.exclusion_columns is not None:
                exclusion_columns = cli_tools.get_arg_list(
                    args.exclusion_columns)
                config_manager.append_exclusion_columns(
                    [col.casefold() for col in exclusion_columns])

        configs.append(config_manager)

    return configs