def __init__(self, config, source_client=None, target_client=None, verbose=False): """Initialize a ConfigManager client which supplies the source and target queries to run. Args: config (Dict): The Validation config supplied source_client (IbisClient): The Ibis client for the source DB target_client (IbisClient): The Ibis client for the target DB verbose (Bool): If verbose, the Data Validation client will print queries run google_credentials (google.auth.credentials.Credentials): Explicit credentials to use in case default credentials aren't working properly. """ self._state_manager = state_manager.StateManager() self._config = config self.source_client = source_client or clients.get_data_client( self.get_source_connection()) self.target_client = target_client or clients.get_data_client( self.get_target_connection()) self.verbose = verbose if self.validation_type not in consts.CONFIG_TYPES: raise ValueError( f"Unknown Configuration Type: {self.validation_type}")
def build_config_managers_from_yaml(args): """Returns List[ConfigManager] instances ready to be executed.""" config_managers = [] config_file_path = _get_arg_config_file(args) yaml_configs = _get_yaml_config_from_file(config_file_path) mgr = state_manager.StateManager() source_conn = mgr.get_connection_config(yaml_configs[consts.YAML_SOURCE]) target_conn = mgr.get_connection_config(yaml_configs[consts.YAML_TARGET]) source_client = clients.get_data_client(source_conn) target_client = clients.get_data_client(target_conn) for config in yaml_configs[consts.YAML_VALIDATIONS]: config[consts.CONFIG_SOURCE_CONN] = source_conn config[consts.CONFIG_TARGET_CONN] = target_conn config[consts.CONFIG_RESULT_HANDLER] = yaml_configs[ consts.YAML_RESULT_HANDLER] config_manager = ConfigManager(config, source_client, target_client, verbose=args.verbose) config_managers.append(config_manager) return config_managers
def run_raw_query_against_connection(args): """Return results of raw query for adhoc usage.""" mgr = state_manager.StateManager() client = clients.get_data_client(mgr.get_connection_config(args.conn)) with client.raw_sql(args.query, results=True) as cur: return cur.fetchall()
def find_tables_using_string_matching(args): """Return JSON String with matched tables for use in validations.""" score_cutoff = args.score_cutoff or 0.8 mgr = state_manager.StateManager() source_client = clients.get_data_client( mgr.get_connection_config(args.source_conn)) target_client = clients.get_data_client( mgr.get_connection_config(args.target_conn)) allowed_schemas = cli_tools.get_arg_list(args.allowed_schemas) source_table_map = get_table_map(source_client, allowed_schemas=allowed_schemas) target_table_map = get_table_map(target_client) table_configs = _compare_match_tables(source_table_map, target_table_map, score_cutoff=score_cutoff) return json.dumps(table_configs)
def test_compile(module_under_test, fs): _create_table_file(TABLE_FILE_PATH, JSON_DATA) client = clients.get_data_client(CONN_CONFIG) primary_keys = ["col_a"] builder = module_under_test.RandomRowBuilder(primary_keys, 10) query = builder.compile(client, None, CONN_CONFIG["table_name"]) df = client.execute(query) assert list(df.columns) == primary_keys assert len(df) == 10
def run_connections(args): """Run commands related to connection management.""" if args.connect_cmd == "list": cli_tools.list_connections() elif args.connect_cmd == "add": conn = cli_tools.get_connection_config_from_args(args) # Test getting a client to validate connection details _ = clients.get_data_client(conn) cli_tools.store_connection(args.connection_name, conn) else: raise ValueError( f"Connections Argument '{args.connect_cmd}' is not supported")
def test_random_row_query_builder(): bq_client = clients.get_data_client(BQ_CONN) row_query_builder = random_row_builder.RandomRowBuilder(["station_id"], 10) query = row_query_builder.compile( bq_client, "bigquery-public-data.new_york_citibike", "citibike_stations" ) random_rows = bq_client.execute(query) assert query.compile() == EXPECTED_RANDOM_ROW_QUERY assert len(random_rows["station_id"]) == 10 assert list(random_rows["station_id"]) != [ 4683, 4676, 4675, 4674, 4673, 4671, 4670, 4666, 4665, 4664, ]
def test_get_pandas_data_client(fs): conn_config = SOURCE_CONN_CONFIG _create_table_file(SOURCE_TABLE_FILE_PATH, JSON_DATA) ibis_client = clients.get_data_client(conn_config) assert isinstance(ibis_client, PandasClient)
def test_get_oracle_data_client(): with pytest.raises(exceptions.DataClientConnectionFailure, match=r".*pip install cx_Oracle"): clients.get_data_client(ORACLE_CONN_CONFIG)
def build_config_managers_from_args(args): """Return a list of config managers ready to execute.""" configs = [] validate_cmd = args.validate_cmd.capitalize() if validate_cmd == "Schema": config_type = consts.SCHEMA_VALIDATION elif validate_cmd == "Column": config_type = consts.COLUMN_VALIDATION elif validate_cmd == "Row": config_type = consts.ROW_VALIDATION elif validate_cmd == "Custom-query": config_type = consts.CUSTOM_QUERY else: raise ValueError(f"Unknown Validation Type: {validate_cmd}") result_handler_config = None if args.bq_result_handler: result_handler_config = cli_tools.get_result_handler( args.bq_result_handler, args.service_account) elif args.result_handler_config: result_handler_config = cli_tools.get_result_handler( args.result_handler_config, args.service_account) # Schema validation will not accept filters, labels, or threshold as flags filter_config, labels, threshold = [], [], 0.0 if config_type != consts.SCHEMA_VALIDATION: if args.filters: filter_config = cli_tools.get_filters(args.filters) if args.threshold: threshold = args.threshold labels = cli_tools.get_labels(args.labels) mgr = state_manager.StateManager() source_client = clients.get_data_client( mgr.get_connection_config(args.source_conn)) target_client = clients.get_data_client( mgr.get_connection_config(args.target_conn)) format = args.format if args.format else "table" use_random_rows = (None if config_type == consts.SCHEMA_VALIDATION else args.use_random_row) random_row_batch_size = (None if config_type == consts.SCHEMA_VALIDATION else args.random_row_batch_size) is_filesystem = source_client._source_type == "FileSystem" tables_list = cli_tools.get_tables_list(args.tables_list, default_value=[{}], is_filesystem=is_filesystem) for table_obj in tables_list: config_manager = ConfigManager.build_config_manager( config_type, args.source_conn, args.target_conn, table_obj, labels, threshold, format, use_random_rows=use_random_rows, random_row_batch_size=random_row_batch_size, source_client=source_client, target_client=target_client, result_handler_config=result_handler_config, filter_config=filter_config, verbose=args.verbose, ) if config_type != consts.SCHEMA_VALIDATION: config_manager = build_config_from_args(args, config_manager) else: if args.exclusion_columns is not None: exclusion_columns = cli_tools.get_arg_list( args.exclusion_columns) config_manager.append_exclusion_columns( [col.casefold() for col in exclusion_columns]) configs.append(config_manager) return configs