def test_get(): def tests(env: MasonEnvironment, config: Config, op: Operator): # Database and table Exist params = OperatorParameters( parameter_string= "database_name:crawler-poc,table_name:catalog_poc_data") exists = op.validate(config, params).run(env, Response()) assert (exists.with_status() == table.get( config.metastore().client.name(), 1)) # Database DNE params = OperatorParameters( parameter_string= "database_name:bad-database,table_name:catalog_poc_data") dne = op.validate(config, params).run(env, Response()) assert (dne.with_status() == table.get( config.metastore().client.name(), 2)) # Table DNE params = OperatorParameters( parameter_string="database_name:crawler-poc,table_name:bad-table") dne2 = op.validate(config, params).run(env, Response()) assert (dne2.with_status() == table.get( config.metastore().client.name(), 3)) run_tests("table", "get", True, "fatal", ["1", "2"], tests)
def test_delete(): def tests(env: MasonEnvironment, config: Config, op: Operator): # valid delete params = OperatorParameters( parameter_string= f"table_name:good_table,database_name:good_database") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Info': ['Table good_table successfully deleted.'] }, 200)) # database DNE params = OperatorParameters( parameter_string=f"table_name:bad_table,database_name:bad_database" ) bad = op.validate(config, params).run(env, Response()) assert (bad.with_status() == ({ 'Errors': ['Database bad_database not found.'] }, 400)) # table DNE params = OperatorParameters( parameter_string=f"table_name:bad_table,database_name:good_database" ) bad = op.validate(config, params).run(env, Response()) assert (bad.with_status() == ({ 'Errors': ['Table bad_table not found.'] }, 400)) run_tests("table", "delete", True, "fatal", ["1"], tests)
def test_post(): def tests(env: MasonEnvironment, config: Config, wf: Workflow): # DNE params = WorkflowParameters(parameter_path=from_root( "/test/support/parameters/table_infer_parameters_1.yaml")) dne = wf.validate(env, config, params).run(env, Response()) assert (dne.with_status() == expects.post(False)) # Exists params = WorkflowParameters(parameter_path=from_root( "/test/support/parameters/table_infer_parameters_2.yaml")) exists = wf.validate(env, config, params).run(env, Response()) assert (exists.with_status() == expects.post(True)) # API response, status = run( "workflow", wf.namespace, wf.command, param_file=from_root( "/test/support/parameters/table_infer_parameters_1.yaml"), config_id=config.id, env=env, log_level="fatal") assert ((response, status) == expects.post(False)) run_tests("table", "infer", True, "fatal", ["1"], tests, workflow=True)
def test_query(): def tests(env: MasonEnvironment, config: Config, op: Operator): # valid query query = "SELECT * from $table limit 3" output_path = from_root("/.tmp/") params = OperatorParameters( parameter_string= f"query_string:{query},database_name:good_database,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp = { "1": [ 'Running Query "SELECT * from $table limit 3"', 'Running Athena query. query_id: test', 'Running job id=test' ], "4": [ f'Table succesfully formatted as parquet and exported to {output_path}' ] } expect = {'Info': exp[config.id]} assert (result.with_status() == (expect, 200)) # bad permissions query = "SELECT * from $table limit 3" params = OperatorParameters( parameter_string= f"query_string:{query},database_name:access_denied,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp_2 = { "1": ({ 'Errors': [ 'Job errored: Access denied for credentials. Ensure associated user or role has permission to CreateNamedQuery on athena' ], 'Info': ['Running Query "SELECT * from $table limit 3"'] }, 403), "4": ({ 'Info': [ f'Table succesfully formatted as parquet and exported to {output_path}' ] }, 200) } assert (result.with_status() == exp_2[config.id]) run_tests("table", "query", True, "fatal", ["1", "4"], tests) tmp_folder = from_root("/.tmp/") if path.exists(tmp_folder): shutil.rmtree(tmp_folder)
def test_refresh(): def tests(env: MasonEnvironment, config: Config, op: Operator): # valid refresh params = OperatorParameters( parameter_string= "table_name:catalog_poc_data,database_name:crawler-poc") refresh = op.validate(config, params).run(env, Response()) assert (refresh.with_status() == table.refresh(False)) # already refreshing params = OperatorParameters( parameter_string= "table_name:catalog_poc_data_refreshing,database_name:crawler-poc") refreshing = op.validate(config, params).run(env, Response()) assert (refreshing.with_status() == table.refresh(True)) run_tests("table", "refresh", True, "fatal", ["1"], tests)
def test_infer(): def tests(env: MasonEnvironment, config: Config, op: Operator): # database DNE params = OperatorParameters( parameter_string= f"database_name:bad-database,storage_path:crawler-poc/catalog_poc_data" ) good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': ['Job errored: Metastore database bad-database not found'], 'Info': ['Table inferred: catalog_poc_data'] }, 404)) # bad path params = OperatorParameters( parameter_string= f"database_name:crawler-poc,storage_path:crawler-poc/bad-table") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': [ 'No keys at s3://crawler-poc/bad-table', 'Job errored: Invalid Tables: No keys at s3://crawler-poc/bad-table' ] }, 404)) # valid path params = OperatorParameters( parameter_string= f"database_name:crawler-poc,storage_path:crawler-poc/catalog_poc_data,output_path:crawler-poc/athena/" ) good = op.validate(config, params).run(env, Response()) def clean(s: List[str]): return list(map(lambda i: clean_uuid(clean_string(i)), s)) infos = clean(good.formatted()["Info"]) expect = [ 'Tableinferred:catalog_poc_data', 'RunningAthenaquery.query_id:test_id', 'Runningjobid=test_id' ] assert (infos == expect) run_tests("table", "infer", True, "fatal", ["3"], tests)
def test_delete(): def tests(env: MasonEnvironment, config: Config, op: Operator): # valid delete params = OperatorParameters( parameter_string=f"schedule_name:good_schedule") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Info': ['Schedule good_schedule successfully deleted.'] }, 200)) # dne params = OperatorParameters( parameter_string=f"schedule_name:bad_schedule") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': ["Crawler entry with name bad_schedule does not exist"] }, 400)) run_tests("schedule", "delete", True, "fatal", ["1"], tests)
def test_index(): def tests(env: MasonEnvironment, config: Config, op: Operator): # Database Exists params = OperatorParameters( parameter_string="database_name:crawler-poc") valid = op.validate(config, params) exists = valid.run(env, Response()) assert exists.with_status() == table.index( config.metastore().client.name()) # Database DNE params = OperatorParameters( parameter_string="database_name:bad-database") dne = op.validate(config, params).run(env, Response()) assert (dne.with_status() == table.index( config.metastore().client.name(), False)) run_tests("table", "list", True, "fatal", ["1", "2"], tests)
def test_format(): load_dotenv(from_root("/../.env"), override=True) def tests(env: MasonEnvironment, config: Config, op: Operator): params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:boogo,output_path:mason-sample-data/tests/out/csv/" ) good = op.validate(config, params).run(env, Response()) invalid_job = good.object assert (isinstance(invalid_job, InvalidJob)) params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:csv,output_path:good_output_path" ) good = op.validate(config, params).run(env, Response()) executed_job = good.object assert (isinstance(executed_job, ExecutedJob)) run_tests("table", "format", True, "fatal", ["4"], tests)
def test_get(): def tests(env: MasonEnvironment, config: Config, op: Operator): # valid job_id params = OperatorParameters(parameter_string=f"job_id:good_job_id") # TODO: consolidate these expect = { 'spark': { 'Data': [{ 'Logs': ['<LOG_DATA>'] }] }, 'athena': { 'Data': [{ 'ResultSetMetadata': { 'ColumnInfo': [{ 'CaseSensitive': True, 'CatalogName': 'hive', 'Label': 'widget', 'Name': 'widget', 'Nullable': 'UNKNOWN', 'Precision': 2147483647, 'Scale': 0, 'SchemaName': '', 'TableName': '', 'Type': 'varchar' }] }, 'Rows': [{ 'Data': [{ 'VarCharValue': 'widget' }] }] }], 'Info': ['Job Status: SUCCEEDED'] }, } good = op.validate(config, params).run(env, Response()) assert ((expect[config.execution().client.name()], 200) == good.with_status()) # invalid job_id params = OperatorParameters(parameter_string="job_id:bad_job_id") bad = op.validate(config, params).run(env, Response()) expect = { 'spark': { 'Errors': [ 'Error from server (NotFound): pods "bad_job_id-driver" not found' ] }, 'athena': { 'Errors': [ 'QueryExecution bad_job_id was not found', 'Job errored: Invalid Job: QueryExecution bad_job_id was not found' ] } } assert (bad.with_status() == (expect[config.execution().client.name()], 400)) run_tests("job", "get", True, "fatal", ["1", "2"], tests)