def register_dag( self, schedule_name: str, valid_dag: ValidDag, schedule: Optional[Schedule], response: Response) -> Tuple[str, Response, Optional[ClientDag]]: response.add_info("Registering DAG in local memory") self.dag = valid_dag return (schedule_name, response, None)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: query_string = parameters.get_required("query_string") database_name = parameters.get_required("database_name") table_name = parameters.get_required("table_name") output_path = parameters.get_optional("output_path") # TODO?: Sanitize the query string query = query_string final: Union[ExecutedJob, InvalidJob] table, response = config.metastore().get_table(database_name, table_name) if output_path and isinstance(config.storage(), StorageClient): outp: Optional[Path] = config.storage().path(output_path) else: outp = None if isinstance(table, Table): response.add_info(f"Running Query \"{query}\"") job = QueryJob(query_string, table, outp) final, response = config.execution().run_job(job, response) else: final = InvalidJob(table.message()) return OperatorResponse(response, final)
def tests(env: MasonEnvironment, config: Config, op: Operator): # valid delete params = OperatorParameters( parameter_string= f"table_name:good_table,database_name:good_database") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Info': ['Table good_table successfully deleted.'] }, 200)) # database DNE params = OperatorParameters( parameter_string=f"table_name:bad_table,database_name:bad_database" ) bad = op.validate(config, params).run(env, Response()) assert (bad.with_status() == ({ 'Errors': ['Database bad_database not found.'] }, 400)) # table DNE params = OperatorParameters( parameter_string=f"table_name:bad_table,database_name:good_database" ) bad = op.validate(config, params).run(env, Response()) assert (bad.with_status() == ({ 'Errors': ['Table bad_table not found.'] }, 400))
def register_dag(self, schedule_name: str, valid_dag: ValidDag, schedule: Optional[Schedule], response: Response): # Short-circuit for glue crawler definition since glue as a scheduler is only well defined for Table Infer Operator if len(valid_dag.valid_steps) == 1 and valid_dag.valid_steps[ 0].operator.type_name() == "TableInfer": op = valid_dag.valid_steps[0].operator params = valid_dag.valid_steps[0].operator.parameters db_name = params.get_required("database_name") storage_engine = op.config.storage() if isinstance(storage_engine, StorageClient): storage_path = storage_engine.path( params.get_required("storage_path")) else: response = response.add_error( f"Attempted to register_dag for invalid client: {storage_engine.reason}" ) response = self.register_schedule(db_name, storage_path, schedule_name, schedule, response) else: response.add_error( "Glue Scheduler only defined for TableInfer type which registers a glue crawler" ) return (schedule_name, response, None)
def save_to(self, inpath: Path, outpath: Path, response: Response): try: self.client().upload(inpath.path_str, outpath.path_str) except Exception as e: response.add_error(f"Error saving {inpath} to {outpath.path_str}") response.add_error(message(e)) return response
def to_response(self, response: Response) -> Response: for it in self.invalid_tables: response = it.to_response(response) if self.error: response.add_error(self.error) return response
def save( self, state_store: MasonStateStore, overwrite: bool = False, response: Response = Response()) -> Response: message = self.get_message() if message: response.add_error(message) return response
def execute(self, env: MasonEnvironment, response: Response, dry_run: bool = True, run_now: bool = False, schedule_name: Optional[str] = None) -> Response: response.add_error(f"Invalid Operator. Reason: {self.reason}") response.set_status(400) return response
def apply(file: str, overwrite: bool = False, log_level: Optional[str] = None, env: Optional[MasonEnvironment] = None): environment: MasonEnvironment = env or MasonEnvironment().initialize() logger.set_level(log_level) response = Response() all = Resources(environment).get_all(file) for r in all: response = r.save(environment.state_store, overwrite, response) return response.with_status()
def trigger_schedule(self, schedule_name: str, response: Response, env: MasonEnvironment) -> Response: dag = self.dag if dag: workflow_run = WorkflowRun(dag) response = workflow_run.run(env, response) else: response.add_error("Dag not found. Run 'register_dag' first.") return response
def tests(env: MasonEnvironment, config: Config, op: Operator): # valid refresh params = OperatorParameters( parameter_string= "table_name:catalog_poc_data,database_name:crawler-poc") refresh = op.validate(config, params).run(env, Response()) assert (refresh.with_status() == table.refresh(False)) # already refreshing params = OperatorParameters( parameter_string= "table_name:catalog_poc_data_refreshing,database_name:crawler-poc") refreshing = op.validate(config, params).run(env, Response()) assert (refreshing.with_status() == table.refresh(True))
def list_tables( self, database_name: str, response: Response ) -> Tuple[Result[TableList, InvalidTables], Response]: try: result = self.client().get_tables(DatabaseName=database_name) except ClientError as e: result = e.response response.add_response(result) error, status, message = self.parse_response(result) if error == "EntityNotFoundException": final = Failure( InvalidTables([], f"Database {database_name} not found")) response.set_status(404) return final, response elif 200 <= status < 300: valid: List[Table] valid, invalid = self.parse_table_list_data( result, Path(database_name, "glue"), database_name) if len(valid) > 0: response.set_status(status) return Success(TableList(valid)), response else: return Failure(InvalidTables( [], "No Valid tables found")), response else: response.set_status(status) return Failure(InvalidTables(message)), response
def tests(env: MasonEnvironment, config: Config, op: Operator): # Database Exists params = OperatorParameters( parameter_string="database_name:crawler-poc") valid = op.validate(config, params) exists = valid.run(env, Response()) assert exists.with_status() == table.index( config.metastore().client.name()) # Database DNE params = OperatorParameters( parameter_string="database_name:bad-database") dne = op.validate(config, params).run(env, Response()) assert (dne.with_status() == table.index( config.metastore().client.name(), False))
def run( self, config: SparkConfig, job: Job, resp: Optional[Response] = None ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: # TODO: Replace with python kubernetes api # TODO: Set up kubernetes configuration, run on docker version response: Response = resp or Response() job.set_id("mason" + "-" + job.type + "-" + str(uuid4())) merged_config = merge_config(config, job) job_id = merged_config["metadata"]["name"] conf = dict(merged_config) final: Union[ExecutedJob, InvalidJob] with tempfile.NamedTemporaryFile(delete=False, mode='w') as yaml_file: yaml_dump = yaml.dump(conf, yaml_file) command = ["kubectl", "apply", "-f", yaml_file.name] response.add_info( f"Executing Spark Kubernetes Operator. job_id: {job_id}") stdout, stderr = run_sys_call(command) if len(stdout) > 0: final = job.running(stdout) else: if len(stderr) > 0: final = job.errored(stderr) else: final = job.running() return final, response
def run(self, job: Job, resp: Optional[Response] = None, mode: str = "async" ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: final: Union[ExecutedJob, InvalidJob] response: Response = resp or Response() try: if self.scheduler: if isinstance(job, FormatJob): final = self.run_job(job.type, job.spec( ), self.scheduler, mode) or ExecutedJob( "format_job", f"Job queued to format {job.table.schema.type} table as {job.format} and save to {job.output_path.path_str}" ) elif isinstance(job, QueryJob): final = self.run_job(job.type, job.spec(), self.scheduler) else: final = job.errored("Job type not supported for Dask") else: final = InvalidJob("Dask Scheduler not defined") except OSError as e: final = InvalidJob(message(e)) return final, response
def tests(env: MasonEnvironment, config: Config, op: Operator): params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:boogo,output_path:mason-sample-data/tests/out/csv/" ) good = op.validate(config, params).run(env, Response()) invalid_job = good.object assert (isinstance(invalid_job, InvalidJob)) params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:csv,output_path:good_output_path" ) good = op.validate(config, params).run(env, Response()) executed_job = good.object assert (isinstance(executed_job, ExecutedJob))
def test_local_client(self): base.set_log_level() env = self.before() config = Resources(env).get_config("8") # DAG has cycle step_params = {"config_id": "8", "parameters": {"test_param": "test"}} params = { "step_1": step_params, "step_2": step_params, "step_3": step_params, "step_4": step_params, "step_5": step_params, "step_6": step_params, } wf = self.get_workflow(env, "workflow_local_scheduler") if isinstance(wf, MalformedResource): raise Exception(f"Workflow not found: {wf.get_message()}") if isinstance(config, MalformedResource): raise Exception(f"Config not found: {config.get_message()}") parameters = WorkflowParameters(parameter_dict=params) validated = wf.validate(env, config, parameters) assert (isinstance(validated, ValidWorkflow)) operator_response = validated.run(env, Response()) info = """ Registering workflow dag test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 with local. Registering DAG in local memory Registered schedule test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 Triggering schedule: test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 Running dag * step_1 | * step_2 | | * step_3 | |/ * | step_4 |/ * step_5 * step_6 Running step step_1 Running operator1 Running step step_2 Running operator2 Running step step_3 Running operator3 Running step step_4 Running operator4 Running step step_5 Running operator5 Running step step_6 Running operator6 """ response = operator_response.response assert (len(response.errors) == 0) assert (clean_uuid(clean_string("\n".join( response.info))) == clean_uuid(clean_string(info)))
def tests(env: MasonEnvironment, config: Config, op: Operator): # valid delete params = OperatorParameters( parameter_string=f"schedule_name:good_schedule") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Info': ['Schedule good_schedule successfully deleted.'] }, 200)) # dne params = OperatorParameters( parameter_string=f"schedule_name:bad_schedule") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': ["Crawler entry with name bad_schedule does not exist"] }, 400))
def execute_ddl( self, ddl: DDLStatement, database: Database, response: Optional[Response] = None ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: return InvalidJob( "Client 'execute_ddl' not implemented"), response or Response()
def print_response(self, response: Response): def default(o): if isinstance(o, (datetime.date, datetime.datetime)): return o.isoformat() resp, status = response.with_status() logger.info(f"Response status: {status}") str_resp = json.dumps(resp, indent=4, sort_keys=True, default=default) logger.info(highlight(str_resp, JsonLexer(), TerminalFormatter()))
def tests(env: MasonEnvironment, config: Config, op: Operator): # valid query query = "SELECT * from $table limit 3" output_path = from_root("/.tmp/") params = OperatorParameters( parameter_string= f"query_string:{query},database_name:good_database,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp = { "1": [ 'Running Query "SELECT * from $table limit 3"', 'Running Athena query. query_id: test', 'Running job id=test' ], "4": [ f'Table succesfully formatted as parquet and exported to {output_path}' ] } expect = {'Info': exp[config.id]} assert (result.with_status() == (expect, 200)) # bad permissions query = "SELECT * from $table limit 3" params = OperatorParameters( parameter_string= f"query_string:{query},database_name:access_denied,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp_2 = { "1": ({ 'Errors': [ 'Job errored: Access denied for credentials. Ensure associated user or role has permission to CreateNamedQuery on athena' ], 'Info': ['Running Query "SELECT * from $table limit 3"'] }, 403), "4": ({ 'Info': [ f'Table succesfully formatted as parquet and exported to {output_path}' ] }, 200) } assert (result.with_status() == exp_2[config.id])
def get_database( self, database_name: str, response: Optional[Response] = None ) -> Tuple[Result[Database, InvalidDatabase], Response]: tables, response = self.list_tables(database_name, response or Response()) database = tables.map(lambda a: Database("s3_table", a)).alt( lambda b: InvalidDatabase(b.error or b.message())) return database, response
def run(self, env: MasonEnvironment, response: Response) -> Response: response.add_info(f"Running dag \n{self.dag.display()}") while not self.finished(): self.step(env) for step in sorted(self.executed_steps): response = response.merge(step.operator_response.response) if len(self.invalid_steps) > 0: response.add_error(f"Workflow failed") for i in self.invalid_steps: response.add_error(i.reason) response.set_status(400) return response
def tests(env: MasonEnvironment, config: Config, op: Operator): # database DNE params = OperatorParameters( parameter_string= f"database_name:bad-database,storage_path:crawler-poc/catalog_poc_data" ) good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': ['Job errored: Metastore database bad-database not found'], 'Info': ['Table inferred: catalog_poc_data'] }, 404)) # bad path params = OperatorParameters( parameter_string= f"database_name:crawler-poc,storage_path:crawler-poc/bad-table") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': [ 'No keys at s3://crawler-poc/bad-table', 'Job errored: Invalid Tables: No keys at s3://crawler-poc/bad-table' ] }, 404)) # valid path params = OperatorParameters( parameter_string= f"database_name:crawler-poc,storage_path:crawler-poc/catalog_poc_data,output_path:crawler-poc/athena/" ) good = op.validate(config, params).run(env, Response()) def clean(s: List[str]): return list(map(lambda i: clean_uuid(clean_string(i)), s)) infos = clean(good.formatted()["Info"]) expect = [ 'Tableinferred:catalog_poc_data', 'RunningAthenaquery.query_id:test_id', 'Runningjobid=test_id' ] assert (infos == expect)
def list_objects(self, database_name: str, response: Response) -> Tuple[Result[dict, str], Response]: try: split = database_name.split("/", 1) result = self.client().s3.list_objects(Bucket=split[0], Prefix=(get(split, 1) or '/'), Delimiter='/') response.add_response(result) return Success(result), response except Exception as e: if isinstance(e, ClientError): result = e.response error = result.get("Error", {}) code = error.get("Code", "") if code == "NoSuchBucket": response.set_status(404) return Failure( f"The specified bucket does not exist: {database_name}" ), response return Failure(message(e)), response
def execute(self, env: MasonEnvironment, response: Response, dry_run: bool = True) -> OperatorResponse: try: module = self.module(env) if isinstance(module, OperatorDefinition): if dry_run: response.add_info( f"Valid Operator: {self.namespace}:{self.command} with specified parameters." ) return OperatorResponse(response) else: operator_response: OperatorResponse = module.run( env, self.config, self.parameters, response) else: response.add_error( f"Module does not contain a valid OperatorDefinition. See /examples for sample operator implementations. \nMessage: {module.reason}" ) operator_response = OperatorResponse(response) except ModuleNotFoundError as e: response.add_error(f"Module Not Found: {e}") operator_response = OperatorResponse(response) return operator_response
def tests(env: MasonEnvironment, config: Config, op: Operator): # Database and table Exist params = OperatorParameters( parameter_string= "database_name:crawler-poc,table_name:catalog_poc_data") exists = op.validate(config, params).run(env, Response()) assert (exists.with_status() == table.get( config.metastore().client.name(), 1)) # Database DNE params = OperatorParameters( parameter_string= "database_name:bad-database,table_name:catalog_poc_data") dne = op.validate(config, params).run(env, Response()) assert (dne.with_status() == table.get( config.metastore().client.name(), 2)) # Table DNE params = OperatorParameters( parameter_string="database_name:crawler-poc,table_name:bad-table") dne2 = op.validate(config, params).run(env, Response()) assert (dne2.with_status() == table.get( config.metastore().client.name(), 3))
def list_keys( self, path: str, response: Optional[Response] = None ) -> Tuple[List[Path], Response]: resp: Response = response or Response() keys = self.client().find(path) resp.add_response({'keys': keys}) if len(keys) > 0: paths = list(map(lambda k: self.get_path(k), keys)) else: paths = [] return paths, resp
def to_response(self, response: Response) -> Response: if self.message != "": response.add_info(self.message) for l in self.logs: if isinstance(l, str): response.add_info(l) else: response.add_data(l) return response
def parse_response( result: dict, response: Response) -> Result[TableList, InvalidTables]: contents: Optional[List[dict]] = result.get("Contents") prefixes: Optional[List[dict]] = result.get("CommonPrefixes") if contents: tables: List[Union[Table, InvalidTables]] = [] for c in contents: key: Optional[str] = c.get("Key") if key: table, response = self.get_table( database_name.split("/")[0], key, response=response) tables.append(table) valid, invalid = sequence(tables, Table, InvalidTables) if len(valid) > 0: return Success(TableList(valid)) else: invalid_tables: List[InvalidTable] = [] for i in invalid: invalid_tables += (i.invalid_tables) return Failure( InvalidTables(invalid_tables, f"No valid tables at {database_name}")) elif prefixes: for p in prefixes: response.add_data(p) return Failure( InvalidTables( [], f"No valid tables at {database_name}. Try appending '/' or specify deeper key." )) else: return Failure(InvalidTables([], "No Data returned from AWS"))