def test_is_emr_step_complete(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) step_name = "test_step" step_cmd = ["ls", "/"] step_ids = emr.add_job_flow_steps( context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]) def get_step_dict(step_id, step_state): return { "Step": { "Id": step_id, "Name": step_name, "Config": { "Jar": "command-runner.jar", "Properties": {}, "Args": step_cmd }, "ActionOnFailure": "CONTINUE", "Status": { "State": step_state, "StateChangeReason": { "Message": "everything is hosed" }, "Timeline": { "StartDateTime": _boto3_now() }, }, }, } emr_step_id = step_ids[0] describe_step_returns = [ get_step_dict(emr_step_id, "PENDING"), get_step_dict(emr_step_id, "RUNNING"), get_step_dict(emr_step_id, "COMPLETED"), get_step_dict(emr_step_id, "FAILED"), ] with mock.patch.object(EmrJobRunner, "describe_step", side_effect=describe_step_returns): assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) with pytest.raises(EmrError) as exc_info: emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert "step failed" in str(exc_info.value)
def test_is_emr_step_complete(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) step_name = 'test_step' step_cmd = ['ls', '/'] step_ids = emr.add_job_flow_steps( context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]) def get_step_dict(step_id, step_state): return { 'Step': { 'Id': step_id, 'Name': step_name, 'Config': { 'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd }, 'ActionOnFailure': 'CONTINUE', 'Status': { 'State': step_state, 'StateChangeReason': { 'Message': 'everything is hosed' }, 'Timeline': { 'StartDateTime': _boto3_now() }, }, }, } emr_step_id = step_ids[0] describe_step_returns = [ get_step_dict(emr_step_id, 'PENDING'), get_step_dict(emr_step_id, 'RUNNING'), get_step_dict(emr_step_id, 'COMPLETED'), get_step_dict(emr_step_id, 'FAILED'), ] with mock.patch.object(EmrJobRunner, 'describe_step', side_effect=describe_step_returns): assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) with pytest.raises(EmrError) as exc_info: emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert 'step failed' in str(exc_info.value)
def test_emr_wait_for_step(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) step_name = 'test_step' step_cmd = ['ls', '/'] step_ids = emr.add_job_flow_steps( context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)] ) def get_step_dict(step_id, step_state): return { 'Step': { 'Id': step_id, 'Name': step_name, 'Config': {'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd}, 'ActionOnFailure': 'CONTINUE', 'Status': { 'State': step_state, 'StateChangeReason': {'Message': 'everything is hosed'}, 'Timeline': {'StartDateTime': _boto3_now()}, }, }, } calls = {'num_calls': 0, 'final_state': 'COMPLETED'} def new_describe_step(_, cluster_id, step_id): calls['num_calls'] += 1 if calls['num_calls'] == 1: return get_step_dict(step_id, 'PENDING') elif calls['num_calls'] == 2: return get_step_dict(step_id, 'RUNNING') else: return get_step_dict(step_id, calls['final_state']) return emr.describe_step(cluster_id, step_id) with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step): emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids) calls['num_calls'] = 0 calls['final_state'] = 'FAILED' with pytest.raises(EmrError) as exc_info: with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step): emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids) assert 'step failed' in str(exc_info.value)
class EmrPySparkStepLauncher(StepLauncher): def __init__( self, region_name, staging_bucket, staging_prefix, wait_for_logs, action_on_failure, cluster_id, spark_config, local_pipeline_package_path, deploy_local_pipeline_package, s3_pipeline_package_path=None, ): self.region_name = check.str_param(region_name, 'region_name') self.staging_bucket = check.str_param(staging_bucket, 'staging_bucket') self.staging_prefix = check.str_param(staging_prefix, 'staging_prefix') self.wait_for_logs = check.bool_param(wait_for_logs, 'wait_for_logs') self.action_on_failure = check.str_param(action_on_failure, 'action_on_failure') self.cluster_id = check.str_param(cluster_id, 'cluster_id') self.spark_config = spark_config check.invariant( not deploy_local_pipeline_package or not s3_pipeline_package_path, 'If deploy_local_pipeline_package is set to True, s3_pipeline_package_path should not ' 'also be set.', ) self.local_pipeline_package_path = check.str_param( local_pipeline_package_path, 'local_pipeline_package_path') self.deploy_local_pipeline_package = check.bool_param( deploy_local_pipeline_package, 'deploy_local_pipeline_package') self.s3_pipeline_package_path = check.opt_str_param( s3_pipeline_package_path, 's3_pipeline_package_path') self.emr_job_runner = EmrJobRunner(region=self.region_name) def _post_artifacts(self, log, step_run_ref, run_id, step_key): ''' Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR. For the zip file, consider the following toy example: # Folder: my_pyspark_project/ # a.py def foo(): print(1) # b.py def bar(): print(2) # main.py from a import foo from b import bar foo() bar() This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will print 1, 2. ''' with seven.TemporaryDirectory() as temp_dir: s3 = boto3.client('s3', region_name=self.region_name) # Upload step run ref def _upload_file_to_s3(local_path, s3_filename): key = self._artifact_s3_key(run_id, step_key, s3_filename) s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename) log.debug('Uploading file {local_path} to {s3_uri}'.format( local_path=local_path, s3_uri=s3_uri)) s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key) # Upload main file. # The remote Dagster installation should also have the file, but locating it there # could be a pain. main_local_path = self._main_file_local_path() _upload_file_to_s3(main_local_path, self._main_file_name()) if self.deploy_local_pipeline_package: # Zip and upload package containing pipeline zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME) build_pyspark_zip(zip_local_path, self.local_pipeline_package_path) _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME) # Create step run ref pickle file step_run_ref_local_path = os.path.join( temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME) with open(step_run_ref_local_path, 'wb') as step_pickle_file: pickle.dump(step_run_ref, step_pickle_file) _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME) def launch_step(self, step_context, prior_attempts_count): step_run_ref = step_context_to_step_run_ref( step_context, prior_attempts_count, self.local_pipeline_package_path) run_id = step_context.pipeline_run.run_id log = step_context.log step_key = step_run_ref.step_key self._post_artifacts(log, step_run_ref, run_id, step_key) emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.solid.name) emr_step_id = self.emr_job_runner.add_job_flow_steps( log, self.cluster_id, [emr_step_def])[0] s3 = boto3.resource('s3', region_name=self.region_name) for event in self.wait_for_completion(log, s3, run_id, step_key, emr_step_id): log_step_event(step_context, event) yield event if self.wait_for_logs: self._log_logs_from_s3(log, emr_step_id) def wait_for_completion(self, log, s3, run_id, step_key, emr_step_id, check_interval=15): ''' We want to wait for the EMR steps to complete, and while that's happening, we want to yield any events that have been written to S3 for us by the remote process. After the the EMR steps complete, we want a final chance to fetch events before finishing the step. ''' done = False all_events = [] while not done: time.sleep( check_interval) # AWS rate-limits us if we poll it too often done = self.emr_job_runner.is_emr_step_complete( log, self.cluster_id, emr_step_id) all_events_new = self.read_events(s3, run_id, step_key) if len(all_events_new) > len(all_events): for i in range(len(all_events), len(all_events_new)): yield all_events_new[i] all_events = all_events_new def read_events(self, s3, run_id, step_key): events_s3_obj = s3.Object( # pylint: disable=no-member self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)) try: events_data = events_s3_obj.get()['Body'].read() return pickle.loads(events_data) except ClientError as ex: # The file might not be there yet, which is fine if ex.response['Error']['Code'] == 'NoSuchKey': return [] else: raise ex def _log_logs_from_s3(self, log, emr_step_id): '''Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs them to the given log.''' stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id( log, self.cluster_id, emr_step_id) # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for # Dagster's logging system. records = parse_hadoop_log4j_records(stderr_log) for record in records: log._log( # pylint: disable=protected-access record.level, record.logger + ': ' + record.message, {}) log.info(stdout_log) def _get_emr_step_def(self, run_id, step_key, solid_name): '''From the local Dagster instance, construct EMR steps that will kick off execution on a remote EMR cluster. ''' action_on_failure = self.action_on_failure # Execute Solid via spark-submit conf = dict(flatten_dict(self.spark_config)) conf['spark.app.name'] = conf.get('spark.app.name', solid_name) check.invariant( conf.get('spark.master', 'yarn') == 'yarn', desc= 'spark.master is configured as %s; cannot set Spark master on EMR to anything ' 'other than "yarn"' % conf.get('spark.master'), ) command = ([ EMR_SPARK_HOME + 'bin/spark-submit', '--master', 'yarn', '--deploy-mode', conf.get('spark.submit.deployMode', 'client'), ] + format_for_cli(list(flatten_dict(conf))) + [ '--py-files', self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME), self._artifact_s3_uri(run_id, step_key, self._main_file_name()), self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME), ]) return EmrJobRunner.construct_step_dict_for_command( 'Execute Solid %s' % solid_name, command, action_on_failure=action_on_failure) def _main_file_name(self): return os.path.basename(self._main_file_local_path()) def _main_file_local_path(self): return emr_step_main.__file__ def _artifact_s3_uri(self, run_id, step_key, filename): key = self._artifact_s3_key(run_id, step_key, filename) return 's3://{bucket}/{key}'.format(bucket=self.staging_bucket, key=key) def _artifact_s3_key(self, run_id, step_key, filename): return '/'.join([ self.staging_prefix, run_id, step_key, os.path.basename(filename) ])
class EmrPySparkStepLauncher(StepLauncher): def __init__( self, region_name, staging_bucket, staging_prefix, wait_for_logs, action_on_failure, cluster_id, spark_config, local_job_package_path, deploy_local_job_package, s3_job_package_path=None, ): self.region_name = check.str_param(region_name, "region_name") self.staging_bucket = check.str_param(staging_bucket, "staging_bucket") self.staging_prefix = check.str_param(staging_prefix, "staging_prefix") self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs") self.action_on_failure = check.str_param(action_on_failure, "action_on_failure") self.cluster_id = check.str_param(cluster_id, "cluster_id") self.spark_config = spark_config check.invariant( not deploy_local_job_package or not s3_job_package_path, "If deploy_local_job_package is set to True, s3_job_package_path should not " "also be set.", ) self.local_job_package_path = check.str_param( local_job_package_path, "local_job_package_path") self.deploy_local_job_package = check.bool_param( deploy_local_job_package, "deploy_local_job_package") self.s3_job_package_path = check.opt_str_param(s3_job_package_path, "s3_job_package_path") self.emr_job_runner = EmrJobRunner(region=self.region_name) def _post_artifacts(self, log, step_run_ref, run_id, step_key): """ Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR. For the zip file, consider the following toy example: # Folder: my_pyspark_project/ # a.py def foo(): print(1) # b.py def bar(): print(2) # main.py from a import foo from b import bar foo() bar() This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will print 1, 2. """ from dagster_pyspark.utils import build_pyspark_zip with tempfile.TemporaryDirectory() as temp_dir: s3 = boto3.client("s3", region_name=self.region_name) # Upload step run ref def _upload_file_to_s3(local_path, s3_filename): key = self._artifact_s3_key(run_id, step_key, s3_filename) s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename) log.debug("Uploading file {local_path} to {s3_uri}".format( local_path=local_path, s3_uri=s3_uri)) s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key) # Upload main file. # The remote Dagster installation should also have the file, but locating it there # could be a pain. main_local_path = self._main_file_local_path() _upload_file_to_s3(main_local_path, self._main_file_name()) if self.deploy_local_job_package: # Zip and upload package containing job zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME) build_pyspark_zip(zip_local_path, self.local_job_package_path) _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME) # Create step run ref pickle file step_run_ref_local_path = os.path.join( temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME) with open(step_run_ref_local_path, "wb") as step_pickle_file: pickle.dump(step_run_ref, step_pickle_file) _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME) def launch_step(self, step_context, prior_attempts_count): step_run_ref = step_context_to_step_run_ref( step_context, prior_attempts_count, self.local_job_package_path) run_id = step_context.pipeline_run.run_id log = step_context.log step_key = step_run_ref.step_key self._post_artifacts(log, step_run_ref, run_id, step_key) emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.solid.name) emr_step_id = self.emr_job_runner.add_job_flow_steps( log, self.cluster_id, [emr_step_def])[0] yield from self.wait_for_completion_and_log(log, run_id, step_key, emr_step_id, step_context) def wait_for_completion_and_log(self, log, run_id, step_key, emr_step_id, step_context): s3 = boto3.resource("s3", region_name=self.region_name) try: for event in self.wait_for_completion(log, s3, run_id, step_key, emr_step_id): log_step_event(step_context, event) yield event except EmrError as emr_error: if self.wait_for_logs: self._log_logs_from_s3(log, emr_step_id) raise emr_error if self.wait_for_logs: self._log_logs_from_s3(log, emr_step_id) def wait_for_completion(self, log, s3, run_id, step_key, emr_step_id, check_interval=15): """We want to wait for the EMR steps to complete, and while that's happening, we want to yield any events that have been written to S3 for us by the remote process. After the the EMR steps complete, we want a final chance to fetch events before finishing the step. """ done = False all_events = [] # If this is being called within a `capture_interrupts` context, allow interrupts # while waiting for the pyspark execution to complete, so that we can terminate slow or # hanging steps while not done: with raise_execution_interrupts(): time.sleep(check_interval ) # AWS rate-limits us if we poll it too often done = self.emr_job_runner.is_emr_step_complete( log, self.cluster_id, emr_step_id) all_events_new = self.read_events(s3, run_id, step_key) if len(all_events_new) > len(all_events): for i in range(len(all_events), len(all_events_new)): yield all_events_new[i] all_events = all_events_new def read_events(self, s3, run_id, step_key): events_s3_obj = s3.Object( # pylint: disable=no-member self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)) try: events_data = events_s3_obj.get()["Body"].read() return pickle.loads(events_data) except ClientError as ex: # The file might not be there yet, which is fine if ex.response["Error"]["Code"] == "NoSuchKey": return [] else: raise ex def _log_logs_from_s3(self, log, emr_step_id): """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs them to the given log.""" stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id( log, self.cluster_id, emr_step_id) # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for # Dagster's logging system. records = parse_hadoop_log4j_records(stderr_log) for record in records: if record.level: log.log( level=record.level, msg="".join([ "Spark Driver stderr: ", record.logger, ": ", record.message ]), ) else: log.debug(f"Spark Driver stderr: {record.message}") sys.stdout.write("---------- Spark Driver stdout: ----------\n" + stdout_log + "\n" + "---------- End of Spark Driver stdout ----------\n") def _get_emr_step_def(self, run_id, step_key, solid_name): """From the local Dagster instance, construct EMR steps that will kick off execution on a remote EMR cluster. """ from dagster_spark.utils import flatten_dict, format_for_cli action_on_failure = self.action_on_failure # Execute Solid via spark-submit conf = dict(flatten_dict(self.spark_config)) conf["spark.app.name"] = conf.get("spark.app.name", solid_name) check.invariant( conf.get("spark.master", "yarn") == "yarn", desc= "spark.master is configured as %s; cannot set Spark master on EMR to anything " 'other than "yarn"' % conf.get("spark.master"), ) command = ([ EMR_SPARK_HOME + "bin/spark-submit", "--master", "yarn", "--deploy-mode", conf.get("spark.submit.deployMode", "client"), ] + format_for_cli(list(flatten_dict(conf))) + [ "--py-files", self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME), self._artifact_s3_uri(run_id, step_key, self._main_file_name()), self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME), ]) return EmrJobRunner.construct_step_dict_for_command( "Execute Solid/Op %s" % solid_name, command, action_on_failure=action_on_failure) def _main_file_name(self): return os.path.basename(self._main_file_local_path()) def _main_file_local_path(self): return emr_step_main.__file__ def _artifact_s3_uri(self, run_id, step_key, filename): key = self._artifact_s3_key(run_id, step_key, filename) return "s3://{bucket}/{key}".format(bucket=self.staging_bucket, key=key) def _artifact_s3_key(self, run_id, step_key, filename): return "/".join([ self.staging_prefix, run_id, step_key, os.path.basename(filename) ])