def _GetCompletedJob(self, job_id): """See base class.""" cmd = self.cmd_prefix + [ 'emr', 'describe-step', '--cluster-id', self.cluster_id, '--step-id', job_id ] stdout, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) if retcode: if 'ThrottlingException' in stderr: logging.warning( 'Rate limited while polling EMR step:\n%s\nRetrying.', stderr) return None else: raise errors.VmUtil.IssueCommandError( f'Getting step status failed:\n{stderr}') result = json.loads(stdout) state = result['Step']['Status']['State'] if state == 'FAILED': raise dpb_service.JobSubmissionError( result['Step']['Status']['FailureDetails']) if state == 'COMPLETED': pending_time = result['Step']['Status']['Timeline'][ 'CreationDateTime'] start_time = result['Step']['Status']['Timeline']['StartDateTime'] end_time = result['Step']['Status']['Timeline']['EndDateTime'] return dpb_service.JobResult(run_time=end_time - start_time, pending_time=start_time - pending_time)
def _GetCompletedJob(self, job_id): """See base class.""" cmd = self.cmd_prefix + [ 'emr', 'describe-step', '--cluster-id', self.cluster_id, '--step-id', job_id ] stdout, _, _ = vm_util.IssueCommand(cmd) result = json.loads(stdout) state = result['Step']['Status']['State'] if state == 'FAILED': raise dpb_service.JobSubmissionError( result['Step']['Status']['FailureDetails']) if state == 'COMPLETED': pending_time = result['Step']['Status']['Timeline']['CreationDateTime'] start_time = result['Step']['Status']['Timeline']['StartDateTime'] end_time = result['Step']['Status']['Timeline']['EndDateTime'] return dpb_service.JobResult( run_time=end_time - start_time, pending_time=start_time - pending_time)
def _GetCompletedJob(self, job_id): """See base class.""" job_name, job_run_id = job_id cmd = self.cmd_prefix + [ 'glue', 'get-job-run', '--job-name', job_name, '--run-id', job_run_id ] stdout, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) if retcode: raise errors.VmUtil.IssueCommandError( f'Getting step status failed:\n{stderr}') result = json.loads(stdout) state = result['JobRun']['JobRunState'] if state in ('FAILED', 'ERROR', 'TIMEOUT'): raise dpb_service.JobSubmissionError( result['JobRun'].get('ErrorMessage')) if state == 'SUCCEEDED': started_on = result['JobRun']['StartedOn'] completed_on = result['JobRun']['CompletedOn'] execution_time = result['JobRun']['ExecutionTime'] return dpb_service.JobResult(run_time=execution_time, pending_time=completed_on - started_on - execution_time)
def SubmitJob(self, jarfile=None, classname=None, pyspark_file=None, query_file=None, job_poll_interval=None, job_stdout_file=None, job_arguments=None, job_files=None, job_jars=None, job_type=None, properties=None): """See base class.""" assert job_type args = ['jobs', 'submit', job_type] if job_type == self.PYSPARK_JOB_TYPE: args.append(pyspark_file) cmd = self.DataprocGcloudCommand(*args) cmd.flags['cluster'] = self.cluster_id cmd.flags['labels'] = util.MakeFormattedDefaultTags() job_jars = job_jars or [] if classname: if jarfile: # Dataproc does not support both a main class and a main jar so just # make the main jar an additional jar instead. job_jars.append(jarfile) cmd.flags['class'] = classname elif jarfile: cmd.flags['jar'] = jarfile if query_file: cmd.flags['file'] = query_file if job_files: cmd.flags['files'] = ','.join(job_files) if job_jars: cmd.flags['jars'] = ','.join(job_jars) # Dataproc gives as stdout an object describing job execution. # Its stderr contains a mix of the stderr of the job, and the # stdout of the job. We set the driver log level to FATAL # to suppress those messages, and we can then separate, hopefully # the job standard out from the log messages. cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level) all_properties = self.GetJobProperties() all_properties.update(properties or {}) if all_properties: # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping cmd.flags['properties'] = '^@^' + '@'.join( '{}={}'.format(k, v) for k, v in all_properties.items()) if job_arguments: cmd.additional_flags = ['--'] + job_arguments stdout, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: raise dpb_service.JobSubmissionError(stderr) results = json.loads(stdout) # Otherwise retcode would not have been 0 assert results['status']['state'] == 'DONE' done_time = GcpDpbDataproc._ParseTime( results['status']['stateStartTime']) pending_time = None start_time = None for state in results['statusHistory']: if state['state'] == 'PENDING': pending_time = GcpDpbDataproc._ParseTime( state['stateStartTime']) elif state['state'] == 'RUNNING': start_time = GcpDpbDataproc._ParseTime(state['stateStartTime']) assert pending_time and start_time and done_time return dpb_service.JobResult( run_time=(done_time - start_time).total_seconds(), pending_time=(start_time - pending_time).total_seconds())
def SubmitJob(self, jarfile=None, classname=None, pyspark_file=None, query_file=None, job_poll_interval=5, job_arguments=None, job_files=None, job_jars=None, job_stdout_file=None, job_type=None, properties=None): """See base class.""" @vm_util.Retry(timeout=EMR_TIMEOUT, poll_interval=job_poll_interval, fuzz=0) def WaitForStep(step_id): result = self._IsStepDone(step_id) if result is None: raise EMRRetryableException( 'Step {0} not complete.'.format(step_id)) return result if job_arguments: # Escape commas in arguments job_arguments = (arg.replace(',', '\\,') for arg in job_arguments) all_properties = self.GetJobProperties() all_properties.update(properties or {}) if job_type == 'hadoop': step_type_spec = 'Type=CUSTOM_JAR' jar_spec = 'Jar=' + jarfile arg_list = [] # Order is important if classname: arg_list += [classname] arg_list += [ '-D{}={}'.format(k, v) for k, v in all_properties.items() ] if job_arguments: arg_list += job_arguments arg_spec = 'Args=[' + ','.join(arg_list) + ']' step_list = [step_type_spec, jar_spec, arg_spec] elif job_type == self.SPARK_JOB_TYPE: arg_list = [] if job_files: arg_list += ['--files', ','.join(job_files)] if job_jars: arg_list += ['--jars', ','.join(job_jars)] for k, v in all_properties.items(): arg_list += ['--conf', '{}={}'.format(k, v)] # jarfile must be last before args arg_list += ['--class', classname, jarfile] if job_arguments: arg_list += job_arguments arg_spec = '[' + ','.join(arg_list) + ']' step_type_spec = 'Type=Spark' step_list = [step_type_spec, 'Args=' + arg_spec] elif job_type == self.PYSPARK_JOB_TYPE: arg_list = [] if job_files: arg_list += ['--files', ','.join(job_files)] if job_jars: arg_list += ['--jars', ','.join(job_jars)] for k, v in all_properties.items(): arg_list += ['--conf', '{}={}'.format(k, v)] # pyspark_file must be last before args arg_list += [pyspark_file] if job_arguments: arg_list += job_arguments arg_spec = 'Args=[{}]'.format(','.join(arg_list)) step_list = ['Type=Spark', arg_spec] elif job_type == self.SPARKSQL_JOB_TYPE: assert not job_arguments arg_list = [query_file] jar_spec = 'Jar="command-runner.jar"' for k, v in all_properties.items(): arg_list += ['--conf', '{}={}'.format(k, v)] arg_spec = 'Args=[spark-sql,-f,{}]'.format(','.join(arg_list)) step_list = [jar_spec, arg_spec] step_string = ','.join(step_list) step_cmd = self.cmd_prefix + [ 'emr', 'add-steps', '--cluster-id', self.cluster_id, '--steps', step_string ] stdout, _, _ = vm_util.IssueCommand(step_cmd) result = json.loads(stdout) step_id = result['StepIds'][0] result = WaitForStep(step_id) pending_time = result['Step']['Status']['Timeline']['CreationDateTime'] start_time = result['Step']['Status']['Timeline']['StartDateTime'] end_time = result['Step']['Status']['Timeline']['EndDateTime'] return dpb_service.JobResult(run_time=end_time - start_time, pending_time=start_time - pending_time)
def SubmitJob(self, jarfile=None, classname=None, pyspark_file=None, query_file=None, job_poll_interval=None, job_stdout_file=None, job_arguments=None, job_files=None, job_jars=None, job_type=None, properties=None): """See base class.""" assert job_type args = ['batches', 'submit', job_type] additional_args = [] if job_type == self.PYSPARK_JOB_TYPE: args.append(pyspark_file) cmd = self.DataprocGcloudCommand(*args) cmd.flags['batch'] = self.cluster_id cmd.flags['labels'] = util.MakeFormattedDefaultTags() job_jars = job_jars or [] if classname: if jarfile: # Dataproc does not support both a main class and a main jar so just # make the main jar an additional jar instead. job_jars.append(jarfile) cmd.flags['class'] = classname elif jarfile: cmd.flags['jar'] = jarfile if query_file: additional_args += query_file if job_files: cmd.flags['files'] = ','.join(job_files) if job_jars: cmd.flags['jars'] = ','.join(job_jars) if FLAGS.gce_network_name: cmd.flags['network'] = FLAGS.gce_network_name if self.dpb_version: cmd.flags['version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['container-image'] = FLAGS.gcp_dataproc_image all_properties = self.GetJobProperties() all_properties.update(properties or {}) if all_properties: # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping cmd.flags['properties'] = '^@^' + '@'.join( '{}={}'.format(k, v) for k, v in all_properties.items()) if job_arguments: additional_args += ['--'] + job_arguments cmd.additional_flags = additional_args _, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: raise dpb_service.JobSubmissionError(stderr) fetch_batch_cmd = self.DataprocGcloudCommand('batches', 'describe', self.cluster_id) stdout, stderr, retcode = fetch_batch_cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: raise dpb_service.JobSubmissionError(stderr) results = json.loads(stdout) # Otherwise retcode would not have been 0 assert results['state'] == 'SUCCEEDED' done_time = self._ParseTime(results['stateTime']) pending_time = None start_time = None for state in results['stateHistory']: if state['state'] == 'PENDING': pending_time = self._ParseTime(state['stateStartTime']) elif state['state'] == 'RUNNING': start_time = self._ParseTime(state['stateStartTime']) assert pending_time and start_time and done_time return dpb_service.JobResult( run_time=(done_time - start_time).total_seconds(), pending_time=(start_time - pending_time).total_seconds())