Пример #1
0
 def _GetCompletedJob(self, job_id):
     """See base class."""
     cmd = self.cmd_prefix + [
         'emr', 'describe-step', '--cluster-id', self.cluster_id,
         '--step-id', job_id
     ]
     stdout, stderr, retcode = vm_util.IssueCommand(cmd,
                                                    raise_on_failure=False)
     if retcode:
         if 'ThrottlingException' in stderr:
             logging.warning(
                 'Rate limited while polling EMR step:\n%s\nRetrying.',
                 stderr)
             return None
         else:
             raise errors.VmUtil.IssueCommandError(
                 f'Getting step status failed:\n{stderr}')
     result = json.loads(stdout)
     state = result['Step']['Status']['State']
     if state == 'FAILED':
         raise dpb_service.JobSubmissionError(
             result['Step']['Status']['FailureDetails'])
     if state == 'COMPLETED':
         pending_time = result['Step']['Status']['Timeline'][
             'CreationDateTime']
         start_time = result['Step']['Status']['Timeline']['StartDateTime']
         end_time = result['Step']['Status']['Timeline']['EndDateTime']
         return dpb_service.JobResult(run_time=end_time - start_time,
                                      pending_time=start_time -
                                      pending_time)
Пример #2
0
 def _GetCompletedJob(self, job_id):
   """See base class."""
   cmd = self.cmd_prefix + [
       'emr', 'describe-step', '--cluster-id', self.cluster_id, '--step-id',
       job_id
   ]
   stdout, _, _ = vm_util.IssueCommand(cmd)
   result = json.loads(stdout)
   state = result['Step']['Status']['State']
   if state == 'FAILED':
     raise dpb_service.JobSubmissionError(
         result['Step']['Status']['FailureDetails'])
   if state == 'COMPLETED':
     pending_time = result['Step']['Status']['Timeline']['CreationDateTime']
     start_time = result['Step']['Status']['Timeline']['StartDateTime']
     end_time = result['Step']['Status']['Timeline']['EndDateTime']
     return dpb_service.JobResult(
         run_time=end_time - start_time,
         pending_time=start_time - pending_time)
 def _GetCompletedJob(self, job_id):
     """See base class."""
     job_name, job_run_id = job_id
     cmd = self.cmd_prefix + [
         'glue', 'get-job-run', '--job-name', job_name, '--run-id',
         job_run_id
     ]
     stdout, stderr, retcode = vm_util.IssueCommand(cmd,
                                                    raise_on_failure=False)
     if retcode:
         raise errors.VmUtil.IssueCommandError(
             f'Getting step status failed:\n{stderr}')
     result = json.loads(stdout)
     state = result['JobRun']['JobRunState']
     if state in ('FAILED', 'ERROR', 'TIMEOUT'):
         raise dpb_service.JobSubmissionError(
             result['JobRun'].get('ErrorMessage'))
     if state == 'SUCCEEDED':
         started_on = result['JobRun']['StartedOn']
         completed_on = result['JobRun']['CompletedOn']
         execution_time = result['JobRun']['ExecutionTime']
         return dpb_service.JobResult(run_time=execution_time,
                                      pending_time=completed_on -
                                      started_on - execution_time)
Пример #4
0
    def SubmitJob(self,
                  jarfile=None,
                  classname=None,
                  pyspark_file=None,
                  query_file=None,
                  job_poll_interval=None,
                  job_stdout_file=None,
                  job_arguments=None,
                  job_files=None,
                  job_jars=None,
                  job_type=None,
                  properties=None):
        """See base class."""
        assert job_type
        args = ['jobs', 'submit', job_type]

        if job_type == self.PYSPARK_JOB_TYPE:
            args.append(pyspark_file)

        cmd = self.DataprocGcloudCommand(*args)

        cmd.flags['cluster'] = self.cluster_id
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        job_jars = job_jars or []
        if classname:
            if jarfile:
                # Dataproc does not support both a main class and a main jar so just
                # make the main jar an additional jar instead.
                job_jars.append(jarfile)
            cmd.flags['class'] = classname
        elif jarfile:
            cmd.flags['jar'] = jarfile

        if query_file:
            cmd.flags['file'] = query_file

        if job_files:
            cmd.flags['files'] = ','.join(job_files)
        if job_jars:
            cmd.flags['jars'] = ','.join(job_jars)

        # Dataproc gives as stdout an object describing job execution.
        # Its stderr contains a mix of the stderr of the job, and the
        # stdout of the job.  We set the driver log level to FATAL
        # to suppress those messages, and we can then separate, hopefully
        # the job standard out from the log messages.
        cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level)

        all_properties = self.GetJobProperties()
        all_properties.update(properties or {})
        if all_properties:
            # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping
            cmd.flags['properties'] = '^@^' + '@'.join(
                '{}={}'.format(k, v) for k, v in all_properties.items())

        if job_arguments:
            cmd.additional_flags = ['--'] + job_arguments

        stdout, stderr, retcode = cmd.Issue(timeout=None,
                                            raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        results = json.loads(stdout)
        # Otherwise retcode would not have been 0
        assert results['status']['state'] == 'DONE'
        done_time = GcpDpbDataproc._ParseTime(
            results['status']['stateStartTime'])
        pending_time = None
        start_time = None
        for state in results['statusHistory']:
            if state['state'] == 'PENDING':
                pending_time = GcpDpbDataproc._ParseTime(
                    state['stateStartTime'])
            elif state['state'] == 'RUNNING':
                start_time = GcpDpbDataproc._ParseTime(state['stateStartTime'])

        assert pending_time and start_time and done_time

        return dpb_service.JobResult(
            run_time=(done_time - start_time).total_seconds(),
            pending_time=(start_time - pending_time).total_seconds())
Пример #5
0
    def SubmitJob(self,
                  jarfile=None,
                  classname=None,
                  pyspark_file=None,
                  query_file=None,
                  job_poll_interval=5,
                  job_arguments=None,
                  job_files=None,
                  job_jars=None,
                  job_stdout_file=None,
                  job_type=None,
                  properties=None):
        """See base class."""
        @vm_util.Retry(timeout=EMR_TIMEOUT,
                       poll_interval=job_poll_interval,
                       fuzz=0)
        def WaitForStep(step_id):
            result = self._IsStepDone(step_id)
            if result is None:
                raise EMRRetryableException(
                    'Step {0} not complete.'.format(step_id))
            return result

        if job_arguments:
            # Escape commas in arguments
            job_arguments = (arg.replace(',', '\\,') for arg in job_arguments)

        all_properties = self.GetJobProperties()
        all_properties.update(properties or {})

        if job_type == 'hadoop':
            step_type_spec = 'Type=CUSTOM_JAR'
            jar_spec = 'Jar=' + jarfile
            arg_list = []
            # Order is important
            if classname:
                arg_list += [classname]
            arg_list += [
                '-D{}={}'.format(k, v) for k, v in all_properties.items()
            ]
            if job_arguments:
                arg_list += job_arguments
            arg_spec = 'Args=[' + ','.join(arg_list) + ']'
            step_list = [step_type_spec, jar_spec, arg_spec]
        elif job_type == self.SPARK_JOB_TYPE:
            arg_list = []
            if job_files:
                arg_list += ['--files', ','.join(job_files)]
            if job_jars:
                arg_list += ['--jars', ','.join(job_jars)]
            for k, v in all_properties.items():
                arg_list += ['--conf', '{}={}'.format(k, v)]
            # jarfile must be last before args
            arg_list += ['--class', classname, jarfile]
            if job_arguments:
                arg_list += job_arguments
            arg_spec = '[' + ','.join(arg_list) + ']'
            step_type_spec = 'Type=Spark'
            step_list = [step_type_spec, 'Args=' + arg_spec]
        elif job_type == self.PYSPARK_JOB_TYPE:
            arg_list = []
            if job_files:
                arg_list += ['--files', ','.join(job_files)]
            if job_jars:
                arg_list += ['--jars', ','.join(job_jars)]
            for k, v in all_properties.items():
                arg_list += ['--conf', '{}={}'.format(k, v)]
            # pyspark_file must be last before args
            arg_list += [pyspark_file]
            if job_arguments:
                arg_list += job_arguments
            arg_spec = 'Args=[{}]'.format(','.join(arg_list))
            step_list = ['Type=Spark', arg_spec]
        elif job_type == self.SPARKSQL_JOB_TYPE:
            assert not job_arguments
            arg_list = [query_file]
            jar_spec = 'Jar="command-runner.jar"'
            for k, v in all_properties.items():
                arg_list += ['--conf', '{}={}'.format(k, v)]
            arg_spec = 'Args=[spark-sql,-f,{}]'.format(','.join(arg_list))
            step_list = [jar_spec, arg_spec]

        step_string = ','.join(step_list)

        step_cmd = self.cmd_prefix + [
            'emr', 'add-steps', '--cluster-id', self.cluster_id, '--steps',
            step_string
        ]
        stdout, _, _ = vm_util.IssueCommand(step_cmd)
        result = json.loads(stdout)
        step_id = result['StepIds'][0]

        result = WaitForStep(step_id)
        pending_time = result['Step']['Status']['Timeline']['CreationDateTime']
        start_time = result['Step']['Status']['Timeline']['StartDateTime']
        end_time = result['Step']['Status']['Timeline']['EndDateTime']
        return dpb_service.JobResult(run_time=end_time - start_time,
                                     pending_time=start_time - pending_time)
Пример #6
0
    def SubmitJob(self,
                  jarfile=None,
                  classname=None,
                  pyspark_file=None,
                  query_file=None,
                  job_poll_interval=None,
                  job_stdout_file=None,
                  job_arguments=None,
                  job_files=None,
                  job_jars=None,
                  job_type=None,
                  properties=None):
        """See base class."""
        assert job_type
        args = ['batches', 'submit', job_type]
        additional_args = []

        if job_type == self.PYSPARK_JOB_TYPE:
            args.append(pyspark_file)

        cmd = self.DataprocGcloudCommand(*args)

        cmd.flags['batch'] = self.cluster_id
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        job_jars = job_jars or []
        if classname:
            if jarfile:
                # Dataproc does not support both a main class and a main jar so just
                # make the main jar an additional jar instead.
                job_jars.append(jarfile)
            cmd.flags['class'] = classname
        elif jarfile:
            cmd.flags['jar'] = jarfile

        if query_file:
            additional_args += query_file

        if job_files:
            cmd.flags['files'] = ','.join(job_files)
        if job_jars:
            cmd.flags['jars'] = ','.join(job_jars)

        if FLAGS.gce_network_name:
            cmd.flags['network'] = FLAGS.gce_network_name

        if self.dpb_version:
            cmd.flags['version'] = self.dpb_version
        if FLAGS.gcp_dataproc_image:
            cmd.flags['container-image'] = FLAGS.gcp_dataproc_image

        all_properties = self.GetJobProperties()
        all_properties.update(properties or {})
        if all_properties:
            # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping
            cmd.flags['properties'] = '^@^' + '@'.join(
                '{}={}'.format(k, v) for k, v in all_properties.items())

        if job_arguments:
            additional_args += ['--'] + job_arguments
        cmd.additional_flags = additional_args

        _, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        fetch_batch_cmd = self.DataprocGcloudCommand('batches', 'describe',
                                                     self.cluster_id)
        stdout, stderr, retcode = fetch_batch_cmd.Issue(timeout=None,
                                                        raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        results = json.loads(stdout)
        # Otherwise retcode would not have been 0
        assert results['state'] == 'SUCCEEDED'
        done_time = self._ParseTime(results['stateTime'])
        pending_time = None
        start_time = None
        for state in results['stateHistory']:
            if state['state'] == 'PENDING':
                pending_time = self._ParseTime(state['stateStartTime'])
            elif state['state'] == 'RUNNING':
                start_time = self._ParseTime(state['stateStartTime'])

        assert pending_time and start_time and done_time

        return dpb_service.JobResult(
            run_time=(done_time - start_time).total_seconds(),
            pending_time=(start_time - pending_time).total_seconds())