def _wait_for_process(self, proc_dict, step_num): # handle counters, status msgs, and other stuff on stderr proc = proc_dict['proc'] stderr_lines = self._process_stderr_from_script( proc.stderr, step_num=step_num) tb_lines = _find_python_traceback(stderr_lines) # proc.stdout isn't always defined if proc.stdout: proc.stdout.close() proc.stderr.close() returncode = proc.wait() if returncode != 0: # show counters before raising exception counters = self._counters[step_num] if counters: log.info(_format_counters(counters)) # try to throw a useful exception if tb_lines: for line in tb_lines: log.error(line.rstrip('\r\n')) reason = str( CalledProcessError(returncode, proc_dict['args'])) raise StepFailedException( reason=reason, step_num=step_num, num_steps=len(self._get_steps()))
def _wait_for_process(self, proc_dict, step_num): # handle counters, status msgs, and other stuff on stderr proc = proc_dict['proc'] stderr_lines = self._process_stderr_from_script(proc.stderr, step_num=step_num) tb_lines = _find_python_traceback(stderr_lines) # proc.stdout isn't always defined if proc.stdout: proc.stdout.close() proc.stderr.close() returncode = proc.wait() if returncode != 0: # show counters before raising exception counters = self._counters[step_num] if counters: log.info(_format_counters(counters)) # try to throw a useful exception if tb_lines: for line in tb_lines: log.error(line.rstrip('\r\n')) reason = str(CalledProcessError(returncode, proc_dict['args'])) raise StepFailedException(reason=reason, step_num=step_num, num_steps=len(self._get_steps()))
def test_indent(self): self.assertEqual(_format_counters(self.COUNTERS, indent=' '), ('Counters: 3\n' ' File System Counters\n' ' FILE: Number of bytes read=8\n' ' FILE: Number of bytes written=359982\n' ' Job Counters\n' ' Launched map tasks=2'))
def test_basic(self): self.assertEqual(_format_counters(self.COUNTERS), ('Counters: 3\n' '\tFile System Counters\n' '\t\tFILE: Number of bytes read=8\n' '\t\tFILE: Number of bytes written=359982\n' '\tJob Counters\n' '\t\tLaunched map tasks=2'))
def test_indent(self): self.assertEqual( _format_counters(self.COUNTERS, indent=' '), ('Counters: 3\n' ' File System Counters\n' ' FILE: Number of bytes read=8\n' ' FILE: Number of bytes written=359982\n' ' Job Counters\n' ' Launched map tasks=2'))
def test_basic(self): self.assertEqual( _format_counters(self.COUNTERS), ('Counters: 3\n' '\tFile System Counters\n' '\t\tFILE: Number of bytes read=8\n' '\t\tFILE: Number of bytes written=359982\n' '\tJob Counters\n' '\t\tLaunched map tasks=2'))
def test_custom_desc(self): self.assertEqual( _format_counters(self.COUNTERS, desc='Counters for step 1'), ('Counters for step 1: 3\n' '\tFile System Counters\n' '\t\tFILE: Number of bytes read=8\n' '\t\tFILE: Number of bytes written=359982\n' '\tJob Counters\n' '\t\tLaunched map tasks=2'))
def _log_counters(self, log_interpretation, step_num): """Utility for logging counters (if any) for a step.""" step_type = self._get_step(step_num)['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found')
def test_empty_group(self): # counter groups should always have at least one counter self.assertEqual( _format_counters({ 'File System Counters': {}, 'Job Counters': { 'Launched map tasks': 2, }, }), ('Counters: 1\n' '\tJob Counters\n' '\t\tLaunched map tasks=2'))
def _log_counters(self, log_interpretation, step_num): """Utility for logging counters (if any) for a step.""" step_type = self._get_step(step_num)['type'] if not self._step_type_uses_spark(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) elif self._read_logs(): # should only log this if we actually looked for counters log.warning('No counters found')
def test_indent(self): self.assertEqual( _format_counters(self.COUNTERS, indent=" "), ( "Counters: 3\n" " File System Counters\n" " FILE: Number of bytes read=8\n" " FILE: Number of bytes written=359982\n" " Job Counters\n" " Launched map tasks=2" ), )
def _log_counters(self, log_interpretation, step_num): """Utility for logging counters (if any) for a step.""" step_type = self._get_step(step_num)['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters( log_interpretation, step_type) if counters: log.info(_format_counters(counters)) elif self._read_logs(): # should only log this if we actually looked for counters log.warning('No counters found')
def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join(self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate(counters, start=(step_num + 1)): if counter_dict: log.info( _format_counters(counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps())
def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode = self._run_spark_submit(spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join( self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate( counters, start=(step_num + 1)): if counter_dict: log.info(_format_counters( counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException( reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps())
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def test_round_trip(self): # are we outputting counters in the same format as the Hadoop binary? self.assertEqual(_parse_indented_counters(_format_counters(self.COUNTERS).splitlines()), self.COUNTERS)
def test_empty_group(self): # counter groups should always have at least one counter self.assertEqual( _format_counters({"File System Counters": {}, "Job Counters": {"Launched map tasks": 2}}), ("Counters: 1\n" "\tJob Counters\n" "\t\tLaunched map tasks=2"), )
def test_empty(self): self.assertEqual(_format_counters({}), "Counters: 0")
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = self._env_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def _invoke_step(self, step_num, step_type): """Run the mapper or reducer for the given step. """ step = self._get_step(step_num) if step['type'] != 'streaming': raise Exception("LocalMRJobRunner cannot run %s steps" % step['type']) jobconf = self._jobconf_for_step(step_num) outfile_prefix = 'step-%04d-%s' % (step_num, step_type) # allow setting number of tasks from jobconf if step_type == 'reducer': num_tasks = int( jobconf_from_dict(jobconf, 'mapreduce.job.reduces', self._DEFAULT_REDUCE_TASKS)) else: num_tasks = int( jobconf_from_dict(jobconf, 'mapreduce.job.maps', self._DEFAULT_MAP_TASKS)) # get file splits for mappers and reducers keep_sorted = (step_type == 'reducer') file_splits = self._get_file_splits(self._step_input_paths(), num_tasks, keep_sorted=keep_sorted) # since we have grapped the files from the _prev_outfiles as input # to this step reset _prev_outfiles self._prev_outfiles = [] # Start the tasks associated with the step: # if we need to sort, then just sort all input files into one file # otherwise, split the files needed for mappers and reducers # and setup the task environment for each # The correctly-ordered list of task_num, file_name pairs file_tasks = sorted([(t['task_num'], file_name) for file_name, t in file_splits.items()], key=lambda t: t[0]) for task_num, input_path in file_tasks: # make a new working_dir for each task working_dir = os.path.join(self._get_local_tmp_dir(), 'job_local_dir', str(step_num), step_type, str(task_num)) self._setup_working_dir(working_dir) log.debug("File name %s" % input_path) # setup environment variables split_kwargs = {} if step_type == 'mapper': # mappers have extra file split info split_kwargs = dict( input_file=file_splits[input_path]['orig_name'], input_start=file_splits[input_path]['start'], input_length=file_splits[input_path]['length']) env = self._subprocess_env(step_num, step_type, task_num, working_dir, **split_kwargs) output_path = os.path.join( self._get_local_tmp_dir(), outfile_prefix + '_part-%05d' % task_num) log.debug('Writing to %s' % output_path) self._run_step(step_num, step_type, input_path, output_path, working_dir, env) self._prev_outfiles.append(output_path) self._per_step_runner_finish(step_num) counters = self._counters[step_num] if counters: log.info(_format_counters(counters))
def _log_counters(self, step_num): counters = self.counters()[step_num] if counters: log.info('\n%s\n' % _format_counters(counters))
def test_empty(self): self.assertEqual(_format_counters({}), 'Counters: 0')
def test_round_trip(self): # are we outputting counters in the same format as the Hadoop binary? self.assertEqual( _parse_indented_counters( _format_counters(self.COUNTERS).splitlines()), self.COUNTERS)
def _invoke_step(self, step_num, step_type): """Run the mapper or reducer for the given step. """ step = self._get_step(step_num) if step['type'] != 'streaming': raise Exception("LocalMRJobRunner cannot run %s steps" % step['type']) jobconf = self._jobconf_for_step(step_num) outfile_prefix = 'step-%d-%s' % (step_num, step_type) # allow setting number of tasks from jobconf if step_type == 'reducer': num_tasks = int(jobconf_from_dict( jobconf, 'mapreduce.job.reduces', self._DEFAULT_REDUCE_TASKS)) else: num_tasks = int(jobconf_from_dict( jobconf, 'mapreduce.job.maps', self._DEFAULT_MAP_TASKS)) # get file splits for mappers and reducers keep_sorted = (step_type == 'reducer') file_splits = self._get_file_splits( self._step_input_paths(), num_tasks, keep_sorted=keep_sorted) # since we have grapped the files from the _prev_outfiles as input # to this step reset _prev_outfiles self._prev_outfiles = [] # Start the tasks associated with the step: # if we need to sort, then just sort all input files into one file # otherwise, split the files needed for mappers and reducers # and setup the task environment for each # The correctly-ordered list of task_num, file_name pairs file_tasks = sorted([ (t['task_num'], file_name) for file_name, t in file_splits.items()], key=lambda t: t[0]) for task_num, input_path in file_tasks: # make a new working_dir for each task working_dir = os.path.join( self._get_local_tmp_dir(), 'job_local_dir', str(step_num), step_type, str(task_num)) self._setup_working_dir(working_dir) log.debug("File name %s" % input_path) # setup environment variables split_kwargs = {} if step_type == 'mapper': # mappers have extra file split info split_kwargs = dict( input_file=file_splits[input_path]['orig_name'], input_start=file_splits[input_path]['start'], input_length=file_splits[input_path]['length']) env = self._subprocess_env( step_num, step_type, task_num, working_dir, **split_kwargs) output_path = os.path.join( self._get_local_tmp_dir(), outfile_prefix + '_part-%05d' % task_num) log.info('writing to %s' % output_path) self._run_step(step_num, step_type, input_path, output_path, working_dir, env) self._prev_outfiles.append(output_path) self.per_step_runner_finish(step_num) counters = self._counters[step_num] if counters: log.info(_format_counters(counters))