def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # Keep track of the status of each step that ran # # these are dictionaries with the same keys as # mrjob.logs.parse._parse_hadoop_streaming_log() self._steps_info = []
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output')) self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home']) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = []
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = (self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = (self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = (self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
def test_unhide_files(self): # avoid giving names to files that Hadoop will ignore as input sd = UploadDirManager('hdfs:///') sd.add('.foo.log') sd.add('_bar.txt') self.assertEqual(sd.path_to_uri(), {'.foo.log': 'hdfs:///foo.log', '_bar.txt': 'hdfs:///bar.txt'})
def test_dot_underscore(self): sd = UploadDirManager('hdfs:///') sd.add('._') sd.add('._.txt') sd.add('._foo') self.assertEqual(sd.path_to_uri(), {'._': 'hdfs:///1', '._.txt': 'hdfs:///1.txt', '._foo': 'hdfs:///foo'})
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output')) self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home']) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None
def __init__(self, **kwargs): super(SparkMRJobRunner, self).__init__(**kwargs) self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = posixpath.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin']
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = []
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # Keep track of the status of each step that ran # # these are dictionaries with the same keys as # mrjob.logs.parse._parse_hadoop_streaming_log() self._steps_info = []
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) if self._opts['hadoop_home']: log.warning( 'hadoop_home is deprecated since 0.5.0 and will be removed' ' in v0.6.0. In most cases, mrjob will now find the hadoop' ' binary and streaming jar without help. If not, use the' ' hadoop_bin and hadoop_streaming_jar options.') self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Fully qualify step_output_dir, if set if self._step_output_dir: self._step_output_dir = fully_qualify_hdfs_path( self._step_output_dir) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # Keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = []
def test_hidden_file_name_collision(self): sd = UploadDirManager('hdfs:///') sd.add('foo/_bar.py') sd.add('_bar.py') self.assertEqual(sd.path_to_uri(), {'foo/_bar.py': 'hdfs:///bar.py', '_bar.py': 'hdfs:///bar-1.py'})
def test_unknown_uri(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'}) self.assertEqual(sd.uri('hdfs://host/path/to/bar.py'), 'hdfs://host/path/to/bar.py') # checking unknown URIs doesn't add them self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
def test_underscores_only(self): sd = UploadDirManager('hdfs:///') sd.add('_') sd.add('_.txt') self.assertEqual(sd.path_to_uri(), {'_': 'hdfs:///1', '_.txt': 'hdfs:///1.txt'})
def test_unknown_uri(self): sd = UploadDirManager("hdfs:///") sd.add("foo/bar.py") self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"}) self.assertEqual(sd.uri("hdfs://host/path/to/bar.py"), "hdfs://host/path/to/bar.py") # checking unknown URIs doesn't add them self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
def test_name_collision(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') sd.add('bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py', 'bar.py': 'hdfs:///bar-1.py'})
def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs): """Create a spark runner :param max_output_files: limit on number of output files when running streaming jobs. Can only be set on command line (not config file) :param mrjob_cls: class of the job you want to run. Used for running streaming steps in Spark SparkMRJobRunner ignores the keyword arguments *hadoop_input_format*, *hadoop_output_format*, and *sort_values* (see :py:meth:`MRJobRunner.__init__`). These are only set by the job as a way to communicate certain attributes to the runner, and the Spark runner instead inspects the job directly. """ # need to set this before checking steps in superclass __init__() self._mrjob_cls = mrjob_cls super(SparkMRJobRunner, self).__init__(**kwargs) self._max_output_files = max_output_files self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = self.fs.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # where to store a .zip file containing the MRJob, with a unique # module name self._job_script_zip_path = None # counters, one per job step. (Counters will be {} for non-streaming # steps because Spark doesn't have counters). self._counters = []
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output'))
def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs): """Create a Spark runner. :param max_output_files: limit on number of output files when running streaming jobs. Can only be set on command line (not config file) :param mrjob_cls: class of the job you want to run. Used for running streaming steps in Spark """ # need to set this before checking steps in superclass __init__() self._mrjob_cls = mrjob_cls super(SparkMRJobRunner, self).__init__(**kwargs) self._max_output_files = max_output_files if self._opts['spark_tmp_dir']: self._check_spark_tmp_dir_opt() self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = self.fs.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # where to store a .zip file containing the MRJob, with a unique # module name self._job_script_zip_path = None # counters, one per job step. (Counters will be {} for non-streaming # steps because Spark doesn't have counters). self._counters = []
def uri_adds_trailing_slash(self): sd = UploadDirManager('s3://bucket/dir') sd.add('foo/bar.py') self.assertEqual(sd.uri('foo/bar.py'), 's3://bucket/dir/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 's3://bucket/dir/bar.py'})
class HadoopJobRunner(MRJobRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = [] @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s' % path) streaming_jars = [] for path in self.fs.ls(path): if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_dirs(self): """Yield all possible hadoop directories (used for streaming jar and logs). May yield duplicates""" if self._opts['hadoop_home']: yield self._opts['hadoop_home'] for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar. May yield duplicates. """ for hadoop_dir in self._hadoop_dirs(): yield hadoop_dir # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir if uses_yarn(self.get_hadoop_version()): yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir yield _DEFAULT_YARN_HDFS_LOG_DIR if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded log paths for EMR, so this can work out-of-the-box for path in _EMR_HADOOP_LOG_DIRS: yield path def _run(self): self._check_input_exists() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if self._opts['check_input_paths']: if not self.fs.exists(path): raise AssertionError( 'Input path %s does not exist!' % (path,)) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.fs._put(path, target) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): for step_num in range(self._num_steps()): step_args = self._args_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._hdfs_step_output_dir(step_num)) log_interpretation['step'] = step_interpretation counters = self._pick_counters(log_interpretation) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps()) def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'],)) def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # set up uploading from HDFS to the working dir args.extend( self._upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = ( self._hadoop_streaming_commands(step_num)) # if no reducer, shut off reducer tasks if not reducer: args.extend(['-D', ('%s=0' % translate_jobconf( 'mapreduce.job.reduces', self.get_hadoop_version()))]) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args def _args_for_jar_step(self, step_num): step = self._get_step(step_num) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args = self.get_hadoop_bin() + ['jar', jar] if step.get('main_class'): args.append(step['main_class']) # TODO: merge with logic in mrjob/emr.py def interpolate(arg): if arg == mrjob.step.JarStep.INPUT: return ','.join(self._hdfs_step_input_files(step_num)) elif arg == mrjob.step.JarStep.OUTPUT: return self._hdfs_step_output_dir(step_num) else: return arg if step.get('args'): args.extend(interpolate(arg) for arg in step['args']) return args def _hdfs_step_input_files(self, step_num): """Get the hdfs:// URI for input for the given step.""" if step_num == 0: return [self._upload_mgr.uri(p) for p in self._get_input_paths()] else: return [posixpath.join( self._hadoop_tmp_dir, 'step-output', str(step_num))] def _hdfs_step_output_dir(self, step_num): if step_num == len(self._get_steps()) - 1: return self._output_dir else: return posixpath.join( self._hadoop_tmp_dir, 'step-output', str(step_num + 1)) def _cleanup_local_tmp(self): super(HadoopJobRunner, self)._cleanup_local_tmp() if self._hadoop_tmp_dir: log.info('deleting %s from HDFS' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) ### LOG (implementation of LogInterpretationMixin) ### def _stream_history_log_dirs(self, output_dir=None): """Yield lists of directories to look for the history log in.""" for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if self.fs.exists(log_dir): log.info('Looking for history log in %s...' % log_dir) # logs aren't always in a subdir named history/ yield [log_dir] def _stream_task_log_dirs(self, application_id=None, output_dir=None): """Yield lists of directories to look for the task logs in.""" # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if application_id: path = self.fs.join(log_dir, 'userlogs', application_id) else: path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('Looking for task syslogs in %s...' % path) yield [path] def counters(self): return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations]
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output')) self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home']) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split('\n')[0] m = HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = m.group('version') log.info("Using Hadoop version %s" % self._hadoop_version) return self._hadoop_version self._hadoop_version = '0.20.203' log.info("Unable to determine Hadoop version. Assuming 0.20.203.") return self._hadoop_version def _run(self): if self._opts['bootstrap_mrjob']: self._add_python_archive(self._create_mrjob_tar_gz()) self._check_input_exists() self._create_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if not self.path_exists(path): raise AssertionError('Input path %s does not exist!' % (path, )) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self._mkdir_on_hdfs(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().iteritems(): self._upload_to_hdfs(path, uri) def _mkdir_on_hdfs(self, path): log.debug('Making directory %s on HDFS' % path) self.invoke_hadoop(['fs', '-mkdir', path]) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.invoke_hadoop(['fs', '-put', path, target]) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'w') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): self._counters = [] steps = self._get_steps() for step_num, step in enumerate(steps): log.debug('running step %d of %d' % (step_num + 1, len(steps))) streaming_args = self._streaming_args(step, step_num, len(steps)) log.debug('> %s' % cmd_line(streaming_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(streaming_args[0], streaming_args) else: master = os.fdopen(master_fd) # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) master.close() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (returncode, streaming_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend( line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise CalledProcessError(returncode, streaming_args) def _process_stderr_from_streaming(self, stderr): def treat_eio_as_eof(iter): # on Linux, the PTY gives us a specific IOError when the # when the child process exits, rather than EOF. while True: try: yield iter.next() # okay for StopIteration to bubble up except IOError, e: if e.errno == errno.EIO: return else: raise for line in treat_eio_as_eof(stderr): line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info('HADOOP: ' + line) if 'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num'))
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s' % path) streaming_jars = [] for path in self.fs.ls(path): if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar.""" if self._opts['hadoop_home']: yield self._opts['hadoop_home'] for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _run(self): self._check_input_exists() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if self._opts['check_input_paths']: if not self.fs.exists(path): raise AssertionError( 'Input path %s does not exist!' % (path,)) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.fs._put(path, target) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): self._counters = [] for step_num in range(self._num_steps()): log.debug('running step %d of %d' % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + to_string(line.strip(b'\n'))) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (returncode, step_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend(line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise CalledProcessError(returncode, step_args) def _process_stderr_from_streaming(self, stderr): def treat_eio_as_eof(iter): # on Linux, the PTY gives us a specific IOError when the # when the child process exits, rather than EOF. while True: try: yield next(iter) # okay for StopIteration to bubble up except IOError as e: if e.errno == errno.EIO: return else: raise for line in treat_eio_as_eof(stderr): line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info('HADOOP: ' + to_string(line)) if b'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num')) def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'],)) def _args_for_streaming_step(self, step_num): version = self.get_hadoop_version() hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend( self._upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend( self._pre_0_20_upload_args(self._upload_mgr)) mapper, combiner, reducer = ( self._hadoop_streaming_commands(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) else: args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return args def _args_for_jar_step(self, step_num): step = self._get_step(step_num) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args = self.get_hadoop_bin() + ['jar', jar] if step.get('main_class'): args.append(step['main_class']) # TODO: merge with logic in mrjob/emr.py def interpolate(arg): if arg == mrjob.step.JarStep.INPUT: return ','.join(self._hdfs_step_input_files(step_num)) elif arg == mrjob.step.JarStep.OUTPUT: return self._hdfs_step_output_dir(step_num) else: return arg if step.get('args'): args.extend(interpolate(arg) for arg in step['args']) return args def _hdfs_step_input_files(self, step_num): """Get the hdfs:// URI for input for the given step.""" if step_num == 0: return [self._upload_mgr.uri(p) for p in self._get_input_paths()] else: return [posixpath.join( self._hadoop_tmp_dir, 'step-output', str(step_num))] def _hdfs_step_output_dir(self, step_num): if step_num == len(self._get_steps()) - 1: return self._output_dir else: return posixpath.join( self._hadoop_tmp_dir, 'step-output', str(step_num + 1)) def _cleanup_local_tmp(self): super(HadoopJobRunner, self)._cleanup_local_tmp() if self._hadoop_tmp_dir: log.info('deleting %s from HDFS' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) ### LOG FETCHING/PARSING ### def _enforce_path_regexp(self, paths, regexp, step_nums): """Helper for log fetching functions to filter out unwanted logs. Keyword arguments are checked against their corresponding regex groups. """ for path in paths: m = regexp.match(path) if (m and (step_nums is None or int(m.group('step_num')) in step_nums) and (self._job_timestamp is None or m.group('timestamp') == self._job_timestamp)): yield path def _ls_logs(self, log_type, step_nums=None): """List logs on the local filesystem by path relative to log root directory """ return [] # in YARN, you can just ask the yarn bin: # http://hortonworks.com/blog/simplifying-user-logs-management-and-access-in-yarn/ # noqa # TODO: redo this to look in # $HADOOP_LOG_DIR # dirname(hadoop_bin[0])/../logs # <output_dir>/_logs # ??? other places ??? def _fetch_counters(self, step_nums, skip_s3_wait=False): """Read Hadoop counters from local logs. Args: step_nums -- the steps belonging to us, so that we can ignore errors from other jobs run with the same timestamp """ uris = self._ls_logs('job', step_nums) new_counters = scan_for_counters_in_files(uris, self, self.get_hadoop_version()) # only include steps relevant to the current job for step_num in step_nums: self._counters.append(new_counters.get(step_num, {})) def counters(self): return self._counters def _find_probable_cause_of_failure(self, step_nums): task_attempt_logs = self._ls_logs('task') step_logs = self._ls_logs('step') job_logs = self._ls_logs('job') log.info('Scanning logs for probable cause of failure') return best_error_from_logs(self, task_attempt_logs, step_logs, job_logs)
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # Keep track of the status of each step that ran # # these are dictionaries with the same keys as # mrjob.logs.parse._parse_hadoop_streaming_log() self._steps_info = [] @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s' % path) streaming_jars = [] for path in self.fs.ls(path): if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_dirs(self): """Yield all possible hadoop directories (used for streaming jar and logs). May yield duplicates""" if self._opts['hadoop_home']: yield self._opts['hadoop_home'] for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar. May yield duplicates. """ for hadoop_dir in self._hadoop_dirs(): yield hadoop_dir # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir if uses_yarn(self.get_hadoop_version()): yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded log paths for EMR, so this can work out-of-the-box for path in _EMR_HADOOP_LOG_DIRS: yield path def _run(self): self._check_input_exists() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if self._opts['check_input_paths']: if not self.fs.exists(path): raise AssertionError('Input path %s does not exist!' % (path, )) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.fs._put(path, target) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): for step_num in range(self._num_steps()): step_args = self._args_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_info = _process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_info = _process_stderr_from_streaming( _wrap_streaming_pty_output(master)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if not step_info['output_dir']: step_info['output_dir'] = self._hdfs_step_output_dir(step_num) if not step_info['counters']: pass # TODO: fetch counters; see _fetch_counters() self._steps_info.append(step_info) # just print counters for this one step self._print_counters(step_nums=[step_num]) if returncode: err_lines = [ 'Job failed with return code %d: %s' % (returncode, cmd_line(step_args)) ] cause = self._find_probable_cause_of_failure(**step_info) if cause: err_lines.append('') # pad with empty line err_lines.extend(_format_cause_of_failure(cause)) for err_line in err_lines: log.error(err_line) raise Exception('\n'.join(err_lines) + '\n') def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'], )) def _args_for_streaming_step(self, step_num): version = self.get_hadoop_version() hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._pre_0_20_upload_args(self._upload_mgr)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) else: args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return args def _args_for_jar_step(self, step_num): step = self._get_step(step_num) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args = self.get_hadoop_bin() + ['jar', jar] if step.get('main_class'): args.append(step['main_class']) # TODO: merge with logic in mrjob/emr.py def interpolate(arg): if arg == mrjob.step.JarStep.INPUT: return ','.join(self._hdfs_step_input_files(step_num)) elif arg == mrjob.step.JarStep.OUTPUT: return self._hdfs_step_output_dir(step_num) else: return arg if step.get('args'): args.extend(interpolate(arg) for arg in step['args']) return args def _hdfs_step_input_files(self, step_num): """Get the hdfs:// URI for input for the given step.""" if step_num == 0: return [self._upload_mgr.uri(p) for p in self._get_input_paths()] else: return [ posixpath.join(self._hadoop_tmp_dir, 'step-output', str(step_num)) ] def _hdfs_step_output_dir(self, step_num): if step_num == len(self._get_steps()) - 1: return self._output_dir else: return posixpath.join(self._hadoop_tmp_dir, 'step-output', str(step_num + 1)) def _cleanup_local_tmp(self): super(HadoopJobRunner, self)._cleanup_local_tmp() if self._hadoop_tmp_dir: log.info('deleting %s from HDFS' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) ### LOG FETCHING/PARSING ### def _find_probable_cause_of_failure(self, application_id=None, job_id=None, output_dir=None, **ignored): """Find probable cause of failure. Currently we just scan task logs. On YARN, you must set application_id, and pre-YARN, you must set job_id. """ # package up logs for _find_error_intask_logs(), # and log where we're looking. hadoop_version = self.get_hadoop_version() yarn = uses_yarn(hadoop_version) if yarn and application_id is None: log.warning("No application ID!") return None if not yarn and job_id is None: log.warning("No job ID!") return None # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a local # directory. It's a start, anyways. See #1201. def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs(output_dir=output_dir)): if yarn: path = self.fs.join(log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('looking for logs in %s' % path) yield [path] return _find_error_in_task_logs(self.fs, stream_task_log_dirs(), hadoop_version, application_id=application_id, job_id=job_id) # TODO: catch timeouts, etc. # TODO: redo this def _fetch_counters(self, step_nums, skip_s3_wait=False): """Read Hadoop counters from local logs. Args: step_nums -- the steps belonging to us, so that we can ignore errors from other jobs run with the same timestamp """ uris = self._ls_logs('job', step_nums) new_counters = scan_for_counters_in_files(uris, self, self.get_hadoop_version()) # only include steps relevant to the current job for step_num in step_nums: self._counters.append(new_counters.get(step_num, {})) def counters(self): return [step_info['counters'] for step_info in self._steps_info]
def test_uri(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') self.assertEqual(sd.uri('foo/bar.py'), 'hdfs:///bar.py')
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException( 'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = ( self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = ( self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = ( self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
def test_simple(self): sd = UploadDirManager("hdfs:///") sd.add("foo/bar.py") self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
def test_empty(self): sd = UploadDirManager("hdfs:///") self.assertEqual(sd.path_to_uri(), {})
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = ( self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = ( self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = ( self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage working dir for bootstrap script self._bootstrap_dir_mgr = BootstrapWorkingDirManager() # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() for cmd in self._bootstrap: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._bootstrap_dir_mgr.add(**maybe_path_dict) # we'll create the script later self._master_bootstrap_script_path = None # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
class DataprocJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' # Don't need to bootstrap mrjob in the setup wrapper; that's what # the bootstrap script is for! BOOTSTRAP_MRJOB_IN_SETUP = False OPTION_STORE_CLASS = DataprocRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = ( self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = ( self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = ( self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage working dir for bootstrap script self._bootstrap_dir_mgr = BootstrapWorkingDirManager() # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() for cmd in self._bootstrap: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._bootstrap_dir_mgr.add(**maybe_path_dict) # we'll create the script later self._master_bootstrap_script_path = None # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def gcloud_config(self): """Lazy load gcloud SDK configs""" if not self._gcloud_config: self._gcloud_config = _read_gcloud_config() return self._gcloud_config @property def api_client(self): if not self._api_client: credentials = GoogleCredentials.get_application_default() api_client = discovery.build( _DATAPROC_API_ENDPOINT, _DATAPROC_API_VERSION, credentials=credentials) self._api_client = api_client.projects().regions() return self._api_client @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem() self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir mrjob_buckets = self.fs.list_buckets( self._gcp_project, prefix='mrjob-') # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None gce_lower_location = self._gce_region.lower() for tmp_bucket in mrjob_buckets: tmp_bucket_name = tmp_bucket['name'] # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase lower_location = tmp_bucket['location'].lower() if lower_location == gce_lower_location: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', gce_lower_location, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_input_exists() self._check_output_not_exists() self._create_setup_wrapper_script() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files_to_fs() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if is_uri(path) and not is_gcs_uri(path): continue # can't check non-GCS URIs, hope for the best if not self.fs.exists(path): raise AssertionError( 'Input path %s does not exist!' % (path,)) def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError( 'Output path %s already exists!' % (self._output_dir,)) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) self._upload_mgr.add(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored, # see _HADOOP_STREAMING_JAR_URI # if self._opts['hadoop_streaming_jar']: # self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) # TODO - mtai @ davidmarin - Implement put function for other FSs self.fs.put(path, gcs_uri) self._wait_for_fs_sync() def _create_fs_tmp_bucket(self, bucket_name, location=None): """Create a temp bucket if missing Tie the temporary bucket to the same region as the GCE job and set a 28-day TTL """ # Return early if our bucket already exists try: self.fs.get_bucket(bucket_name) return except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('creating FS bucket %r' % bucket_name) location = location or self._gce_region # NOTE - By default, we create a bucket in the same GCE region as our # job (tmp buckets ONLY) # https://cloud.google.com/storage/docs/bucket-locations self.fs.create_bucket( self._gcp_project, bucket_name, location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS) self._wait_for_fs_sync() ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for current_job in self._api_job_list( cluster_name=self._cluster_id, state_matcher='ACTIVE'): # Kill all active jobs with the same job_prefix as this job current_job_id = current_job['reference']['jobId'] if not current_job_id.startswith(job_prefix): continue self._api_job_cancel(current_job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._api_cluster_delete(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _build_dataproc_hadoop_job(self, step_num): """This function creates a "HadoopJob" to be passed to self._api_job_submit_hadoop :param step_num: :return: output_hadoop_job """ # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa args = list() file_uris = list() archive_uris = list() properties = dict() step = self._get_step(step_num) assert step['type'] in ('streaming', 'jar'), ( 'Bad step type: %r' % (step['type'],)) # TODO - mtai @ davidmarin - Might be trivial to support jar running, # see "mainJarFileUri" of variable "output_hadoop_job" in this function # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa assert step['type'] == 'streaming', 'Jar not implemented' main_jar_uri = _HADOOP_STREAMING_JAR_URI # TODO - mtai @ davidmarin - Not clear if we should move _upload_args # to file_uris, currently works fine as-is # TODO - dmarin @ mtai - Probably a little safer to do the API's way, # assuming the API supports distributed cache syntax (so we can pick # the names of the uploaded files). args.extend(self._upload_args()) args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) if mapper: args += ['-mapper', mapper] if combiner: args += ['-combiner', combiner] if reducer: args += ['-reducer', reducer] for current_input_uri in self._step_input_uris(step_num): args += ['-input', current_input_uri] args += ['-output', self._step_output_uri(step_num)] # TODO - mtai @ davidmarin - Add back support to specify a different # mainJarFileURI output_hadoop_job = dict( args=args, fileUris=file_uris, archiveUris=archive_uris, properties=properties, mainJarFileUri=main_jar_uri ) return output_hadoop_job def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in _DATAPROC_CLUSTER_STATES_READY: result_describe = self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() cluster_state = result_describe['status']['state'] if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR: raise DataprocException(result_describe) self._wait_for_api('cluster to accept jobs') assert cluster_state in _DATAPROC_CLUSTER_STATES_READY log.info("Cluster %s ready", cluster_id) return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete( job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): # Build each step hadoop_job = self._build_dataproc_hadoop_job(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._api_job_submit_hadoop(step_name, hadoop_job) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result['reference']['jobId'] assert job_id == step_name return job_id def _wait_for_step_to_complete( self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job_result = self._api_job_get(job_id) job_state = job_result['status']['state'] log.info('%s => %s' % (job_id, job_state)) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa if job_state in _DATAPROC_JOB_STATES_ACTIVE: self._wait_for_api('job completion') continue # we're done, will return at the end of this elif job_state == 'DONE': break raise StepFailedException(step_num=step_num, num_steps=num_steps) def _intermediate_output_uri(self, step_num): # TODO: davidmarin @ mtai: noticed this is 1-indexed and uses # %05d instead of %04d. Any particular reason? return 'hdfs:///tmp/mrjob/%s/step-output/%05d/' % ( self._job_key, step_num + 1) def counters(self): # TODO - mtai @ davidmarin - Counters are currently always empty as we # are not processing task logs return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise AssertionError('cluster has not yet been created') cluster = self._api_cluster_get(self._cluster_id) self._image_version = ( cluster['config']['softwareConfig']['imageVersion']) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version( self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) ### Bootstrapping ### def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path} self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin())]) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append( ['sudo %s -m compileall -q' ' -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin())]) # we call the script b.py because there's a character limit on # bootstrap script names (or there was at one time, anyway) path = os.path.join(self._get_local_tmp_dir(), 'b.py') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content( self._bootstrap + mrjob_bootstrap) for line in contents: log.debug('BOOTSTRAP: ' + line.rstrip('\r\n')) with open(path, 'w') as f: for line in contents: f.write(line) self._master_bootstrap_script_path = path def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _master_bootstrap_script_content(self, bootstrap): """Create the contents of the master bootstrap script. """ out = [] def writeln(line=''): out.append(line + '\n') # shebang sh_bin = self._opts['sh_bin'] if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin writeln('#!' + cmd_line(sh_bin)) writeln() # store $PWD writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') # FYI - mtai @ davidmarin - begin section, mtai had to add this # otherwise initialization didn't work # // kept blowing up in all subsequent invocations of $__mrjob_PWD/ writeln('if [ $__mrjob_PWD = "/" ]; then') writeln(' __mrjob_PWD=""') writeln('fi') # FYI - mtai @ davidmarin - end section writeln() # download files writeln('# download files and mark them executable') cp_to_local = 'hadoop fs -copyToLocal' for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) output_string = '%s %s $__mrjob_PWD/%s' % ( cp_to_local, pipes.quote(uri), pipes.quote(name)) writeln(output_string) # make everything executable, like Hadoop Distributed Cache writeln('chmod a+x $__mrjob_PWD/%s' % pipes.quote(name)) writeln() # run bootstrap commands writeln('# bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = '' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) writeln() return out def get_cluster_id(self): return self._cluster_id def _cluster_create_args(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) # always add idle termination script # add it last, so that we don't count bootstrapping as idle time gcs_init_script_uris.append( self._upload_mgr.uri(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH)) # NOTE - Cluster initializationActions can only take scripts with no # script args, so the auto-term script receives 'mrjob-max-secs-idle' # via metadata instead of as an arg cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ cluster_metadata['mrjob-max-secs-idle'] = str(int( self._opts['max_hours_idle'] * 3600)) cluster_config = dict( gceClusterConfig=dict( zoneUri=_gcp_zone_uri( project=self._gcp_project, zone=self._gce_zone), serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES, metadata=cluster_metadata ), initializationActions=[ dict(executableFile=init_script_uri) for init_script_uri in gcs_init_script_uris ] ) # Task tracker master_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=1, instance_type=self._opts['master_instance_type'] ) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type'] ) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True ) cluster_config['masterConfig'] = master_conf cluster_config['workerConfig'] = worker_conf if self._opts['num_task_instances']: cluster_config['secondaryWorkerConfig'] = secondary_worker_conf # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: cluster_config['softwareConfig'] = dict( imageVersion=self._opts['image_version']) return dict(projectId=self._gcp_project, clusterName=self._cluster_id, config=cluster_config) ### Dataproc-specific Stuff ### def _api_cluster_get(self, cluster_id): return self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id ).execute() def _api_cluster_create(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa return self.api_client.clusters().create( projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=cluster_data ).execute() def _api_cluster_delete(self, cluster_id): return self.api_client.clusters().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id ).execute() def _api_job_list(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, ) if cluster_name: list_kwargs['clusterName'] = cluster_name if state_matcher: list_kwargs['jobStateMatcher'] = state_matcher list_request = self.api_client.jobs().list(**list_kwargs) while list_request: try: resp = list_request.execute() except google_errors.HttpError as e: if e.resp.status == 404: return raise for current_item in resp['items']: yield current_item list_request = self.api_client.jobs().list_next(list_request, resp) def _api_job_get(self, job_id): return self.api_client.jobs().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_cancel(self, job_id): return self.api_client.jobs().cancel( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_delete(self, job_id): return self.api_client.jobs().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_submit_hadoop(self, step_name, hadoop_job): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa job_data = dict( reference=dict(projectId=self._gcp_project, jobId=step_name), placement=dict(clusterName=self._cluster_id), hadoopJob=hadoop_job ) jobs_submit_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=dict(job=job_data) ) return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'hadoop_bin', 'hadoop_extra_args', 'hadoop_log_dirs', 'hadoop_streaming_jar', 'hadoop_tmp_dir', 'spark_deploy_mode', 'spark_master', } # supports everything (so far) _STEP_TYPES = {'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'} def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Fully qualify step_output_dir, if set if self._step_output_dir: self._step_output_dir = fully_qualify_hdfs_path( self._step_output_dir) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = [] def _default_opts(self): return combine_dicts( super(HadoopJobRunner, self)._default_opts(), dict(hadoop_tmp_dir='tmp/mrjob', )) @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() # don't pass [] to fs; this means not to use hadoop until # fs.set_hadoop_bin() is called (used for running hadoop over SSH). hadoop_bin = self._opts['hadoop_bin'] or None self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin)) self._fs.add_fs('local', LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.hadoop.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.hadoop.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s...' % path) streaming_jars = [] for path in self.fs.ls(path): if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_dirs(self): """Yield all possible hadoop directories (used for streaming jar and logs). May yield duplicates""" for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar. May yield duplicates. """ for hadoop_dir in self._hadoop_dirs(): yield hadoop_dir # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir yarn = uses_yarn(self.get_hadoop_version()) if yarn: yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir yield _DEFAULT_YARN_HDFS_LOG_DIR if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded fallback paths if yarn: for path in _FALLBACK_HADOOP_YARN_LOG_DIRS: yield path for path in _FALLBACK_HADOOP_LOG_DIRS: yield path def _run(self): self._find_binaries_and_jars() self._create_setup_wrapper_scripts() self._add_job_files_for_upload() self._upload_local_files() self._run_job_in_hadoop() def _find_binaries_and_jars(self): """Find hadoop and (if needed) spark-submit bin up-front, before continuing with the job. (This is just for user-interaction purposes; these would otherwise lazy-load as needed.) """ # this triggers looking for Hadoop binary self.get_hadoop_version() if self._has_hadoop_streaming_steps(): self.get_hadoop_streaming_jar() if self._has_spark_steps(): self.get_spark_submit_bin() def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._working_dir_mgr.paths('archive'): self._upload_mgr.add(path) for path in self._py_files(): self._upload_mgr.add(path) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s...' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_driver(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process try: os.execvpe(step_args[0], step_args, env) # now we are no longer Python except OSError as ex: # use _exit() so we don't do cleanup, etc. that's # the parent process's job os._exit(ex.errno) finally: # if we got some other exception, still exit hard os._exit(-1) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) step_type = step['type'] if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps()) def _warn_about_spark_archives(self, step): """If *step* is a Spark step, the *upload_archives* option is set, and *spark_master* is not ``'yarn'``, warn that *upload_archives* will be ignored by Spark.""" if (_is_spark_step_type(step['type']) and self._spark_master() != 'yarn' and self._opts['upload_archives']): log.warning('Spark will probably ignore archives because' " spark_master is not 'yarn'") def _spark_master(self): return self._opts['spark_master'] or 'yarn' def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) elif _is_spark_step_type(step['type']): return self._args_for_spark_step(step_num) else: raise ValueError('Bad step type: %r' % (step['type'], )) def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] + self._hadoop_streaming_jar_args(step_num)) def _args_for_jar_step(self, step_num): step = self._get_step(step_num) args = [] args.extend(self.get_hadoop_bin()) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args.extend(['jar', jar]) if step.get('main_class'): args.append(step['main_class']) if step.get('args'): args.extend(self._interpolate_jar_step_args( step['args'], step_num)) return args def _env_for_step(self, step_num): step = self._get_step(step_num) env = dict(os.environ) # when running spark-submit, set its environment directly. See #1464 if _is_spark_step_type(step['type']): env.update(self._spark_cmdenv(step_num)) return env def _default_step_output_dir(self): return posixpath.join(self._hadoop_tmp_dir, 'step-output') def _cleanup_hadoop_tmp(self): if self._hadoop_tmp_dir: log.info('Removing HDFS temp directory %s...' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) def _manifest_download_commands(self): cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal'] return [ ('*://*', cmd_line(cp_to_local)), ] ### LOG (implementation of LogInterpretationMixin) ### def _stream_history_log_dirs(self, output_dir=None): """Yield lists of directories to look for the history log in.""" if not self._read_logs(): return for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if _logs_exist(self.fs, log_dir): log.info('Looking for history log in %s...' % log_dir) # logs aren't always in a subdir named history/ yield [log_dir] def _stream_task_log_dirs(self, application_id=None, output_dir=None): """Yield lists of directories to look for the task logs in.""" # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. if not self._read_logs(): return for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if application_id: path = self.fs.join(log_dir, 'userlogs', application_id) else: path = self.fs.join(log_dir, 'userlogs') if _logs_exist(self.fs, path): log.info('Looking for task syslogs in %s...' % path) yield [path] def counters(self): return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ]
def test_simple(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
def test_add_is_idempotent(self): sd = UploadDirManager("hdfs:///") sd.add("foo/bar.py") self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"}) sd.add("foo/bar.py") self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
def test_add_is_idempotent(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'}) sd.add('foo/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'bootstrap_spark', 'hadoop_bin', 'hadoop_extra_args', 'hadoop_log_dirs', 'hadoop_streaming_jar', 'hadoop_tmp_dir', 'spark_master', } def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Fully qualify step_output_dir, if set if self._step_output_dir: self._step_output_dir = fully_qualify_hdfs_path( self._step_output_dir) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # Keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = [] def _default_opts(self): return combine_dicts( super(HadoopJobRunner, self)._default_opts(), dict( hadoop_tmp_dir='tmp/mrjob', spark_master='yarn', ) ) @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s...' % path) streaming_jars = [] for path in self.fs.ls(path): if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_dirs(self): """Yield all possible hadoop directories (used for streaming jar and logs). May yield duplicates""" for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar. May yield duplicates. """ for hadoop_dir in self._hadoop_dirs(): yield hadoop_dir # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir yarn = uses_yarn(self.get_hadoop_version()) if yarn: yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir yield _DEFAULT_YARN_HDFS_LOG_DIR if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded fallback paths if yarn: for path in _FALLBACK_HADOOP_YARN_LOG_DIRS: yield path for path in _FALLBACK_HADOOP_LOG_DIRS: yield path def get_spark_submit_bin(self): if not self._spark_submit_bin: self._spark_submit_bin = self._find_spark_submit_bin() return self._spark_submit_bin def _find_spark_submit_bin(self): # TODO: this is very similar to _find_hadoop_bin() (in fs) for path in unique(self._spark_submit_bin_dirs()): log.info('Looking for spark-submit binary in %s...' % ( path or '$PATH')) spark_submit_bin = which('spark-submit', path=path) if spark_submit_bin: log.info('Found spark-submit binary: %s' % spark_submit_bin) return [spark_submit_bin] else: log.info("Falling back to 'spark-submit'") return ['spark-submit'] def _spark_submit_bin_dirs(self): # $SPARK_HOME spark_home = os.environ.get('SPARK_HOME') if spark_home: yield os.path.join(spark_home, 'bin') yield None # use $PATH # some other places recommended by install docs (see #1366) yield '/usr/lib/spark/bin' yield '/usr/local/spark/bin' yield '/usr/local/lib/spark/bin' def _run(self): self._find_binaries_and_jars() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _find_binaries_and_jars(self): """Find hadoop and (if needed) spark-submit bin up-front, before continuing with the job. (This is just for user-interaction purposes; these would otherwise lazy-load as needed.) """ # this triggers looking for Hadoop binary self.get_hadoop_version() if self._has_streaming_steps(): self.get_hadoop_streaming_jar() if self._has_spark_steps(): self.get_spark_submit_bin() def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying local files to %s...' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _upload_to_hdfs(self, path, target): log.debug(' %s -> %s' % (path, target)) self.fs._put(path, target) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s...' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps()) def _warn_about_spark_archives(self, step): """If *step* is a Spark step, the *upload_archives* option is set, and *spark_master* is not ``'yarn'``, warn that *upload_archives* will be ignored by Spark.""" if (_is_spark_step_type(step['type']) and self._opts['spark_master'] != 'yarn' and self._opts['upload_archives']): log.warning('Spark will probably ignore archives because' " spark_master is not set to 'yarn'") def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) elif _is_spark_step_type(step['type']): return self._args_for_spark_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'],)) def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] + self._hadoop_streaming_jar_args(step_num)) def _args_for_jar_step(self, step_num): step = self._get_step(step_num) args = [] args.extend(self.get_hadoop_bin()) # -libjars, -D args.extend(self._hadoop_generic_args_for_step(step_num)) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args.extend(['jar', jar]) if step.get('main_class'): args.append(step['main_class']) if step.get('args'): args.extend( self._interpolate_input_and_output(step['args'], step_num)) return args def _spark_submit_arg_prefix(self): return ['--master', self._opts['spark_master']] def _env_for_step(self, step_num): step = self._get_step(step_num) env = dict(os.environ) # when running spark-submit, set its environment directly. See #1464 if _is_spark_step_type(step['type']): env.update(self._spark_cmdenv(step_num)) return env def _default_step_output_dir(self): return posixpath.join(self._hadoop_tmp_dir, 'step-output') def _cleanup_hadoop_tmp(self): if self._hadoop_tmp_dir: log.info('Removing HDFS temp directory %s...' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) ### LOG (implementation of LogInterpretationMixin) ### def _stream_history_log_dirs(self, output_dir=None): """Yield lists of directories to look for the history log in.""" for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if _logs_exist(self.fs, log_dir): log.info('Looking for history log in %s...' % log_dir) # logs aren't always in a subdir named history/ yield [log_dir] def _stream_task_log_dirs(self, application_id=None, output_dir=None): """Yield lists of directories to look for the task logs in.""" # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if application_id: path = self.fs.join(log_dir, 'userlogs', application_id) else: path = self.fs.join(log_dir, 'userlogs') if _logs_exist(self.fs, path): log.info('Looking for task syslogs in %s...' % path) yield [path] def counters(self): return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations]
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | { 'gcloud_bin', 'project_id', } def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError( 'You must install google-cloud and google-cloud-dataproc' ' to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException('Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES) self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # map driver_output_uri to a dict with the keys: # log_uri: uri of file we're reading from # pos: position in file # buffer: bytes read from file already self._driver_output_state = {} # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def _fix_zone_and_region_opts(self): """Ensure that exactly one of region and zone is set.""" if self._opts['region'] and self._opts['zone']: log.warning('you do not need to set region if you set zone') self._opts['region'] = None return if not (self._opts['region'] or self._opts['zone']): if environ.get('CLOUDSDK_COMPUTE_ZONE'): self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE'] elif environ.get('CLOUDSDK_COMPUTE_REGION'): self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION'] else: self._opts['region'] = _DEFAULT_GCE_REGION def _default_opts(self): return combine_dicts( super(DataprocJobRunner, self)._default_opts(), dict( bootstrap_python=True, check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY, cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'], cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS, gcloud_bin=['gcloud'], image_version=_DEFAULT_IMAGE_VERSION, instance_type=_DEFAULT_INSTANCE_TYPE, master_instance_type=_DEFAULT_INSTANCE_TYPE, num_core_instances=_DATAPROC_MIN_WORKERS, num_task_instances=0, sh_bin=['/bin/sh', '-ex'], )) def _combine_opts(self, opt_list): """Blank out overridden *zone* and *region* opts.""" # copy opt_list so we can modify it opt_list = [dict(opts) for opts in opt_list] # blank out any instance_fleets/groups before the last config # where they are set blank_out = False for opts in reversed(opt_list): if blank_out: opts['region'] = None opts['zone'] = None elif any(opts.get(k) is not None for k in ('region', 'zone')): blank_out = True # now combine opts, with region/zone blanked out return super(DataprocJobRunner, self)._combine_opts(opt_list) @property def cluster_client(self): return google.cloud.dataproc_v1.ClusterControllerClient( **self._client_create_kwargs()) @property def job_client(self): return google.cloud.dataproc_v1.JobControllerClient( **self._client_create_kwargs()) def _client_create_kwargs(self): if self._opts['region']: endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT) return dict(channel=google.api_core.grpc_helpers.create_channel( endpoint, credentials=self._credentials)) else: return dict(credentials=self._credentials) @property def api_client(self): raise NotImplementedError( '"api_client" was disabled in v0.6.2. Use "cluster_client"' ' or "job_client" instead.') @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem( credentials=self._credentials, local_tmp_dir=self._get_local_tmp_dir(), project_id=self._project_id, ) self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs def _fs_chunk_size(self): """Chunk size for cloud storage Blob objects. Currently only used for uploading.""" if self._opts['cloud_upload_part_size']: return int(self._opts['cloud_upload_part_size'] * 1024 * 1024) else: return None def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None # determine region for bucket region = self._region() for tmp_bucket_name in self.fs.get_all_bucket_names(prefix='mrjob-'): tmp_bucket = self.fs.get_bucket(tmp_bucket_name) # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase. (As of Feb. 12, 2018, this is still true, # observed on google-cloud-sdk) if tmp_bucket.location.lower() == region: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', region, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _region(self): # region of cluster, which is either the region set by the user, # or the region derived from the zone they set. # used to pick bucket location and name cluster return self._opts['region'] or _zone_to_region(self._opts['zone']) def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_output_not_exists() self._create_setup_wrapper_scripts() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files_to_fs() def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError('Output path %s already exists!' % (self._output_dir, )) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored, # see _HADOOP_STREAMING_JAR_URI # if self._opts['hadoop_streaming_jar']: # self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) self.fs.put(path, gcs_uri, chunk_size=self._fs_chunk_size()) self._wait_for_fs_sync() def _create_fs_tmp_bucket(self, bucket_name, location=None): """Create a temp bucket if missing Tie the temporary bucket to the same region as the GCE job and set a 28-day TTL """ # Return early if our bucket already exists try: self.fs.get_bucket(bucket_name) return except google.api_core.exceptions.NotFound: pass log.info('creating FS bucket %r' % bucket_name) location = location or self._opts['region'] or _zone_to_region( self._opts['zone']) # NOTE - By default, we create a bucket in the same GCE region as our # job (tmp buckets ONLY) # https://cloud.google.com/storage/docs/bucket-locations self.fs.create_bucket( bucket_name, location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS) self._wait_for_fs_sync() ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # close our SSH tunnel, if any self._kill_ssh_tunnel() # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for job in self._list_jobs(cluster_name=self._cluster_id, state_matcher=_STATE_MATCHER_ACTIVE): # Kill all active jobs with the same job_prefix as this job job_id = job.reference.job_id if not job_id.startswith(job_prefix): continue self._cancel_job(job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._delete_cluster(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _build_dataproc_hadoop_job(self, step_num): """This function creates a "HadoopJob" to be passed to self._submit_hadoop_job :param step_num: :return: output_hadoop_job """ # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa args = list() file_uris = list() archive_uris = list() properties = dict() step = self._get_step(step_num) assert step['type'] in ('streaming', 'jar'), ('Bad step type: %r' % (step['type'], )) # TODO - mtai @ davidmarin - Might be trivial to support jar running, # see "main_jar_file_uri" of variable "output_hadoop_job" in # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa assert step['type'] == 'streaming', 'Jar not implemented' main_jar_uri = _HADOOP_STREAMING_JAR_URI # TODO - mtai @ davidmarin - Not clear if we should move _upload_args # to file_uris, currently works fine as-is # TODO - dmarin @ mtai - Probably a little safer to do the API's way, # assuming the API supports distributed cache syntax (so we can pick # the names of the uploaded files). args.extend(self._upload_args()) args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) if mapper: args += ['-mapper', mapper] if combiner: args += ['-combiner', combiner] if reducer: args += ['-reducer', reducer] for current_input_uri in self._step_input_uris(step_num): args += ['-input', current_input_uri] args += ['-output', self._step_output_uri(step_num)] # TODO - mtai @ davidmarin - Add back support to specify a different # main_jar_file_uri output_hadoop_job = dict(args=args, file_uris=file_uris, archive_uris=archive_uris, properties=properties, main_jar_file_uri=main_jar_uri) return output_hadoop_job def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._region(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._get_cluster(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google.api_core.exceptions.NotFound: log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._create_cluster(cluster_data) self._wait_for_cluster_ready(self._cluster_id) self._set_up_ssh_tunnel() # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in ('RUNNING', 'UPDATING'): cluster = self._get_cluster(cluster_id) cluster_state = cluster.status.State.Name(cluster.status.state) if cluster_state in ('ERROR', 'DELETING'): raise DataprocException(cluster) self._wait_for_api('cluster to accept jobs') return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete(job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): # Build each step hadoop_job = self._build_dataproc_hadoop_job(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._submit_hadoop_job(step_name, hadoop_job) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result.reference.job_id assert job_id == step_name return job_id def _wait_for_step_to_complete(self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) step_interpretation = {} log_interpretation['step'] = step_interpretation while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) driver_output_uri = job.driver_output_resource_uri log.info('%s => %s' % (job_id, job_state)) # interpret driver output so far if driver_output_uri: self._update_step_interpretation(step_interpretation, driver_output_uri) if step_interpretation.get('progress'): log.info(' ' + step_interpretation['progress']['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException(step_num=step_num, num_steps=num_steps) def _default_step_output_dir(self): # put intermediate data in HDFS return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key def _update_step_interpretation(self, step_interpretation, driver_output_uri): new_lines = self._get_new_driver_output_lines(driver_output_uri) _interpret_new_dataproc_step_stderr(step_interpretation, new_lines) def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs._get_blob(log_uri) try: # TODO: use start= kwarg once google-cloud-storage 1.9 is out new_data = log_blob.download_as_string()[state['pos']:] except google.api_core.exceptions.NotFound: # handle race condition where blob was just created break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines def counters(self): # TODO - mtai @ davidmarin - Counters are currently always empty as we # are not processing task logs return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise AssertionError('cluster has not yet been created') cluster = self._get_cluster(self._cluster_id) self._image_version = (cluster.config.software_config.image_version) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version(self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) def _bootstrap_pre_commands(self): # don't run the bootstrap script in / (see #1601) return [ 'mkdir /tmp/mrjob', 'cd /tmp/mrjob', ] ### Bootstrapping ### def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def get_cluster_id(self): return self._cluster_id def _cluster_create_kwargs(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) # always add idle termination script # add it last, so that we don't count bootstrapping as idle time gcs_init_script_uris.append( self._upload_mgr.uri(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)) # NOTE - Cluster initialization_actions can only take scripts with no # script args, so the auto-term script receives 'mrjob-max-secs-idle' # via metadata instead of as an arg cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ cluster_metadata['mrjob-max-secs-idle'] = str( int(self._opts['max_mins_idle'] * 60)) gce_cluster_config = dict( service_account_scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES, metadata=cluster_metadata) if self._opts['zone']: gce_cluster_config['zone_uri'] = _gcp_zone_uri( project=self._project_id, zone=self._opts['zone']) cluster_config = dict(gce_cluster_config=gce_cluster_config, initialization_actions=[ dict(executable_file=init_script_uri) for init_script_uri in gcs_init_script_uris ]) # Task tracker master_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=1, instance_type=self._opts['master_instance_type'], ) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type']) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True) cluster_config['master_config'] = master_conf cluster_config['worker_config'] = worker_conf if self._opts['num_task_instances']: cluster_config['secondary_worker_config'] = secondary_worker_conf # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: cluster_config['software_config'] = dict( image_version=self._opts['image_version']) kwargs = dict(project_id=self._project_id, cluster_name=self._cluster_id, config=cluster_config) return self._add_extra_cluster_params(kwargs) ### Dataproc-specific Stuff ### def _get_cluster(self, cluster_id): return self.cluster_client.get_cluster(cluster_name=cluster_id, **self._project_id_and_region()) def _create_cluster(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa self.cluster_client.create_cluster(cluster=cluster_data, **self._project_id_and_region()) def _delete_cluster(self, cluster_id): return self.cluster_client.delete_cluster( cluster_name=cluster_id, **self._project_id_and_region()) def _list_jobs(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = self._project_id_and_region() if cluster_name: list_kwargs['cluster_name'] = cluster_name if state_matcher: list_kwargs['job_state_matcher'] = state_matcher return self.job_client.list_jobs(**list_kwargs) def _get_job(self, job_id): return self.job_client.get_job(job_id=job_id, **self._project_id_and_region()) def _cancel_job(self, job_id): return self.job_client.cancel_job(job_id=job_id, **self._project_id_and_region()) def _submit_hadoop_job(self, step_name, hadoop_job): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa return self.job_client.submit_job(job=dict( reference=dict(project_id=self._project_id, job_id=step_name), placement=dict(cluster_name=self._cluster_id), hadoop_job=hadoop_job, ), **self._project_id_and_region()) def _project_id_and_region(self): return dict( project_id=self._project_id, region=(self._opts['region'] or 'global'), ) def _manifest_download_commands(self): return [ # TODO: SSH in and figure out how to use gsutil or similar #('gs://*', 'gsutil cp'), ('*://*', 'hadoop fs -copyToLocal'), ] ### SSH hooks ### def _job_tracker_host(self): return '%s-m' % self._cluster_id def _ssh_tunnel_config(self): return _SSH_TUNNEL_CONFIG def _launch_ssh_proc(self, args): ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args) # enter an empty passphrase if creating a key for the first time ssh_proc.stdin.write(b'\n\n') return ssh_proc def _ssh_launch_wait_secs(self): """Wait 20 seconds because gcloud has to update project metadata (unless we were going to check the cluster sooner anyway).""" return min(20.0, self._opts['check_cluster_every']) def _ssh_tunnel_args(self, bind_port): if not self._opts['gcloud_bin']: self._give_up_on_ssh_tunnel = True return None if not self._cluster_id: return cluster = self._get_cluster(self._cluster_id) zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1] return self._opts['gcloud_bin'] + [ 'compute', 'ssh', '--zone', zone, self._job_tracker_host(), '--', ] + self._ssh_tunnel_opts(bind_port)
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output')) self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home']) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split('\n')[0] m = HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = m.group('version') log.info("Using Hadoop version %s" % self._hadoop_version) return self._hadoop_version self._hadoop_version = '0.20.203' log.info("Unable to determine Hadoop version. Assuming 0.20.203.") return self._hadoop_version def _run(self): if self._opts['bootstrap_mrjob']: self._add_python_archive(self._create_mrjob_tar_gz()) self._check_input_exists() self._create_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if not self.path_exists(path): raise AssertionError( 'Input path %s does not exist!' % (path,)) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self._mkdir_on_hdfs(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().iteritems(): self._upload_to_hdfs(path, uri) def _mkdir_on_hdfs(self, path): log.debug('Making directory %s on HDFS' % path) self.invoke_hadoop(['fs', '-mkdir', path]) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.invoke_hadoop(['fs', '-put', path, target]) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'w') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): self._counters = [] steps = self._get_steps() for step_num, step in enumerate(steps): log.debug('running step %d of %d' % (step_num + 1, len(steps))) streaming_args = self._streaming_args(step, step_num, len(steps)) log.debug('> %s' % cmd_line(streaming_args)) step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE) # TODO: use a pty or something so that the hadoop binary # won't buffer the status messages self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (step_proc.returncode, streaming_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend(line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise Exception(msg) raise CalledProcessError(step_proc.returncode, streaming_args) def _process_stderr_from_streaming(self, stderr): for line in stderr: line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info('HADOOP: ' + line) if 'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num')) def _streaming_args(self, step, step_num, num_steps): version = self.get_hadoop_version() streaming_args = (self._opts['hadoop_bin'] + ['jar', self._opts['hadoop_streaming_jar']]) # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend( self._new_upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. streaming_args.extend( self._hadoop_conf_args(step, step_num, num_steps)) # set up input for input_uri in self._hdfs_step_input_files(step_num): streaming_args.extend(['-input', input_uri]) # set up output streaming_args.append('-output') streaming_args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend( self._old_upload_args(self._upload_mgr)) mapper, combiner, reducer = ( self._hadoop_streaming_commands(step, step_num)) streaming_args.append('-mapper') streaming_args.append(mapper) if combiner: streaming_args.append('-combiner') streaming_args.append(combiner) if reducer: streaming_args.append('-reducer') streaming_args.append(reducer) else: streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return streaming_args def _hdfs_step_input_files(self, step_num): """Get the hdfs:// URI for input for the given step.""" if step_num == 0: return [self._upload_mgr.uri(p) for p in self._get_input_paths()] else: return [posixpath.join( self._hdfs_tmp_dir, 'step-output', str(step_num))] def _hdfs_step_output_dir(self, step_num): if step_num == len(self._get_steps()) - 1: return self._output_dir else: return posixpath.join( self._hdfs_tmp_dir, 'step-output', str(step_num + 1)) def _cleanup_local_scratch(self): super(HadoopJobRunner, self)._cleanup_local_scratch() if self._hdfs_tmp_dir: log.info('deleting %s from HDFS' % self._hdfs_tmp_dir) try: self.invoke_hadoop(['fs', '-rmr', self._hdfs_tmp_dir]) except Exception, e: log.exception(e)
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output')) self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home']) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split('\n')[0] m = HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = m.group('version') log.info("Using Hadoop version %s" % self._hadoop_version) return self._hadoop_version self._hadoop_version = '0.20.203' log.info("Unable to determine Hadoop version. Assuming 0.20.203.") return self._hadoop_version def _run(self): if self._opts['bootstrap_mrjob']: self._add_python_archive(self._create_mrjob_tar_gz()) self._check_input_exists() self._create_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if not self.path_exists(path): raise AssertionError('Input path %s does not exist!' % (path, )) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self._mkdir_on_hdfs(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().iteritems(): self._upload_to_hdfs(path, uri) def _mkdir_on_hdfs(self, path): log.debug('Making directory %s on HDFS' % path) self.invoke_hadoop(['fs', '-mkdir', path]) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.invoke_hadoop(['fs', '-put', path, target]) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'w') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): self._counters = [] steps = self._get_steps() for step_num, step in enumerate(steps): log.debug('running step %d of %d' % (step_num + 1, len(steps))) streaming_args = self._streaming_args(step, step_num, len(steps)) log.debug('> %s' % cmd_line(streaming_args)) step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE) # TODO: use a pty or something so that the hadoop binary # won't buffer the status messages self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (step_proc.returncode, streaming_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend( line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise Exception(msg) raise CalledProcessError(step_proc.returncode, streaming_args) def _process_stderr_from_streaming(self, stderr): for line in stderr: line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info('HADOOP: ' + line) if 'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num')) def _streaming_args(self, step, step_num, num_steps): version = self.get_hadoop_version() streaming_args = (self._opts['hadoop_bin'] + ['jar', self._opts['hadoop_streaming_jar']]) # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend(self._new_upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. streaming_args.extend(self._hadoop_conf_args(step_num, num_steps)) # set up input for input_uri in self._hdfs_step_input_files(step_num): streaming_args.extend(['-input', input_uri]) # set up output streaming_args.append('-output') streaming_args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend(self._old_upload_args(self._upload_mgr)) mapper, combiner, reducer = (self._hadoop_streaming_commands( step, step_num)) streaming_args.append('-mapper') streaming_args.append(mapper) if combiner: streaming_args.append('-combiner') streaming_args.append(combiner) if reducer: streaming_args.append('-reducer') streaming_args.append(reducer) else: streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return streaming_args def _hdfs_step_input_files(self, step_num): """Get the hdfs:// URI for input for the given step.""" if step_num == 0: return [self._upload_mgr.uri(p) for p in self._get_input_paths()] else: return [ posixpath.join(self._hdfs_tmp_dir, 'step-output', str(step_num)) ] def _hdfs_step_output_dir(self, step_num): if step_num == len(self._get_steps()) - 1: return self._output_dir else: return posixpath.join(self._hdfs_tmp_dir, 'step-output', str(step_num + 1)) def _cleanup_local_scratch(self): super(HadoopJobRunner, self)._cleanup_local_scratch() if self._hdfs_tmp_dir: log.info('deleting %s from HDFS' % self._hdfs_tmp_dir) try: self.invoke_hadoop(['fs', '-rmr', self._hdfs_tmp_dir]) except Exception, e: log.exception(e)
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | { 'cluster_properties', 'core_instance_config', 'gcloud_bin', 'master_instance_config', 'network', 'project_id', 'service_account', 'service_account_scopes', 'subnet', 'task_instance_config', } # no Spark support yet (see #1765) _STEP_TYPES = {'jar', 'streaming'} def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError('You must install google-cloud-logging and ' 'google-cloud-storage to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException('Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # see #1820 if self._opts['image_id']: log.warning('mrjob does not yet support custom machine images' ' on Dataproc') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() if self._opts['service_account_scopes']: self._opts['service_account_scopes'] = [ _fully_qualify_scope_uri(s) for s in self._opts['service_account_scopes'] ] # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # map driver_output_uri to a dict with the keys: # log_uri: uri of file we're reading from # pos: position in file # buffer: bytes read from file already self._driver_output_state = {} # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def _fix_zone_and_region_opts(self): """Ensure that exactly one of region and zone is set.""" if self._opts['region'] and self._opts['zone']: log.warning('you do not need to set region if you set zone') self._opts['region'] = None return if not (self._opts['region'] or self._opts['zone']): if environ.get('CLOUDSDK_COMPUTE_ZONE'): self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE'] elif environ.get('CLOUDSDK_COMPUTE_REGION'): self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION'] else: self._opts['region'] = _DEFAULT_GCE_REGION def _default_opts(self): return combine_dicts( super(DataprocJobRunner, self)._default_opts(), dict( bootstrap_python=True, check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY, cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'], cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS, image_version=_DEFAULT_IMAGE_VERSION, instance_type=_DEFAULT_INSTANCE_TYPE, master_instance_type=_DEFAULT_INSTANCE_TYPE, num_core_instances=_DATAPROC_MIN_WORKERS, num_task_instances=0, )) def _combine_opts(self, opt_list): """Blank out conflicts between *network*/*subnet* and *region*/*zone*.""" opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone']) opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet']) # now combine opts, with region/zone blanked out return super(DataprocJobRunner, self)._combine_opts(opt_list) @property def cluster_client(self): return google.cloud.dataproc_v1beta2.ClusterControllerClient( **self._client_create_kwargs()) @property def job_client(self): return google.cloud.dataproc_v1beta2.JobControllerClient( **self._client_create_kwargs()) @property def logging_client(self): return google.cloud.logging.Client(credentials=self._credentials, project=self._project_id) def _client_create_kwargs(self): if self._opts['region']: endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT) return dict(channel=google.api_core.grpc_helpers.create_channel( endpoint, credentials=self._credentials)) else: return dict(credentials=self._credentials) @property def api_client(self): raise NotImplementedError( '"api_client" was disabled in v0.6.2. Use "cluster_client"' ' or "job_client" instead.') @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() location = self._opts['region'] or _zone_to_region( self._opts['zone']) self._fs.add_fs( 'gcs', GCSFilesystem( credentials=self._credentials, project_id=self._project_id, part_size=self._upload_part_size(), location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, )) self._fs.add_fs('local', LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None # determine region for bucket region = self._region() for tmp_bucket_name in self.fs.gcs.get_all_bucket_names( prefix='mrjob-'): tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name) # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase. (As of Feb. 12, 2018, this is still true, # observed on google-cloud-sdk) if tmp_bucket.location.lower() == region: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', region, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _region(self): # region of cluster, which is either the region set by the user, # or the region derived from the zone they set. # used to pick bucket location and name cluster return self._opts['region'] or _zone_to_region(self._opts['zone']) def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_output_not_exists() self._create_setup_wrapper_scripts() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files() self._wait_for_fs_sync() def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError('Output path %s already exists!' % (self._output_dir, )) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._working_dir_mgr.paths('archive'): self._upload_mgr.add(path) if self._opts['hadoop_streaming_jar']: self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # close our SSH tunnel, if any self._kill_ssh_tunnel() # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for job in self._list_jobs(cluster_name=self._cluster_id, state_matcher=_STATE_MATCHER_ACTIVE): # Kill all active jobs with the same job_prefix as this job job_id = job.reference.job_id if not job_id.startswith(job_prefix): continue self._cancel_job(job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._delete_cluster(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _streaming_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a hadoop streaming job. """ return dict(hadoop_job=dict( args=self._hadoop_streaming_jar_args(step_num), main_jar_file_uri=self._hadoop_streaming_jar_uri(), )) def _jar_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a Hadoop job that runs a JAR""" step = self._get_step(step_num) hadoop_job = {} hadoop_job['args'] = (self._interpolate_jar_step_args( step['args'], step_num)) jar_uri = self._upload_mgr.uri(step['jar']) # can't specify main_class and main_jar_file_uri; see # https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa if step.get('main_class'): hadoop_job['jar_file_uris'] = [jar_uri] hadoop_job['main_class'] = step['main_class'] else: hadoop_job['main_jar_file_uri'] = jar_uri return dict(hadoop_job=hadoop_job) def _hadoop_streaming_jar_uri(self): if self._opts['hadoop_streaming_jar']: return self._upload_mgr.uri(self._opts['hadoop_streaming_jar']) else: return _HADOOP_STREAMING_JAR_URI def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" self.fs.mkdir(self._job_tmpdir) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._region(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._get_cluster(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google.api_core.exceptions.NotFound: log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._create_cluster(cluster_data) self._wait_for_cluster_ready(self._cluster_id) self._set_up_ssh_tunnel() # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in ('RUNNING', 'UPDATING'): cluster = self._get_cluster(cluster_id) cluster_state = cluster.status.State.Name(cluster.status.state) if cluster_state in ('ERROR', 'DELETING'): raise DataprocException(cluster) self._wait_for_api('cluster to accept jobs') return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete(job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): step = self._get_step(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Build step # job_kwarg is a single-item dict, where the key is 'hadoop_job', # 'spark_job', etc. if step['type'] == 'streaming': job_kwarg = self._streaming_step_job_kwarg(step_num) elif step['type'] == 'jar': job_kwarg = self._jar_step_job_kwarg(step_num) else: raise NotImplementedError('Unsupported step type: %r' % step['type']) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._submit_job(step_name, job_kwarg) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result.reference.job_id assert job_id == step_name return job_id def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException(step_num=step_num, num_steps=num_steps) def _default_step_output_dir(self): # put intermediate data in HDFS return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key ### log intepretation ### # step def _interpret_step_logs(self, log_interpretation, step_type): """Hook for interpreting step logs. Unlike with most runners, you may call this multiple times and it will continue to parse the step log incrementally, which is useful for getting job progress.""" # don't turn this off even if read_logs opt is false; it's # the only way this runner can track job progress driver_output_uri = log_interpretation.get('step', {}).get('driver_output_uri') if driver_output_uri: self._update_step_interpretation(log_interpretation['step'], driver_output_uri) def _update_step_interpretation(self, step_interpretation, driver_output_uri): new_lines = self._get_new_driver_output_lines(driver_output_uri) _interpret_new_dataproc_step_stderr(step_interpretation, new_lines) def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: # log the location of job driver output just once log.info(' Parsing job driver output from %s*' % driver_output_uri) state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs.gcs._get_blob(log_uri) try: new_data = log_blob.download_as_string(start=state['pos']) except (google.api_core.exceptions.NotFound, google.api_core.exceptions.RequestRangeNotSatisfiable): # blob was just created, or no more data is available break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines # history def _interpret_history_log(self, log_interpretation): """Does nothing. We can't get the history logs, and we don't need them.""" if not self._read_logs(): return log_interpretation.setdefault('history', {}) # task def _interpret_task_logs(self, log_interpretation, step_type, error_attempt_ids=(), partial=True): """Scan node manager log to find failed container IDs of failed tasks, and then scan the corresponding stderr and syslogs.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted if not self._read_logs(): return step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') if not application_id: log.warning( "Can't parse node manager logs; missing application ID") return log_interpretation['task'] = self._task_log_interpretation( application_id, step_type, partial) def _task_log_interpretation(self, application_id, step_type, partial=True): """Helper for :py:meth:`_interpret_task_logs`""" # not bothering with _read_logs() since this is a helper method result = {} for container_id in self._failed_task_container_ids(application_id): error = _parse_task_syslog_records( self._task_syslog_records(application_id, container_id, step_type)) if not error.get('hadoop_error'): # not sure if this ever happens, since we already know # which containers failed continue error['container_id'] = container_id # fix weird munging of java stacktrace error['hadoop_error']['message'] = _fix_java_stack_trace( error['hadoop_error']['message']) task_error = _parse_task_stderr( self._task_stderr_lines(application_id, container_id, step_type)) if task_error: task_error['message'] = _fix_traceback(task_error['message']) error['task_error'] = task_error result.setdefault('errors', []).append(error) # if partial is true, bail out when we find the first task error if task_error and partial: result['partial'] = True return result return result def _failed_task_container_ids(self, application_id): """Stream container IDs of failed tasks, in reverse order.""" container_id_prefix = 'container' + application_id[11:] log_filter = self._make_log_filter( 'yarn-yarn-nodemanager', {'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME}) log.info('Scanning node manager logs for IDs of failed tasks...') # it doesn't seem to work to do self.logging_client.logger(); # there's some RPC dispute about whether the log name should # be qualified by project name or not entries = self.logging_client.list_entries( filter_=log_filter, order_by=google.cloud.logging.DESCENDING) for entry in entries: message = entry.payload.get('message') if not message: continue m = _CONTAINER_EXIT_RE.match(message) if not m: continue returncode = int(m.group('returncode')) if not returncode: continue container_id = m.group('container_id') # matches some other step if not container_id.startswith(container_id_prefix): continue log.debug(' %s' % container_id) yield container_id def _task_stderr_lines(self, application_id, container_id, step_type): """Yield lines from a specific stderr log.""" log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'stderr', }) log.info(' reading stderr log...') entries = self.logging_client.list_entries(filter_=log_filter) # use log4j parsing to handle tab -> newline conversion for record in _log_entries_to_log4j(entries): for line in record['message'].split('\n'): yield line def _task_syslog_records(self, application_id, container_id, step_type): """Yield log4j records from a specific syslog. """ log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'syslog', }) log.info(' reading syslog...') entries = self.logging_client.list_entries(filter_=log_filter) return _log_entries_to_log4j(entries) # misc def _make_log_filter(self, log_name=None, extra_values=None): # we only want logs from this project, cluster, and region d = {} d['resource.labels.cluster_name'] = self._cluster_id d['resource.labels.project_id'] = self._project_id d['resource.labels.region'] = self._region() d['resource.type'] = 'cloud_dataproc_cluster' if log_name: d['logName'] = 'projects/%s/logs/%s' % (self._project_id, log_name) if extra_values: d.update(extra_values) return _log_filter_str(d) def counters(self): return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise ValueError('cluster has not yet been created') cluster = self._get_cluster(self._cluster_id) self._image_version = (cluster.config.software_config.image_version) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version(self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) def _bootstrap_pre_commands(self): # don't run the bootstrap script in / (see #1601) return [ 'mkdir /tmp/mrjob', 'cd /tmp/mrjob', ] ### Bootstrapping ### def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def get_cluster_id(self): return self._cluster_id def _cluster_create_kwargs(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ # TODO: remove mrjob-max-secs-idle once lifecycle_config is visible # through the gcloud utility and the Google Cloud Console cluster_metadata['mrjob-max-secs-idle'] = str( int(self._opts['max_mins_idle'] * 60)) gce_cluster_config = dict( metadata=cluster_metadata, service_account_scopes=self._opts['service_account_scopes'], ) if self._opts['network']: gce_cluster_config['network_uri'] = self._opts['network'] if self._opts['subnet']: gce_cluster_config['subnetwork_uri'] = self._opts['subnet'] if self._opts['service_account']: gce_cluster_config['service_account'] = ( self._opts['service_account']) if self._opts['service_account_scopes']: gce_cluster_config['service_account_scopes'] = ( self._opts['service_account_scopes']) if self._opts['zone']: gce_cluster_config['zone_uri'] = _gcp_zone_uri( project=self._project_id, zone=self._opts['zone']) cluster_config = dict(gce_cluster_config=gce_cluster_config, initialization_actions=[ dict(executable_file=init_script_uri) for init_script_uri in gcs_init_script_uris ]) # Task tracker master_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=1, instance_type=self._opts['master_instance_type'], ) if self._opts['master_instance_config']: master_conf.update(self._opts['master_instance_config']) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type']) if self._opts['core_instance_config']: worker_conf.update(self._opts['core_instance_config']) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True) if self._opts['task_instance_config']: secondary_worker_conf.update(self._opts['task_instance_config']) cluster_config['master_config'] = master_conf cluster_config['worker_config'] = worker_conf if secondary_worker_conf.get('num_instances'): cluster_config['secondary_worker_config'] = secondary_worker_conf cluster_config['lifecycle_config'] = dict(idle_delete_ttl=dict( seconds=int(self._opts['max_mins_idle'] * 60))) software_config = {} if self._opts['cluster_properties']: software_config['properties'] = _values_to_text( self._opts['cluster_properties']) # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: software_config['image_version'] = self._opts['image_version'] if software_config: cluster_config['software_config'] = software_config # in Python 2, dict keys loaded from JSON will be unicode, which # the Google protobuf objects don't like if PY2: cluster_config = _clean_json_dict_keys(cluster_config) kwargs = dict(project_id=self._project_id, cluster_name=self._cluster_id, config=cluster_config) return self._add_extra_cluster_params(kwargs) ### Dataproc-specific Stuff ### def _get_cluster(self, cluster_id): return self.cluster_client.get_cluster(cluster_name=cluster_id, **self._project_id_and_region()) def _create_cluster(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa self.cluster_client.create_cluster(cluster=cluster_data, **self._project_id_and_region()) def _delete_cluster(self, cluster_id): return self.cluster_client.delete_cluster( cluster_name=cluster_id, **self._project_id_and_region()) def _list_jobs(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = self._project_id_and_region() if cluster_name: list_kwargs['cluster_name'] = cluster_name if state_matcher: list_kwargs['job_state_matcher'] = state_matcher return self.job_client.list_jobs(**list_kwargs) def _get_job(self, job_id): return self.job_client.get_job(job_id=job_id, **self._project_id_and_region()) def _cancel_job(self, job_id): return self.job_client.cancel_job(job_id=job_id, **self._project_id_and_region()) def _submit_job(self, step_name, job_kwarg): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa submit_job_kwargs = dict(job=dict( reference=dict(project_id=self._project_id, job_id=step_name), placement=dict(cluster_name=self._cluster_id), **job_kwarg), **self._project_id_and_region()) log.debug(' submit_job(%s)' % ', '.join('%s=%r' % (k, v) for k, v in sorted(submit_job_kwargs.items()))) return self.job_client.submit_job(**submit_job_kwargs) def _project_id_and_region(self): return dict( project_id=self._project_id, region=(self._opts['region'] or 'global'), ) def _manifest_download_commands(self): return [ # TODO: SSH in and figure out how to use gsutil or similar # ('gs://*', 'gsutil cp'), ('*://*', 'hadoop fs -copyToLocal'), ] ### SSH hooks ### def _job_tracker_host(self): return '%s-m' % self._cluster_id def _ssh_tunnel_config(self): return _SSH_TUNNEL_CONFIG def _launch_ssh_proc(self, args): ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args) # enter an empty passphrase if creating a key for the first time ssh_proc.stdin.write(b'\n\n') return ssh_proc def _ssh_launch_wait_secs(self): """Wait 20 seconds because gcloud has to update project metadata (unless we were going to check the cluster sooner anyway).""" return min(20.0, self._opts['check_cluster_every']) def _ssh_tunnel_args(self, bind_port): if not self._cluster_id: return gcloud_bin = self._opts['gcloud_bin'] or ['gcloud'] cluster = self._get_cluster(self._cluster_id) zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1] return gcloud_bin + [ 'compute', 'ssh', '--zone', zone, self._job_tracker_host(), '--', ] + self._ssh_tunnel_opts(bind_port)
class HadoopJobRunner(MRJobRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) if self._opts['hadoop_home']: log.warning( 'hadoop_home is deprecated since 0.5.0 and will be removed' ' in v0.6.0. In most cases, mrjob will now find the hadoop' ' binary and streaming jar without help. If not, use the' ' hadoop_bin and hadoop_streaming_jar options.') self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Fully qualify step_output_dir, if set if self._step_output_dir: self._step_output_dir = fully_qualify_hdfs_path( self._step_output_dir) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # Keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = [] @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s...' % path) streaming_jars = [] for path in self.fs.ls(path): if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_dirs(self): """Yield all possible hadoop directories (used for streaming jar and logs). May yield duplicates""" if self._opts['hadoop_home']: yield self._opts['hadoop_home'] for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar. May yield duplicates. """ for hadoop_dir in self._hadoop_dirs(): yield hadoop_dir # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir yarn = uses_yarn(self.get_hadoop_version()) if yarn: yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir yield _DEFAULT_YARN_HDFS_LOG_DIR if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded fallback paths if yarn: for path in _FALLBACK_HADOOP_YARN_LOG_DIRS: yield path for path in _FALLBACK_HADOOP_LOG_DIRS: yield path def get_spark_submit_bin(self): if not self._spark_submit_bin: self._spark_submit_bin = self._find_spark_submit_bin() return self._spark_submit_bin def _find_spark_submit_bin(self): # TODO: this is very similar to _find_hadoop_bin() (in fs) for path in unique(self._spark_submit_bin_dirs()): log.info('Looking for spark-submit binary in %s...' % (path or '$PATH')) spark_submit_bin = which('spark-submit', path=path) if spark_submit_bin: log.info('Found spark-submit binary: %s' % spark_submit_bin) return [spark_submit_bin] else: log.info("Falling back to 'spark-submit'") return ['spark-submit'] def _spark_submit_bin_dirs(self): # $SPARK_HOME spark_home = os.environ.get('SPARK_HOME') if spark_home: yield os.path.join(spark_home, 'bin') yield None # use $PATH # some other places recommended by install docs (see #1366) yield '/usr/lib/spark/bin' yield '/usr/local/spark/bin' yield '/usr/local/lib/spark/bin' def _run(self): self._find_binaries_and_jars() self._check_input_exists() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _find_binaries_and_jars(self): """Find hadoop and (if needed) spark-submit bin up-front, before continuing with the job. (This is just for user-interaction purposes; these would otherwise lazy-load as needed.) """ # this triggers looking for Hadoop binary self.get_hadoop_version() if self._has_streaming_steps(): self.get_hadoop_streaming_jar() if self._has_spark_steps(): self.get_spark_submit_bin() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if self._opts['check_input_paths']: if not self.fs.exists(path): raise AssertionError('Input path %s does not exist!' % (path, )) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying local files to %s...' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _upload_to_hdfs(self, path, target): log.debug(' %s -> %s' % (path, target)) self.fs._put(path, target) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s...' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = self._env_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps()) def _warn_about_spark_archives(self, step): """If *step* is a Spark step, the *upload_archives* option is set, and *spark_master* is not ``'yarn'``, warn that *upload_archives* will be ignored by Spark.""" if (_is_spark_step_type(step['type']) and self._opts['spark_master'] != 'yarn' and self._opts['upload_archives']): log.warning('Spark will probably ignore archives because' " spark_master is not set to 'yarn'") def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) elif _is_spark_step_type(step['type']): return self._args_for_spark_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'], )) def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # set up uploading from HDFS to the working dir args.extend(self._upload_args()) # if no reducer, shut off reducer tasks. This has to come before # extra hadoop args, which could contain jar-specific args # (e.g. -outputformat). See #1331. # # might want to just integrate this into _hadoop_args_for_step? if not reducer: args.extend([ '-D', ('%s=0' % translate_jobconf('mapreduce.job.reduces', self.get_hadoop_version())) ]) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._step_input_uris(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._step_output_uri(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args def _args_for_jar_step(self, step_num): step = self._get_step(step_num) args = [] args.extend(self.get_hadoop_bin()) # -libjars, -D args.extend(self._hadoop_generic_args_for_step(step_num)) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args.extend(['jar', jar]) if step.get('main_class'): args.append(step['main_class']) if step.get('args'): args.extend( self._interpolate_input_and_output(step['args'], step_num)) return args def _spark_submit_arg_prefix(self): return ['--master', self._opts['spark_master']] def _env_for_step(self, step_num): step = self._get_step(step_num) env = dict(os.environ) # when running spark-submit, set its environment directly. See #1464 if _is_spark_step_type(step['type']): env.update(self._spark_cmdenv(step_num)) return env def _default_step_output_dir(self): return posixpath.join(self._hadoop_tmp_dir, 'step-output') def _cleanup_hadoop_tmp(self): if self._hadoop_tmp_dir: log.info('Removing HDFS temp directory %s...' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) ### LOG (implementation of LogInterpretationMixin) ### def _stream_history_log_dirs(self, output_dir=None): """Yield lists of directories to look for the history log in.""" for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if _logs_exist(self.fs, log_dir): log.info('Looking for history log in %s...' % log_dir) # logs aren't always in a subdir named history/ yield [log_dir] def _stream_task_log_dirs(self, application_id=None, output_dir=None): """Yield lists of directories to look for the task logs in.""" # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if application_id: path = self.fs.join(log_dir, 'userlogs', application_id) else: path = self.fs.join(log_dir, 'userlogs') if _logs_exist(self.fs, path): log.info('Looking for task syslogs in %s...' % path) yield [path] def counters(self): return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ]
class DataprocJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' # Don't need to bootstrap mrjob in the setup wrapper; that's what # the bootstrap script is for! BOOTSTRAP_MRJOB_IN_SETUP = False OPTION_STORE_CLASS = DataprocRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = (self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = (self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = (self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage working dir for bootstrap script self._bootstrap_dir_mgr = BootstrapWorkingDirManager() # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() for cmd in self._bootstrap: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._bootstrap_dir_mgr.add(**maybe_path_dict) # we'll create the script later self._master_bootstrap_script_path = None # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def gcloud_config(self): """Lazy load gcloud SDK configs""" if not self._gcloud_config: self._gcloud_config = _read_gcloud_config() return self._gcloud_config @property def api_client(self): if not self._api_client: credentials = GoogleCredentials.get_application_default() api_client = discovery.build(_DATAPROC_API_ENDPOINT, _DATAPROC_API_VERSION, credentials=credentials) self._api_client = api_client.projects().regions() return self._api_client @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem() self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir mrjob_buckets = self.fs.list_buckets(self._gcp_project, prefix='mrjob-') # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None gce_lower_location = self._gce_region.lower() for tmp_bucket in mrjob_buckets: tmp_bucket_name = tmp_bucket['name'] # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase lower_location = tmp_bucket['location'].lower() if lower_location == gce_lower_location: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', gce_lower_location, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_input_exists() self._check_output_not_exists() self._create_setup_wrapper_script() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files_to_fs() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if is_uri(path) and not is_gcs_uri(path): continue # can't check non-GCS URIs, hope for the best if not self.fs.exists(path): raise AssertionError('Input path %s does not exist!' % (path, )) def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError('Output path %s already exists!' % (self._output_dir, )) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) self._upload_mgr.add(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored, # see _HADOOP_STREAMING_JAR_URI # if self._opts['hadoop_streaming_jar']: # self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) # TODO - mtai @ davidmarin - Implement put function for other FSs self.fs.put(path, gcs_uri) self._wait_for_fs_sync() def _create_fs_tmp_bucket(self, bucket_name, location=None): """Create a temp bucket if missing Tie the temporary bucket to the same region as the GCE job and set a 28-day TTL """ # Return early if our bucket already exists try: self.fs.get_bucket(bucket_name) return except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('creating FS bucket %r' % bucket_name) location = location or self._gce_region # NOTE - By default, we create a bucket in the same GCE region as our # job (tmp buckets ONLY) # https://cloud.google.com/storage/docs/bucket-locations self.fs.create_bucket( self._gcp_project, bucket_name, location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS) self._wait_for_fs_sync() ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for current_job in self._api_job_list(cluster_name=self._cluster_id, state_matcher='ACTIVE'): # Kill all active jobs with the same job_prefix as this job current_job_id = current_job['reference']['jobId'] if not current_job_id.startswith(job_prefix): continue self._api_job_cancel(current_job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._api_cluster_delete(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _build_dataproc_hadoop_job(self, step_num): """This function creates a "HadoopJob" to be passed to self._api_job_submit_hadoop :param step_num: :return: output_hadoop_job """ # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa args = list() file_uris = list() archive_uris = list() properties = dict() step = self._get_step(step_num) assert step['type'] in ('streaming', 'jar'), ('Bad step type: %r' % (step['type'], )) # TODO - mtai @ davidmarin - Might be trivial to support jar running, # see "mainJarFileUri" of variable "output_hadoop_job" in this function # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa assert step['type'] == 'streaming', 'Jar not implemented' main_jar_uri = _HADOOP_STREAMING_JAR_URI # TODO - mtai @ davidmarin - Not clear if we should move _upload_args # to file_uris, currently works fine as-is # TODO - dmarin @ mtai - Probably a little safer to do the API's way, # assuming the API supports distributed cache syntax (so we can pick # the names of the uploaded files). args.extend(self._upload_args()) args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) if mapper: args += ['-mapper', mapper] if combiner: args += ['-combiner', combiner] if reducer: args += ['-reducer', reducer] for current_input_uri in self._step_input_uris(step_num): args += ['-input', current_input_uri] args += ['-output', self._step_output_uri(step_num)] # TODO - mtai @ davidmarin - Add back support to specify a different # mainJarFileURI output_hadoop_job = dict(args=args, fileUris=file_uris, archiveUris=archive_uris, properties=properties, mainJarFileUri=main_jar_uri) return output_hadoop_job def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in _DATAPROC_CLUSTER_STATES_READY: result_describe = self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() cluster_state = result_describe['status']['state'] if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR: raise DataprocException(result_describe) self._wait_for_api('cluster to accept jobs') assert cluster_state in _DATAPROC_CLUSTER_STATES_READY log.info("Cluster %s ready", cluster_id) return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete(job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): # Build each step hadoop_job = self._build_dataproc_hadoop_job(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._api_job_submit_hadoop(step_name, hadoop_job) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result['reference']['jobId'] assert job_id == step_name return job_id def _wait_for_step_to_complete(self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job_result = self._api_job_get(job_id) job_state = job_result['status']['state'] log.info('%s => %s' % (job_id, job_state)) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa if job_state in _DATAPROC_JOB_STATES_ACTIVE: self._wait_for_api('job completion') continue # we're done, will return at the end of this elif job_state == 'DONE': break raise StepFailedException(step_num=step_num, num_steps=num_steps) def _intermediate_output_uri(self, step_num): # TODO: davidmarin @ mtai: noticed this is 1-indexed and uses # %05d instead of %04d. Any particular reason? return 'hdfs:///tmp/mrjob/%s/step-output/%05d/' % (self._job_key, step_num + 1) def counters(self): # TODO - mtai @ davidmarin - Counters are currently always empty as we # are not processing task logs return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise AssertionError('cluster has not yet been created') cluster = self._api_cluster_get(self._cluster_id) self._image_version = ( cluster['config']['softwareConfig']['imageVersion']) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version(self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) ### Bootstrapping ### def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path } self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin()) ]) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append([ 'sudo %s -m compileall -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin()) ]) # we call the script b.py because there's a character limit on # bootstrap script names (or there was at one time, anyway) path = os.path.join(self._get_local_tmp_dir(), 'b.py') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content(self._bootstrap + mrjob_bootstrap) for line in contents: log.debug('BOOTSTRAP: ' + line.rstrip('\r\n')) with open(path, 'w') as f: for line in contents: f.write(line) self._master_bootstrap_script_path = path def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _master_bootstrap_script_content(self, bootstrap): """Create the contents of the master bootstrap script. """ out = [] def writeln(line=''): out.append(line + '\n') # shebang sh_bin = self._opts['sh_bin'] if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin writeln('#!' + cmd_line(sh_bin)) writeln() # store $PWD writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') # FYI - mtai @ davidmarin - begin section, mtai had to add this # otherwise initialization didn't work # // kept blowing up in all subsequent invocations of $__mrjob_PWD/ writeln('if [ $__mrjob_PWD = "/" ]; then') writeln(' __mrjob_PWD=""') writeln('fi') # FYI - mtai @ davidmarin - end section writeln() # download files writeln('# download files and mark them executable') cp_to_local = 'hadoop fs -copyToLocal' for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) output_string = '%s %s $__mrjob_PWD/%s' % ( cp_to_local, pipes.quote(uri), pipes.quote(name)) writeln(output_string) # make everything executable, like Hadoop Distributed Cache writeln('chmod a+x $__mrjob_PWD/%s' % pipes.quote(name)) writeln() # run bootstrap commands writeln('# bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = '' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) writeln() return out def get_cluster_id(self): return self._cluster_id def _cluster_create_args(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) # always add idle termination script # add it last, so that we don't count bootstrapping as idle time gcs_init_script_uris.append( self._upload_mgr.uri(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH)) # NOTE - Cluster initializationActions can only take scripts with no # script args, so the auto-term script receives 'mrjob-max-secs-idle' # via metadata instead of as an arg cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ cluster_metadata['mrjob-max-secs-idle'] = str( int(self._opts['max_hours_idle'] * 3600)) cluster_config = dict(gceClusterConfig=dict( zoneUri=_gcp_zone_uri(project=self._gcp_project, zone=self._gce_zone), serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES, metadata=cluster_metadata), initializationActions=[ dict(executableFile=init_script_uri) for init_script_uri in gcs_init_script_uris ]) # Task tracker master_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=1, instance_type=self._opts['master_instance_type']) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type']) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True) cluster_config['masterConfig'] = master_conf cluster_config['workerConfig'] = worker_conf if self._opts['num_task_instances']: cluster_config['secondaryWorkerConfig'] = secondary_worker_conf # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: cluster_config['softwareConfig'] = dict( imageVersion=self._opts['image_version']) return dict(projectId=self._gcp_project, clusterName=self._cluster_id, config=cluster_config) ### Dataproc-specific Stuff ### def _api_cluster_get(self, cluster_id): return self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() def _api_cluster_create(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa return self.api_client.clusters().create(projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=cluster_data).execute() def _api_cluster_delete(self, cluster_id): return self.api_client.clusters().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() def _api_job_list(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, ) if cluster_name: list_kwargs['clusterName'] = cluster_name if state_matcher: list_kwargs['jobStateMatcher'] = state_matcher list_request = self.api_client.jobs().list(**list_kwargs) while list_request: try: resp = list_request.execute() except google_errors.HttpError as e: if e.resp.status == 404: return raise for current_item in resp['items']: yield current_item list_request = self.api_client.jobs().list_next(list_request, resp) def _api_job_get(self, job_id): return self.api_client.jobs().get(projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id).execute() def _api_job_cancel(self, job_id): return self.api_client.jobs().cancel(projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id).execute() def _api_job_delete(self, job_id): return self.api_client.jobs().delete(projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id).execute() def _api_job_submit_hadoop(self, step_name, hadoop_job): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa job_data = dict(reference=dict(projectId=self._gcp_project, jobId=step_name), placement=dict(clusterName=self._cluster_id), hadoopJob=hadoop_job) jobs_submit_kwargs = dict(projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=dict(job=job_data)) return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError( 'You must install google-cloud and google-cloud-dataproc' ' to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException('Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES) self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
def test_empty(self): sd = UploadDirManager('hdfs:///') self.assertEqual(sd.path_to_uri(), {})
class DataprocJobRunner(HadoopInTheCloudJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | { 'gcp_project', } def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException( 'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = ( self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = ( self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = ( self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def _default_opts(self): return combine_dicts( super(DataprocJobRunner, self)._default_opts(), dict( bootstrap_python=True, check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY, cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'], cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS, image_version=_DEFAULT_IMAGE_VERSION, instance_type=_DEFAULT_INSTANCE_TYPE, master_instance_type=_DEFAULT_INSTANCE_TYPE, num_core_instances=_DATAPROC_MIN_WORKERS, num_task_instances=0, sh_bin=['/bin/sh', '-ex'], ) ) def gcloud_config(self): """Lazy load gcloud SDK configs""" if not self._gcloud_config: self._gcloud_config = _read_gcloud_config() return self._gcloud_config @property def api_client(self): if not self._api_client: credentials = GoogleCredentials.get_application_default() api_client = discovery.build( _DATAPROC_API_ENDPOINT, _DATAPROC_API_VERSION, credentials=credentials) self._api_client = api_client.projects().regions() return self._api_client @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem() self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir mrjob_buckets = self.fs.list_buckets( self._gcp_project, prefix='mrjob-') # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None gce_lower_location = self._gce_region.lower() for tmp_bucket in mrjob_buckets: tmp_bucket_name = tmp_bucket['name'] # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase lower_location = tmp_bucket['location'].lower() if lower_location == gce_lower_location: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', gce_lower_location, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_output_not_exists() self._create_setup_wrapper_script() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files_to_fs() def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError( 'Output path %s already exists!' % (self._output_dir,)) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored, # see _HADOOP_STREAMING_JAR_URI # if self._opts['hadoop_streaming_jar']: # self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) # TODO - mtai @ davidmarin - Implement put function for other FSs self.fs.put(path, gcs_uri) self._wait_for_fs_sync() def _create_fs_tmp_bucket(self, bucket_name, location=None): """Create a temp bucket if missing Tie the temporary bucket to the same region as the GCE job and set a 28-day TTL """ # Return early if our bucket already exists try: self.fs.get_bucket(bucket_name) return except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('creating FS bucket %r' % bucket_name) location = location or self._gce_region # NOTE - By default, we create a bucket in the same GCE region as our # job (tmp buckets ONLY) # https://cloud.google.com/storage/docs/bucket-locations self.fs.create_bucket( self._gcp_project, bucket_name, location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS) self._wait_for_fs_sync() ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for current_job in self._api_job_list( cluster_name=self._cluster_id, state_matcher='ACTIVE'): # Kill all active jobs with the same job_prefix as this job current_job_id = current_job['reference']['jobId'] if not current_job_id.startswith(job_prefix): continue self._api_job_cancel(current_job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._api_cluster_delete(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _build_dataproc_hadoop_job(self, step_num): """This function creates a "HadoopJob" to be passed to self._api_job_submit_hadoop :param step_num: :return: output_hadoop_job """ # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa args = list() file_uris = list() archive_uris = list() properties = dict() step = self._get_step(step_num) assert step['type'] in ('streaming', 'jar'), ( 'Bad step type: %r' % (step['type'],)) # TODO - mtai @ davidmarin - Might be trivial to support jar running, # see "mainJarFileUri" of variable "output_hadoop_job" in this function # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa assert step['type'] == 'streaming', 'Jar not implemented' main_jar_uri = _HADOOP_STREAMING_JAR_URI # TODO - mtai @ davidmarin - Not clear if we should move _upload_args # to file_uris, currently works fine as-is # TODO - dmarin @ mtai - Probably a little safer to do the API's way, # assuming the API supports distributed cache syntax (so we can pick # the names of the uploaded files). args.extend(self._upload_args()) args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) if mapper: args += ['-mapper', mapper] if combiner: args += ['-combiner', combiner] if reducer: args += ['-reducer', reducer] for current_input_uri in self._step_input_uris(step_num): args += ['-input', current_input_uri] args += ['-output', self._step_output_uri(step_num)] # TODO - mtai @ davidmarin - Add back support to specify a different # mainJarFileURI output_hadoop_job = dict( args=args, fileUris=file_uris, archiveUris=archive_uris, properties=properties, mainJarFileUri=main_jar_uri ) return output_hadoop_job def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in _DATAPROC_CLUSTER_STATES_READY: result_describe = self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() cluster_state = result_describe['status']['state'] if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR: raise DataprocException(result_describe) self._wait_for_api('cluster to accept jobs') assert cluster_state in _DATAPROC_CLUSTER_STATES_READY log.info("Cluster %s ready", cluster_id) return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete( job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): # Build each step hadoop_job = self._build_dataproc_hadoop_job(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._api_job_submit_hadoop(step_name, hadoop_job) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result['reference']['jobId'] assert job_id == step_name return job_id def _wait_for_step_to_complete( self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job_result = self._api_job_get(job_id) job_state = job_result['status']['state'] log.info('%s => %s' % (job_id, job_state)) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa if job_state in _DATAPROC_JOB_STATES_ACTIVE: self._wait_for_api('job completion') continue # we're done, will return at the end of this elif job_state == 'DONE': break raise StepFailedException(step_num=step_num, num_steps=num_steps) def _default_step_output_dir(self): # put intermediate data in HDFS return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key def counters(self): # TODO - mtai @ davidmarin - Counters are currently always empty as we # are not processing task logs return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise AssertionError('cluster has not yet been created') cluster = self._api_cluster_get(self._cluster_id) self._image_version = ( cluster['config']['softwareConfig']['imageVersion']) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version( self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) ### Bootstrapping ### def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def get_cluster_id(self): return self._cluster_id def _cluster_create_kwargs(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) # always add idle termination script # add it last, so that we don't count bootstrapping as idle time gcs_init_script_uris.append( self._upload_mgr.uri(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)) # NOTE - Cluster initializationActions can only take scripts with no # script args, so the auto-term script receives 'mrjob-max-secs-idle' # via metadata instead of as an arg cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ cluster_metadata['mrjob-max-secs-idle'] = str(int( self._opts['max_mins_idle'] * 60)) cluster_config = dict( gceClusterConfig=dict( zoneUri=_gcp_zone_uri( project=self._gcp_project, zone=self._gce_zone), serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES, metadata=cluster_metadata ), initializationActions=[ dict(executableFile=init_script_uri) for init_script_uri in gcs_init_script_uris ] ) # Task tracker master_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=1, instance_type=self._opts['master_instance_type'] ) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type'] ) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True ) cluster_config['masterConfig'] = master_conf cluster_config['workerConfig'] = worker_conf if self._opts['num_task_instances']: cluster_config['secondaryWorkerConfig'] = secondary_worker_conf # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: cluster_config['softwareConfig'] = dict( imageVersion=self._opts['image_version']) kwargs = dict(projectId=self._gcp_project, clusterName=self._cluster_id, config=cluster_config) return self._add_extra_cluster_params(kwargs) ### Dataproc-specific Stuff ### def _api_cluster_get(self, cluster_id): return self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id ).execute() def _api_cluster_create(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa return self.api_client.clusters().create( projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=cluster_data ).execute() def _api_cluster_delete(self, cluster_id): return self.api_client.clusters().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id ).execute() def _api_job_list(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, ) if cluster_name: list_kwargs['clusterName'] = cluster_name if state_matcher: list_kwargs['jobStateMatcher'] = state_matcher list_request = self.api_client.jobs().list(**list_kwargs) while list_request: try: resp = list_request.execute() except google_errors.HttpError as e: if e.resp.status == 404: return raise for current_item in resp['items']: yield current_item list_request = self.api_client.jobs().list_next(list_request, resp) def _api_job_get(self, job_id): return self.api_client.jobs().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_cancel(self, job_id): return self.api_client.jobs().cancel( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_delete(self, job_id): return self.api_client.jobs().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_submit_hadoop(self, step_name, hadoop_job): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa job_data = dict( reference=dict(projectId=self._gcp_project, jobId=step_name), placement=dict(clusterName=self._cluster_id), hadoopJob=hadoop_job ) jobs_submit_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=dict(job=job_data) ) return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError('You must install google-cloud-logging and ' 'google-cloud-storage to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException('Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # see #1820 if self._opts['image_id']: log.warning('mrjob does not yet support custom machine images' ' on Dataproc') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() if self._opts['service_account_scopes']: self._opts['service_account_scopes'] = [ _fully_qualify_scope_uri(s) for s in self._opts['service_account_scopes'] ] # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # map driver_output_uri to a dict with the keys: # log_uri: uri of file we're reading from # pos: position in file # buffer: bytes read from file already self._driver_output_state = {} # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output')) self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home']) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split('\n')[0] m = HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = m.group('version') log.info("Using Hadoop version %s" % self._hadoop_version) return self._hadoop_version self._hadoop_version = '0.20.203' log.info("Unable to determine Hadoop version. Assuming 0.20.203.") return self._hadoop_version def _run(self): self._check_input_exists() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if self._opts['check_input_paths']: if not self.path_exists(path): raise AssertionError( 'Input path %s does not exist!' % (path,)) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self._mkdir_on_hdfs(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().iteritems(): self._upload_to_hdfs(path, uri) def _mkdir_on_hdfs(self, path): log.debug('Making directory %s on HDFS' % path) self.invoke_hadoop(['fs', '-mkdir', path]) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.invoke_hadoop(['fs', '-put', path, target]) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'w') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): self._counters = [] for step_num in xrange(self._num_steps()): log.debug('running step %d of %d' % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: master = os.fdopen(master_fd) # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) master.close() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (returncode, step_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend(line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise CalledProcessError(returncode, step_args) def _process_stderr_from_streaming(self, stderr): def treat_eio_as_eof(iter): # on Linux, the PTY gives us a specific IOError when the # when the child process exits, rather than EOF. while True: try: yield iter.next() # okay for StopIteration to bubble up except IOError, e: if e.errno == errno.EIO: return else: raise for line in treat_eio_as_eof(stderr): line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info('HADOOP: ' + line) if 'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num'))
def test_uri(self): sd = UploadDirManager("hdfs:///") sd.add("foo/bar.py") self.assertEqual(sd.uri("foo/bar.py"), "hdfs:///bar.py")
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hdfs_scratch_dir'], self._job_name)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output')) self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home']) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem( HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split('\n')[0] m = HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = m.group('version') log.info("Using Hadoop version %s" % self._hadoop_version) return self._hadoop_version self._hadoop_version = '0.20.203' log.info("Unable to determine Hadoop version. Assuming 0.20.203.") return self._hadoop_version def _run(self): self._check_input_exists() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == '-': continue # STDIN always exists if self._opts['check_input_paths']: if not self.path_exists(path): raise AssertionError('Input path %s does not exist!' % (path, )) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self._mkdir_on_hdfs(self._upload_mgr.prefix) log.info('Copying local files into %s' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _mkdir_on_hdfs(self, path): log.debug('Making directory %s on HDFS' % path) hadoop_version = self.get_hadoop_version() # from version 0.23 / 2.x on, -mkdir needs a -p option to create # parent directories # version == 0.23 if ((mrjob.compat.version_gte(hadoop_version, "0.23") and not mrjob.compat.version_gte(hadoop_version, "0.24"))): self.invoke_hadoop(['fs', '-mkdir', '-p', path]) # version >= 2.0 elif mrjob.compat.version_gte(hadoop_version, "2.0"): self.invoke_hadoop(['fs', '-mkdir', '-p', path]) # for version 0.20, 1.x else: self.invoke_hadoop(['fs', '-mkdir', path]) def _upload_to_hdfs(self, path, target): log.debug('Uploading %s -> %s on HDFS' % (path, target)) self.invoke_hadoop(['fs', '-put', path, target]) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): self._counters = [] for step_num in range(self._num_steps()): log.debug('running step %d of %d' % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + to_string(line.strip(b'\n'))) returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (returncode, step_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend( line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise CalledProcessError(returncode, step_args) def _process_stderr_from_streaming(self, stderr): def treat_eio_as_eof(iter): # on Linux, the PTY gives us a specific IOError when the # when the child process exits, rather than EOF. while True: try: yield next(iter) # okay for StopIteration to bubble up except IOError as e: if e.errno == errno.EIO: return else: raise for line in treat_eio_as_eof(stderr): line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info('HADOOP: ' + to_string(line)) if b'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num')) def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) else: raise AssertionError('Bad step type: %r' % (step['type'], )) def _args_for_streaming_step(self, step_num): version = self.get_hadoop_version() args = (self._opts['hadoop_bin'] + ['jar', self._opts['hadoop_streaming_jar']]) # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._new_upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._old_upload_args(self._upload_mgr)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) else: args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return args def _args_for_jar_step(self, step_num): step = self._get_step(step_num) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args = (self._opts['hadoop_bin'] + ['jar', jar]) if step.get('main_class'): args.append(step['main_class']) # TODO: merge with logic in mrjob/emr.py def interpolate(arg): if arg == mrjob.step.JarStep.INPUT: return ','.join(self._hdfs_step_input_files(step_num)) elif arg == mrjob.step.JarStep.OUTPUT: return self._hdfs_step_output_dir(step_num) else: return arg if step.get('args'): args.extend(interpolate(arg) for arg in step['args']) return args def _hdfs_step_input_files(self, step_num): """Get the hdfs:// URI for input for the given step.""" if step_num == 0: return [self._upload_mgr.uri(p) for p in self._get_input_paths()] else: return [ posixpath.join(self._hdfs_tmp_dir, 'step-output', str(step_num)) ] def _hdfs_step_output_dir(self, step_num): if step_num == len(self._get_steps()) - 1: return self._output_dir else: return posixpath.join(self._hdfs_tmp_dir, 'step-output', str(step_num + 1)) def _cleanup_local_scratch(self): super(HadoopJobRunner, self)._cleanup_local_scratch() if self._hdfs_tmp_dir: log.info('deleting %s from HDFS' % self._hdfs_tmp_dir) try: self.invoke_hadoop(['fs', '-rmr', self._hdfs_tmp_dir]) except Exception as e: log.exception(e) ### LOG FETCHING/PARSING ### def _enforce_path_regexp(self, paths, regexp, step_nums): """Helper for log fetching functions to filter out unwanted logs. Keyword arguments are checked against their corresponding regex groups. """ for path in paths: m = regexp.match(path) if (m and (step_nums is None or int(m.group('step_num')) in step_nums) and (self._job_timestamp is None or m.group('timestamp') == self._job_timestamp)): yield path def _ls_logs(self, relative_path): """List logs on the local filesystem by path relative to log root directory """ return self.ls(os.path.join(self._hadoop_log_dir, relative_path)) def _fetch_counters(self, step_nums, skip_s3_wait=False): """Read Hadoop counters from local logs. Args: step_nums -- the steps belonging to us, so that we can ignore errors from other jobs run with the same timestamp """ job_logs = self._enforce_path_regexp(self._ls_logs('history/'), HADOOP_JOB_LOG_URI_RE, step_nums) uris = list(job_logs) new_counters = scan_for_counters_in_files(uris, self, self.get_hadoop_version()) # only include steps relevant to the current job for step_num in step_nums: self._counters.append(new_counters.get(step_num, {})) def counters(self): return self._counters def _find_probable_cause_of_failure(self, step_nums): all_task_attempt_logs = [] try: all_task_attempt_logs.extend(self._ls_logs('userlogs/')) except IOError: # sometimes the master doesn't have these pass # TODO: get these logs from slaves if possible task_attempt_logs = self._enforce_path_regexp( all_task_attempt_logs, TASK_ATTEMPTS_LOG_URI_RE, step_nums) step_logs = self._enforce_path_regexp(self._ls_logs('steps/'), STEP_LOG_URI_RE, step_nums) job_logs = self._enforce_path_regexp(self._ls_logs('history/'), HADOOP_JOB_LOG_URI_RE, step_nums) log.info('Scanning logs for probable cause of failure') return best_error_from_logs(self, task_attempt_logs, step_logs, job_logs)
class HadoopJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = "hadoop" OPTION_STORE_CLASS = HadoopRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hdfs_tmp_dir = fully_qualify_hdfs_path(posixpath.join(self._opts["hdfs_scratch_dir"], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, "files", "") self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path(self._output_dir or posixpath.join(self._hdfs_tmp_dir, "output")) self._hadoop_log_dir = hadoop_log_dir(self._opts["hadoop_home"]) # Running jobs via hadoop assigns a new timestamp to each job. # Running jobs via mrjob only adds steps. # Store both of these values to enable log parsing. self._job_timestamp = None self._start_step_num = 0 # init hadoop version cache self._hadoop_version = None @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem(HadoopFilesystem(self._opts["hadoop_bin"]), LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" if not self._hadoop_version: stdout = self.invoke_hadoop(["version"], return_stdout=True) if stdout: first_line = stdout.split("\n")[0] m = HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = m.group("version") log.info("Using Hadoop version %s" % self._hadoop_version) return self._hadoop_version self._hadoop_version = "0.20.203" log.info("Unable to determine Hadoop version. Assuming 0.20.203.") return self._hadoop_version def _run(self): self._check_input_exists() self._create_setup_wrapper_script() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ for path in self._input_paths: if path == "-": continue # STDIN always exists if self._opts["check_input_paths"]: if not self.path_exists(path): raise AssertionError("Input path %s does not exist!" % (path,)) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self._mkdir_on_hdfs(self._upload_mgr.prefix) log.info("Copying local files into %s" % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _mkdir_on_hdfs(self, path): log.debug("Making directory %s on HDFS" % path) hadoop_version = self.get_hadoop_version() # from version 0.23 / 2.x on, -mkdir needs a -p option to create # parent directories # version == 0.23 if mrjob.compat.version_gte(hadoop_version, "0.23") and not mrjob.compat.version_gte(hadoop_version, "0.24"): self.invoke_hadoop(["fs", "-mkdir", "-p", path]) # version >= 2.0 elif mrjob.compat.version_gte(hadoop_version, "2.0"): self.invoke_hadoop(["fs", "-mkdir", "-p", path]) # for version 0.20, 1.x else: self.invoke_hadoop(["fs", "-mkdir", path]) def _upload_to_hdfs(self, path, target): log.debug("Uploading %s -> %s on HDFS" % (path, target)) self.invoke_hadoop(["fs", "-put", path, target]) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = os.path.join(self._get_local_tmp_dir(), "STDIN") # prompt user, so they don't think the process has stalled log.info("reading from STDIN") log.debug("dumping stdin to local file %s" % stdin_path) stdin_file = open(stdin_path, "wb") for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): self._counters = [] for step_num in range(self._num_steps()): log.debug("running step %d of %d" % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug("> %s" % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error("STDOUT: " + to_string(line.strip(b"\n"))) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: with os.fdopen(master_fd, "rb") as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = "Job failed with return code %d: %s" % (returncode, step_args) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure([step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"]) cause_msg.extend(line.strip("\n") for line in cause["lines"]) if cause["input_uri"]: cause_msg.append("(while reading from %s)" % cause["input_uri"]) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += "\n" + "\n".join(cause_msg) + "\n" raise CalledProcessError(returncode, step_args) def _process_stderr_from_streaming(self, stderr): def treat_eio_as_eof(iter): # on Linux, the PTY gives us a specific IOError when the # when the child process exits, rather than EOF. while True: try: yield next(iter) # okay for StopIteration to bubble up except IOError as e: if e.errno == errno.EIO: return else: raise for line in treat_eio_as_eof(stderr): line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info("HADOOP: " + to_string(line)) if b"Streaming Job Failed!" in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group("timestamp") self._start_step_num = int(m.group("step_num")) def _args_for_step(self, step_num): step = self._get_step(step_num) if step["type"] == "streaming": return self._args_for_streaming_step(step_num) elif step["type"] == "jar": return self._args_for_jar_step(step_num) else: raise AssertionError("Bad step type: %r" % (step["type"],)) def _args_for_streaming_step(self, step_num): version = self.get_hadoop_version() args = self._opts["hadoop_bin"] + ["jar", self._opts["hadoop_streaming_jar"]] # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._new_upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(["-input", input_uri]) # set up output args.append("-output") args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._old_upload_args(self._upload_mgr)) mapper, combiner, reducer = self._hadoop_streaming_commands(step_num) args.append("-mapper") args.append(mapper) if combiner: args.append("-combiner") args.append(combiner) if reducer: args.append("-reducer") args.append(reducer) else: args.extend(["-jobconf", "mapred.reduce.tasks=0"]) return args def _args_for_jar_step(self, step_num): step = self._get_step(step_num) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step["jar"].startswith("file:///"): jar = step["jar"][7:] # keep leading slash else: jar = step["jar"] args = self._opts["hadoop_bin"] + ["jar", jar] if step.get("main_class"): args.append(step["main_class"]) # TODO: merge with logic in mrjob/emr.py def interpolate(arg): if arg == mrjob.step.JarStep.INPUT: return ",".join(self._hdfs_step_input_files(step_num)) elif arg == mrjob.step.JarStep.OUTPUT: return self._hdfs_step_output_dir(step_num) else: return arg if step.get("args"): args.extend(interpolate(arg) for arg in step["args"]) return args def _hdfs_step_input_files(self, step_num): """Get the hdfs:// URI for input for the given step.""" if step_num == 0: return [self._upload_mgr.uri(p) for p in self._get_input_paths()] else: return [posixpath.join(self._hdfs_tmp_dir, "step-output", str(step_num))] def _hdfs_step_output_dir(self, step_num): if step_num == len(self._get_steps()) - 1: return self._output_dir else: return posixpath.join(self._hdfs_tmp_dir, "step-output", str(step_num + 1)) def _cleanup_local_scratch(self): super(HadoopJobRunner, self)._cleanup_local_scratch() if self._hdfs_tmp_dir: log.info("deleting %s from HDFS" % self._hdfs_tmp_dir) try: self.invoke_hadoop(["fs", "-rmr", self._hdfs_tmp_dir]) except Exception as e: log.exception(e) ### LOG FETCHING/PARSING ### def _enforce_path_regexp(self, paths, regexp, step_nums): """Helper for log fetching functions to filter out unwanted logs. Keyword arguments are checked against their corresponding regex groups. """ for path in paths: m = regexp.match(path) if ( m and (step_nums is None or int(m.group("step_num")) in step_nums) and (self._job_timestamp is None or m.group("timestamp") == self._job_timestamp) ): yield path def _ls_logs(self, relative_path): """List logs on the local filesystem by path relative to log root directory """ return self.ls(os.path.join(self._hadoop_log_dir, relative_path)) def _fetch_counters(self, step_nums, skip_s3_wait=False): """Read Hadoop counters from local logs. Args: step_nums -- the steps belonging to us, so that we can ignore errors from other jobs run with the same timestamp """ job_logs = self._enforce_path_regexp(self._ls_logs("history/"), HADOOP_JOB_LOG_URI_RE, step_nums) uris = list(job_logs) new_counters = scan_for_counters_in_files(uris, self, self.get_hadoop_version()) # only include steps relevant to the current job for step_num in step_nums: self._counters.append(new_counters.get(step_num, {})) def counters(self): return self._counters def _find_probable_cause_of_failure(self, step_nums): all_task_attempt_logs = [] try: all_task_attempt_logs.extend(self._ls_logs("userlogs/")) except IOError: # sometimes the master doesn't have these pass # TODO: get these logs from slaves if possible task_attempt_logs = self._enforce_path_regexp(all_task_attempt_logs, TASK_ATTEMPTS_LOG_URI_RE, step_nums) step_logs = self._enforce_path_regexp(self._ls_logs("steps/"), STEP_LOG_URI_RE, step_nums) job_logs = self._enforce_path_regexp(self._ls_logs("history/"), HADOOP_JOB_LOG_URI_RE, step_nums) log.info("Scanning logs for probable cause of failure") return best_error_from_logs(self, task_attempt_logs, step_logs, job_logs)
def uri_adds_trailing_slash(self): sd = UploadDirManager("s3://bucket/dir") sd.add("foo/bar.py") self.assertEqual(sd.uri("foo/bar.py"), "s3://bucket/dir/bar.py") self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "s3://bucket/dir/bar.py"})
def test_name_collision(self): sd = UploadDirManager("hdfs:///") sd.add("foo/bar.py") sd.add("bar.py") self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py", "bar.py": "hdfs:///bar-1.py"})