def test_empty_fs(self): fs = CompositeFilesystem() self.assertFalse(fs.can_handle_path('s3://walrus/fish')) self.assertFalse(fs.can_handle_path('/')) self.assertRaises(IOError, fs.ls, '/')
def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['project_id'], location=self._opts['gcs_region'], object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, ), disable_if=_is_permanent_google_error) # Hadoop FS is responsible for all URIs that fall through to it self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['google_project_id'] ), disable_if=_is_permanent_google_error) self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def test_forward_join(self): # join() is a special case since it takes multiple arguments fs = CompositeFilesystem() fs.add_fs('s3', self.s3_fs) self.assertEqual(fs.join('s3://walrus/fish', 'salmon'), self.s3_fs.join.return_value) self.s3_fs.join.assert_called_once_with( 's3://walrus/fish', 'salmon')
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. """ if self._fs is None: # wrap LocalFilesystem in LocalFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem() self._fs.add_fs('local', LocalFilesystem()) return self._fs
def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() self._fs.add_fs('hadoop', HadoopFilesystem(self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() # don't pass [] to fs; this means not to use hadoop until # fs.set_hadoop_bin() is called (used for running hadoop over SSH). hadoop_bin = self._opts['hadoop_bin'] or None self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin)) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. Methods on :py:class:`~mrjob.fs.base.Filesystem` objects will be forwarded to :py:class:`~mrjob.runner.MRJobRunner` until mrjob 0.6.0, but **this behavior is deprecated.** """ if self._fs is None: # wrap LocalFilesystem in CompositeFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem(LocalFilesystem()) return self._fs
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem() self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() location = self._opts['region'] or _zone_to_region( self._opts['zone']) self._fs.add_fs('gcs', GCSFilesystem( credentials=self._credentials, project_id=self._project_id, part_size=self._upload_part_size(), location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, )) self._fs.add_fs('local', LocalFilesystem()) return self._fs
def test_forward_put_with_part_size(self): fs = CompositeFilesystem() fs.add_fs('s3', self.s3_fs) fs.put('/path/to/file', 's3://walrus/file', part_size_mb=99999) self.s3_fs.put.assert_called_once_with( '/path/to/file', 's3://walrus/file', 99999)
def test_forward_put(self): # put() is a special case since the path that matters comes second fs = CompositeFilesystem() fs.add_fs('s3', self.s3_fs) fs.put('/path/to/file', 's3://walrus/file') self.s3_fs.put.assert_called_once_with( '/path/to/file', 's3://walrus/file')
def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem( credentials=self._credentials, local_tmp_dir=self._get_local_tmp_dir(), project_id=self._project_id, ) self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs
def test_forward_fs_extensions(self): fs = CompositeFilesystem() fs.add_fs('s3', self.s3_fs) fs.add_fs('hadoop', self.hadoop_fs) self.assertEqual(fs.create_bucket, self.s3_fs.create_bucket) self.assertEqual(fs.get_hadoop_version, self.hadoop_fs.get_hadoop_version) self.assertRaises(AttributeError, lambda: fs.client)
def test_pick_fs(self): fs = CompositeFilesystem() fs.add_fs('s3', self.s3_fs) fs.add_fs('hadoop', self.hadoop_fs) self.assertEqual(fs.ls('s3://walrus/fish'), self.s3_fs.ls.return_value) # hadoop fs could have handled it, but s3_fs got it first self.assertTrue(self.hadoop_fs.can_handle_path('s3://walrus/fish')) self.assertFalse(self.hadoop_fs.ls.called) self.assertEqual(fs.ls('hdfs:///user/hadoop/'), self.hadoop_fs.ls.return_value) # don't move on to the next FS on an error (unlike old # CompositeFilesystem implementation) self.s3_fs.ls.side_effect = IOError self.assertRaises(IOError, fs.ls, 's3://walrus/fish')
def test_disable_fs(self): class NoCredentialsError(Exception): pass fs = CompositeFilesystem() # tentatively use S3 filesystem, if set up fs.add_fs('s3', self.s3_fs, disable_if=lambda ex: isinstance(ex, NoCredentialsError)) fs.add_fs('hadoop', self.hadoop_fs) self.s3_fs.ls.side_effect = NoCredentialsError # calling ls() on S3 fs disables it, so we move on to hadoop fs self.assertEqual(fs.ls('s3://walrus/'), self.hadoop_fs.ls.return_value) self.assertTrue(self.s3_fs.ls.called) self.assertIn('s3', fs._disabled) # now that s3 fs is disabled, we won't even try to call it self.assertEqual(fs.cat('s3://walrus/fish'), self.hadoop_fs.cat.return_value) self.assertFalse(self.s3_fs.cat.called)
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | { 'cluster_properties', 'core_instance_config', 'gcloud_bin', 'master_instance_config', 'network', 'project_id', 'service_account', 'service_account_scopes', 'subnet', 'task_instance_config', } # no Spark support yet (see #1765) _STEP_TYPES = {'jar', 'streaming'} def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError('You must install google-cloud-logging and ' 'google-cloud-storage to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException('Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # see #1820 if self._opts['image_id']: log.warning('mrjob does not yet support custom machine images' ' on Dataproc') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() if self._opts['service_account_scopes']: self._opts['service_account_scopes'] = [ _fully_qualify_scope_uri(s) for s in self._opts['service_account_scopes'] ] # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # map driver_output_uri to a dict with the keys: # log_uri: uri of file we're reading from # pos: position in file # buffer: bytes read from file already self._driver_output_state = {} # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def _fix_zone_and_region_opts(self): """Ensure that exactly one of region and zone is set.""" if self._opts['region'] and self._opts['zone']: log.warning('you do not need to set region if you set zone') self._opts['region'] = None return if not (self._opts['region'] or self._opts['zone']): if environ.get('CLOUDSDK_COMPUTE_ZONE'): self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE'] elif environ.get('CLOUDSDK_COMPUTE_REGION'): self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION'] else: self._opts['region'] = _DEFAULT_GCE_REGION def _default_opts(self): return combine_dicts( super(DataprocJobRunner, self)._default_opts(), dict( bootstrap_python=True, check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY, cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'], cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS, image_version=_DEFAULT_IMAGE_VERSION, instance_type=_DEFAULT_INSTANCE_TYPE, master_instance_type=_DEFAULT_INSTANCE_TYPE, num_core_instances=_DATAPROC_MIN_WORKERS, num_task_instances=0, )) def _combine_opts(self, opt_list): """Blank out conflicts between *network*/*subnet* and *region*/*zone*.""" opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone']) opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet']) # now combine opts, with region/zone blanked out return super(DataprocJobRunner, self)._combine_opts(opt_list) @property def cluster_client(self): return google.cloud.dataproc_v1beta2.ClusterControllerClient( **self._client_create_kwargs()) @property def job_client(self): return google.cloud.dataproc_v1beta2.JobControllerClient( **self._client_create_kwargs()) @property def logging_client(self): return google.cloud.logging.Client(credentials=self._credentials, project=self._project_id) def _client_create_kwargs(self): if self._opts['region']: endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT) return dict(channel=google.api_core.grpc_helpers.create_channel( endpoint, credentials=self._credentials)) else: return dict(credentials=self._credentials) @property def api_client(self): raise NotImplementedError( '"api_client" was disabled in v0.6.2. Use "cluster_client"' ' or "job_client" instead.') @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() location = self._opts['region'] or _zone_to_region( self._opts['zone']) self._fs.add_fs( 'gcs', GCSFilesystem( credentials=self._credentials, project_id=self._project_id, part_size=self._upload_part_size(), location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, )) self._fs.add_fs('local', LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None # determine region for bucket region = self._region() for tmp_bucket_name in self.fs.gcs.get_all_bucket_names( prefix='mrjob-'): tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name) # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase. (As of Feb. 12, 2018, this is still true, # observed on google-cloud-sdk) if tmp_bucket.location.lower() == region: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', region, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _region(self): # region of cluster, which is either the region set by the user, # or the region derived from the zone they set. # used to pick bucket location and name cluster return self._opts['region'] or _zone_to_region(self._opts['zone']) def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_output_not_exists() self._create_setup_wrapper_scripts() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files() self._wait_for_fs_sync() def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError('Output path %s already exists!' % (self._output_dir, )) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._working_dir_mgr.paths('archive'): self._upload_mgr.add(path) if self._opts['hadoop_streaming_jar']: self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # close our SSH tunnel, if any self._kill_ssh_tunnel() # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for job in self._list_jobs(cluster_name=self._cluster_id, state_matcher=_STATE_MATCHER_ACTIVE): # Kill all active jobs with the same job_prefix as this job job_id = job.reference.job_id if not job_id.startswith(job_prefix): continue self._cancel_job(job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._delete_cluster(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _streaming_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a hadoop streaming job. """ return dict(hadoop_job=dict( args=self._hadoop_streaming_jar_args(step_num), main_jar_file_uri=self._hadoop_streaming_jar_uri(), )) def _jar_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a Hadoop job that runs a JAR""" step = self._get_step(step_num) hadoop_job = {} hadoop_job['args'] = (self._interpolate_jar_step_args( step['args'], step_num)) jar_uri = self._upload_mgr.uri(step['jar']) # can't specify main_class and main_jar_file_uri; see # https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa if step.get('main_class'): hadoop_job['jar_file_uris'] = [jar_uri] hadoop_job['main_class'] = step['main_class'] else: hadoop_job['main_jar_file_uri'] = jar_uri return dict(hadoop_job=hadoop_job) def _hadoop_streaming_jar_uri(self): if self._opts['hadoop_streaming_jar']: return self._upload_mgr.uri(self._opts['hadoop_streaming_jar']) else: return _HADOOP_STREAMING_JAR_URI def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" self.fs.mkdir(self._job_tmpdir) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._region(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._get_cluster(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google.api_core.exceptions.NotFound: log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._create_cluster(cluster_data) self._wait_for_cluster_ready(self._cluster_id) self._set_up_ssh_tunnel() # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in ('RUNNING', 'UPDATING'): cluster = self._get_cluster(cluster_id) cluster_state = cluster.status.State.Name(cluster.status.state) if cluster_state in ('ERROR', 'DELETING'): raise DataprocException(cluster) self._wait_for_api('cluster to accept jobs') return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete(job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): step = self._get_step(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Build step # job_kwarg is a single-item dict, where the key is 'hadoop_job', # 'spark_job', etc. if step['type'] == 'streaming': job_kwarg = self._streaming_step_job_kwarg(step_num) elif step['type'] == 'jar': job_kwarg = self._jar_step_job_kwarg(step_num) else: raise NotImplementedError('Unsupported step type: %r' % step['type']) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._submit_job(step_name, job_kwarg) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result.reference.job_id assert job_id == step_name return job_id def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException(step_num=step_num, num_steps=num_steps) def _default_step_output_dir(self): # put intermediate data in HDFS return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key ### log intepretation ### # step def _interpret_step_logs(self, log_interpretation, step_type): """Hook for interpreting step logs. Unlike with most runners, you may call this multiple times and it will continue to parse the step log incrementally, which is useful for getting job progress.""" # don't turn this off even if read_logs opt is false; it's # the only way this runner can track job progress driver_output_uri = log_interpretation.get('step', {}).get('driver_output_uri') if driver_output_uri: self._update_step_interpretation(log_interpretation['step'], driver_output_uri) def _update_step_interpretation(self, step_interpretation, driver_output_uri): new_lines = self._get_new_driver_output_lines(driver_output_uri) _interpret_new_dataproc_step_stderr(step_interpretation, new_lines) def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: # log the location of job driver output just once log.info(' Parsing job driver output from %s*' % driver_output_uri) state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs.gcs._get_blob(log_uri) try: new_data = log_blob.download_as_string(start=state['pos']) except (google.api_core.exceptions.NotFound, google.api_core.exceptions.RequestRangeNotSatisfiable): # blob was just created, or no more data is available break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines # history def _interpret_history_log(self, log_interpretation): """Does nothing. We can't get the history logs, and we don't need them.""" if not self._read_logs(): return log_interpretation.setdefault('history', {}) # task def _interpret_task_logs(self, log_interpretation, step_type, error_attempt_ids=(), partial=True): """Scan node manager log to find failed container IDs of failed tasks, and then scan the corresponding stderr and syslogs.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted if not self._read_logs(): return step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') if not application_id: log.warning( "Can't parse node manager logs; missing application ID") return log_interpretation['task'] = self._task_log_interpretation( application_id, step_type, partial) def _task_log_interpretation(self, application_id, step_type, partial=True): """Helper for :py:meth:`_interpret_task_logs`""" # not bothering with _read_logs() since this is a helper method result = {} for container_id in self._failed_task_container_ids(application_id): error = _parse_task_syslog_records( self._task_syslog_records(application_id, container_id, step_type)) if not error.get('hadoop_error'): # not sure if this ever happens, since we already know # which containers failed continue error['container_id'] = container_id # fix weird munging of java stacktrace error['hadoop_error']['message'] = _fix_java_stack_trace( error['hadoop_error']['message']) task_error = _parse_task_stderr( self._task_stderr_lines(application_id, container_id, step_type)) if task_error: task_error['message'] = _fix_traceback(task_error['message']) error['task_error'] = task_error result.setdefault('errors', []).append(error) # if partial is true, bail out when we find the first task error if task_error and partial: result['partial'] = True return result return result def _failed_task_container_ids(self, application_id): """Stream container IDs of failed tasks, in reverse order.""" container_id_prefix = 'container' + application_id[11:] log_filter = self._make_log_filter( 'yarn-yarn-nodemanager', {'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME}) log.info('Scanning node manager logs for IDs of failed tasks...') # it doesn't seem to work to do self.logging_client.logger(); # there's some RPC dispute about whether the log name should # be qualified by project name or not entries = self.logging_client.list_entries( filter_=log_filter, order_by=google.cloud.logging.DESCENDING) for entry in entries: message = entry.payload.get('message') if not message: continue m = _CONTAINER_EXIT_RE.match(message) if not m: continue returncode = int(m.group('returncode')) if not returncode: continue container_id = m.group('container_id') # matches some other step if not container_id.startswith(container_id_prefix): continue log.debug(' %s' % container_id) yield container_id def _task_stderr_lines(self, application_id, container_id, step_type): """Yield lines from a specific stderr log.""" log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'stderr', }) log.info(' reading stderr log...') entries = self.logging_client.list_entries(filter_=log_filter) # use log4j parsing to handle tab -> newline conversion for record in _log_entries_to_log4j(entries): for line in record['message'].split('\n'): yield line def _task_syslog_records(self, application_id, container_id, step_type): """Yield log4j records from a specific syslog. """ log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'syslog', }) log.info(' reading syslog...') entries = self.logging_client.list_entries(filter_=log_filter) return _log_entries_to_log4j(entries) # misc def _make_log_filter(self, log_name=None, extra_values=None): # we only want logs from this project, cluster, and region d = {} d['resource.labels.cluster_name'] = self._cluster_id d['resource.labels.project_id'] = self._project_id d['resource.labels.region'] = self._region() d['resource.type'] = 'cloud_dataproc_cluster' if log_name: d['logName'] = 'projects/%s/logs/%s' % (self._project_id, log_name) if extra_values: d.update(extra_values) return _log_filter_str(d) def counters(self): return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise ValueError('cluster has not yet been created') cluster = self._get_cluster(self._cluster_id) self._image_version = (cluster.config.software_config.image_version) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version(self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) def _bootstrap_pre_commands(self): # don't run the bootstrap script in / (see #1601) return [ 'mkdir /tmp/mrjob', 'cd /tmp/mrjob', ] ### Bootstrapping ### def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def get_cluster_id(self): return self._cluster_id def _cluster_create_kwargs(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ # TODO: remove mrjob-max-secs-idle once lifecycle_config is visible # through the gcloud utility and the Google Cloud Console cluster_metadata['mrjob-max-secs-idle'] = str( int(self._opts['max_mins_idle'] * 60)) gce_cluster_config = dict( metadata=cluster_metadata, service_account_scopes=self._opts['service_account_scopes'], ) if self._opts['network']: gce_cluster_config['network_uri'] = self._opts['network'] if self._opts['subnet']: gce_cluster_config['subnetwork_uri'] = self._opts['subnet'] if self._opts['service_account']: gce_cluster_config['service_account'] = ( self._opts['service_account']) if self._opts['service_account_scopes']: gce_cluster_config['service_account_scopes'] = ( self._opts['service_account_scopes']) if self._opts['zone']: gce_cluster_config['zone_uri'] = _gcp_zone_uri( project=self._project_id, zone=self._opts['zone']) cluster_config = dict(gce_cluster_config=gce_cluster_config, initialization_actions=[ dict(executable_file=init_script_uri) for init_script_uri in gcs_init_script_uris ]) # Task tracker master_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=1, instance_type=self._opts['master_instance_type'], ) if self._opts['master_instance_config']: master_conf.update(self._opts['master_instance_config']) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type']) if self._opts['core_instance_config']: worker_conf.update(self._opts['core_instance_config']) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True) if self._opts['task_instance_config']: secondary_worker_conf.update(self._opts['task_instance_config']) cluster_config['master_config'] = master_conf cluster_config['worker_config'] = worker_conf if secondary_worker_conf.get('num_instances'): cluster_config['secondary_worker_config'] = secondary_worker_conf cluster_config['lifecycle_config'] = dict(idle_delete_ttl=dict( seconds=int(self._opts['max_mins_idle'] * 60))) software_config = {} if self._opts['cluster_properties']: software_config['properties'] = _values_to_text( self._opts['cluster_properties']) # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: software_config['image_version'] = self._opts['image_version'] if software_config: cluster_config['software_config'] = software_config # in Python 2, dict keys loaded from JSON will be unicode, which # the Google protobuf objects don't like if PY2: cluster_config = _clean_json_dict_keys(cluster_config) kwargs = dict(project_id=self._project_id, cluster_name=self._cluster_id, config=cluster_config) return self._add_extra_cluster_params(kwargs) ### Dataproc-specific Stuff ### def _get_cluster(self, cluster_id): return self.cluster_client.get_cluster(cluster_name=cluster_id, **self._project_id_and_region()) def _create_cluster(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa self.cluster_client.create_cluster(cluster=cluster_data, **self._project_id_and_region()) def _delete_cluster(self, cluster_id): return self.cluster_client.delete_cluster( cluster_name=cluster_id, **self._project_id_and_region()) def _list_jobs(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = self._project_id_and_region() if cluster_name: list_kwargs['cluster_name'] = cluster_name if state_matcher: list_kwargs['job_state_matcher'] = state_matcher return self.job_client.list_jobs(**list_kwargs) def _get_job(self, job_id): return self.job_client.get_job(job_id=job_id, **self._project_id_and_region()) def _cancel_job(self, job_id): return self.job_client.cancel_job(job_id=job_id, **self._project_id_and_region()) def _submit_job(self, step_name, job_kwarg): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa submit_job_kwargs = dict(job=dict( reference=dict(project_id=self._project_id, job_id=step_name), placement=dict(cluster_name=self._cluster_id), **job_kwarg), **self._project_id_and_region()) log.debug(' submit_job(%s)' % ', '.join('%s=%r' % (k, v) for k, v in sorted(submit_job_kwargs.items()))) return self.job_client.submit_job(**submit_job_kwargs) def _project_id_and_region(self): return dict( project_id=self._project_id, region=(self._opts['region'] or 'global'), ) def _manifest_download_commands(self): return [ # TODO: SSH in and figure out how to use gsutil or similar # ('gs://*', 'gsutil cp'), ('*://*', 'hadoop fs -copyToLocal'), ] ### SSH hooks ### def _job_tracker_host(self): return '%s-m' % self._cluster_id def _ssh_tunnel_config(self): return _SSH_TUNNEL_CONFIG def _launch_ssh_proc(self, args): ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args) # enter an empty passphrase if creating a key for the first time ssh_proc.stdin.write(b'\n\n') return ssh_proc def _ssh_launch_wait_secs(self): """Wait 20 seconds because gcloud has to update project metadata (unless we were going to check the cluster sooner anyway).""" return min(20.0, self._opts['check_cluster_every']) def _ssh_tunnel_args(self, bind_port): if not self._cluster_id: return gcloud_bin = self._opts['gcloud_bin'] or ['gcloud'] cluster = self._get_cluster(self._cluster_id) zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1] return gcloud_bin + [ 'compute', 'ssh', '--zone', zone, self._job_tracker_host(), '--', ] + self._ssh_tunnel_opts(bind_port)
class MRJobRunner(object): """Abstract base class for all runners""" # this class handles the basic runner framework, options and config files, # arguments to mrjobs, and setting up job working dirs and environments. # this will put files from setup scripts, py_files, and bootstrap_mrjob # into the job's working dir, but won't actually run/import them # # command lines to run substeps (including Spark) are handled by # mrjob.bin.MRJobBinRunner #: alias for this runner, used on the command line with ``-r`` alias = None # libjars is only here because the job can set it; might want to # handle this with a warning from the launcher instead OPT_NAMES = { 'bootstrap_mrjob', 'check_input_paths', 'cleanup', 'cleanup_on_failure', 'cmdenv', 'jobconf', 'label', 'libjars', 'local_tmp_dir', 'owner', 'py_files', 'read_logs', 'setup', 'upload_archives', 'upload_dirs', 'upload_files' } # re-define this as a set of step types supported by your runner _STEP_TYPES = None # if this is true, when bootstrap_mrjob is true, create a mrjob.zip # and patch it into the *py_files* option _BOOTSTRAP_MRJOB_IN_PY_FILES = True ### methods to call from your batch script ### def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, steps=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :param steps: a list of descriptions of steps to run (see :doc:`step` for description formats) :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)] ) log.debug('Active configuration:') log.debug(pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark # if not using a setup script. self._spark_files = [] self._spark_archives = [] # set this to an :py:class:`~mrjob.setup.UploadDirManager` in # runners that upload files to HDFS, S3, etc. self._upload_mgr = None self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key() # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) self._spark_files.append( (extra_arg['name'], extra_arg['path'])) # extra file arguments to our job if file_upload_args: log.warning('file_upload_args is deprecated and will be removed' ' in v0.6.0. Pass dicts to extra_args instead.') for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._extra_args.extend([arg, arg_file]) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add( 'archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where to keep the input manifest self._input_manifest_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # check and store *steps* self._steps = None if steps is None: if not mr_job_script: self._steps = [] # otherwise we'll load steps on-the-fly, see _load_steps() else: self._check_steps(steps) self._steps = copy.deepcopy(steps) # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False ### Options #### def _default_opts(self): try: owner = getpass.getuser() except: owner = None return dict( check_input_paths=True, cleanup=['ALL'], cleanup_on_failure=['NONE'], owner=owner, ) def _combine_confs(self, source_and_opt_list): """Combine several opt dictionaries into one. *source_and_opt_list* is a list of tuples of *source*, *opts* where *opts* is a dictionary and *source* is either None or a description of where the opts came from (usually a path). Only override this if you need truly fine-grained control, including knowledge of the options' source. """ opt_list = [ self._fix_opts(opts, source) for source, opts in source_and_opt_list ] return self._combine_opts(opt_list) def _combine_opts(self, opt_list): """Combine several opt dictionaries into one. *opt_list* is a list of dictionaries containing validated options Override this if you need to base options off the values of other options, but don't need to issue warnings etc. about the options' source. """ return combine_opts(self._opt_combiners(), *opt_list) def _opt_combiners(self): """A dictionary mapping opt name to combiner funciton. This won't necessarily include every opt name (we default to :py:func:`~mrjob.conf.combine_value`). """ return _combiners(self.OPT_NAMES) def _fix_opts(self, opts, source=None): """Take an options dictionary, and either return a sanitized version of it, or raise an exception. *source* is either a string describing where the opts came from or None. This ensures that opt dictionaries are really dictionaries and handles deprecated options. """ if source is None: source = 'defaults' # defaults shouldn't trigger warnings if not isinstance(opts, dict): raise TypeError( 'options for %s (from %s) must be a dict' % (self.alias, source)) deprecated_aliases = _deprecated_aliases(self.OPT_NAMES) results = {} for k, v in sorted(opts.items()): # rewrite deprecated aliases if k in deprecated_aliases: if v is None: # don't care continue aliased_opt = deprecated_aliases log.warning('Deprecated option %s (from %s) has been renamed' ' to %s and will be removed in v0.7.0' % ( k, source, aliased_opt)) if opts.get(aliased_opt) is not None: return # don't overwrite non-aliased opt k = aliased_opt if k in self.OPT_NAMES: results[k] = None if v is None else self._fix_opt(k, v, source) elif v: log.warning('Unexpected option %s (from %s)' % (k, source)) return results def _fix_opt(self, opt_key, opt_value, source): """Fix a single option, returning its correct value or raising an exception. This is not called for options that are ``None``. This currently handles cleanup opts. Override this if you require additional opt validation or cleanup. """ if opt_key in ('cleanup', 'cleanup_on_failure'): return self._fix_cleanup_opt(opt_key, opt_value, source) else: return opt_value def _fix_cleanup_opt(self, opt_key, opt_value, source): """Fix a cleanup option, or raise ValueError.""" if isinstance(opt_value, string_types): opt_value = [opt_value] if 'NONE' in opt_value and len(set(opt_value)) > 1: raise ValueError( 'Cannot clean up both nothing and something!' ' (%s option from %s)' % (opt_key, source)) for cleanup_type in opt_value: if cleanup_type not in CLEANUP_CHOICES: raise ValueError( '%s must be one of %s, not %s (from %s)' % ( opt_key, ', '.join(CLEANUP_CHOICES), opt_value, source)) return opt_value def _obfuscate_opt(self, opt_key, opt_value): """Return value of opt to show in debug printout. Used to obfuscate credentials, etc.""" return opt_value ### Filesystem object ### @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. """ if self._fs is None: # wrap LocalFilesystem in LocalFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem() self._fs.add_fs('local', LocalFilesystem()) return self._fs ### Running the job and parsing output ### def run(self): """Run the job, and block until it finishes. Raise :py:class:`~mrjob.step.StepFailedException` if there are any problems (except on :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the actual exception that caused the step to fail). """ if self._ran_job: raise ValueError('Job already ran!') if self._num_steps() == 0: raise ValueError('Job has no steps!') self._create_dir_archives() # TODO: no point in checking input paths if we're going to # make a manifest out of them self._check_input_paths() self._add_input_files_for_upload() self._create_input_manifest_if_needed() self._run() self._ran_job = True last_step = self._get_steps()[-1] # only print this message if the last step uses our output dir if 'args' not in last_step or OUTPUT in last_step['args']: log.info('job output is in %s' % self._output_dir) def cat_output(self): """Stream the jobs output, as a stream of ``bytes``. If there are multiple output files, there will be an empty bytestring (``b''``) between them. .. versionadded:: 0.6.0 In previous versions, you'd use :py:meth:`stream_output`. """ output_dir = self.get_output_dir() if output_dir is None: raise ValueError('Run the job before streaming output') if self._closed is True: log.warning( 'WARNING! Trying to stream output from a closed runner, output' ' will probably be empty.') log.info('Streaming final output from %s...' % output_dir) def split_path(path): while True: base, name = os.path.split(path) # no more elements if not name: break yield name path = base def ls_output(): for filename in self.fs.ls(output_dir): subpath = filename[len(output_dir):] # Hadoop ignores files and dirs inside the output dir # whose names start with '_' or '.'. See #1337. if not (any(name[0] in '_.' for name in split_path(subpath))): yield filename for i, filename in enumerate(ls_output()): if i > 0: yield b'' # EOF of previous file for chunk in self.fs._cat_file(filename): yield chunk def stream_output(self): """Like :py:meth:`cat_output` except that it groups bytes into lines. Equivalent to ``mrjob.util.to_lines(runner.cat_output())``. .. deprecated:: 0.6.0 """ log.warning('stream_output() is deprecated and will be removed in' ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())' ' instead.') return to_lines(self.cat_output()) def _cleanup_mode(self, mode=None): """Actual cleanup action to take based on various options""" if self._script_path and not self._ran_job: return mode or self._opts['cleanup_on_failure'] else: return mode or self._opts['cleanup'] def _cleanup_cloud_tmp(self): """Cleanup any files/directories on cloud storage (e.g. S3) we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only EMR runner does this def _cleanup_hadoop_tmp(self): """Cleanup any files/directories on HDFS we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only Hadoop runner does this def _cleanup_local_tmp(self): """Cleanup any files/directories on the local machine we created while running this job. Should be safe to run this at any time, or multiple times. This particular function removes any local tmp directories added to the list self._local_tmp_dirs This won't remove output_dir if it's outside of our tmp dir. """ if self._local_tmp_dir: log.info('Removing temp directory %s...' % self._local_tmp_dir) try: rmtree(self._local_tmp_dir) except OSError as e: log.exception(e) self._local_tmp_dir = None def _cleanup_cluster(self): """Terminate the cluster if there is one.""" pass # this only happens on EMR def _cleanup_logs(self): """Cleanup any log files that are created as a side-effect of the job. """ pass # this only happens on EMR def _cleanup_job(self): """Stop any jobs that we created that are still running.""" pass # currently disabled (see #1241) def cleanup(self, mode=None): """Clean up running jobs, temp files, and logs, subject to the *cleanup* option passed to the constructor. If you create your runner in a ``with`` block, :py:meth:`cleanup` will be called automatically:: with mr_job.make_runner() as runner: ... # cleanup() called automatically here :param mode: override *cleanup* passed into the constructor. Should be a list of strings from :py:data:`~mrjob.options.CLEANUP_CHOICES` """ mode = self._cleanup_mode(mode) def mode_has(*args): return any((choice in mode) for choice in args) if self._script_path and not self._ran_job: if mode_has('CLUSTER', 'ALL'): self._cleanup_cluster() if mode_has('JOB', 'ALL'): self._cleanup_job() if mode_has('ALL', 'TMP', 'CLOUD_TMP'): self._cleanup_cloud_tmp() if mode_has('ALL', 'TMP', 'HADOOP_TMP'): self._cleanup_hadoop_tmp() if mode_has('ALL', 'TMP', 'LOCAL_TMP'): self._cleanup_local_tmp() if mode_has('ALL', 'LOGS'): self._cleanup_logs() self._closed = True def counters(self): """Get counters associated with this run in this form:: [{'group name': {'counter1': 1, 'counter2': 2}}, {'group name': ...}] The list contains an entry for every step of the current job. """ raise NotImplementedError ### hooks for the with statement ### def __enter__(self): """Don't do anything special at start of with block""" return self def __exit__(self, type, value, traceback): """Call self.cleanup() at end of with block.""" self.cleanup() ### more runner information ### def get_opts(self): """Get options set for this runner, as a dict.""" log.warning('get_opts() is deprecated and will be removed in v0.7.0') return copy.deepcopy(self._opts) def get_job_key(self): """Get the unique key for the job run by this runner. This has the format ``label.owner.date.time.microseconds`` """ return self._job_key def get_output_dir(self): """Find the directory containing the job output. If the job hasn't run yet, returns None""" if self._script_path and not self._ran_job: return None return self._output_dir ### other methods you need to implement in your subclass ### def get_hadoop_version(self): """Return the version number of the Hadoop environment as a string if Hadoop is being used or simulated. Return None if not applicable. :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster. :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an additional `hadoop_version` option to specify which version it simulates. :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at all. """ return None # you'll probably wan't to add your own __init__() and cleanup() as well def _run(self): """Run the job.""" raise NotImplementedError ### internal utilities for implementing MRJobRunners ### def _get_local_tmp_dir(self): """Create a tmp directory on the local filesystem that will be cleaned up by self.cleanup()""" if not self._local_tmp_dir: tmp_dir = (self._opts['local_tmp_dir'] or tempfile.gettempdir()) path = os.path.join(tmp_dir, self._job_key) log.info('Creating temp directory %s' % path) if os.path.isdir(path): rmtree(path) os.makedirs(path) self._local_tmp_dir = path return self._local_tmp_dir def _make_unique_job_key(self, label=None, owner=None): """Come up with a useful unique ID for this job. Optionally, you can specify a custom label or owner (otherwise we use :py:meth:`_label` and :py:meth:`_owner`. We use this to choose the output directory, etc. for the job. """ if label is None: label = self._label() if owner is None: owner = self._owner() now = datetime.datetime.utcnow() return '%s.%s.%s.%06d' % ( label, owner, now.strftime('%Y%m%d.%H%M%S'), now.microsecond) def _label(self): """Return *label* opt, or if not set, the name of the file containing the MRJob, minus extension, or if none, ``'no_script'``""" if self._opts['label']: return self._opts['label'] elif self._script_path: return os.path.basename(self._script_path).split('.')[0] else: return 'no_script' def _owner(self): """Return *owner* opt (which defaults to :py:func:`getpass.getuser`), or ``'no_user'`` if not set.""" if self._opts['owner']: # owner opt defaults to getpass.getuser() return self._opts['owner'] else: return 'no_user' def _get_steps(self): """If *steps* was not set at init time, call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. """ if self._steps is None: log.warning( 'querying jobs for steps is deprecated and' ' will go away in v0.7.0') steps = self._load_steps() self._check_steps(steps) self._steps = steps return self._steps def _load_steps(self): """Ask job how many steps it has, and whether there are mappers and reducers for each step. Returns output as described in :ref:`steps-format`. If this is called, you can assume self._script_path is set. """ raise NotImplementedError def _check_steps(self, steps): """Look at the step definition (*steps*). If it is not supported by the runner, raise :py:class:`NotImplementedError`. If it is not supported by mrjob, raise :py:class:`ValueError`. """ if not self._STEP_TYPES: # use __class__.__name__ because only MRJobRunner would # trigger this raise NotImplementedError( '%s cannot run steps!' % self.__class__.__name__) for step_num, step in enumerate(steps): self._check_step(step, step_num) def _check_step(self, step, step_num): """Raise an exception if the given step is invalid (:py:class:`ValueError`) or not handled by this runner (:py:class:`NotImplementedError`). By default, we check that *step* has a support step type, only uses an input manifest if it's the first step, and that :py:attr:`_script_path` exists if necessary. You can re-define this in your subclass. """ if step.get('type') not in self._STEP_TYPES: raise NotImplementedError( 'step %d has type %r, but %s runner only supports:' ' %s' % (step_num, step.get('type'), self.alias, ', '.join(sorted(self._STEP_TYPES)))) if step.get('input_manifest') and step_num != 0: raise ValueError( 'step %d may not take an input manifest (only' ' first step can' % step_num) # some step types assume a MRJob script if not self._script_path: if step['type'] == 'spark': raise ValueError( "SparkStep (step %d) can't run without a MRJob script" " (try SparkScriptStep instead)" % step_num) elif step['type'] == 'streaming': for mrc in ('mapper', 'combiner', 'reducer'): if not step.get(mrc): continue substep = step[mrc] if substep['type'] == 'script': raise ValueError( "%s (step %d) can't run without a MRJob" " script" % (mrc, step_num)) def _get_step(self, step_num): """Get a single step (calls :py:meth:`_get_steps`).""" return self._get_steps()[step_num] def _num_steps(self): """Get the number of steps (calls :py:meth:`get_steps`).""" return len(self._get_steps()) def _uses_input_manifest(self): """Does the first step take an input manifest?""" return bool(self._get_step(0).get('input_manifest')) def _has_streaming_steps(self): """Are any of our steps Hadoop Streaming steps?""" return any(step['type'] == 'streaming' for step in self._get_steps()) def _has_spark_steps(self): """Are any of our steps Spark steps? (e.g. spark, spark_jar, spark_script) Generally used to determine if we need to install Spark on a cluster. """ return any(_is_spark_step_type(step['type']) for step in self._get_steps()) def _has_pyspark_steps(self): """Do any of our steps involve running Python on Spark? Includes spark and spark_script types, but not spark_jar. Generally used to tell if we need a Spark setup script. """ return any(_is_pyspark_step_type(step['type']) for step in self._get_steps()) def _args_for_task(self, step_num, mrc): return [ '--step-num=%d' % step_num, '--%s' % mrc, ] + self._mr_job_extra_args() def _mr_job_extra_args(self, local=False): """Return arguments to add to every invocation of MRJob. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ result = [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if local: result.append(extra_arg['path']) else: result.append(self._working_dir_mgr.name(**extra_arg)) else: result.append(extra_arg) return result def _dir_archive_path(self, dir_path): """Assign a path for the archive of *dir_path* but don't actually create anything.""" if dir_path not in self._dir_to_archive_path: # we can check local paths now if not (is_uri(dir_path) or os.path.isdir(dir_path)): raise OSError('%s is not a directory!' % dir_path) name = name_uniquely( dir_path, names_taken=self._dir_archive_names_taken) self._dir_archive_names_taken.add(name) self._dir_to_archive_path[dir_path] = os.path.join( self._get_local_tmp_dir(), 'archives', name + '.tar.gz') return self._dir_to_archive_path[dir_path] def _create_dir_archives(self): """Call this to create all dir archives""" for dir_path in sorted(set(self._dir_to_archive_path)): self._create_dir_archive(dir_path) def _create_dir_archive(self, dir_path): """Helper for :py:meth:`archive_dir`""" if not self.fs.exists(dir_path): raise OSError('%s does not exist') tar_gz_path = self._dir_archive_path(dir_path) if tar_gz_path in self._dir_archives_created: return # already created if not os.path.isdir(os.path.dirname(tar_gz_path)): os.makedirs(os.path.dirname(tar_gz_path)) # for remote files tmp_download_path = os.path.join( self._get_local_tmp_dir(), 'tmp-download') log.info('Archiving %s -> %s' % (dir_path, tar_gz_path)) with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz: for path in self.fs.ls(dir_path): # fs.ls() only lists files if path == dir_path: raise OSError('%s is a file, not a directory!' % dir_path) # TODO: do we need this? if os.path.realpath(path) == os.path.realpath(tar_gz_path): raise OSError( 'attempted to archive %s into itself!' % tar_gz_path) if is_uri(path): path_in_tar_gz = path[len(dir_path):].lstrip('/') log.info(' downloading %s -> %s' % ( path, tmp_download_path)) with open(tmp_download_path, 'wb') as f: for chunk in self.fs.cat(path): f.write(chunk) local_path = tmp_download_path else: path_in_tar_gz = path[len(dir_path):].lstrip(os.sep) local_path = path log.debug(' adding %s to %s' % (path, tar_gz_path)) tar_gz.add(local_path, path_in_tar_gz, recursive=False) self._dir_archives_created.add(tar_gz_path) def _bootstrap_mrjob(self): """Should we bootstrap mrjob?""" if self._opts['bootstrap_mrjob'] is None: return self._opts['interpreter'] is None else: return bool(self._opts['bootstrap_mrjob']) def _get_input_paths(self): """Get the paths to input files, dumping STDIN to a local file if need be.""" if self._input_manifest_path: return [self._input_manifest_path] if '-' in self._input_paths: if self._stdin_path is None: # prompt user, so they don't think the process has stalled log.info('reading from STDIN') stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') log.debug('dumping stdin to local file %s' % stdin_path) with open(stdin_path, 'wb') as stdin_file: for line in self._stdin: # catch missing newlines (often happens with test data) if not line.endswith(b'\n'): line += b'\n' stdin_file.write(line) self._stdin_path = stdin_path return [self._stdin_path if p == '-' else p for p in self._input_paths] def _create_input_manifest_if_needed(self): """Create a file with a list of URIs of input files.""" if self._input_manifest_path or not self._uses_input_manifest(): return uris = [] log.info('finding input files to add to manifest...') for path in self._get_input_paths(): log.debug(' in %s' % path) if is_uri(path): # URIs might be globs for uri in self.fs.ls(path): uris.append(uri) else: # local paths are expected to be single files # (shell would resolve globs) if self._upload_mgr: uris.append(self._upload_mgr.uri(path)) else: # just make sure job can find files from it's working dir uris.append(os.path.abspath(path)) log.info('found %d input files' % len(uris)) path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt') self._write_script(uris, path, 'input manifest') self._input_manifest_path = path if self._upload_mgr: self._upload_mgr.add(self._input_manifest_path) def _check_input_paths(self): """Check that input exists prior to running the job, if the `check_input_paths` option is true.""" if not self._opts['check_input_paths']: return for path in self._input_paths: self._check_input_path(path) def _check_input_path(self, path): """Raise :py:class:`IOError` if the given input does not exist or is otherwise invalid. Override this to provide custom check behavior.""" if path == '-': return # STDIN always exists if not self.fs.can_handle_path(path): return # no way to check (e.g. non-S3 URIs on EMR) if not self.fs.exists(path): raise IOError( 'Input path %s does not exist!' % (path,)) def _add_input_files_for_upload(self): """If there is an upload manager, add input files to it.""" if self._upload_mgr: for path in self._get_input_paths(): self._upload_mgr.add(path) def _intermediate_output_dir(self, step_num, local=False): """A directory for intermediate output for the given step number.""" join = os.path.join if local else posixpath.join return join( self._step_output_dir or self._default_step_output_dir(), '%04d' % step_num) def _default_step_output_dir(self): """Where to put output for steps other than the last one, if not specified by the *output_dir* constructor keyword. Usually you want this to be on HDFS (most efficient). Define this in your runner subclass. """ raise NotImplementedError def _step_input_uris(self, step_num): """A list of URIs to use as input for the given step. For all except the first step, this list will have a single item (a directory).""" if step_num == 0: return [self._upload_mgr.uri(path) if self._upload_mgr else to_uri(path) for path in self._get_input_paths()] else: return [to_uri(self._intermediate_output_dir(step_num - 1))] def _step_output_uri(self, step_num): """URI to use as output for the given step. This is either an intermediate dir (see :py:meth:`intermediate_output_uri`) or ``self._output_dir`` for the final step.""" if step_num == len(self._get_steps()) - 1: return to_uri(self._output_dir) else: return to_uri(self._intermediate_output_dir(step_num)) def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) # _sort_values_jobconf() isn't relevant to Spark, # but it doesn't do any harm either jobconf = combine_jobconfs(self._sort_values_jobconf(), self._opts['jobconf'], step.get('jobconf')) # if user is using the wrong jobconfs, add in the correct ones # and log a warning hadoop_version = self.get_hadoop_version() if hadoop_version: jobconf = translate_jobconf_dict(jobconf, hadoop_version) return jobconf def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf def _sort_values_partitioner(self): """Partitioner to use with *sort_values* keyword to the constructor.""" if self._sort_values: return _SORT_VALUES_PARTITIONER else: return None def _upload_args(self): # just upload every file and archive in the working dir manager return self._upload_args_helper('-files', None, '-archives', None) def _upload_args_helper( self, files_opt_str, files, archives_opt_str, archives, always_use_hash=True): args = [] file_hash_paths = list( self._arg_hash_paths('file', files, always_use_hash=always_use_hash)) if file_hash_paths: args.append(files_opt_str) args.append(','.join(file_hash_paths)) archive_hash_paths = list( self._arg_hash_paths('archive', archives, always_use_hash=always_use_hash)) if archive_hash_paths: args.append(archives_opt_str) args.append(','.join(archive_hash_paths)) return args def _arg_hash_paths(self, type, named_paths=None, always_use_hash=True): """Helper function for the *upload_args methods.""" if named_paths is None: # just return everything managed by _working_dir_mgr named_paths = sorted( self._working_dir_mgr.name_to_path(type).items()) for name, path in named_paths: if not name: name = self._working_dir_mgr.name(type, path) if self._upload_mgr: uri = self._upload_mgr.uri(path) else: uri = path if not always_use_hash and _basename(uri) == name: yield uri else: yield '%s#%s' % (uri, name) def _write_script(self, lines, path, description): """Write text of a setup script, input manifest, etc. to the given file. By default, this writes binary data. Redefine :py:meth:`write_lines` to use other line endings. :param lines: a list of lines as ``str`` :param path: path of file to write to :param description: what we're writing to, for debug messages """ log.debug('Writing %s to %s:' % (description, path)) for line in lines: log.debug(' ' + line) self._write_script_lines(lines, path) def _write_script_lines(self, lines, path): """Write text to the given file. By default, this writes binary data, but can be redefined to use local line endings.""" with open(path, 'wb') as f: for line in lines: f.write((line + '\n').encode('utf-8'))
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | { 'cluster_properties', 'core_instance_config', 'gcloud_bin', 'master_instance_config', 'network', 'project_id', 'service_account', 'service_account_scopes', 'subnet', 'task_instance_config', } # no Spark support yet (see #1765) _STEP_TYPES = {'jar', 'streaming'} def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError( 'You must install google-cloud-logging and ' 'google-cloud-storage to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException( 'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # see #1820 if self._opts['image_id']: log.warning('mrjob does not yet support custom machine images' ' on Dataproc') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() if self._opts['service_account_scopes']: self._opts['service_account_scopes'] = [ _fully_qualify_scope_uri(s) for s in self._opts['service_account_scopes'] ] # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # map driver_output_uri to a dict with the keys: # log_uri: uri of file we're reading from # pos: position in file # buffer: bytes read from file already self._driver_output_state = {} # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def _fix_zone_and_region_opts(self): """Ensure that exactly one of region and zone is set.""" if self._opts['region'] and self._opts['zone']: log.warning('you do not need to set region if you set zone') self._opts['region'] = None return if not (self._opts['region'] or self._opts['zone']): if environ.get('CLOUDSDK_COMPUTE_ZONE'): self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE'] elif environ.get('CLOUDSDK_COMPUTE_REGION'): self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION'] else: self._opts['region'] = _DEFAULT_GCE_REGION def _default_opts(self): return combine_dicts( super(DataprocJobRunner, self)._default_opts(), dict( bootstrap_python=True, check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY, cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'], cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS, image_version=_DEFAULT_IMAGE_VERSION, instance_type=_DEFAULT_INSTANCE_TYPE, master_instance_type=_DEFAULT_INSTANCE_TYPE, num_core_instances=_DATAPROC_MIN_WORKERS, num_task_instances=0, ) ) def _combine_opts(self, opt_list): """Blank out conflicts between *network*/*subnet* and *region*/*zone*.""" opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone']) opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet']) # now combine opts, with region/zone blanked out return super(DataprocJobRunner, self)._combine_opts(opt_list) @property def cluster_client(self): return google.cloud.dataproc_v1beta2.ClusterControllerClient( **self._client_create_kwargs()) @property def job_client(self): return google.cloud.dataproc_v1beta2.JobControllerClient( **self._client_create_kwargs()) @property def logging_client(self): return google.cloud.logging.Client(credentials=self._credentials, project=self._project_id) def _client_create_kwargs(self): if self._opts['region']: endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT) return dict( channel=google.api_core.grpc_helpers.create_channel( endpoint, credentials=self._credentials)) else: return dict(credentials=self._credentials) @property def api_client(self): raise NotImplementedError( '"api_client" was disabled in v0.6.2. Use "cluster_client"' ' or "job_client" instead.') @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() location = self._opts['region'] or _zone_to_region( self._opts['zone']) self._fs.add_fs('gcs', GCSFilesystem( credentials=self._credentials, project_id=self._project_id, part_size=self._upload_part_size(), location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, )) self._fs.add_fs('local', LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None # determine region for bucket region = self._region() for tmp_bucket_name in self.fs.gcs.get_all_bucket_names( prefix='mrjob-'): tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name) # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase. (As of Feb. 12, 2018, this is still true, # observed on google-cloud-sdk) if tmp_bucket.location.lower() == region: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', region, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _region(self): # region of cluster, which is either the region set by the user, # or the region derived from the zone they set. # used to pick bucket location and name cluster return self._opts['region'] or _zone_to_region(self._opts['zone']) def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_output_not_exists() self._create_setup_wrapper_scripts() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files() self._wait_for_fs_sync() def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError( 'Output path %s already exists!' % (self._output_dir,)) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" if self._opts['hadoop_streaming_jar']: self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # close our SSH tunnel, if any self._kill_ssh_tunnel() # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for job in self._list_jobs( cluster_name=self._cluster_id, state_matcher=_STATE_MATCHER_ACTIVE): # Kill all active jobs with the same job_prefix as this job job_id = job.reference.job_id if not job_id.startswith(job_prefix): continue self._cancel_job(job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._delete_cluster(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _streaming_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a hadoop streaming job. """ return dict( hadoop_job=dict( args=self._hadoop_streaming_jar_args(step_num), main_jar_file_uri=self._hadoop_streaming_jar_uri(), ) ) def _jar_step_job_kwarg(self, step_num): """Returns a map from ``'hadoop_job'`` to a dict representing a Hadoop job that runs a JAR""" step = self._get_step(step_num) hadoop_job = {} hadoop_job['args'] = ( self._interpolate_jar_step_args(step['args'], step_num)) jar_uri = self._upload_mgr.uri(step['jar']) # can't specify main_class and main_jar_file_uri; see # https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa if step.get('main_class'): hadoop_job['jar_file_uris'] = [jar_uri] hadoop_job['main_class'] = step['main_class'] else: hadoop_job['main_jar_file_uri'] = jar_uri return dict(hadoop_job=hadoop_job) def _hadoop_streaming_jar_uri(self): if self._opts['hadoop_streaming_jar']: return self._upload_mgr.uri(self._opts['hadoop_streaming_jar']) else: return _HADOOP_STREAMING_JAR_URI def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" self.fs.mkdir(self._job_tmpdir) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._region(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._get_cluster(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google.api_core.exceptions.NotFound: log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._create_cluster(cluster_data) self._wait_for_cluster_ready(self._cluster_id) self._set_up_ssh_tunnel() # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in ('RUNNING', 'UPDATING'): cluster = self._get_cluster(cluster_id) cluster_state = cluster.status.State.Name(cluster.status.state) if cluster_state in ('ERROR', 'DELETING'): raise DataprocException(cluster) self._wait_for_api('cluster to accept jobs') return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete( job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): step = self._get_step(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Build step # job_kwarg is a single-item dict, where the key is 'hadoop_job', # 'spark_job', etc. if step['type'] == 'streaming': job_kwarg = self._streaming_step_job_kwarg(step_num) elif step['type'] == 'jar': job_kwarg = self._jar_step_job_kwarg(step_num) else: raise NotImplementedError( 'Unsupported step type: %r' % step['type']) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._submit_job(step_name, job_kwarg) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result.reference.job_id assert job_id == step_name return job_id def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException( step_num=step_num, num_steps=num_steps) def _default_step_output_dir(self): # put intermediate data in HDFS return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key ### log intepretation ### # step def _interpret_step_logs(self, log_interpretation, step_type): """Hook for interpreting step logs. Unlike with most runners, you may call this multiple times and it will continue to parse the step log incrementally, which is useful for getting job progress.""" # don't turn this off even if read_logs opt is false; it's # the only way this runner can track job progress driver_output_uri = log_interpretation.get( 'step', {}).get('driver_output_uri') if driver_output_uri: self._update_step_interpretation( log_interpretation['step'], driver_output_uri) def _update_step_interpretation( self, step_interpretation, driver_output_uri): new_lines = self._get_new_driver_output_lines(driver_output_uri) _interpret_new_dataproc_step_stderr(step_interpretation, new_lines) def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: # log the location of job driver output just once log.info( ' Parsing job driver output from %s*' % driver_output_uri) state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs.gcs._get_blob(log_uri) try: new_data = log_blob.download_as_string(start=state['pos']) except (google.api_core.exceptions.NotFound, google.api_core.exceptions.RequestRangeNotSatisfiable): # blob was just created, or no more data is available break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines # history def _interpret_history_log(self, log_interpretation): """Does nothing. We can't get the history logs, and we don't need them.""" if not self._read_logs(): return log_interpretation.setdefault('history', {}) # task def _interpret_task_logs(self, log_interpretation, step_type, error_attempt_ids=(), partial=True): """Scan node manager log to find failed container IDs of failed tasks, and then scan the corresponding stderr and syslogs.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted if not self._read_logs(): return step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') if not application_id: log.warning( "Can't parse node manager logs; missing application ID") return log_interpretation['task'] = self._task_log_interpretation( application_id, step_type, partial) def _task_log_interpretation( self, application_id, step_type, partial=True): """Helper for :py:meth:`_interpret_task_logs`""" # not bothering with _read_logs() since this is a helper method result = {} for container_id in self._failed_task_container_ids(application_id): error = _parse_task_syslog_records( self._task_syslog_records( application_id, container_id, step_type)) if not error.get('hadoop_error'): # not sure if this ever happens, since we already know # which containers failed continue error['container_id'] = container_id # fix weird munging of java stacktrace error['hadoop_error']['message'] = _fix_java_stack_trace( error['hadoop_error']['message']) task_error = _parse_task_stderr( self._task_stderr_lines( application_id, container_id, step_type)) if task_error: task_error['message'] = _fix_traceback(task_error['message']) error['task_error'] = task_error result.setdefault('errors', []).append(error) # if partial is true, bail out when we find the first task error if task_error and partial: result['partial'] = True return result return result def _failed_task_container_ids(self, application_id): """Stream container IDs of failed tasks, in reverse order.""" container_id_prefix = 'container' + application_id[11:] log_filter = self._make_log_filter( 'yarn-yarn-nodemanager', {'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME}) log.info('Scanning node manager logs for IDs of failed tasks...') # it doesn't seem to work to do self.logging_client.logger(); # there's some RPC dispute about whether the log name should # be qualified by project name or not entries = self.logging_client.list_entries( filter_=log_filter, order_by=google.cloud.logging.DESCENDING) for entry in entries: message = entry.payload.get('message') if not message: continue m = _CONTAINER_EXIT_RE.match(message) if not m: continue returncode = int(m.group('returncode')) if not returncode: continue container_id = m.group('container_id') # matches some other step if not container_id.startswith(container_id_prefix): continue log.debug(' %s' % container_id) yield container_id def _task_stderr_lines(self, application_id, container_id, step_type): """Yield lines from a specific stderr log.""" log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'stderr', }) log.info(' reading stderr log...') entries = self.logging_client.list_entries(filter_=log_filter) # use log4j parsing to handle tab -> newline conversion for record in _log_entries_to_log4j(entries): for line in record['message'].split('\n'): yield line def _task_syslog_records(self, application_id, container_id, step_type): """Yield log4j records from a specific syslog. """ log_filter = self._make_log_filter( 'yarn-userlogs', { 'jsonPayload.application': application_id, 'jsonPayload.container': container_id, # TODO: pick based on step_type 'jsonPayload.container_logname': 'syslog', }) log.info(' reading syslog...') entries = self.logging_client.list_entries(filter_=log_filter) return _log_entries_to_log4j(entries) # misc def _make_log_filter(self, log_name=None, extra_values=None): # we only want logs from this project, cluster, and region d = {} d['resource.labels.cluster_name'] = self._cluster_id d['resource.labels.project_id'] = self._project_id d['resource.labels.region'] = self._region() d['resource.type'] = 'cloud_dataproc_cluster' if log_name: d['logName'] = 'projects/%s/logs/%s' % ( self._project_id, log_name) if extra_values: d.update(extra_values) return _log_filter_str(d) def counters(self): return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise ValueError('cluster has not yet been created') cluster = self._get_cluster(self._cluster_id) self._image_version = ( cluster.config.software_config.image_version) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version( self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) def _bootstrap_pre_commands(self): # don't run the bootstrap script in / (see #1601) return [ 'mkdir /tmp/mrjob', 'cd /tmp/mrjob', ] ### Bootstrapping ### def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def get_cluster_id(self): return self._cluster_id def _cluster_create_kwargs(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ # TODO: remove mrjob-max-secs-idle once lifecycle_config is visible # through the gcloud utility and the Google Cloud Console cluster_metadata['mrjob-max-secs-idle'] = str(int( self._opts['max_mins_idle'] * 60)) gce_cluster_config = dict( metadata=cluster_metadata, service_account_scopes=self._opts['service_account_scopes'], ) if self._opts['network']: gce_cluster_config['network_uri'] = self._opts['network'] if self._opts['subnet']: gce_cluster_config['subnetwork_uri'] = self._opts['subnet'] if self._opts['service_account']: gce_cluster_config['service_account'] = ( self._opts['service_account']) if self._opts['service_account_scopes']: gce_cluster_config['service_account_scopes'] = ( self._opts['service_account_scopes']) if self._opts['zone']: gce_cluster_config['zone_uri'] = _gcp_zone_uri( project=self._project_id, zone=self._opts['zone']) cluster_config = dict( gce_cluster_config=gce_cluster_config, initialization_actions=[ dict(executable_file=init_script_uri) for init_script_uri in gcs_init_script_uris ] ) # Task tracker master_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=1, instance_type=self._opts['master_instance_type'], ) if self._opts['master_instance_config']: master_conf.update(self._opts['master_instance_config']) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type'] ) if self._opts['core_instance_config']: worker_conf.update(self._opts['core_instance_config']) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._project_id, zone=self._opts['zone'], count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True ) if self._opts['task_instance_config']: secondary_worker_conf.update(self._opts['task_instance_config']) cluster_config['master_config'] = master_conf cluster_config['worker_config'] = worker_conf if secondary_worker_conf.get('num_instances'): cluster_config['secondary_worker_config'] = secondary_worker_conf cluster_config['lifecycle_config'] = dict( idle_delete_ttl=dict( seconds=int(self._opts['max_mins_idle'] * 60))) software_config = {} if self._opts['cluster_properties']: software_config['properties'] = _values_to_text( self._opts['cluster_properties']) # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: software_config['image_version'] = self._opts['image_version'] if software_config: cluster_config['software_config'] = software_config # in Python 2, dict keys loaded from JSON will be unicode, which # the Google protobuf objects don't like if PY2: cluster_config = _clean_json_dict_keys(cluster_config) kwargs = dict(project_id=self._project_id, cluster_name=self._cluster_id, config=cluster_config) return self._add_extra_cluster_params(kwargs) ### Dataproc-specific Stuff ### def _get_cluster(self, cluster_id): return self.cluster_client.get_cluster( cluster_name=cluster_id, **self._project_id_and_region() ) def _create_cluster(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa self.cluster_client.create_cluster( cluster=cluster_data, **self._project_id_and_region() ) def _delete_cluster(self, cluster_id): return self.cluster_client.delete_cluster( cluster_name=cluster_id, **self._project_id_and_region() ) def _list_jobs(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = self._project_id_and_region() if cluster_name: list_kwargs['cluster_name'] = cluster_name if state_matcher: list_kwargs['job_state_matcher'] = state_matcher return self.job_client.list_jobs(**list_kwargs) def _get_job(self, job_id): return self.job_client.get_job( job_id=job_id, **self._project_id_and_region() ) def _cancel_job(self, job_id): return self.job_client.cancel_job( job_id=job_id, **self._project_id_and_region() ) def _submit_job(self, step_name, job_kwarg): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa submit_job_kwargs = dict( job=dict( reference=dict(project_id=self._project_id, job_id=step_name), placement=dict(cluster_name=self._cluster_id), **job_kwarg ), **self._project_id_and_region() ) log.debug(' submit_job(%s)' % ', '.join( '%s=%r' % (k, v) for k, v in sorted(submit_job_kwargs.items()))) return self.job_client.submit_job(**submit_job_kwargs) def _project_id_and_region(self): return dict( project_id=self._project_id, region=(self._opts['region'] or 'global'), ) def _manifest_download_commands(self): return [ # TODO: SSH in and figure out how to use gsutil or similar # ('gs://*', 'gsutil cp'), ('*://*', 'hadoop fs -copyToLocal'), ] ### SSH hooks ### def _job_tracker_host(self): return '%s-m' % self._cluster_id def _ssh_tunnel_config(self): return _SSH_TUNNEL_CONFIG def _launch_ssh_proc(self, args): ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args) # enter an empty passphrase if creating a key for the first time ssh_proc.stdin.write(b'\n\n') return ssh_proc def _ssh_launch_wait_secs(self): """Wait 20 seconds because gcloud has to update project metadata (unless we were going to check the cluster sooner anyway).""" return min(20.0, self._opts['check_cluster_every']) def _ssh_tunnel_args(self, bind_port): if not self._cluster_id: return gcloud_bin = self._opts['gcloud_bin'] or ['gcloud'] cluster = self._get_cluster(self._cluster_id) zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1] return gcloud_bin + [ 'compute', 'ssh', '--zone', zone, self._job_tracker_host(), '--', ] + self._ssh_tunnel_opts(bind_port)
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'hadoop_bin', 'hadoop_extra_args', 'hadoop_log_dirs', 'hadoop_streaming_jar', 'hadoop_tmp_dir', 'spark_deploy_mode', 'spark_master', } # supports everything (so far) _STEP_TYPES = {'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'} def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Fully qualify step_output_dir, if set if self._step_output_dir: self._step_output_dir = fully_qualify_hdfs_path( self._step_output_dir) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = [] def _default_opts(self): return combine_dicts( super(HadoopJobRunner, self)._default_opts(), dict(hadoop_tmp_dir='tmp/mrjob', )) @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() # don't pass [] to fs; this means not to use hadoop until # fs.set_hadoop_bin() is called (used for running hadoop over SSH). hadoop_bin = self._opts['hadoop_bin'] or None self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin)) self._fs.add_fs('local', LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.hadoop.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.hadoop.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s...' % path) streaming_jars = [] for path in self.fs.ls(path): if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_dirs(self): """Yield all possible hadoop directories (used for streaming jar and logs). May yield duplicates""" for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar. May yield duplicates. """ for hadoop_dir in self._hadoop_dirs(): yield hadoop_dir # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir yarn = uses_yarn(self.get_hadoop_version()) if yarn: yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir yield _DEFAULT_YARN_HDFS_LOG_DIR if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded fallback paths if yarn: for path in _FALLBACK_HADOOP_YARN_LOG_DIRS: yield path for path in _FALLBACK_HADOOP_LOG_DIRS: yield path def _run(self): self._find_binaries_and_jars() self._create_setup_wrapper_scripts() self._add_job_files_for_upload() self._upload_local_files() self._run_job_in_hadoop() def _find_binaries_and_jars(self): """Find hadoop and (if needed) spark-submit bin up-front, before continuing with the job. (This is just for user-interaction purposes; these would otherwise lazy-load as needed.) """ # this triggers looking for Hadoop binary self.get_hadoop_version() if self._has_streaming_steps(): self.get_hadoop_streaming_jar() if self._has_spark_steps(): self.get_spark_submit_bin() def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._py_files(): self._upload_mgr.add(path) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s...' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_driver(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) step_type = step['type'] if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps()) def _warn_about_spark_archives(self, step): """If *step* is a Spark step, the *upload_archives* option is set, and *spark_master* is not ``'yarn'``, warn that *upload_archives* will be ignored by Spark.""" if (_is_spark_step_type(step['type']) and self._spark_master() != 'yarn' and self._opts['upload_archives']): log.warning('Spark will probably ignore archives because' " spark_master is not 'yarn'") def _spark_master(self): return self._opts['spark_master'] or 'yarn' def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) elif _is_spark_step_type(step['type']): return self._args_for_spark_step(step_num) else: raise ValueError('Bad step type: %r' % (step['type'], )) def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] + self._hadoop_streaming_jar_args(step_num)) def _args_for_jar_step(self, step_num): step = self._get_step(step_num) args = [] args.extend(self.get_hadoop_bin()) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args.extend(['jar', jar]) if step.get('main_class'): args.append(step['main_class']) if step.get('args'): args.extend(self._interpolate_jar_step_args( step['args'], step_num)) return args def _env_for_step(self, step_num): step = self._get_step(step_num) env = dict(os.environ) # when running spark-submit, set its environment directly. See #1464 if _is_spark_step_type(step['type']): env.update(self._spark_cmdenv(step_num)) return env def _default_step_output_dir(self): return posixpath.join(self._hadoop_tmp_dir, 'step-output') def _cleanup_hadoop_tmp(self): if self._hadoop_tmp_dir: log.info('Removing HDFS temp directory %s...' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) def _manifest_download_commands(self): cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal'] return [ ('*://*', cmd_line(cp_to_local)), ] ### LOG (implementation of LogInterpretationMixin) ### def _stream_history_log_dirs(self, output_dir=None): """Yield lists of directories to look for the history log in.""" if not self._read_logs(): return for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if _logs_exist(self.fs, log_dir): log.info('Looking for history log in %s...' % log_dir) # logs aren't always in a subdir named history/ yield [log_dir] def _stream_task_log_dirs(self, application_id=None, output_dir=None): """Yield lists of directories to look for the task logs in.""" # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. if not self._read_logs(): return for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if application_id: path = self.fs.join(log_dir, 'userlogs', application_id) else: path = self.fs.join(log_dir, 'userlogs') if _logs_exist(self.fs, path): log.info('Looking for task syslogs in %s...' % path) yield [path] def counters(self): return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ]
class MRJobRunner(object): """Abstract base class for all runners""" # this class handles the basic runner framework, options and config files, # arguments to mrjobs, and setting up job working dirs and environments. # this will put files from setup scripts, py_files, and bootstrap_mrjob # into the job's working dir, but won't actually run/import them # # command lines to run substeps (including Spark) are handled by # mrjob.bin.MRJobBinRunner #: alias for this runner, used on the command line with ``-r`` alias = None # libjars is only here because the job can set it; might want to # handle this with a warning from the launcher instead OPT_NAMES = { 'bootstrap_mrjob', 'check_input_paths', 'cleanup', 'cleanup_on_failure', 'cmdenv', 'jobconf', 'label', 'libjars', 'local_tmp_dir', 'owner', 'py_files', 'read_logs', 'setup', 'upload_archives', 'upload_dirs', 'upload_files' } # re-define this as a set of step types supported by your runner _STEP_TYPES = None # if this is true, when bootstrap_mrjob is true, create a mrjob.zip # and patch it into the *py_files* option _BOOTSTRAP_MRJOB_IN_PY_FILES = True ### methods to call from your batch script ### def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, steps=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :param steps: a list of descriptions of steps to run (see :doc:`step` for description formats) :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)]) log.debug('Active configuration:') log.debug( pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark # if not using a setup script. self._spark_files = [] self._spark_archives = [] # set this to an :py:class:`~mrjob.setup.UploadDirManager` in # runners that upload files to HDFS, S3, etc. self._upload_mgr = None self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key() # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) self._spark_files.append( (extra_arg['name'], extra_arg['path'])) # extra file arguments to our job if file_upload_args: log.warning('file_upload_args is deprecated and will be removed' ' in v0.6.0. Pass dicts to extra_args instead.') for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._extra_args.extend([arg, arg_file]) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add('archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where to keep the input manifest self._input_manifest_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # check and store *steps* self._steps = None if steps is None: if not mr_job_script: self._steps = [] # otherwise we'll load steps on-the-fly, see _load_steps() else: self._check_steps(steps) self._steps = copy.deepcopy(steps) # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False ### Options #### def _default_opts(self): try: owner = getpass.getuser() except: owner = None return dict( check_input_paths=True, cleanup=['ALL'], cleanup_on_failure=['NONE'], owner=owner, ) def _combine_confs(self, source_and_opt_list): """Combine several opt dictionaries into one. *source_and_opt_list* is a list of tuples of *source*, *opts* where *opts* is a dictionary and *source* is either None or a description of where the opts came from (usually a path). Only override this if you need truly fine-grained control, including knowledge of the options' source. """ opt_list = [ self._fix_opts(opts, source) for source, opts in source_and_opt_list ] return self._combine_opts(opt_list) def _combine_opts(self, opt_list): """Combine several opt dictionaries into one. *opt_list* is a list of dictionaries containing validated options Override this if you need to base options off the values of other options, but don't need to issue warnings etc. about the options' source. """ return combine_opts(self._opt_combiners(), *opt_list) def _opt_combiners(self): """A dictionary mapping opt name to combiner funciton. This won't necessarily include every opt name (we default to :py:func:`~mrjob.conf.combine_value`). """ return _combiners(self.OPT_NAMES) def _fix_opts(self, opts, source=None): """Take an options dictionary, and either return a sanitized version of it, or raise an exception. *source* is either a string describing where the opts came from or None. This ensures that opt dictionaries are really dictionaries and handles deprecated options. """ if source is None: source = 'defaults' # defaults shouldn't trigger warnings if not isinstance(opts, dict): raise TypeError('options for %s (from %s) must be a dict' % (self.alias, source)) deprecated_aliases = _deprecated_aliases(self.OPT_NAMES) results = {} for k, v in sorted(opts.items()): # rewrite deprecated aliases if k in deprecated_aliases: if v is None: # don't care continue aliased_opt = deprecated_aliases log.warning('Deprecated option %s (from %s) has been renamed' ' to %s and will be removed in v0.7.0' % (k, source, aliased_opt)) if opts.get(aliased_opt) is not None: return # don't overwrite non-aliased opt k = aliased_opt if k in self.OPT_NAMES: results[k] = None if v is None else self._fix_opt(k, v, source) elif v: log.warning('Unexpected option %s (from %s)' % (k, source)) return results def _fix_opt(self, opt_key, opt_value, source): """Fix a single option, returning its correct value or raising an exception. This is not called for options that are ``None``. This currently handles cleanup opts. Override this if you require additional opt validation or cleanup. """ if opt_key in ('cleanup', 'cleanup_on_failure'): return self._fix_cleanup_opt(opt_key, opt_value, source) else: return opt_value def _fix_cleanup_opt(self, opt_key, opt_value, source): """Fix a cleanup option, or raise ValueError.""" if isinstance(opt_value, string_types): opt_value = [opt_value] if 'NONE' in opt_value and len(set(opt_value)) > 1: raise ValueError('Cannot clean up both nothing and something!' ' (%s option from %s)' % (opt_key, source)) for cleanup_type in opt_value: if cleanup_type not in CLEANUP_CHOICES: raise ValueError( '%s must be one of %s, not %s (from %s)' % (opt_key, ', '.join(CLEANUP_CHOICES), opt_value, source)) return opt_value def _obfuscate_opt(self, opt_key, opt_value): """Return value of opt to show in debug printout. Used to obfuscate credentials, etc.""" return opt_value ### Filesystem object ### @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for the local filesystem. """ if self._fs is None: # wrap LocalFilesystem in LocalFilesystem to get IOError # on URIs (see #1185) self._fs = CompositeFilesystem() self._fs.add_fs('local', LocalFilesystem()) return self._fs ### Running the job and parsing output ### def run(self): """Run the job, and block until it finishes. Raise :py:class:`~mrjob.step.StepFailedException` if there are any problems (except on :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the actual exception that caused the step to fail). """ if self._ran_job: raise ValueError('Job already ran!') if self._num_steps() == 0: raise ValueError('Job has no steps!') self._create_dir_archives() # TODO: no point in checking input paths if we're going to # make a manifest out of them self._check_input_paths() self._add_input_files_for_upload() self._create_input_manifest_if_needed() self._run() self._ran_job = True last_step = self._get_steps()[-1] # only print this message if the last step uses our output dir if 'args' not in last_step or OUTPUT in last_step['args']: log.info('job output is in %s' % self._output_dir) def cat_output(self): """Stream the jobs output, as a stream of ``bytes``. If there are multiple output files, there will be an empty bytestring (``b''``) between them. .. versionadded:: 0.6.0 In previous versions, you'd use :py:meth:`stream_output`. """ output_dir = self.get_output_dir() if output_dir is None: raise ValueError('Run the job before streaming output') if self._closed is True: log.warning( 'WARNING! Trying to stream output from a closed runner, output' ' will probably be empty.') log.info('Streaming final output from %s...' % output_dir) def split_path(path): while True: base, name = os.path.split(path) # no more elements if not name: break yield name path = base def ls_output(): for filename in self.fs.ls(output_dir): subpath = filename[len(output_dir):] # Hadoop ignores files and dirs inside the output dir # whose names start with '_' or '.'. See #1337. if not (any(name[0] in '_.' for name in split_path(subpath))): yield filename for i, filename in enumerate(ls_output()): if i > 0: yield b'' # EOF of previous file for chunk in self.fs._cat_file(filename): yield chunk def stream_output(self): """Like :py:meth:`cat_output` except that it groups bytes into lines. Equivalent to ``mrjob.util.to_lines(runner.cat_output())``. .. deprecated:: 0.6.0 """ log.warning('stream_output() is deprecated and will be removed in' ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())' ' instead.') return to_lines(self.cat_output()) def _cleanup_mode(self, mode=None): """Actual cleanup action to take based on various options""" if self._script_path and not self._ran_job: return mode or self._opts['cleanup_on_failure'] else: return mode or self._opts['cleanup'] def _cleanup_cloud_tmp(self): """Cleanup any files/directories on cloud storage (e.g. S3) we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only EMR runner does this def _cleanup_hadoop_tmp(self): """Cleanup any files/directories on HDFS we created while running this job. Should be safe to run this at any time, or multiple times. """ pass # only Hadoop runner does this def _cleanup_local_tmp(self): """Cleanup any files/directories on the local machine we created while running this job. Should be safe to run this at any time, or multiple times. This particular function removes any local tmp directories added to the list self._local_tmp_dirs This won't remove output_dir if it's outside of our tmp dir. """ if self._local_tmp_dir: log.info('Removing temp directory %s...' % self._local_tmp_dir) try: shutil.rmtree(self._local_tmp_dir) except OSError as e: log.exception(e) self._local_tmp_dir = None def _cleanup_cluster(self): """Terminate the cluster if there is one.""" pass # this only happens on EMR def _cleanup_logs(self): """Cleanup any log files that are created as a side-effect of the job. """ pass # this only happens on EMR def _cleanup_job(self): """Stop any jobs that we created that are still running.""" pass # currently disabled (see #1241) def cleanup(self, mode=None): """Clean up running jobs, temp files, and logs, subject to the *cleanup* option passed to the constructor. If you create your runner in a ``with`` block, :py:meth:`cleanup` will be called automatically:: with mr_job.make_runner() as runner: ... # cleanup() called automatically here :param mode: override *cleanup* passed into the constructor. Should be a list of strings from :py:data:`~mrjob.options.CLEANUP_CHOICES` """ mode = self._cleanup_mode(mode) def mode_has(*args): return any((choice in mode) for choice in args) if self._script_path and not self._ran_job: if mode_has('CLUSTER', 'ALL'): self._cleanup_cluster() if mode_has('JOB', 'ALL'): self._cleanup_job() if mode_has('ALL', 'TMP', 'CLOUD_TMP'): self._cleanup_cloud_tmp() if mode_has('ALL', 'TMP', 'HADOOP_TMP'): self._cleanup_hadoop_tmp() if mode_has('ALL', 'TMP', 'LOCAL_TMP'): self._cleanup_local_tmp() if mode_has('ALL', 'LOGS'): self._cleanup_logs() self._closed = True def counters(self): """Get counters associated with this run in this form:: [{'group name': {'counter1': 1, 'counter2': 2}}, {'group name': ...}] The list contains an entry for every step of the current job. """ raise NotImplementedError ### hooks for the with statement ### def __enter__(self): """Don't do anything special at start of with block""" return self def __exit__(self, type, value, traceback): """Call self.cleanup() at end of with block.""" self.cleanup() ### more runner information ### def get_opts(self): """Get options set for this runner, as a dict.""" log.warning('get_opts() is deprecated and will be removed in v0.7.0') return copy.deepcopy(self._opts) def get_job_key(self): """Get the unique key for the job run by this runner. This has the format ``label.owner.date.time.microseconds`` """ return self._job_key def get_output_dir(self): """Find the directory containing the job output. If the job hasn't run yet, returns None""" if self._script_path and not self._ran_job: return None return self._output_dir ### other methods you need to implement in your subclass ### def get_hadoop_version(self): """Return the version number of the Hadoop environment as a string if Hadoop is being used or simulated. Return None if not applicable. :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster. :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an additional `hadoop_version` option to specify which version it simulates. :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at all. """ return None # you'll probably wan't to add your own __init__() and cleanup() as well def _run(self): """Run the job.""" raise NotImplementedError ### internal utilities for implementing MRJobRunners ### def _get_local_tmp_dir(self): """Create a tmp directory on the local filesystem that will be cleaned up by self.cleanup()""" if not self._local_tmp_dir: tmp_dir = (self._opts['local_tmp_dir'] or tempfile.gettempdir()) path = os.path.join(tmp_dir, self._job_key) log.info('Creating temp directory %s' % path) if os.path.isdir(path): shutil.rmtree(path) os.makedirs(path) self._local_tmp_dir = path return self._local_tmp_dir def _make_unique_job_key(self, label=None, owner=None): """Come up with a useful unique ID for this job. Optionally, you can specify a custom label or owner (otherwise we use :py:meth:`_label` and :py:meth:`_owner`. We use this to choose the output directory, etc. for the job. """ if label is None: label = self._label() if owner is None: owner = self._owner() now = datetime.datetime.utcnow() return '%s.%s.%s.%06d' % (label, owner, now.strftime('%Y%m%d.%H%M%S'), now.microsecond) def _label(self): """Return *label* opt, or if not set, the name of the file containing the MRJob, minus extension, or if none, ``'no_script'``""" if self._opts['label']: return self._opts['label'] elif self._script_path: return os.path.basename(self._script_path).split('.')[0] else: return 'no_script' def _owner(self): """Return *owner* opt (which defaults to :py:func:`getpass.getuser`), or ``'no_user'`` if not set.""" if self._opts['owner']: # owner opt defaults to getpass.getuser() return self._opts['owner'] else: return 'no_user' def _get_steps(self): """If *steps* was not set at init time, call the job script to find out how many steps it has, and whether there are mappers and reducers for each step. Validate its output. Returns output as described in :ref:`steps-format`. """ if self._steps is None: log.warning('querying jobs for steps is deprecated and' ' will go away in v0.7.0') steps = self._load_steps() self._check_steps(steps) self._steps = steps return self._steps def _load_steps(self): """Ask job how many steps it has, and whether there are mappers and reducers for each step. Returns output as described in :ref:`steps-format`. If this is called, you can assume self._script_path is set. """ raise NotImplementedError def _check_steps(self, steps): """Look at the step definition (*steps*). If it is not supported by the runner, raise :py:class:`NotImplementedError`. If it is not supported by mrjob, raise :py:class:`ValueError`. """ if not self._STEP_TYPES: # use __class__.__name__ because only MRJobRunner would # trigger this raise NotImplementedError('%s cannot run steps!' % self.__class__.__name__) for step_num, step in enumerate(steps): if step.get('type') not in self._STEP_TYPES: raise NotImplementedError( 'step %d has type %r, but %s runner only supports:' ' %s' % (step_num, step.get('type'), self.alias, ', '.join( sorted(self._STEP_TYPES)))) if step.get('input_manifest') and step_num != 0: raise ValueError('step %d may not take an input manifest (only' ' first step can' % step_num) # some step types assume a MRJob script if not self._script_path: if step['type'] == 'spark': raise ValueError( "SparkStep (step %d) can't run without a MRJob script" " (try SparkScriptStep instead)" % step_num) elif step['type'] == 'streaming': for mrc in ('mapper', 'combiner', 'reducer'): if not step.get(mrc): continue substep = step[mrc] if substep['type'] == 'script': raise ValueError( "%s (step %d) can't run without a MRJob" " script" % (mrc, step_num)) def _get_step(self, step_num): """Get a single step (calls :py:meth:`_get_steps`).""" return self._get_steps()[step_num] def _num_steps(self): """Get the number of steps (calls :py:meth:`get_steps`).""" return len(self._get_steps()) def _uses_input_manifest(self): """Does the first step take an input manifest?""" return bool(self._get_step(0).get('input_manifest')) def _has_streaming_steps(self): """Are any of our steps Hadoop Streaming steps?""" return any(step['type'] == 'streaming' for step in self._get_steps()) def _has_spark_steps(self): """Are any of our steps Spark steps? (e.g. spark, spark_jar, spark_script) Generally used to determine if we need to install Spark on a cluster. """ return any( _is_spark_step_type(step['type']) for step in self._get_steps()) def _has_pyspark_steps(self): """Do any of our steps involve running Python on Spark? Includes spark and spark_script types, but not spark_jar. Generally used to tell if we need a Spark setup script. """ return any( _is_pyspark_step_type(step['type']) for step in self._get_steps()) def _args_for_task(self, step_num, mrc): return [ '--step-num=%d' % step_num, '--%s' % mrc, ] + self._mr_job_extra_args() def _mr_job_extra_args(self, local=False): """Return arguments to add to every invocation of MRJob. :type local: boolean :param local: if this is True, use files' local paths rather than the path they'll have inside Hadoop streaming """ result = [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if local: result.append(extra_arg['path']) else: result.append(self._working_dir_mgr.name(**extra_arg)) else: result.append(extra_arg) return result def _dir_archive_path(self, dir_path): """Assign a path for the archive of *dir_path* but don't actually create anything.""" if dir_path not in self._dir_to_archive_path: # we can check local paths now if not (is_uri(dir_path) or os.path.isdir(dir_path)): raise OSError('%s is not a directory!' % dir_path) name = name_uniquely(dir_path, names_taken=self._dir_archive_names_taken) self._dir_archive_names_taken.add(name) self._dir_to_archive_path[dir_path] = os.path.join( self._get_local_tmp_dir(), 'archives', name + '.tar.gz') return self._dir_to_archive_path[dir_path] def _create_dir_archives(self): """Call this to create all dir archives""" for dir_path in sorted(set(self._dir_to_archive_path)): self._create_dir_archive(dir_path) def _create_dir_archive(self, dir_path): """Helper for :py:meth:`archive_dir`""" if not self.fs.exists(dir_path): raise OSError('%s does not exist') tar_gz_path = self._dir_archive_path(dir_path) if tar_gz_path in self._dir_archives_created: return # already created if not os.path.isdir(os.path.dirname(tar_gz_path)): os.makedirs(os.path.dirname(tar_gz_path)) # for remote files tmp_download_path = os.path.join(self._get_local_tmp_dir(), 'tmp-download') log.info('Archiving %s -> %s' % (dir_path, tar_gz_path)) with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz: for path in self.fs.ls(dir_path): # fs.ls() only lists files if path == dir_path: raise OSError('%s is a file, not a directory!' % dir_path) # TODO: do we need this? if os.path.realpath(path) == os.path.realpath(tar_gz_path): raise OSError('attempted to archive %s into itself!' % tar_gz_path) if is_uri(path): path_in_tar_gz = path[len(dir_path):].lstrip('/') log.info(' downloading %s -> %s' % (path, tmp_download_path)) with open(tmp_download_path, 'wb') as f: for chunk in self.fs.cat(path): f.write(chunk) local_path = tmp_download_path else: path_in_tar_gz = path[len(dir_path):].lstrip(os.sep) local_path = path log.debug(' adding %s to %s' % (path, tar_gz_path)) tar_gz.add(local_path, path_in_tar_gz, recursive=False) self._dir_archives_created.add(tar_gz_path) def _bootstrap_mrjob(self): """Should we bootstrap mrjob?""" if self._opts['bootstrap_mrjob'] is None: return self._opts['interpreter'] is None else: return bool(self._opts['bootstrap_mrjob']) def _get_input_paths(self): """Get the paths to input files, dumping STDIN to a local file if need be.""" if self._input_manifest_path: return [self._input_manifest_path] if '-' in self._input_paths: if self._stdin_path is None: # prompt user, so they don't think the process has stalled log.info('reading from STDIN') stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN') log.debug('dumping stdin to local file %s' % stdin_path) with open(stdin_path, 'wb') as stdin_file: for line in self._stdin: # catch missing newlines (often happens with test data) if not line.endswith(b'\n'): line += b'\n' stdin_file.write(line) self._stdin_path = stdin_path return [self._stdin_path if p == '-' else p for p in self._input_paths] def _create_input_manifest_if_needed(self): """Create a file with a list of URIs of input files.""" if self._input_manifest_path or not self._uses_input_manifest(): return uris = [] log.info('finding input files to add to manifest...') for path in self._get_input_paths(): log.debug(' in %s' % path) if is_uri(path): # URIs might be globs for uri in self.fs.ls(path): uris.append(uri) else: # local paths are expected to be single files # (shell would resolve globs) if self._upload_mgr: uris.append(self._upload_mgr.uri(path)) else: # just make sure job can find files from it's working dir uris.append(os.path.abspath(path)) log.info('found %d input files' % len(uris)) path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt') self._write_script(uris, path, 'input manifest') self._input_manifest_path = path if self._upload_mgr: self._upload_mgr.add(self._input_manifest_path) def _check_input_paths(self): """Check that input exists prior to running the job, if the `check_input_paths` option is true.""" if not self._opts['check_input_paths']: return for path in self._input_paths: self._check_input_path(path) def _check_input_path(self, path): """Raise :py:class:`IOError` if the given input does not exist or is otherwise invalid. Override this to provide custom check behavior.""" if path == '-': return # STDIN always exists if not self.fs.can_handle_path(path): return # no way to check (e.g. non-S3 URIs on EMR) if not self.fs.exists(path): raise IOError('Input path %s does not exist!' % (path, )) def _add_input_files_for_upload(self): """If there is an upload manager, add input files to it.""" if self._upload_mgr: for path in self._get_input_paths(): self._upload_mgr.add(path) def _intermediate_output_dir(self, step_num, local=False): """A directory for intermediate output for the given step number.""" join = os.path.join if local else posixpath.join return join(self._step_output_dir or self._default_step_output_dir(), '%04d' % step_num) def _default_step_output_dir(self): """Where to put output for steps other than the last one, if not specified by the *output_dir* constructor keyword. Usually you want this to be on HDFS (most efficient). Define this in your runner subclass. """ raise NotImplementedError def _step_input_uris(self, step_num): """A list of URIs to use as input for the given step. For all except the first step, this list will have a single item (a directory).""" if step_num == 0: return [ self._upload_mgr.uri(path) if self._upload_mgr else to_uri(path) for path in self._get_input_paths() ] else: return [to_uri(self._intermediate_output_dir(step_num - 1))] def _step_output_uri(self, step_num): """URI to use as output for the given step. This is either an intermediate dir (see :py:meth:`intermediate_output_uri`) or ``self._output_dir`` for the final step.""" if step_num == len(self._get_steps()) - 1: return to_uri(self._output_dir) else: return to_uri(self._intermediate_output_dir(step_num)) def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) # _sort_values_jobconf() isn't relevant to Spark, # but it doesn't do any harm either jobconf = combine_jobconfs(self._sort_values_jobconf(), self._opts['jobconf'], step.get('jobconf')) # if user is using the wrong jobconfs, add in the correct ones # and log a warning hadoop_version = self.get_hadoop_version() if hadoop_version: jobconf = translate_jobconf_dict(jobconf, hadoop_version) return jobconf def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf def _sort_values_partitioner(self): """Partitioner to use with *sort_values* keyword to the constructor.""" if self._sort_values: return _SORT_VALUES_PARTITIONER else: return None def _upload_args(self): # just upload every file and archive in the working dir manager return self._upload_args_helper('-files', None, '-archives', None) def _upload_args_helper(self, files_opt_str, files, archives_opt_str, archives): args = [] file_hash_paths = list(self._arg_hash_paths('file', files)) if file_hash_paths: args.append(files_opt_str) args.append(','.join(file_hash_paths)) archive_hash_paths = list(self._arg_hash_paths('archive', archives)) if archive_hash_paths: args.append(archives_opt_str) args.append(','.join(archive_hash_paths)) return args def _arg_hash_paths(self, type, named_paths=None): """Helper function for the *upload_args methods.""" if named_paths is None: # just return everything managed by _working_dir_mgr named_paths = sorted( self._working_dir_mgr.name_to_path(type).items()) for name, path in named_paths: if not name: name = self._working_dir_mgr.name(type, path) if self._upload_mgr: uri = self._upload_mgr.uri(path) else: uri = path yield '%s#%s' % (uri, name) def _write_script(self, lines, path, description): """Write text of a setup script, input manifest, etc. to the given file. By default, this writes binary data. Redefine :py:meth:`write_lines` to use other line endings. :param lines: a list of lines as ``str`` :param path: path of file to write to :param description: what we're writing to, for debug messages """ log.debug('Writing %s to %s:' % (description, path)) for line in lines: log.debug(' ' + line) self._write_script_lines(lines, path) def _write_script_lines(self, lines, path): """Write text to the given file. By default, this writes binary data, but can be redefined to use local line endings.""" with open(path, 'wb') as f: for line in lines: f.write((line + '\n').encode('utf-8'))
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin): """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster. Invoked when you run your job with ``-r hadoop``. Input and support files can be either local or on HDFS; use ``hdfs://...`` URLs to refer to files on HDFS. """ alias = 'hadoop' OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'hadoop_bin', 'hadoop_extra_args', 'hadoop_log_dirs', 'hadoop_streaming_jar', 'hadoop_tmp_dir', 'spark_deploy_mode', 'spark_master', } # supports everything (so far) _STEP_TYPES = { 'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'} def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join( self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Fully qualify step_output_dir, if set if self._step_output_dir: self._step_output_dir = fully_qualify_hdfs_path( self._step_output_dir) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = [] def _default_opts(self): return combine_dicts( super(HadoopJobRunner, self)._default_opts(), dict( hadoop_tmp_dir='tmp/mrjob', spark_deploy_mode='client', spark_master='yarn', ) ) @property def fs(self): """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local filesystem. """ if self._fs is None: self._fs = CompositeFilesystem() # don't pass [] to fs; this means not to use hadoop until # fs.set_hadoop_bin() is called (used for running hadoop over SSH). hadoop_bin = self._opts['hadoop_bin'] or None self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin)) self._fs.add_fs('local', LocalFilesystem()) return self._fs def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" return self.fs.hadoop.get_hadoop_version() def get_hadoop_bin(self): """Find the hadoop binary. A list: binary followed by arguments.""" return self.fs.hadoop.get_hadoop_bin() def get_hadoop_streaming_jar(self): """Find the path of the hadoop streaming jar, or None if not found.""" if not (self._hadoop_streaming_jar or self._searched_for_hadoop_streaming_jar): self._hadoop_streaming_jar = self._find_hadoop_streaming_jar() if self._hadoop_streaming_jar: log.info('Found Hadoop streaming jar: %s' % self._hadoop_streaming_jar) else: log.warning('Hadoop streaming jar not found. Use' ' --hadoop-streaming-jar') self._searched_for_hadoop_streaming_jar = True return self._hadoop_streaming_jar def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s...' % path) streaming_jars = [] for path in self.fs.ls(path): if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None def _hadoop_dirs(self): """Yield all possible hadoop directories (used for streaming jar and logs). May yield duplicates""" for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL', 'HADOOP_MAPRED_HOME'): path = os.environ.get(name) if path: yield path # guess it from the path of the Hadoop binary hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0]) if hadoop_home: yield hadoop_home # try HADOOP_*_HOME for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield path def _hadoop_streaming_jar_dirs(self): """Yield all possible places to look for the Hadoop streaming jar. May yield duplicates. """ for hadoop_dir in self._hadoop_dirs(): yield hadoop_dir # use hard-coded paths to work out-of-the-box on EMR for path in _EMR_HADOOP_STREAMING_JAR_DIRS: yield path def _hadoop_log_dirs(self, output_dir=None): """Yield all possible places to look for hadoop logs.""" # hadoop_log_dirs opt overrides all this if self._opts['hadoop_log_dirs']: for path in self._opts['hadoop_log_dirs']: yield path return hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR') if hadoop_log_dir: yield hadoop_log_dir yarn = uses_yarn(self.get_hadoop_version()) if yarn: yarn_log_dir = os.environ.get('YARN_LOG_DIR') if yarn_log_dir: yield yarn_log_dir yield _DEFAULT_YARN_HDFS_LOG_DIR if output_dir: # Cloudera style of logging yield posixpath.join(output_dir, '_logs') for hadoop_dir in self._hadoop_dirs(): yield posixpath.join(hadoop_dir, 'logs') # hard-coded fallback paths if yarn: for path in _FALLBACK_HADOOP_YARN_LOG_DIRS: yield path for path in _FALLBACK_HADOOP_LOG_DIRS: yield path def _run(self): self._find_binaries_and_jars() self._create_setup_wrapper_scripts() self._add_job_files_for_upload() self._upload_local_files_to_hdfs() self._run_job_in_hadoop() def _find_binaries_and_jars(self): """Find hadoop and (if needed) spark-submit bin up-front, before continuing with the job. (This is just for user-interaction purposes; these would otherwise lazy-load as needed.) """ # this triggers looking for Hadoop binary self.get_hadoop_version() if self._has_streaming_steps(): self.get_hadoop_streaming_jar() if self._has_spark_steps(): self.get_spark_submit_bin() def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) for path in self._py_files(): self._upload_mgr.add(path) def _upload_local_files_to_hdfs(self): """Copy files managed by self._upload_mgr to HDFS """ self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying local files to %s...' % self._upload_mgr.prefix) for path, uri in self._upload_mgr.path_to_uri().items(): self._upload_to_hdfs(path, uri) def _upload_to_hdfs(self, path, target): log.debug(' %s -> %s' % (path, target)) self.fs.hadoop.put(path, target) def _dump_stdin_to_local_file(self): """Dump sys.stdin to a local file, and return the path to it.""" stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN') # prompt user, so they don't think the process has stalled log.info('reading from STDIN') log.debug('dumping stdin to local file %s...' % stdin_path) stdin_file = open(stdin_path, 'wb') for line in self._stdin: stdin_file.write(line) return stdin_path def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_driver(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) step_type = step['type'] if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps()) def _warn_about_spark_archives(self, step): """If *step* is a Spark step, the *upload_archives* option is set, and *spark_master* is not ``'yarn'``, warn that *upload_archives* will be ignored by Spark.""" if (_is_spark_step_type(step['type']) and self._opts['spark_master'] != 'yarn' and self._opts['upload_archives']): log.warning('Spark will probably ignore archives because' " spark_master is not set to 'yarn'") def _args_for_step(self, step_num): step = self._get_step(step_num) if step['type'] == 'streaming': return self._args_for_streaming_step(step_num) elif step['type'] == 'jar': return self._args_for_jar_step(step_num) elif _is_spark_step_type(step['type']): return self._args_for_spark_step(step_num) else: raise ValueError('Bad step type: %r' % (step['type'],)) def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] + self._hadoop_streaming_jar_args(step_num)) def _args_for_jar_step(self, step_num): step = self._get_step(step_num) args = [] args.extend(self.get_hadoop_bin()) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args.extend(['jar', jar]) if step.get('main_class'): args.append(step['main_class']) if step.get('args'): args.extend( self._interpolate_step_args(step['args'], step_num)) return args def _env_for_step(self, step_num): step = self._get_step(step_num) env = dict(os.environ) # when running spark-submit, set its environment directly. See #1464 if _is_spark_step_type(step['type']): env.update(self._spark_cmdenv(step_num)) return env def _default_step_output_dir(self): return posixpath.join(self._hadoop_tmp_dir, 'step-output') def _cleanup_hadoop_tmp(self): if self._hadoop_tmp_dir: log.info('Removing HDFS temp directory %s...' % self._hadoop_tmp_dir) try: self.fs.rm(self._hadoop_tmp_dir) except Exception as e: log.exception(e) def _manifest_download_commands(self): cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal'] return [ ('*://*', cmd_line(cp_to_local)), ] ### LOG (implementation of LogInterpretationMixin) ### def _stream_history_log_dirs(self, output_dir=None): """Yield lists of directories to look for the history log in.""" if not self._read_logs(): return for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if _logs_exist(self.fs, log_dir): log.info('Looking for history log in %s...' % log_dir) # logs aren't always in a subdir named history/ yield [log_dir] def _stream_task_log_dirs(self, application_id=None, output_dir=None): """Yield lists of directories to look for the task logs in.""" # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. if not self._read_logs(): return for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if application_id: path = self.fs.join(log_dir, 'userlogs', application_id) else: path = self.fs.join(log_dir, 'userlogs') if _logs_exist(self.fs, path): log.info('Looking for task syslogs in %s...' % path) yield [path] def counters(self): return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations]
class SparkMRJobRunner(MRJobBinRunner): """Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or without Hadoop). Invoked when you run your job with ``-r spark``. See :ref:`running-on-your-spark-cluster` for more information. The Spark runner can also run "classic" MRJobs directly on Spark, without using Hadoop streaming. See :ref:`classic-mrjobs-on-spark`. .. versionadded:: 0.6.8 """ alias = 'spark' # other than ``spark_*``, these options are only used for filesystems # # max_output_files doesn't appear here because it can only be read from # the command line, not mrjob.conf (see #2040) OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'cloud_fs_sync_secs', 'cloud_part_size_mb', 'gcs_region', # used when creating buckets on GCS 'hadoop_bin', 'project_id', # used by GCS filesystem 's3_endpoint', 's3_region', # used when creating buckets on S3 'spark_deploy_mode', 'spark_master', 'spark_tmp_dir', # where to put temp files in Spark } # everything except Hadoop JARs # streaming jobs will be run using mrjob/spark/harness.py (see #1972) _STEP_TYPES = { 'spark', 'spark_jar', 'spark_script', 'streaming', } def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs): """Create a Spark runner. :param max_output_files: limit on number of output files when running streaming jobs. Can only be set on command line (not config file) :param mrjob_cls: class of the job you want to run. Used for running streaming steps in Spark """ # need to set this before checking steps in superclass __init__() self._mrjob_cls = mrjob_cls super(SparkMRJobRunner, self).__init__(**kwargs) self._max_output_files = max_output_files self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = self.fs.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # where to store a .zip file containing the MRJob, with a unique # module name self._job_script_zip_path = None # counters, one per job step. (Counters will be {} for non-streaming # steps because Spark doesn't have counters). self._counters = [] # TODO: we may eventually want log interpretation, but it shouldn't # include counters, as they are not found in logs. def _check_step(self, step, step_num): """Don't try to run steps that include commands or use manifests.""" super(SparkMRJobRunner, self)._check_step(step, step_num) if step.get('input_manifest'): raise NotImplementedError( 'spark runner does not support input manifests') # we don't currently support commands, but we *could* (see #1956). if step['type'] == 'streaming': if not self._mrjob_cls: raise ValueError( 'You must set mrjob_cls to run streaming steps') for mrc in ('mapper', 'combiner', 'reducer'): if step.get(mrc): if 'command' in step[mrc] or 'pre_filter' in step[mrc]: raise NotImplementedError( "step %d's %s runs a command, but spark" " runner does not support commands" % ( step_num, mrc)) def _default_opts(self): return combine_dicts( super(SparkMRJobRunner, self)._default_opts(), dict( cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB, ), ) def _run(self): self.get_spark_submit_bin() # find spark-submit up front self._create_setup_wrapper_scripts() self._upload_local_files() self._run_steps_on_spark() def _pick_spark_tmp_dir(self): if self._opts['spark_tmp_dir']: return self.fs.join(self._opts['spark_tmp_dir'], self._job_key) else: master = self._spark_master() or 'local' if master.startswith('local'): # including local-cluster # need a local temp dir # add "-spark" so we don't collide with default local temp dir return os.path.join( gettempdir(), self._job_key + '-spark') else: # use HDFS (same default as HadoopJobRunner) return posixpath.join( fully_qualify_hdfs_path('tmp/mrjob'), self._job_key) def _default_step_output_dir(self): return self.fs.join(self._spark_tmp_dir, 'step-output') def _counter_output_dir(self, step_num): return self.fs.join( self._spark_tmp_dir, 'counter-output-step-%d' % step_num) def counters(self): return deepcopy(self._counters) @property def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['project_id'], location=self._opts['gcs_region'], object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, ), disable_if=_is_permanent_google_error) # Hadoop FS is responsible for all URIs that fall through to it self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs # making mr_job_script visible in Spark def _job_script_module_name(self): """A unique module name to use with the MRJob script.""" return re.sub(r'[^\w\d]', '_', self._job_key) def _create_job_script_zip(self): if not self._job_script_zip_path: zip_path = os.path.join(self._get_local_tmp_dir(), 'script.zip') name_in_zip = self._job_script_module_name() + '.py' log.debug('archiving %s -> %s as %s' % ( self._script_path, zip_path, name_in_zip)) with _create_zip_file(zip_path) as zip_file: zip_file.write(self._script_path, arcname=name_in_zip) self._job_script_zip_path = zip_path return self._job_script_zip_path def _py_files(self): """Patch in :py:attr:`_job_script_zip_path`, if running streaming steps.""" py_files = super(SparkMRJobRunner, self)._py_files() if self._has_streaming_steps(): py_files.append(self._create_job_script_zip()) return py_files # running the job def _run_steps_on_spark(self): steps = self._get_steps() for group in self._group_steps(steps): step_num = group['step_num'] last_step_num = step_num + len(group['steps']) - 1 # the Spark harness can run several streaming steps in one job if step_num == last_step_num: step_desc = 'step %d' % (step_num + 1) else: step_desc = 'steps %d-%d' % (step_num + 1, last_step_num + 1) log.info('Running %s of %d' % (step_desc, len(steps))) self._run_step_on_spark(group['steps'][0], step_num, last_step_num) def _group_steps(self, steps): """Group streaming steps together.""" # a list of dicts with: # # type -- shared type of steps # steps -- list of steps in group # step_num -- (0-indexed) number of first step groups = [] for step_num, step in enumerate(steps): # should we add *step* to existing group of streaming steps? if (step['type'] == 'streaming' and groups and groups[-1]['type'] == 'streaming' and step.get('jobconf') == groups[-1]['steps'][0].get('jobconf')): groups[-1]['steps'].append(step) else: # start a new step group groups.append(dict( type=step['type'], steps=[step], step_num=step_num)) return groups def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode = self._run_spark_submit(spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join( self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate( counters, start=(step_num + 1)): if counter_dict: log.info(_format_counters( counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException( reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps()) def _spark_script_path(self, step_num): """For streaming steps, return the path of the harness script (and handle other spark step types the usual way).""" step = self._get_step(step_num) if step['type'] == 'streaming': return self._spark_harness_path() else: return super(SparkMRJobRunner, self)._spark_script_path(step_num) def _spark_script_args(self, step_num, last_step_num=None): """Generate spark harness args for streaming steps (and handle other spark step types the usual way). """ if last_step_num is None: last_step_num = step_num steps = self._get_steps()[step_num:last_step_num + 1] if steps[0]['type'] != 'streaming': return super(SparkMRJobRunner, self)._spark_script_args( step_num, last_step_num) args = [] # class name args.append('%s.%s' % (self._job_script_module_name(), self._mrjob_cls.__name__)) # INPUT args.append( ','.join(self._step_input_uris(step_num))) # OUTPUT # note that we use the output dir for the *last* step args.append( self._step_output_uri(last_step_num)) # --hadoop-input-format. Pass '' to indicate we know there is none args.extend(['--hadoop-input-format', self._hadoop_input_format or '']) # --hadoop-output-format. Pass '' to indicate we know there is none args.extend(['--hadoop-output-format', self._hadoop_output_format or '']) # --sort-values if self._sort_values: args.append('--sort-values') else: args.append('--no-sort-values') # --steps-desc args.extend(['--steps-desc', json.dumps(steps)]) # --counter-output-dir, to simulate counters args.extend(['--counter-output-dir', self._counter_output_dir(step_num)]) # --first-step-num, --last-step-num (step range) args.extend(['--first-step-num', str(step_num), '--last-step-num', str(last_step_num)]) # --job-args (passthrough args) # if on local[*] master, keep file upload args as-is (see #2031) job_args = self._mr_job_extra_args( local=not self._spark_executors_have_own_wd()) if job_args: args.extend(['--job-args', cmd_line(job_args)]) # --compression-codec jobconf = self._jobconf_for_step(step_num) compress_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress') codec_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress.codec') if compress_conf and compress_conf != 'false' and codec_conf: args.extend(['--compression-codec', codec_conf]) # --num-reducers num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces') if num_reducers and int(num_reducers) > 0: args.extend(['--num-reducers', str(num_reducers)]) # --max-output-files if self._max_output_files: args.extend(['--max-output-files', str(self._max_output_files)]) return args def _spark_harness_path(self): """Where to find the Spark harness.""" path = mrjob.spark.harness.__file__ if path.endswith('.pyc'): path = path[:-1] return path # "streaming" steps run on Spark too def _has_spark_steps(self): """Treat streaming steps as Spark steps.""" return (super(SparkMRJobRunner, self)._has_spark_steps() or self._has_streaming_steps()) def _has_hadoop_streaming_steps(self): # the Spark runner doesn't run "streaming" steps on Hadoop return False def _has_streaming_steps(self): """Are any of our steps "streaming" steps that would normally run on Hadoop Streaming?""" return any(step['type'] == 'streaming' for step in self._get_steps()) def _is_pyspark_step(self, step): """Treat streaming steps as Spark steps that use Python.""" return (super(SparkMRJobRunner, self)._is_pyspark_step(step) or step['type'] == 'streaming')
class SparkMRJobRunner(MRJobBinRunner): """Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or without Hadoop). Invoked when you run your job with ``-r spark``. See :ref:`running-on-your-spark-cluster` for more information. The Spark runner can also run "classic" MRJobs directly on Spark, without using Hadoop streaming. See :ref:`classic-mrjobs-on-spark`. .. versionadded:: 0.6.8 """ alias = 'spark' # other than ``spark_*``, these options are only used for filesystems # # max_output_files doesn't appear here because it can only be read from # the command line, not mrjob.conf (see #2040) OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'cloud_fs_sync_secs', 'cloud_part_size_mb', 'emulate_map_input_file', 'gcs_region', # used when creating buckets on GCS 'hadoop_bin', 'project_id', # used by GCS filesystem 's3_endpoint', 's3_region', # used when creating buckets on S3 'spark_deploy_mode', 'spark_master', 'spark_tmp_dir', # where to put temp files in Spark } # everything except Hadoop JARs # streaming jobs will be run using mrjob/spark/harness.py (see #1972) _STEP_TYPES = { 'spark', 'spark_jar', 'spark_script', 'streaming', } def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs): """Create a Spark runner. :param max_output_files: limit on number of output files when running streaming jobs. Can only be set on command line (not config file) :param mrjob_cls: class of the job you want to run. Used for running streaming steps in Spark """ # need to set this before checking steps in superclass __init__() self._mrjob_cls = mrjob_cls super(SparkMRJobRunner, self).__init__(**kwargs) self._max_output_files = max_output_files if self._opts['spark_tmp_dir']: self._check_spark_tmp_dir_opt() self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = self.fs.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # where to store a .zip file containing the MRJob, with a unique # module name self._job_script_zip_path = None # counters, one per job step. (Counters will be {} for non-streaming # steps because Spark doesn't have counters). self._counters = [] # TODO: we may eventually want log interpretation, but it shouldn't # include counters, as they are not found in logs. def _check_spark_tmp_dir_opt(self): # warn if spark_tmp_dir isn't actually visible to Spark executors # (see #2062) tmp_dir_is_local = to_uri( self._opts['spark_tmp_dir']).startswith('file://') spark_master_is_local = self._spark_master().startswith('local') if tmp_dir_is_local != spark_master_is_local: log.warning( 'Warning: executors on Spark master %s may not be able to' ' access spark_tmp_dir %s' % (self._spark_master(), self._opts['spark_tmp_dir'])) def _check_step(self, step, step_num): """Don't try to run steps that include commands or use manifests.""" super(SparkMRJobRunner, self)._check_step(step, step_num) if step.get('input_manifest'): raise NotImplementedError( 'spark runner does not support input manifests') # we don't currently support commands, but we *could* (see #1956). if step['type'] == 'streaming': if not self._mrjob_cls: raise ValueError( 'You must set mrjob_cls to run streaming steps') for mrc in ('mapper', 'combiner', 'reducer'): if step.get(mrc): if 'command' in step[mrc] or 'pre_filter' in step[mrc]: raise NotImplementedError( "step %d's %s runs a command, but spark" " runner does not support commands" % ( step_num, mrc)) def _default_opts(self): return combine_dicts( super(SparkMRJobRunner, self)._default_opts(), dict( cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB, ), ) def _run(self): self.get_spark_submit_bin() # find spark-submit up front self._create_setup_wrapper_scripts() self._upload_local_files() self._run_steps_on_spark() def _pick_spark_tmp_dir(self): if self._opts['spark_tmp_dir']: return self.fs.join(self._opts['spark_tmp_dir'], self._job_key) else: master = self._spark_master() or 'local' if master.startswith('local'): # including local-cluster # need a local temp dir # add "-spark" so we don't collide with default local temp dir return os.path.join( gettempdir(), self._job_key + '-spark') else: # use HDFS (same default as HadoopJobRunner) return posixpath.join( fully_qualify_hdfs_path('tmp/mrjob'), self._job_key) def _default_step_output_dir(self): return self.fs.join(self._spark_tmp_dir, 'step-output') def _counter_output_dir(self, step_num): return self.fs.join( self._spark_tmp_dir, 'counter-output-step-%d' % step_num) def counters(self): return deepcopy(self._counters) @property def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['project_id'], location=self._opts['gcs_region'], object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS, ), disable_if=_is_permanent_google_error) # Hadoop FS is responsible for all URIs that fall through to it self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs # making mr_job_script visible in Spark def _job_script_module_name(self): """A unique module name to use with the MRJob script.""" return re.sub(r'[^\w\d]', '_', self._job_key) def _create_job_script_zip(self): if not self._job_script_zip_path: zip_path = os.path.join(self._get_local_tmp_dir(), 'script.zip') name_in_zip = self._job_script_module_name() + '.py' log.debug('archiving %s -> %s as %s' % ( self._script_path, zip_path, name_in_zip)) with _create_zip_file(zip_path) as zip_file: zip_file.write(self._script_path, arcname=name_in_zip) self._job_script_zip_path = zip_path return self._job_script_zip_path def _py_files(self): """Patch in :py:attr:`_job_script_zip_path`, if running streaming steps.""" py_files = super(SparkMRJobRunner, self)._py_files() if self._has_streaming_steps(): py_files.append(self._create_job_script_zip()) return py_files # running the job def _run_steps_on_spark(self): steps = self._get_steps() for group in self._group_steps(steps): step_num = group['step_num'] last_step_num = step_num + len(group['steps']) - 1 # the Spark harness can run several streaming steps in one job if step_num == last_step_num: step_desc = 'step %d' % (step_num + 1) else: step_desc = 'steps %d-%d' % (step_num + 1, last_step_num + 1) log.info('Running %s of %d' % (step_desc, len(steps))) self._run_step_on_spark(group['steps'][0], step_num, last_step_num) def _group_steps(self, steps): """Group streaming steps together.""" # a list of dicts with: # # type -- shared type of steps # steps -- list of steps in group # step_num -- (0-indexed) number of first step groups = [] for step_num, step in enumerate(steps): # should we add *step* to existing group of streaming steps? if (step['type'] == 'streaming' and groups and groups[-1]['type'] == 'streaming' and step.get('jobconf') == groups[-1]['steps'][0].get('jobconf')): groups[-1]['steps'].append(step) else: # start a new step group groups.append(dict( type=step['type'], steps=[step], step_num=step_num)) return groups def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode = self._run_spark_submit(spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join( self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate( counters, start=(step_num + 1)): if counter_dict: log.info(_format_counters( counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException( reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps()) def _spark_script_path(self, step_num): """For streaming steps, return the path of the harness script (and handle other spark step types the usual way).""" step = self._get_step(step_num) if step['type'] == 'streaming': return self._spark_harness_path() else: return super(SparkMRJobRunner, self)._spark_script_path(step_num) def _spark_script_args(self, step_num, last_step_num=None): """Generate spark harness args for streaming steps (and handle other spark step types the usual way). """ if last_step_num is None: last_step_num = step_num steps = self._get_steps()[step_num:last_step_num + 1] if steps[0]['type'] != 'streaming': return super(SparkMRJobRunner, self)._spark_script_args( step_num, last_step_num) args = [] # class name args.append('%s.%s' % (self._job_script_module_name(), self._mrjob_cls.__name__)) # INPUT args.append( ','.join(self._step_input_uris(step_num))) # OUTPUT # note that we use the output dir for the *last* step args.append( self._step_output_uri(last_step_num)) # --hadoop-input-format. Pass '' to indicate we know there is none args.extend(['--hadoop-input-format', self._hadoop_input_format or '']) # --hadoop-output-format. Pass '' to indicate we know there is none args.extend(['--hadoop-output-format', self._hadoop_output_format or '']) # --sort-values if self._sort_values: args.append('--sort-values') else: args.append('--no-sort-values') # --steps-desc args.extend(['--steps-desc', json.dumps(steps)]) # --counter-output-dir, to simulate counters args.extend(['--counter-output-dir', self._counter_output_dir(step_num)]) # --first-step-num, --last-step-num (step range) args.extend(['--first-step-num', str(step_num), '--last-step-num', str(last_step_num)]) # --job-args (passthrough args) # if on local[*] master, keep file upload args as-is (see #2031) job_args = self._mr_job_extra_args( local=not self._spark_executors_have_own_wd()) if job_args: args.extend(['--job-args', cmd_line(job_args)]) # --compression-codec jobconf = self._jobconf_for_step(step_num) compress_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress') codec_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress.codec') if compress_conf and compress_conf != 'false' and codec_conf: args.extend(['--compression-codec', codec_conf]) # --num-reducers num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces') if num_reducers and int(num_reducers) > 0: args.extend(['--num-reducers', str(num_reducers)]) # --max-output-files if self._max_output_files: args.extend(['--max-output-files', str(self._max_output_files)]) if self._opts['emulate_map_input_file']: args.append('--emulate-map-input-file') return args def _spark_harness_path(self): """Where to find the Spark harness.""" path = mrjob.spark.harness.__file__ if path.endswith('.pyc'): path = path[:-1] return path # "streaming" steps run on Spark too def _has_spark_steps(self): """Treat streaming steps as Spark steps.""" return (super(SparkMRJobRunner, self)._has_spark_steps() or self._has_streaming_steps()) def _has_hadoop_streaming_steps(self): # the Spark runner doesn't run "streaming" steps on Hadoop return False def _has_streaming_steps(self): """Are any of our steps "streaming" steps that would normally run on Hadoop Streaming?""" return any(step['type'] == 'streaming' for step in self._get_steps()) def _is_pyspark_step(self, step): """Treat streaming steps as Spark steps that use Python.""" return (super(SparkMRJobRunner, self)._is_pyspark_step(step) or step['type'] == 'streaming')
class SparkMRJobRunner(MRJobBinRunner): """Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or without Hadoop). Invoked when you run your job with ``-r spark``. """ alias = 'spark' # other than ``spark_*``, these options are only used for filesystems OPT_NAMES = MRJobBinRunner.OPT_NAMES | { 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'cloud_fs_sync_secs', 'cloud_part_size_mb', 'google_project_id', # used by GCS filesystem 'hadoop_bin', 's3_endpoint', 's3_region', # only used along with s3_endpoint 'spark_deploy_mode', 'spark_master', 'spark_tmp_dir', # where to put temp files in Spark } # everything except Hadoop JARs # streaming jobs will be run using mrjob_spark_harness.py (see #1972) _STEP_TYPES = { 'spark', 'spark_jar', 'spark_script', # 'streaming', } def __init__(self, **kwargs): super(SparkMRJobRunner, self).__init__(**kwargs) self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = posixpath.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] def _default_opts(self): return combine_dicts( super(SparkMRJobRunner, self)._default_opts(), dict( spark_master='local[*]', spark_deploy_mode='client', ) ) def _run(self): self.get_spark_submit_bin() # find spark-submit up front self._create_setup_wrapper_scripts() self._add_job_files_for_upload() self._upload_local_files() self._run_steps_on_spark() def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" if self._upload_mgr: for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) # no need to upload py_files, spark-submit handles this def _pick_spark_tmp_dir(self): if self._opts['spark_tmp_dir']: if is_uri(self._opts['spark_tmp_dir']): return posixpath.join( self._opts['spark_tmp_dir'], self._job_key) else: return os.path.join( self._opts['spark_tmp_dir'], self._job_key) elif self._spark_master_is_local(): # need a local temp dir # add "-spark" so we don't collide with default local temp dir return os.path.join( gettempdir(), self._job_key + '-spark') else: # use HDFS (same default as HadoopJobRunner) return posixpath.join( fully_qualify_hdfs_path('tmp/mrjob'), self._job_key) def _default_step_output_dir(self): return posixpath.join(self._spark_tmp_dir, 'step-output') @property def fs(self): # Spark supports basically every filesystem there is if not self._fs: self._fs = CompositeFilesystem() if boto3_installed: self._fs.add_fs('s3', S3Filesystem( aws_access_key_id=self._opts['aws_access_key_id'], aws_secret_access_key=self._opts['aws_secret_access_key'], aws_session_token=self._opts['aws_session_token'], s3_endpoint=self._opts['s3_endpoint'], s3_region=self._opts['s3_region'], ), disable_if=_is_permanent_boto3_error) if google_libs_installed: self._fs.add_fs('gcs', GCSFilesystem( project_id=self._opts['google_project_id'] ), disable_if=_is_permanent_google_error) self._fs.add_fs('hadoop', HadoopFilesystem( self._opts['hadoop_bin'])) self._fs.add_fs('local', LocalFilesystem()) return self._fs def _upload_local_files(self): # in local mode, nothing to upload if not self._upload_mgr: return self.fs.mkdir(self._upload_mgr.prefix) log.info('Copying local files to %s' % self._upload_mgr.prefix) for src_path, uri in self._upload_mgr.path_to_uri().items(): log.debug(' %s -> %s' % (src_path, uri)) self.fs.put(src_path, uri) def _run_steps_on_spark(self): for step_num, step in enumerate(self._get_steps()): self._run_step_on_spark(step, step_num) def _run_step_on_spark(self, step, step_num): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode = self._run_spark_submit(spark_submit_args, env, record_callback=_log_log4j_record) if returncode: reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())