Пример #1
0
 def test_unknown_uri(self):
     sd = UploadDirManager("hdfs:///")
     sd.add("foo/bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
     self.assertEqual(sd.uri("hdfs://host/path/to/bar.py"), "hdfs://host/path/to/bar.py")
     # checking unknown URIs doesn't add them
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
Пример #2
0
 def test_unknown_uri(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
     self.assertEqual(sd.uri('hdfs://host/path/to/bar.py'),
                      'hdfs://host/path/to/bar.py')
     # checking unknown URIs doesn't add them
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Пример #3
0
 def test_unknown_uri(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
     self.assertEqual(sd.uri('hdfs://host/path/to/bar.py'),
                      'hdfs://host/path/to/bar.py')
     # checking unknown URIs doesn't add them
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Пример #4
0
 def test_hidden_file_name_collision(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/_bar.py')
     sd.add('_bar.py')
     self.assertEqual(sd.path_to_uri(),
                      {'foo/_bar.py': 'hdfs:///bar.py',
                       '_bar.py': 'hdfs:///bar-1.py'})
Пример #5
0
 def test_name_collision(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     sd.add('bar.py')
     self.assertEqual(sd.path_to_uri(),
                      {'foo/bar.py': 'hdfs:///bar.py',
                       'bar.py': 'hdfs:///bar-1.py'})
Пример #6
0
    def test_underscores_only(self):
        sd = UploadDirManager('hdfs:///')
        sd.add('_')
        sd.add('_.txt')

        self.assertEqual(sd.path_to_uri(),
                         {'_': 'hdfs:///1',
                          '_.txt': 'hdfs:///1.txt'})
Пример #7
0
 def test_unhide_files(self):
     # avoid giving names to files that Hadoop will ignore as input
     sd = UploadDirManager('hdfs:///')
     sd.add('.foo.log')
     sd.add('_bar.txt')
     self.assertEqual(sd.path_to_uri(),
                      {'.foo.log': 'hdfs:///foo.log',
                       '_bar.txt': 'hdfs:///bar.txt'})
Пример #8
0
    def test_underscores_only(self):
        sd = UploadDirManager('hdfs:///')
        sd.add('_')
        sd.add('_.txt')

        self.assertEqual(sd.path_to_uri(),
                         {'_': 'hdfs:///1',
                          '_.txt': 'hdfs:///1.txt'})
Пример #9
0
 def test_unhide_files(self):
     # avoid giving names to files that Hadoop will ignore as input
     sd = UploadDirManager('hdfs:///')
     sd.add('.foo.log')
     sd.add('_bar.txt')
     self.assertEqual(sd.path_to_uri(),
                      {'.foo.log': 'hdfs:///foo.log',
                       '_bar.txt': 'hdfs:///bar.txt'})
Пример #10
0
    def test_dot_underscore(self):
        sd = UploadDirManager('hdfs:///')

        sd.add('._')
        sd.add('._.txt')
        sd.add('._foo')

        self.assertEqual(sd.path_to_uri(),
                         {'._': 'hdfs:///1',
                          '._.txt': 'hdfs:///1.txt',
                          '._foo': 'hdfs:///foo'})
Пример #11
0
    def test_dot_underscore(self):
        sd = UploadDirManager('hdfs:///')

        sd.add('._')
        sd.add('._.txt')
        sd.add('._foo')

        self.assertEqual(sd.path_to_uri(),
                         {'._': 'hdfs:///1',
                          '._.txt': 'hdfs:///1.txt',
                          '._foo': 'hdfs:///foo'})
Пример #12
0
class DataprocJobRunner(HadoopInTheCloudJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
        'gcp_project',
    }

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException('Dataproc expects at LEAST %d workers' %
                                    _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # Lazy-load gcloud config as needed - invocations fail in PyCharm
        # debugging
        self._gcloud_config = None

        # Google Cloud Platform - project
        self._gcp_project = (self._opts['gcp_project']
                             or self.gcloud_config()['core.project'])

        # Google Compute Engine - Region / Zone
        self._gce_region = (self._opts['region']
                            or self.gcloud_config()['compute.region'])
        self._gce_zone = (self._opts['zone']
                          or self.gcloud_config()['compute.zone'])

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def _default_opts(self):
        return combine_dicts(
            super(DataprocJobRunner, self)._default_opts(),
            dict(
                bootstrap_python=True,
                check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
                cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
                cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
                image_version=_DEFAULT_IMAGE_VERSION,
                instance_type=_DEFAULT_INSTANCE_TYPE,
                master_instance_type=_DEFAULT_INSTANCE_TYPE,
                num_core_instances=_DATAPROC_MIN_WORKERS,
                num_task_instances=0,
                sh_bin=['/bin/sh', '-ex'],
            ))

    def gcloud_config(self):
        """Lazy load gcloud SDK configs"""
        if not self._gcloud_config:
            self._gcloud_config = _read_gcloud_config()

        return self._gcloud_config

    @property
    def api_client(self):
        if not self._api_client:
            credentials = GoogleCredentials.get_application_default()

            api_client = discovery.build(_DATAPROC_API_ENDPOINT,
                                         _DATAPROC_API_VERSION,
                                         credentials=credentials)
            self._api_client = api_client.projects().regions()

        return self._api_client

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem()

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        mrjob_buckets = self.fs.list_buckets(self._gcp_project,
                                             prefix='mrjob-')

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None
        gce_lower_location = self._gce_region.lower()
        for tmp_bucket in mrjob_buckets:
            tmp_bucket_name = tmp_bucket['name']

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase
            lower_location = tmp_bucket['location'].lower()
            if lower_location == gce_lower_location:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', gce_lower_location,
                 random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_output_not_exists()
        self._create_setup_wrapper_script()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files_to_fs()

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError('Output path %s already exists!' %
                          (self._output_dir, ))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)
            self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

        # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored,
        # see _HADOOP_STREAMING_JAR_URI
        # if self._opts['hadoop_streaming_jar']:
        #     self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            # TODO - mtai @ davidmarin - Implement put function for other FSs
            self.fs.put(path, gcs_uri)

        self._wait_for_fs_sync()

    def _create_fs_tmp_bucket(self, bucket_name, location=None):
        """Create a temp bucket if missing

        Tie the temporary bucket to the same region as the GCE job and set a
        28-day TTL
        """
        # Return early if our bucket already exists
        try:
            self.fs.get_bucket(bucket_name)
            return
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

        log.info('creating FS bucket %r' % bucket_name)

        location = location or self._gce_region

        # NOTE - By default, we create a bucket in the same GCE region as our
        # job (tmp buckets ONLY)
        # https://cloud.google.com/storage/docs/bucket-locations
        self.fs.create_bucket(
            self._gcp_project,
            bucket_name,
            location=location,
            object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS)

        self._wait_for_fs_sync()

    ### Running the job ###
    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for current_job in self._api_job_list(cluster_name=self._cluster_id,
                                              state_matcher='ACTIVE'):
            # Kill all active jobs with the same job_prefix as this job
            current_job_id = current_job['reference']['jobId']

            if not current_job_id.startswith(job_prefix):
                continue

            self._api_job_cancel(current_job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._api_cluster_delete(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _build_dataproc_hadoop_job(self, step_num):
        """This function creates a "HadoopJob" to be passed to
        self._api_job_submit_hadoop

        :param step_num:
        :return: output_hadoop_job
        """
        # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        args = list()
        file_uris = list()
        archive_uris = list()
        properties = dict()

        step = self._get_step(step_num)

        assert step['type'] in ('streaming', 'jar'), ('Bad step type: %r' %
                                                      (step['type'], ))

        # TODO - mtai @ davidmarin - Might be trivial to support jar running,
        # see "mainJarFileUri" of variable "output_hadoop_job" in this function
        #         https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        assert step['type'] == 'streaming', 'Jar not implemented'
        main_jar_uri = _HADOOP_STREAMING_JAR_URI

        # TODO - mtai @ davidmarin - Not clear if we should move _upload_args
        # to file_uris, currently works fine as-is

        # TODO - dmarin @ mtai - Probably a little safer to do the API's way,
        # assuming the API supports distributed cache syntax (so we can pick
        # the names of the uploaded files).
        args.extend(self._upload_args())

        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        if mapper:
            args += ['-mapper', mapper]

        if combiner:
            args += ['-combiner', combiner]

        if reducer:
            args += ['-reducer', reducer]

        for current_input_uri in self._step_input_uris(step_num):
            args += ['-input', current_input_uri]

        args += ['-output', self._step_output_uri(step_num)]

        # TODO - mtai @ davidmarin - Add back support to specify a different
        # mainJarFileURI
        output_hadoop_job = dict(args=args,
                                 fileUris=file_uris,
                                 archiveUris=archive_uris,
                                 properties=properties,
                                 mainJarFileUri=main_jar_uri)
        return output_hadoop_job

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(),
                 random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info('Creating Dataproc Hadoop cluster - %s' %
                     self._cluster_id)

            cluster_data = self._cluster_create_kwargs()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in _DATAPROC_CLUSTER_STATES_READY:
            result_describe = self.api_client.clusters().get(
                projectId=self._gcp_project,
                region=_DATAPROC_API_REGION,
                clusterName=cluster_id).execute()

            cluster_state = result_describe['status']['state']
            if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR:
                raise DataprocException(result_describe)

            self._wait_for_api('cluster to accept jobs')

        assert cluster_state in _DATAPROC_CLUSTER_STATES_READY
        log.info("Cluster %s ready", cluster_id)

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(job_id,
                                            step_num=step_num,
                                            num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        # Build each step
        hadoop_job = self._build_dataproc_hadoop_job(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._api_job_submit_hadoop(step_name, hadoop_job)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result['reference']['jobId']
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(self,
                                   job_id,
                                   step_num=None,
                                   num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job_result = self._api_job_get(job_id)
            job_state = job_result['status']['state']

            log.info('%s => %s' % (job_id, job_state))

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            if job_state in _DATAPROC_JOB_STATES_ACTIVE:
                self._wait_for_api('job completion')
                continue

            # we're done, will return at the end of this
            elif job_state == 'DONE':
                break

            raise StepFailedException(step_num=step_num, num_steps=num_steps)

    def _default_step_output_dir(self):
        # put intermediate data in HDFS
        return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key

    def counters(self):
        # TODO - mtai @ davidmarin - Counters are currently always empty as we
        # are not processing task logs
        return [
            _pick_counters(log_interpretation)
            for log_interpretation in self._log_interpretations
        ]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise AssertionError('cluster has not yet been created')

        cluster = self._api_cluster_get(self._cluster_id)
        self._image_version = (
            cluster['config']['softwareConfig']['imageVersion'])
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(self._image_version,
                                           _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_kwargs(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

            # always add idle termination script
            # add it last, so that we don't count bootstrapping as idle time
            gcs_init_script_uris.append(
                self._upload_mgr.uri(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH))

        # NOTE - Cluster initializationActions can only take scripts with no
        # script args, so the auto-term script receives 'mrjob-max-secs-idle'
        # via metadata instead of as an arg
        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__
        cluster_metadata['mrjob-max-secs-idle'] = str(
            int(self._opts['max_mins_idle'] * 60))

        cluster_config = dict(gceClusterConfig=dict(
            zoneUri=_gcp_zone_uri(project=self._gcp_project,
                                  zone=self._gce_zone),
            serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES,
            metadata=cluster_metadata),
                              initializationActions=[
                                  dict(executableFile=init_script_uri)
                                  for init_script_uri in gcs_init_script_uris
                              ])

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._gcp_project,
            zone=self._gce_zone,
            count=1,
            instance_type=self._opts['master_instance_type'])

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._gcp_project,
            zone=self._gce_zone,
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type'])

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._gcp_project,
            zone=self._gce_zone,
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True)

        cluster_config['masterConfig'] = master_conf
        cluster_config['workerConfig'] = worker_conf
        if self._opts['num_task_instances']:
            cluster_config['secondaryWorkerConfig'] = secondary_worker_conf

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            cluster_config['softwareConfig'] = dict(
                imageVersion=self._opts['image_version'])

        kwargs = dict(projectId=self._gcp_project,
                      clusterName=self._cluster_id,
                      config=cluster_config)

        return self._add_extra_cluster_params(kwargs)

    ### Dataproc-specific Stuff ###

    def _api_cluster_get(self, cluster_id):
        return self.api_client.clusters().get(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id).execute()

    def _api_cluster_create(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa
        return self.api_client.clusters().create(projectId=self._gcp_project,
                                                 region=_DATAPROC_API_REGION,
                                                 body=cluster_data).execute()

    def _api_cluster_delete(self, cluster_id):
        return self.api_client.clusters().delete(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id).execute()

    def _api_job_list(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = dict(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
        )
        if cluster_name:
            list_kwargs['clusterName'] = cluster_name

        if state_matcher:
            list_kwargs['jobStateMatcher'] = state_matcher

        list_request = self.api_client.jobs().list(**list_kwargs)
        while list_request:
            try:
                resp = list_request.execute()
            except google_errors.HttpError as e:
                if e.resp.status == 404:
                    return
                raise

            for current_item in resp['items']:
                yield current_item

            list_request = self.api_client.jobs().list_next(list_request, resp)

    def _api_job_get(self, job_id):
        return self.api_client.jobs().get(projectId=self._gcp_project,
                                          region=_DATAPROC_API_REGION,
                                          jobId=job_id).execute()

    def _api_job_cancel(self, job_id):
        return self.api_client.jobs().cancel(projectId=self._gcp_project,
                                             region=_DATAPROC_API_REGION,
                                             jobId=job_id).execute()

    def _api_job_delete(self, job_id):
        return self.api_client.jobs().delete(projectId=self._gcp_project,
                                             region=_DATAPROC_API_REGION,
                                             jobId=job_id).execute()

    def _api_job_submit_hadoop(self, step_name, hadoop_job):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa
        job_data = dict(reference=dict(projectId=self._gcp_project,
                                       jobId=step_name),
                        placement=dict(clusterName=self._cluster_id),
                        hadoopJob=hadoop_job)

        jobs_submit_kwargs = dict(projectId=self._gcp_project,
                                  region=_DATAPROC_API_REGION,
                                  body=dict(job=job_data))
        return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
Пример #13
0
 def test_add_is_idempotent(self):
     sd = UploadDirManager("hdfs:///")
     sd.add("foo/bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
     sd.add("foo/bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
Пример #14
0
 def test_name_collision(self):
     sd = UploadDirManager("hdfs:///")
     sd.add("foo/bar.py")
     sd.add("bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py", "bar.py": "hdfs:///bar-1.py"})
Пример #15
0
class SparkMRJobRunner(MRJobBinRunner):
    """Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or
    without Hadoop). Invoked when you run your job with ``-r spark``.
    """
    alias = 'spark'

    # other than ``spark_*``, these options are only used for filesystems
    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'aws_access_key_id',
        'aws_secret_access_key',
        'aws_session_token',
        'cloud_fs_sync_secs',
        'cloud_part_size_mb',
        'google_project_id',  # used by GCS filesystem
        'hadoop_bin',
        's3_endpoint',
        's3_region',  # only used along with s3_endpoint
        'spark_deploy_mode',
        'spark_master',
        'spark_tmp_dir',  # where to put temp files in Spark
    }

    # everything except Hadoop JARs
    # streaming jobs will be run using mrjob_spark_harness.py (see #1972)
    _STEP_TYPES = {
        'spark', 'spark_jar', 'spark_script', # 'streaming',
    }

    def __init__(self, **kwargs):
        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = posixpath.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

    def _default_opts(self):
        return combine_dicts(
            super(SparkMRJobRunner, self)._default_opts(),
            dict(
                spark_master='local[*]',
                spark_deploy_mode='client',
            )
        )

    def _run(self):
        self.get_spark_submit_bin()  # find spark-submit up front
        self._create_setup_wrapper_scripts()
        self._add_job_files_for_upload()
        self._upload_local_files()
        self._run_steps_on_spark()

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        if self._upload_mgr:
            for path in self._working_dir_mgr.paths():
                self._upload_mgr.add(path)

        # no need to upload py_files, spark-submit handles this

    def _pick_spark_tmp_dir(self):
        if self._opts['spark_tmp_dir']:
            if is_uri(self._opts['spark_tmp_dir']):
                return posixpath.join(
                    self._opts['spark_tmp_dir'], self._job_key)
            else:
                return os.path.join(
                    self._opts['spark_tmp_dir'], self._job_key)
        elif self._spark_master_is_local():
            # need a local temp dir
            # add "-spark" so we don't collide with default local temp dir
            return os.path.join(
                gettempdir(), self._job_key + '-spark')
        else:
            # use HDFS (same default as HadoopJobRunner)
            return posixpath.join(
                fully_qualify_hdfs_path('tmp/mrjob'), self._job_key)

    def _default_step_output_dir(self):
        return posixpath.join(self._spark_tmp_dir, 'step-output')

    @property
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['google_project_id']
                ), disable_if=_is_permanent_google_error)

            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    def _upload_local_files(self):
        # in local mode, nothing to upload
        if not self._upload_mgr:
            return

        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files to %s' % self._upload_mgr.prefix)
        for src_path, uri in self._upload_mgr.path_to_uri().items():
            log.debug('  %s -> %s' % (src_path, uri))
            self.fs.put(src_path, uri)

    def _run_steps_on_spark(self):
        for step_num, step in enumerate(self._get_steps()):
            self._run_step_on_spark(step, step_num)

    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode = self._run_spark_submit(spark_submit_args, env,
                                            record_callback=_log_log4j_record)

        if returncode:
            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num,
                num_steps=self._num_steps())
Пример #16
0
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
        'gcloud_bin',
        'project_id',
    }

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # check for library support
        if google is None:
            raise ImportError(
                'You must install google-cloud and google-cloud-dataproc'
                ' to connect to Dataproc')

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException('Dataproc expects at LEAST %d workers' %
                                    _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # load credentials and project ID
        self._credentials, auth_project_id = google.auth.default(
            scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES)

        self._project_id = self._opts['project_id'] or auth_project_id

        if not self._project_id:
            raise DataprocException(
                'project_id must be set. Use --project_id or'
                ' set $GOOGLE_CLOUD_PROJECT')

        self._fix_zone_and_region_opts()

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # map driver_output_uri to a dict with the keys:
        # log_uri: uri of file we're reading from
        # pos: position in file
        # buffer: bytes read from file already
        self._driver_output_state = {}

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def _fix_zone_and_region_opts(self):
        """Ensure that exactly one of region and zone is set."""
        if self._opts['region'] and self._opts['zone']:
            log.warning('you do not need to set region if you set zone')
            self._opts['region'] = None
            return

        if not (self._opts['region'] or self._opts['zone']):
            if environ.get('CLOUDSDK_COMPUTE_ZONE'):
                self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE']
            elif environ.get('CLOUDSDK_COMPUTE_REGION'):
                self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION']
            else:
                self._opts['region'] = _DEFAULT_GCE_REGION

    def _default_opts(self):
        return combine_dicts(
            super(DataprocJobRunner, self)._default_opts(),
            dict(
                bootstrap_python=True,
                check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
                cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
                cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
                gcloud_bin=['gcloud'],
                image_version=_DEFAULT_IMAGE_VERSION,
                instance_type=_DEFAULT_INSTANCE_TYPE,
                master_instance_type=_DEFAULT_INSTANCE_TYPE,
                num_core_instances=_DATAPROC_MIN_WORKERS,
                num_task_instances=0,
                sh_bin=['/bin/sh', '-ex'],
            ))

    def _combine_opts(self, opt_list):
        """Blank out overridden *zone* and *region* opts."""
        # copy opt_list so we can modify it
        opt_list = [dict(opts) for opts in opt_list]

        # blank out any instance_fleets/groups before the last config
        # where they are set
        blank_out = False
        for opts in reversed(opt_list):
            if blank_out:
                opts['region'] = None
                opts['zone'] = None
            elif any(opts.get(k) is not None for k in ('region', 'zone')):
                blank_out = True

        # now combine opts, with region/zone blanked out
        return super(DataprocJobRunner, self)._combine_opts(opt_list)

    @property
    def cluster_client(self):
        return google.cloud.dataproc_v1.ClusterControllerClient(
            **self._client_create_kwargs())

    @property
    def job_client(self):
        return google.cloud.dataproc_v1.JobControllerClient(
            **self._client_create_kwargs())

    def _client_create_kwargs(self):
        if self._opts['region']:
            endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT)
            return dict(channel=google.api_core.grpc_helpers.create_channel(
                endpoint, credentials=self._credentials))
        else:
            return dict(credentials=self._credentials)

    @property
    def api_client(self):
        raise NotImplementedError(
            '"api_client" was disabled in v0.6.2. Use "cluster_client"'
            ' or "job_client" instead.')

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem(
            credentials=self._credentials,
            local_tmp_dir=self._get_local_tmp_dir(),
            project_id=self._project_id,
        )

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs

    def _fs_chunk_size(self):
        """Chunk size for cloud storage Blob objects. Currently
        only used for uploading."""
        if self._opts['cloud_upload_part_size']:
            return int(self._opts['cloud_upload_part_size'] * 1024 * 1024)
        else:
            return None

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None

        # determine region for bucket
        region = self._region()

        for tmp_bucket_name in self.fs.get_all_bucket_names(prefix='mrjob-'):
            tmp_bucket = self.fs.get_bucket(tmp_bucket_name)

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase. (As of Feb. 12, 2018, this is still true,
            # observed on google-cloud-sdk)
            if tmp_bucket.location.lower() == region:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', region, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _region(self):
        # region of cluster, which is either the region set by the user,
        # or the region derived from the zone they set.
        # used to pick bucket location and name cluster
        return self._opts['region'] or _zone_to_region(self._opts['zone'])

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_output_not_exists()
        self._create_setup_wrapper_scripts()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files_to_fs()

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError('Output path %s already exists!' %
                          (self._output_dir, ))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)
            self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

        # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored,
        # see _HADOOP_STREAMING_JAR_URI
        # if self._opts['hadoop_streaming_jar']:
        #     self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            self.fs.put(path, gcs_uri, chunk_size=self._fs_chunk_size())

        self._wait_for_fs_sync()

    def _create_fs_tmp_bucket(self, bucket_name, location=None):
        """Create a temp bucket if missing

        Tie the temporary bucket to the same region as the GCE job and set a
        28-day TTL
        """
        # Return early if our bucket already exists
        try:
            self.fs.get_bucket(bucket_name)
            return
        except google.api_core.exceptions.NotFound:
            pass

        log.info('creating FS bucket %r' % bucket_name)

        location = location or self._opts['region'] or _zone_to_region(
            self._opts['zone'])

        # NOTE - By default, we create a bucket in the same GCE region as our
        # job (tmp buckets ONLY)
        # https://cloud.google.com/storage/docs/bucket-locations
        self.fs.create_bucket(
            bucket_name,
            location=location,
            object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS)

        self._wait_for_fs_sync()

    ### Running the job ###

    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # close our SSH tunnel, if any
        self._kill_ssh_tunnel()

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for job in self._list_jobs(cluster_name=self._cluster_id,
                                   state_matcher=_STATE_MATCHER_ACTIVE):
            # Kill all active jobs with the same job_prefix as this job
            job_id = job.reference.job_id

            if not job_id.startswith(job_prefix):
                continue

            self._cancel_job(job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._delete_cluster(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _build_dataproc_hadoop_job(self, step_num):
        """This function creates a "HadoopJob" to be passed to
        self._submit_hadoop_job

        :param step_num:
        :return: output_hadoop_job
        """
        # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        args = list()
        file_uris = list()
        archive_uris = list()
        properties = dict()

        step = self._get_step(step_num)

        assert step['type'] in ('streaming', 'jar'), ('Bad step type: %r' %
                                                      (step['type'], ))

        # TODO - mtai @ davidmarin - Might be trivial to support jar running,
        # see "main_jar_file_uri" of variable "output_hadoop_job" in
        #         https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        assert step['type'] == 'streaming', 'Jar not implemented'
        main_jar_uri = _HADOOP_STREAMING_JAR_URI

        # TODO - mtai @ davidmarin - Not clear if we should move _upload_args
        # to file_uris, currently works fine as-is

        # TODO - dmarin @ mtai - Probably a little safer to do the API's way,
        # assuming the API supports distributed cache syntax (so we can pick
        # the names of the uploaded files).
        args.extend(self._upload_args())

        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        if mapper:
            args += ['-mapper', mapper]

        if combiner:
            args += ['-combiner', combiner]

        if reducer:
            args += ['-reducer', reducer]

        for current_input_uri in self._step_input_uris(step_num):
            args += ['-input', current_input_uri]

        args += ['-output', self._step_output_uri(step_num)]

        # TODO - mtai @ davidmarin - Add back support to specify a different
        # main_jar_file_uri
        output_hadoop_job = dict(args=args,
                                 file_uris=file_uris,
                                 archive_uris=archive_uris,
                                 properties=properties,
                                 main_jar_file_uri=main_jar_uri)
        return output_hadoop_job

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._region(),
                 random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._get_cluster(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google.api_core.exceptions.NotFound:
            log.info('Creating Dataproc Hadoop cluster - %s' %
                     self._cluster_id)

            cluster_data = self._cluster_create_kwargs()
            self._create_cluster(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        self._set_up_ssh_tunnel()

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in ('RUNNING', 'UPDATING'):
            cluster = self._get_cluster(cluster_id)
            cluster_state = cluster.status.State.Name(cluster.status.state)

            if cluster_state in ('ERROR', 'DELETING'):
                raise DataprocException(cluster)

            self._wait_for_api('cluster to accept jobs')

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(job_id,
                                            step_num=step_num,
                                            num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        # Build each step
        hadoop_job = self._build_dataproc_hadoop_job(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._submit_hadoop_job(step_name, hadoop_job)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result.reference.job_id
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(self,
                                   job_id,
                                   step_num=None,
                                   num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        step_interpretation = {}
        log_interpretation['step'] = step_interpretation

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job = self._get_job(job_id)

            job_state = job.status.State.Name(job.status.state)

            driver_output_uri = job.driver_output_resource_uri

            log.info('%s => %s' % (job_id, job_state))

            # interpret driver output so far
            if driver_output_uri:
                self._update_step_interpretation(step_interpretation,
                                                 driver_output_uri)

            if step_interpretation.get('progress'):
                log.info(' ' + step_interpretation['progress']['message'])

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            # these are the states covered by the ACTIVE job state matcher,
            # plus SETUP_DONE
            if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING',
                             'SETUP_DONE'):
                self._wait_for_api('job completion')
                continue

            # print counters if job wasn't CANCELLED
            if job_state != 'CANCELLED':
                self._log_counters(log_interpretation, step_num)

            # we're done, will return at the end of this
            if job_state == 'DONE':
                break
            else:
                raise StepFailedException(step_num=step_num,
                                          num_steps=num_steps)

    def _default_step_output_dir(self):
        # put intermediate data in HDFS
        return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key

    def _update_step_interpretation(self, step_interpretation,
                                    driver_output_uri):
        new_lines = self._get_new_driver_output_lines(driver_output_uri)
        _interpret_new_dataproc_step_stderr(step_interpretation, new_lines)

    def _get_new_driver_output_lines(self, driver_output_uri):
        """Get a list of complete job driver output lines that are
        new since the last time we checked.
        """
        state = self._driver_output_state.setdefault(
            driver_output_uri, dict(log_uri=None, pos=0, buffer=b''))

        # driver output is in logs with names like driveroutput.000000000
        log_uris = sorted(self.fs.ls(driver_output_uri + '*'))

        for log_uri in log_uris:
            # initialize log_uri with first URI we see
            if state['log_uri'] is None:
                state['log_uri'] = log_uri

            # skip log files already parsed
            if log_uri < state['log_uri']:
                continue

            # when parsing the next file, reset *pos*
            elif log_uri > state['log_uri']:
                state['pos'] = 0
                state['log_uri'] = log_uri

            log_blob = self.fs._get_blob(log_uri)

            try:
                # TODO: use start= kwarg once google-cloud-storage 1.9 is out
                new_data = log_blob.download_as_string()[state['pos']:]
            except google.api_core.exceptions.NotFound:
                # handle race condition where blob was just created
                break

            state['buffer'] += new_data
            state['pos'] += len(new_data)

        # convert buffer into lines, saving leftovers for next time
        stream = BytesIO(state['buffer'])
        state['buffer'] = b''

        lines = []

        for line_bytes in stream:
            if line_bytes.endswith(b'\n'):
                lines.append(to_unicode(line_bytes))
            else:
                # leave final partial line (if any) in buffer
                state['buffer'] = line_bytes

        return lines

    def counters(self):
        # TODO - mtai @ davidmarin - Counters are currently always empty as we
        # are not processing task logs
        return [
            _pick_counters(log_interpretation)
            for log_interpretation in self._log_interpretations
        ]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise AssertionError('cluster has not yet been created')

        cluster = self._get_cluster(self._cluster_id)
        self._image_version = (cluster.config.software_config.image_version)
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(self._image_version,
                                           _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    def _bootstrap_pre_commands(self):
        # don't run the bootstrap script in / (see #1601)
        return [
            'mkdir /tmp/mrjob',
            'cd /tmp/mrjob',
        ]

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_kwargs(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

            # always add idle termination script
            # add it last, so that we don't count bootstrapping as idle time
            gcs_init_script_uris.append(
                self._upload_mgr.uri(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH))

        # NOTE - Cluster initialization_actions can only take scripts with no
        # script args, so the auto-term script receives 'mrjob-max-secs-idle'
        # via metadata instead of as an arg
        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__
        cluster_metadata['mrjob-max-secs-idle'] = str(
            int(self._opts['max_mins_idle'] * 60))

        gce_cluster_config = dict(
            service_account_scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES,
            metadata=cluster_metadata)

        if self._opts['zone']:
            gce_cluster_config['zone_uri'] = _gcp_zone_uri(
                project=self._project_id, zone=self._opts['zone'])

        cluster_config = dict(gce_cluster_config=gce_cluster_config,
                              initialization_actions=[
                                  dict(executable_file=init_script_uri)
                                  for init_script_uri in gcs_init_script_uris
                              ])

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._project_id,
            zone=self._opts['zone'],
            count=1,
            instance_type=self._opts['master_instance_type'],
        )

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._project_id,
            zone=self._opts['zone'],
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type'])

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._project_id,
            zone=self._opts['zone'],
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True)

        cluster_config['master_config'] = master_conf
        cluster_config['worker_config'] = worker_conf
        if self._opts['num_task_instances']:
            cluster_config['secondary_worker_config'] = secondary_worker_conf

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            cluster_config['software_config'] = dict(
                image_version=self._opts['image_version'])

        kwargs = dict(project_id=self._project_id,
                      cluster_name=self._cluster_id,
                      config=cluster_config)

        return self._add_extra_cluster_params(kwargs)

    ### Dataproc-specific Stuff ###

    def _get_cluster(self, cluster_id):
        return self.cluster_client.get_cluster(cluster_name=cluster_id,
                                               **self._project_id_and_region())

    def _create_cluster(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa

        self.cluster_client.create_cluster(cluster=cluster_data,
                                           **self._project_id_and_region())

    def _delete_cluster(self, cluster_id):
        return self.cluster_client.delete_cluster(
            cluster_name=cluster_id, **self._project_id_and_region())

    def _list_jobs(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = self._project_id_and_region()

        if cluster_name:
            list_kwargs['cluster_name'] = cluster_name

        if state_matcher:
            list_kwargs['job_state_matcher'] = state_matcher

        return self.job_client.list_jobs(**list_kwargs)

    def _get_job(self, job_id):
        return self.job_client.get_job(job_id=job_id,
                                       **self._project_id_and_region())

    def _cancel_job(self, job_id):
        return self.job_client.cancel_job(job_id=job_id,
                                          **self._project_id_and_region())

    def _submit_hadoop_job(self, step_name, hadoop_job):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa
        return self.job_client.submit_job(job=dict(
            reference=dict(project_id=self._project_id, job_id=step_name),
            placement=dict(cluster_name=self._cluster_id),
            hadoop_job=hadoop_job,
        ),
                                          **self._project_id_and_region())

    def _project_id_and_region(self):
        return dict(
            project_id=self._project_id,
            region=(self._opts['region'] or 'global'),
        )

    def _manifest_download_commands(self):
        return [
            # TODO: SSH in and figure out how to use gsutil or similar
            #('gs://*', 'gsutil cp'),
            ('*://*', 'hadoop fs -copyToLocal'),
        ]

    ### SSH hooks ###

    def _job_tracker_host(self):
        return '%s-m' % self._cluster_id

    def _ssh_tunnel_config(self):
        return _SSH_TUNNEL_CONFIG

    def _launch_ssh_proc(self, args):
        ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args)

        # enter an empty passphrase if creating a key for the first time
        ssh_proc.stdin.write(b'\n\n')

        return ssh_proc

    def _ssh_launch_wait_secs(self):
        """Wait 20 seconds because gcloud has to update project metadata
        (unless we were going to check the cluster sooner anyway)."""
        return min(20.0, self._opts['check_cluster_every'])

    def _ssh_tunnel_args(self, bind_port):
        if not self._opts['gcloud_bin']:
            self._give_up_on_ssh_tunnel = True
            return None

        if not self._cluster_id:
            return

        cluster = self._get_cluster(self._cluster_id)
        zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1]

        return self._opts['gcloud_bin'] + [
            'compute',
            'ssh',
            '--zone',
            zone,
            self._job_tracker_host(),
            '--',
        ] + self._ssh_tunnel_opts(bind_port)
Пример #17
0
 def test_add_is_idempotent(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Пример #18
0
 def test_empty(self):
     sd = UploadDirManager('hdfs:///')
     self.assertEqual(sd.path_to_uri(), {})
Пример #19
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(
                self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or
            posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']),
                LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar or
                self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')),
                            len(posixpath.basename(p)),
                            p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar."""
        if self._opts['hadoop_home']:
            yield self._opts['hadoop_home']

        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug('running step %d of %d' %
                      (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error('STDOUT: ' + to_string(line.strip(b'\n')))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (returncode, step_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(line.strip('\n')
                                     for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise CalledProcessError(returncode, step_args)

    def _process_stderr_from_streaming(self, stderr):

        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield next(iter)  # okay for StopIteration to bubble up
                except IOError as e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + to_string(line))

            if b'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'],))

    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._pre_0_20_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)
        else:
            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args = self.get_hadoop_bin() + ['jar', jar]

        if step.get('main_class'):
            args.append(step['main_class'])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ','.join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get('args'):
            args.extend(interpolate(arg) for arg in step['args'])

        return args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p)
                    for p in self._get_input_paths()]
        else:
            return [posixpath.join(
                self._hadoop_tmp_dir, 'step-output', str(step_num))]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(
                self._hadoop_tmp_dir, 'step-output', str(step_num + 1))

    def _cleanup_local_tmp(self):
        super(HadoopJobRunner, self)._cleanup_local_tmp()

        if self._hadoop_tmp_dir:
            log.info('deleting %s from HDFS' % self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    ### LOG FETCHING/PARSING ###

    def _enforce_path_regexp(self, paths, regexp, step_nums):
        """Helper for log fetching functions to filter out unwanted
        logs. Keyword arguments are checked against their corresponding
        regex groups.
        """
        for path in paths:
            m = regexp.match(path)
            if (m and
                (step_nums is None or
                 int(m.group('step_num')) in step_nums) and
                (self._job_timestamp is None or
                 m.group('timestamp') == self._job_timestamp)):

                yield path

    def _ls_logs(self, log_type, step_nums=None):
        """List logs on the local filesystem by path relative to log root
        directory
        """
        return []
        # in YARN, you can just ask the yarn bin:
        # http://hortonworks.com/blog/simplifying-user-logs-management-and-access-in-yarn/  # noqa

        # TODO: redo this to look in
        # $HADOOP_LOG_DIR
        # dirname(hadoop_bin[0])/../logs
        # <output_dir>/_logs
        # ??? other places ???

    def _fetch_counters(self, step_nums, skip_s3_wait=False):
        """Read Hadoop counters from local logs.

        Args:
        step_nums -- the steps belonging to us, so that we can ignore errors
                     from other jobs run with the same timestamp
        """
        uris = self._ls_logs('job', step_nums)
        new_counters = scan_for_counters_in_files(uris, self,
                                                  self.get_hadoop_version())

        # only include steps relevant to the current job
        for step_num in step_nums:
            self._counters.append(new_counters.get(step_num, {}))

    def counters(self):
        return self._counters

    def _find_probable_cause_of_failure(self, step_nums):
        task_attempt_logs = self._ls_logs('task')
        step_logs = self._ls_logs('step')
        job_logs = self._ls_logs('job')

        log.info('Scanning logs for probable cause of failure')
        return best_error_from_logs(self, task_attempt_logs, step_logs,
                                    job_logs)
Пример #20
0
 def test_add_is_idempotent(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Пример #21
0
 def test_simple(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Пример #22
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hdfs_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hdfs_scratch_dir'], self._job_name))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output'))

        self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home'])

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # init hadoop version cache
        self._hadoop_version = None

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split('\n')[0]
                m = HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = m.group('version')
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                    return self._hadoop_version
            self._hadoop_version = '0.20.203'
            log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
        return self._hadoop_version

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.path_exists(path):
                    raise AssertionError('Input path %s does not exist!' %
                                         (path, ))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _mkdir_on_hdfs(self, path):
        log.debug('Making directory %s on HDFS' % path)

        hadoop_version = self.get_hadoop_version()
        # from version 0.23 / 2.x on, -mkdir needs a -p option to create
        # parent directories

        # version == 0.23
        if ((mrjob.compat.version_gte(hadoop_version, "0.23")
             and not mrjob.compat.version_gte(hadoop_version, "0.24"))):
            self.invoke_hadoop(['fs', '-mkdir', '-p', path])

        # version >= 2.0
        elif mrjob.compat.version_gte(hadoop_version, "2.0"):
            self.invoke_hadoop(['fs', '-mkdir', '-p', path])

        # for version 0.20, 1.x
        else:
            self.invoke_hadoop(['fs', '-mkdir', path])

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.invoke_hadoop(['fs', '-put', path, target])

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
        # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug('running step %d of %d' %
                      (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error('STDOUT: ' + to_string(line.strip(b'\n')))

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (returncode, step_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(
                        line.strip('\n') for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise CalledProcessError(returncode, step_args)

    def _process_stderr_from_streaming(self, stderr):
        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield next(iter)  # okay for StopIteration to bubble up
                except IOError as e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + to_string(line))

            if b'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'], ))

    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        args = (self._opts['hadoop_bin'] +
                ['jar', self._opts['hadoop_streaming_jar']])

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._new_upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._old_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)
        else:
            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args = (self._opts['hadoop_bin'] + ['jar', jar])

        if step.get('main_class'):
            args.append(step['main_class'])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ','.join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get('args'):
            args.extend(interpolate(arg) for arg in step['args'])

        return args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p) for p in self._get_input_paths()]
        else:
            return [
                posixpath.join(self._hdfs_tmp_dir, 'step-output',
                               str(step_num))
            ]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(self._hdfs_tmp_dir, 'step-output',
                                  str(step_num + 1))

    def _cleanup_local_scratch(self):
        super(HadoopJobRunner, self)._cleanup_local_scratch()

        if self._hdfs_tmp_dir:
            log.info('deleting %s from HDFS' % self._hdfs_tmp_dir)

            try:
                self.invoke_hadoop(['fs', '-rmr', self._hdfs_tmp_dir])
            except Exception as e:
                log.exception(e)

    ### LOG FETCHING/PARSING ###

    def _enforce_path_regexp(self, paths, regexp, step_nums):
        """Helper for log fetching functions to filter out unwanted
        logs. Keyword arguments are checked against their corresponding
        regex groups.
        """
        for path in paths:
            m = regexp.match(path)
            if (m and
                (step_nums is None or int(m.group('step_num')) in step_nums)
                    and (self._job_timestamp is None
                         or m.group('timestamp') == self._job_timestamp)):

                yield path

    def _ls_logs(self, relative_path):
        """List logs on the local filesystem by path relative to log root
        directory
        """
        return self.ls(os.path.join(self._hadoop_log_dir, relative_path))

    def _fetch_counters(self, step_nums, skip_s3_wait=False):
        """Read Hadoop counters from local logs.

        Args:
        step_nums -- the steps belonging to us, so that we can ignore errors
                     from other jobs run with the same timestamp
        """
        job_logs = self._enforce_path_regexp(self._ls_logs('history/'),
                                             HADOOP_JOB_LOG_URI_RE, step_nums)
        uris = list(job_logs)
        new_counters = scan_for_counters_in_files(uris, self,
                                                  self.get_hadoop_version())

        # only include steps relevant to the current job
        for step_num in step_nums:
            self._counters.append(new_counters.get(step_num, {}))

    def counters(self):
        return self._counters

    def _find_probable_cause_of_failure(self, step_nums):
        all_task_attempt_logs = []
        try:
            all_task_attempt_logs.extend(self._ls_logs('userlogs/'))
        except IOError:
            # sometimes the master doesn't have these
            pass
        # TODO: get these logs from slaves if possible
        task_attempt_logs = self._enforce_path_regexp(
            all_task_attempt_logs, TASK_ATTEMPTS_LOG_URI_RE, step_nums)
        step_logs = self._enforce_path_regexp(self._ls_logs('steps/'),
                                              STEP_LOG_URI_RE, step_nums)
        job_logs = self._enforce_path_regexp(self._ls_logs('history/'),
                                             HADOOP_JOB_LOG_URI_RE, step_nums)
        log.info('Scanning logs for probable cause of failure')
        return best_error_from_logs(self, task_attempt_logs, step_logs,
                                    job_logs)
Пример #23
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # Keep track of the status of each step that ran
        #
        # these are dictionaries with the same keys as
        # mrjob.logs.parse._parse_hadoop_streaming_log()
        self._steps_info = []

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar
                or self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')), len(posixpath.basename(p)), p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_dirs(self):
        """Yield all possible hadoop directories (used for streaming jar
        and logs). May yield duplicates"""
        if self._opts['hadoop_home']:
            yield self._opts['hadoop_home']

        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar.
        May yield duplicates.
        """
        for hadoop_dir in self._hadoop_dirs():
            yield hadoop_dir

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _hadoop_log_dirs(self, output_dir=None):
        """Yield all possible places to look for hadoop logs."""
        # hadoop_log_dirs opt overrides all this
        if self._opts['hadoop_log_dirs']:
            for path in self._opts['hadoop_log_dirs']:
                yield path
            return

        hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
        if hadoop_log_dir:
            yield hadoop_log_dir

        if uses_yarn(self.get_hadoop_version()):
            yarn_log_dir = os.environ.get('YARN_LOG_DIR')
            if yarn_log_dir:
                yield yarn_log_dir

        if output_dir:
            # Cloudera style of logging
            yield posixpath.join(output_dir, '_logs')

        for hadoop_dir in self._hadoop_dirs():
            yield posixpath.join(hadoop_dir, 'logs')

        # hard-coded log paths for EMR, so this can work out-of-the-box
        for path in _EMR_HADOOP_LOG_DIRS:
            yield path

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError('Input path %s does not exist!' %
                                         (path, ))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
        # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_info = _process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_info = _process_stderr_from_streaming(
                            _wrap_streaming_pty_output(master))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if not step_info['output_dir']:
                step_info['output_dir'] = self._hdfs_step_output_dir(step_num)

            if not step_info['counters']:
                pass  # TODO: fetch counters; see _fetch_counters()

            self._steps_info.append(step_info)

            # just print counters for this one step
            self._print_counters(step_nums=[step_num])

            if returncode:
                err_lines = [
                    'Job failed with return code %d: %s' %
                    (returncode, cmd_line(step_args))
                ]

                cause = self._find_probable_cause_of_failure(**step_info)

                if cause:
                    err_lines.append('')  # pad with empty line
                    err_lines.extend(_format_cause_of_failure(cause))

                for err_line in err_lines:
                    log.error(err_line)
                raise Exception('\n'.join(err_lines) + '\n')

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'], ))

    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._pre_0_20_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)
        else:
            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args = self.get_hadoop_bin() + ['jar', jar]

        if step.get('main_class'):
            args.append(step['main_class'])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ','.join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get('args'):
            args.extend(interpolate(arg) for arg in step['args'])

        return args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p) for p in self._get_input_paths()]
        else:
            return [
                posixpath.join(self._hadoop_tmp_dir, 'step-output',
                               str(step_num))
            ]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(self._hadoop_tmp_dir, 'step-output',
                                  str(step_num + 1))

    def _cleanup_local_tmp(self):
        super(HadoopJobRunner, self)._cleanup_local_tmp()

        if self._hadoop_tmp_dir:
            log.info('deleting %s from HDFS' % self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    ### LOG FETCHING/PARSING ###

    def _find_probable_cause_of_failure(self,
                                        application_id=None,
                                        job_id=None,
                                        output_dir=None,
                                        **ignored):
        """Find probable cause of failure. Currently we just scan task logs.

        On YARN, you must set application_id, and pre-YARN, you must set
        job_id.
        """
        # package up logs for _find_error_intask_logs(),
        # and log where we're looking.
        hadoop_version = self.get_hadoop_version()
        yarn = uses_yarn(hadoop_version)

        if yarn and application_id is None:
            log.warning("No application ID!")
            return None

        if not yarn and job_id is None:
            log.warning("No job ID!")
            return None

        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a local
        # directory. It's a start, anyways. See #1201.
        def stream_task_log_dirs():
            for log_dir in unique(
                    self._hadoop_log_dirs(output_dir=output_dir)):

                if yarn:
                    path = self.fs.join(log_dir, 'userlogs', application_id)
                else:
                    # sometimes pre-YARN attempt logs are organized by job_id,
                    # sometimes not. Play it safe
                    path = self.fs.join(log_dir, 'userlogs')

                if self.fs.exists(path):
                    log.info('looking for logs in %s' % path)
                    yield [path]

        return _find_error_in_task_logs(self.fs,
                                        stream_task_log_dirs(),
                                        hadoop_version,
                                        application_id=application_id,
                                        job_id=job_id)

        # TODO: catch timeouts, etc.

    # TODO: redo this
    def _fetch_counters(self, step_nums, skip_s3_wait=False):
        """Read Hadoop counters from local logs.

        Args:
        step_nums -- the steps belonging to us, so that we can ignore errors
                     from other jobs run with the same timestamp
        """
        uris = self._ls_logs('job', step_nums)
        new_counters = scan_for_counters_in_files(uris, self,
                                                  self.get_hadoop_version())

        # only include steps relevant to the current job
        for step_num in step_nums:
            self._counters.append(new_counters.get(step_num, {}))

    def counters(self):
        return [step_info['counters'] for step_info in self._steps_info]
Пример #24
0
 def uri_adds_trailing_slash(self):
     sd = UploadDirManager("s3://bucket/dir")
     sd.add("foo/bar.py")
     self.assertEqual(sd.uri("foo/bar.py"), "s3://bucket/dir/bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "s3://bucket/dir/bar.py"})
Пример #25
0
 def uri_adds_trailing_slash(self):
     sd = UploadDirManager('s3://bucket/dir')
     sd.add('foo/bar.py')
     self.assertEqual(sd.uri('foo/bar.py'), 's3://bucket/dir/bar.py')
     self.assertEqual(sd.path_to_uri(),
                      {'foo/bar.py': 's3://bucket/dir/bar.py'})
Пример #26
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hdfs_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(
                self._opts['hdfs_scratch_dir'], self._job_name))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or
            posixpath.join(self._hdfs_tmp_dir, 'output'))

        self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home'])

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # init hadoop version cache
        self._hadoop_version = None

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']),
                LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split('\n')[0]
                m = HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = m.group('version')
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                    return self._hadoop_version
            self._hadoop_version = '0.20.203'
            log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
        return self._hadoop_version

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.path_exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().iteritems():
            self._upload_to_hdfs(path, uri)

    def _mkdir_on_hdfs(self, path):
        log.debug('Making directory %s on HDFS' % path)
        self.invoke_hadoop(['fs', '-mkdir', path])

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.invoke_hadoop(['fs', '-put', path, target])

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'w')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in xrange(self._num_steps()):
            log.debug('running step %d of %d' %
                      (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error('STDOUT: ' + line.strip('\n'))

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    master = os.fdopen(master_fd)
                    # reading from master gives us the subprocess's
                    # stderr and stdout (it's a fake terminal)
                    self._process_stderr_from_streaming(master)
                    _, returncode = os.waitpid(pid, 0)
                    master.close()

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (returncode, step_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(line.strip('\n')
                                     for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise CalledProcessError(returncode, step_args)

    def _process_stderr_from_streaming(self, stderr):

        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield iter.next()  # okay for StopIteration to bubble up
                except IOError, e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + line)

            if 'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))
Пример #27
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar
                or self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')), len(posixpath.basename(p)), p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar."""
        if self._opts['hadoop_home']:
            yield self._opts['hadoop_home']

        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError('Input path %s does not exist!' %
                                         (path, ))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
        # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug('running step %d of %d' %
                      (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error('STDOUT: ' + to_string(line.strip(b'\n')))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (returncode, step_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(
                        line.strip('\n') for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise CalledProcessError(returncode, step_args)

    def _process_stderr_from_streaming(self, stderr):
        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield next(iter)  # okay for StopIteration to bubble up
                except IOError as e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + to_string(line))

            if b'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'], ))

    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._pre_0_20_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)
        else:
            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args = self.get_hadoop_bin() + ['jar', jar]

        if step.get('main_class'):
            args.append(step['main_class'])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ','.join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get('args'):
            args.extend(interpolate(arg) for arg in step['args'])

        return args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p) for p in self._get_input_paths()]
        else:
            return [
                posixpath.join(self._hadoop_tmp_dir, 'step-output',
                               str(step_num))
            ]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(self._hadoop_tmp_dir, 'step-output',
                                  str(step_num + 1))

    def _cleanup_local_tmp(self):
        super(HadoopJobRunner, self)._cleanup_local_tmp()

        if self._hadoop_tmp_dir:
            log.info('deleting %s from HDFS' % self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    ### LOG FETCHING/PARSING ###

    def _enforce_path_regexp(self, paths, regexp, step_nums):
        """Helper for log fetching functions to filter out unwanted
        logs. Keyword arguments are checked against their corresponding
        regex groups.
        """
        for path in paths:
            m = regexp.match(path)
            if (m and
                (step_nums is None or int(m.group('step_num')) in step_nums)
                    and (self._job_timestamp is None
                         or m.group('timestamp') == self._job_timestamp)):

                yield path

    def _ls_logs(self, log_type, step_nums=None):
        """List logs on the local filesystem by path relative to log root
        directory
        """
        return []
        # in YARN, you can just ask the yarn bin:
        # http://hortonworks.com/blog/simplifying-user-logs-management-and-access-in-yarn/  # noqa

        # TODO: redo this to look in
        # $HADOOP_LOG_DIR
        # dirname(hadoop_bin[0])/../logs
        # <output_dir>/_logs
        # ??? other places ???

    def _fetch_counters(self, step_nums, skip_s3_wait=False):
        """Read Hadoop counters from local logs.

        Args:
        step_nums -- the steps belonging to us, so that we can ignore errors
                     from other jobs run with the same timestamp
        """
        uris = self._ls_logs('job', step_nums)
        new_counters = scan_for_counters_in_files(uris, self,
                                                  self.get_hadoop_version())

        # only include steps relevant to the current job
        for step_num in step_nums:
            self._counters.append(new_counters.get(step_num, {}))

    def counters(self):
        return self._counters

    def _find_probable_cause_of_failure(self, step_nums):
        task_attempt_logs = self._ls_logs('task')
        step_logs = self._ls_logs('step')
        job_logs = self._ls_logs('job')

        log.info('Scanning logs for probable cause of failure')
        return best_error_from_logs(self, task_attempt_logs, step_logs,
                                    job_logs)
Пример #28
0
 def test_simple(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Пример #29
0
class DataprocJobRunner(HadoopInTheCloudJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
        'gcp_project',
    }

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException(
                'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # Lazy-load gcloud config as needed - invocations fail in PyCharm
        # debugging
        self._gcloud_config = None

        # Google Cloud Platform - project
        self._gcp_project = (
            self._opts['gcp_project'] or self.gcloud_config()['core.project'])

        # Google Compute Engine - Region / Zone
        self._gce_region = (
            self._opts['region'] or self.gcloud_config()['compute.region'])
        self._gce_zone = (
            self._opts['zone'] or self.gcloud_config()['compute.zone'])

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def _default_opts(self):
        return combine_dicts(
            super(DataprocJobRunner, self)._default_opts(),
            dict(
                bootstrap_python=True,
                check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
                cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
                cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
                image_version=_DEFAULT_IMAGE_VERSION,
                instance_type=_DEFAULT_INSTANCE_TYPE,
                master_instance_type=_DEFAULT_INSTANCE_TYPE,
                num_core_instances=_DATAPROC_MIN_WORKERS,
                num_task_instances=0,
                sh_bin=['/bin/sh', '-ex'],
            )
        )

    def gcloud_config(self):
        """Lazy load gcloud SDK configs"""
        if not self._gcloud_config:
            self._gcloud_config = _read_gcloud_config()

        return self._gcloud_config

    @property
    def api_client(self):
        if not self._api_client:
            credentials = GoogleCredentials.get_application_default()

            api_client = discovery.build(
                _DATAPROC_API_ENDPOINT, _DATAPROC_API_VERSION,
                credentials=credentials)
            self._api_client = api_client.projects().regions()

        return self._api_client

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem()

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        mrjob_buckets = self.fs.list_buckets(
            self._gcp_project, prefix='mrjob-')

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None
        gce_lower_location = self._gce_region.lower()
        for tmp_bucket in mrjob_buckets:
            tmp_bucket_name = tmp_bucket['name']

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase
            lower_location = tmp_bucket['location'].lower()
            if lower_location == gce_lower_location:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', gce_lower_location, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_output_not_exists()
        self._create_setup_wrapper_script()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files_to_fs()

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError(
                'Output path %s already exists!' % (self._output_dir,))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)
            self._upload_mgr.add(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

        # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored,
        # see _HADOOP_STREAMING_JAR_URI
        # if self._opts['hadoop_streaming_jar']:
        #     self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            # TODO - mtai @ davidmarin - Implement put function for other FSs
            self.fs.put(path, gcs_uri)

        self._wait_for_fs_sync()

    def _create_fs_tmp_bucket(self, bucket_name, location=None):
        """Create a temp bucket if missing

        Tie the temporary bucket to the same region as the GCE job and set a
        28-day TTL
        """
        # Return early if our bucket already exists
        try:
            self.fs.get_bucket(bucket_name)
            return
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

        log.info('creating FS bucket %r' % bucket_name)

        location = location or self._gce_region

        # NOTE - By default, we create a bucket in the same GCE region as our
        # job (tmp buckets ONLY)
        # https://cloud.google.com/storage/docs/bucket-locations
        self.fs.create_bucket(
            self._gcp_project, bucket_name, location=location,
            object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS)

        self._wait_for_fs_sync()

    ### Running the job ###
    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for current_job in self._api_job_list(
                cluster_name=self._cluster_id, state_matcher='ACTIVE'):
            # Kill all active jobs with the same job_prefix as this job
            current_job_id = current_job['reference']['jobId']

            if not current_job_id.startswith(job_prefix):
                continue

            self._api_job_cancel(current_job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._api_cluster_delete(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _build_dataproc_hadoop_job(self, step_num):
        """This function creates a "HadoopJob" to be passed to
        self._api_job_submit_hadoop

        :param step_num:
        :return: output_hadoop_job
        """
        # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        args = list()
        file_uris = list()
        archive_uris = list()
        properties = dict()

        step = self._get_step(step_num)

        assert step['type'] in ('streaming', 'jar'), (
            'Bad step type: %r' % (step['type'],))

        # TODO - mtai @ davidmarin - Might be trivial to support jar running,
        # see "mainJarFileUri" of variable "output_hadoop_job" in this function
        #         https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        assert step['type'] == 'streaming', 'Jar not implemented'
        main_jar_uri = _HADOOP_STREAMING_JAR_URI

        # TODO - mtai @ davidmarin - Not clear if we should move _upload_args
        # to file_uris, currently works fine as-is

        # TODO - dmarin @ mtai - Probably a little safer to do the API's way,
        # assuming the API supports distributed cache syntax (so we can pick
        # the names of the uploaded files).
        args.extend(self._upload_args())

        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        if mapper:
            args += ['-mapper', mapper]

        if combiner:
            args += ['-combiner', combiner]

        if reducer:
            args += ['-reducer', reducer]

        for current_input_uri in self._step_input_uris(step_num):
            args += ['-input', current_input_uri]

        args += ['-output', self._step_output_uri(step_num)]

        # TODO - mtai @ davidmarin - Add back support to specify a different
        # mainJarFileURI
        output_hadoop_job = dict(
            args=args,
            fileUris=file_uris,
            archiveUris=archive_uris,
            properties=properties,
            mainJarFileUri=main_jar_uri
        )
        return output_hadoop_job

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(), random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info(
                'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)

            cluster_data = self._cluster_create_kwargs()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in _DATAPROC_CLUSTER_STATES_READY:
            result_describe = self.api_client.clusters().get(
                projectId=self._gcp_project,
                region=_DATAPROC_API_REGION,
                clusterName=cluster_id).execute()

            cluster_state = result_describe['status']['state']
            if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR:
                raise DataprocException(result_describe)

            self._wait_for_api('cluster to accept jobs')

        assert cluster_state in _DATAPROC_CLUSTER_STATES_READY
        log.info("Cluster %s ready", cluster_id)

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(
                job_id, step_num=step_num, num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        # Build each step
        hadoop_job = self._build_dataproc_hadoop_job(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._api_job_submit_hadoop(step_name, hadoop_job)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result['reference']['jobId']
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(
            self, job_id, step_num=None, num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job_result = self._api_job_get(job_id)
            job_state = job_result['status']['state']

            log.info('%s => %s' % (job_id, job_state))

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            if job_state in _DATAPROC_JOB_STATES_ACTIVE:
                self._wait_for_api('job completion')
                continue

            # we're done, will return at the end of this
            elif job_state == 'DONE':
                break

            raise StepFailedException(step_num=step_num, num_steps=num_steps)

    def _default_step_output_dir(self):
        # put intermediate data in HDFS
        return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key

    def counters(self):
        # TODO - mtai @ davidmarin - Counters are currently always empty as we
        # are not processing task logs
        return [_pick_counters(log_interpretation)
                for log_interpretation in self._log_interpretations]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise AssertionError('cluster has not yet been created')

        cluster = self._api_cluster_get(self._cluster_id)
        self._image_version = (
            cluster['config']['softwareConfig']['imageVersion'])
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(
            self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_kwargs(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

            # always add idle termination script
            # add it last, so that we don't count bootstrapping as idle time
            gcs_init_script_uris.append(
                self._upload_mgr.uri(_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH))

        # NOTE - Cluster initializationActions can only take scripts with no
        # script args, so the auto-term script receives 'mrjob-max-secs-idle'
        # via metadata instead of as an arg
        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__
        cluster_metadata['mrjob-max-secs-idle'] = str(int(
            self._opts['max_mins_idle'] * 60))

        cluster_config = dict(
            gceClusterConfig=dict(
                zoneUri=_gcp_zone_uri(
                    project=self._gcp_project, zone=self._gce_zone),
                serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES,
                metadata=cluster_metadata
            ),
            initializationActions=[
                dict(executableFile=init_script_uri)
                for init_script_uri in gcs_init_script_uris
            ]
        )

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._gcp_project, zone=self._gce_zone,
            count=1, instance_type=self._opts['master_instance_type']
        )

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._gcp_project, zone=self._gce_zone,
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type']
        )

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._gcp_project, zone=self._gce_zone,
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True
        )

        cluster_config['masterConfig'] = master_conf
        cluster_config['workerConfig'] = worker_conf
        if self._opts['num_task_instances']:
            cluster_config['secondaryWorkerConfig'] = secondary_worker_conf

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            cluster_config['softwareConfig'] = dict(
                imageVersion=self._opts['image_version'])

        kwargs = dict(projectId=self._gcp_project,
                      clusterName=self._cluster_id,
                      config=cluster_config)

        return self._add_extra_cluster_params(kwargs)

    ### Dataproc-specific Stuff ###

    def _api_cluster_get(self, cluster_id):
        return self.api_client.clusters().get(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id
        ).execute()

    def _api_cluster_create(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa
        return self.api_client.clusters().create(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            body=cluster_data
        ).execute()

    def _api_cluster_delete(self, cluster_id):
        return self.api_client.clusters().delete(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id
        ).execute()

    def _api_job_list(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = dict(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
        )
        if cluster_name:
            list_kwargs['clusterName'] = cluster_name

        if state_matcher:
            list_kwargs['jobStateMatcher'] = state_matcher

        list_request = self.api_client.jobs().list(**list_kwargs)
        while list_request:
            try:
                resp = list_request.execute()
            except google_errors.HttpError as e:
                if e.resp.status == 404:
                    return
                raise

            for current_item in resp['items']:
                yield current_item

            list_request = self.api_client.jobs().list_next(list_request, resp)

    def _api_job_get(self, job_id):
        return self.api_client.jobs().get(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            jobId=job_id
        ).execute()

    def _api_job_cancel(self, job_id):
        return self.api_client.jobs().cancel(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            jobId=job_id
        ).execute()

    def _api_job_delete(self, job_id):
        return self.api_client.jobs().delete(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            jobId=job_id
        ).execute()

    def _api_job_submit_hadoop(self, step_name, hadoop_job):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa
        job_data = dict(
            reference=dict(projectId=self._gcp_project, jobId=step_name),
            placement=dict(clusterName=self._cluster_id),
            hadoopJob=hadoop_job
        )

        jobs_submit_kwargs = dict(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            body=dict(job=job_data)
        )
        return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
Пример #30
0
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
        'cluster_properties',
        'core_instance_config',
        'gcloud_bin',
        'master_instance_config',
        'network',
        'project_id',
        'service_account',
        'service_account_scopes',
        'subnet',
        'task_instance_config',
    }

    # no Spark support yet (see #1765)
    _STEP_TYPES = {'jar', 'streaming'}

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # check for library support
        if google is None:
            raise ImportError(
                'You must install google-cloud-logging and '
                'google-cloud-storage to connect to Dataproc')

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException(
                'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # see #1820
        if self._opts['image_id']:
            log.warning('mrjob does not yet support custom machine images'
                        ' on Dataproc')

        # load credentials and project ID
        self._credentials, auth_project_id = google.auth.default(
            scopes=[_FULL_SCOPE])  # needed for $GOOGLE_APPLICATION_CREDENTIALS

        self._project_id = self._opts['project_id'] or auth_project_id

        if not self._project_id:
            raise DataprocException(
                'project_id must be set. Use --project_id or'
                ' set $GOOGLE_CLOUD_PROJECT')

        self._fix_zone_and_region_opts()

        if self._opts['service_account_scopes']:
            self._opts['service_account_scopes'] = [
                _fully_qualify_scope_uri(s)
                for s in self._opts['service_account_scopes']
            ]

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # map driver_output_uri to a dict with the keys:
        # log_uri: uri of file we're reading from
        # pos: position in file
        # buffer: bytes read from file already
        self._driver_output_state = {}

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def _fix_zone_and_region_opts(self):
        """Ensure that exactly one of region and zone is set."""
        if self._opts['region'] and self._opts['zone']:
            log.warning('you do not need to set region if you set zone')
            self._opts['region'] = None
            return

        if not (self._opts['region'] or self._opts['zone']):
            if environ.get('CLOUDSDK_COMPUTE_ZONE'):
                self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE']
            elif environ.get('CLOUDSDK_COMPUTE_REGION'):
                self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION']
            else:
                self._opts['region'] = _DEFAULT_GCE_REGION

    def _default_opts(self):
        return combine_dicts(
            super(DataprocJobRunner, self)._default_opts(),
            dict(
                bootstrap_python=True,
                check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
                cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
                cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
                gcloud_bin=['gcloud'],
                image_version=_DEFAULT_IMAGE_VERSION,
                instance_type=_DEFAULT_INSTANCE_TYPE,
                master_instance_type=_DEFAULT_INSTANCE_TYPE,
                num_core_instances=_DATAPROC_MIN_WORKERS,
                num_task_instances=0,
            )
        )

    def _combine_opts(self, opt_list):
        """Blank out conflicts between *network*/*subnet* and
        *region*/*zone*."""
        opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone'])
        opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet'])

        # now combine opts, with region/zone blanked out
        return super(DataprocJobRunner, self)._combine_opts(opt_list)

    @property
    def cluster_client(self):
        return google.cloud.dataproc_v1beta2.ClusterControllerClient(
            **self._client_create_kwargs())

    @property
    def job_client(self):
        return google.cloud.dataproc_v1beta2.JobControllerClient(
            **self._client_create_kwargs())

    @property
    def logging_client(self):
        return google.cloud.logging.Client(credentials=self._credentials,
                                           project=self._project_id)

    def _client_create_kwargs(self):
        if self._opts['region']:
            endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT)
            return dict(
                channel=google.api_core.grpc_helpers.create_channel(
                    endpoint, credentials=self._credentials))
        else:
            return dict(credentials=self._credentials)

    @property
    def api_client(self):
        raise NotImplementedError(
            '"api_client" was disabled in v0.6.2. Use "cluster_client"'
            ' or "job_client" instead.')

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem(
            credentials=self._credentials,
            local_tmp_dir=self._get_local_tmp_dir(),
            project_id=self._project_id,
        )

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs

    def _fs_chunk_size(self):
        """Chunk size for cloud storage Blob objects. Currently
        only used for uploading."""
        if self._opts['cloud_part_size_mb']:
            return int(self._opts['cloud_part_size_mb'] * 1024 * 1024)
        else:
            return None

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None

        # determine region for bucket
        region = self._region()

        for tmp_bucket_name in self.fs.get_all_bucket_names(prefix='mrjob-'):
            tmp_bucket = self.fs.get_bucket(tmp_bucket_name)

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase. (As of Feb. 12, 2018, this is still true,
            # observed on google-cloud-sdk)
            if tmp_bucket.location.lower() == region:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', region, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _region(self):
        # region of cluster, which is either the region set by the user,
        # or the region derived from the zone they set.
        # used to pick bucket location and name cluster
        return self._opts['region'] or _zone_to_region(self._opts['zone'])

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_output_not_exists()
        self._create_setup_wrapper_scripts()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files_to_fs()

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError(
                'Output path %s already exists!' % (self._output_dir,))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

        if self._opts['hadoop_streaming_jar']:
            self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            self.fs.put(path, gcs_uri, chunk_size=self._fs_chunk_size())

        self._wait_for_fs_sync()

    def _create_fs_tmp_bucket(self, bucket_name, location=None):
        """Create a temp bucket if missing

        Tie the temporary bucket to the same region as the GCE job and set a
        28-day TTL
        """
        # Return early if our bucket already exists
        try:
            self.fs.get_bucket(bucket_name)
            return
        except google.api_core.exceptions.NotFound:
            pass

        log.info('creating FS bucket %r' % bucket_name)

        location = location or self._opts['region'] or _zone_to_region(
            self._opts['zone'])

        # NOTE - By default, we create a bucket in the same GCE region as our
        # job (tmp buckets ONLY)
        # https://cloud.google.com/storage/docs/bucket-locations
        self.fs.create_bucket(
            bucket_name, location=location,
            object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS)

        self._wait_for_fs_sync()

    ### Running the job ###

    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # close our SSH tunnel, if any
        self._kill_ssh_tunnel()

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for job in self._list_jobs(
                cluster_name=self._cluster_id,
                state_matcher=_STATE_MATCHER_ACTIVE):
            # Kill all active jobs with the same job_prefix as this job
            job_id = job.reference.job_id

            if not job_id.startswith(job_prefix):
                continue

            self._cancel_job(job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._delete_cluster(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _streaming_step_job_kwarg(self, step_num):
        """Returns a map from ``'hadoop_job'`` to a dict representing
        a hadoop streaming job.
        """
        return dict(
            hadoop_job=dict(
                args=self._hadoop_streaming_jar_args(step_num),
                main_jar_file_uri=self._hadoop_streaming_jar_uri(),
            )
        )

    def _jar_step_job_kwarg(self, step_num):
        """Returns a map from ``'hadoop_job'`` to a dict representing
        a Hadoop job that runs a JAR"""
        step = self._get_step(step_num)

        hadoop_job = {}

        hadoop_job['args'] = (
            self._interpolate_step_args(step['args'], step_num))

        jar_uri = self._upload_mgr.uri(step['jar'])

        # can't specify main_class and main_jar_file_uri; see
        # https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        if step.get('main_class'):
            hadoop_job['jar_file_uris'] = [jar_uri]
            hadoop_job['main_class'] = step['main_class']
        else:
            hadoop_job['main_jar_file_uri'] = jar_uri

        return dict(hadoop_job=hadoop_job)

    def _hadoop_streaming_jar_uri(self):
        if self._opts['hadoop_streaming_jar']:
            return self._upload_mgr.uri(self._opts['hadoop_streaming_jar'])
        else:
            return _HADOOP_STREAMING_JAR_URI

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._region(), random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._get_cluster(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google.api_core.exceptions.NotFound:
            log.info(
                'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)

            cluster_data = self._cluster_create_kwargs()
            self._create_cluster(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        self._set_up_ssh_tunnel()

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in ('RUNNING', 'UPDATING'):
            cluster = self._get_cluster(cluster_id)
            cluster_state = cluster.status.State.Name(cluster.status.state)

            if cluster_state in ('ERROR', 'DELETING'):
                raise DataprocException(cluster)

            self._wait_for_api('cluster to accept jobs')

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(
                job_id, step_num=step_num, num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        step = self._get_step(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Build step

        # job_kwarg is a single-item dict, where the key is 'hadoop_job',
        # 'spark_job', etc.
        if step['type'] == 'streaming':
            job_kwarg = self._streaming_step_job_kwarg(step_num)
        elif step['type'] == 'jar':
            job_kwarg = self._jar_step_job_kwarg(step_num)
        else:
            raise NotImplementedError(
                'Unsupported step type: %r' % step['type'])

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._submit_job(step_name, job_kwarg)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result.reference.job_id
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(self, job_id, step_num, num_steps):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        log_interpretation['step'] = {}
        step_type = self._get_step(step_num)['type']

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job = self._get_job(job_id)

            job_state = job.status.State.Name(job.status.state)

            log.info('%s => %s' % (job_id, job_state))

            log_interpretation['step']['driver_output_uri'] = (
                job.driver_output_resource_uri)

            self._interpret_step_logs(log_interpretation, step_type)

            progress = log_interpretation['step'].get('progress')
            if progress:
                log.info(' ' + progress['message'])

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            # these are the states covered by the ACTIVE job state matcher,
            # plus SETUP_DONE
            if job_state in ('PENDING', 'RUNNING',
                             'CANCEL_PENDING', 'SETUP_DONE'):
                self._wait_for_api('job completion')
                continue

            # print counters if job wasn't CANCELLED
            if job_state != 'CANCELLED':
                self._log_counters(log_interpretation, step_num)

            if job_state == 'ERROR':
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n\n' %
                              _format_error(error))

            # we're done, will return at the end of this
            if job_state == 'DONE':
                break
            else:
                raise StepFailedException(
                    step_num=step_num, num_steps=num_steps)

    def _default_step_output_dir(self):
        # put intermediate data in HDFS
        return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key

    ### log intepretation ###

    # step

    def _interpret_step_logs(self, log_interpretation, step_type):
        """Hook for interpreting step logs.

        Unlike with most runners, you may call this multiple times and it
        will continue to parse the step log incrementally, which is useful
        for getting job progress."""
        # don't turn this off even if read_logs opt is false; it's
        # the only way this runner can track job progress

        driver_output_uri = log_interpretation.get(
            'step', {}).get('driver_output_uri')

        if driver_output_uri:
            self._update_step_interpretation(
                log_interpretation['step'], driver_output_uri)

    def _update_step_interpretation(
            self, step_interpretation, driver_output_uri):
        new_lines = self._get_new_driver_output_lines(driver_output_uri)
        _interpret_new_dataproc_step_stderr(step_interpretation, new_lines)

    def _get_new_driver_output_lines(self, driver_output_uri):
        """Get a list of complete job driver output lines that are
        new since the last time we checked.
        """
        state = self._driver_output_state.setdefault(
            driver_output_uri,
            dict(log_uri=None, pos=0, buffer=b''))

        # driver output is in logs with names like driveroutput.000000000
        log_uris = sorted(self.fs.ls(driver_output_uri + '*'))

        for log_uri in log_uris:
            # initialize log_uri with first URI we see
            if state['log_uri'] is None:
                # log the location of job driver output just once
                log.info(
                    '  Parsing job driver output from %s*' % driver_output_uri)
                state['log_uri'] = log_uri

            # skip log files already parsed
            if log_uri < state['log_uri']:
                continue

            # when parsing the next file, reset *pos*
            elif log_uri > state['log_uri']:
                state['pos'] = 0
                state['log_uri'] = log_uri

            log_blob = self.fs._get_blob(log_uri)

            try:
                new_data = log_blob.download_as_string(start=state['pos'])
            except (google.api_core.exceptions.NotFound,
                    google.api_core.exceptions.RequestRangeNotSatisfiable):
                # blob was just created, or no more data is available
                break

            state['buffer'] += new_data
            state['pos'] += len(new_data)

        # convert buffer into lines, saving leftovers for next time
        stream = BytesIO(state['buffer'])
        state['buffer'] = b''

        lines = []

        for line_bytes in stream:
            if line_bytes.endswith(b'\n'):
                lines.append(to_unicode(line_bytes))
            else:
                # leave final partial line (if any) in buffer
                state['buffer'] = line_bytes

        return lines

    # history

    def _interpret_history_log(self, log_interpretation):
        """Does nothing. We can't get the history logs, and we don't need
        them."""
        if not self._read_logs():
            return

        log_interpretation.setdefault('history', {})

    # task

    def _interpret_task_logs(self, log_interpretation, step_type,
                             error_attempt_ids=(), partial=True):
        """Scan node manager log to find failed container IDs of failed
        tasks, and then scan the corresponding stderr and syslogs."""
        if 'task' in log_interpretation and (
                partial or not log_interpretation['task'].get('partial')):
            return   # already interpreted

        if not self._read_logs():
            return

        step_interpretation = log_interpretation.get('step') or {}

        application_id = step_interpretation.get('application_id')
        if not application_id:
            log.warning(
                "Can't parse node manager logs; missing application ID")
            return

        log_interpretation['task'] = self._task_log_interpretation(
            application_id, step_type, partial)

    def _task_log_interpretation(
            self, application_id, step_type, partial=True):
        """Helper for :py:meth:`_interpret_task_logs`"""
        # not bothering with _read_logs() since this is a helper method
        result = {}

        for container_id in self._failed_task_container_ids(application_id):
            error = _parse_task_syslog_records(
                self._task_syslog_records(
                    application_id, container_id, step_type))

            if not error.get('hadoop_error'):
                # not sure if this ever happens, since we already know
                # which containers failed
                continue

            error['container_id'] = container_id

            # fix weird munging of java stacktrace
            error['hadoop_error']['message'] = _fix_java_stack_trace(
                error['hadoop_error']['message'])

            task_error = _parse_task_stderr(
                self._task_stderr_lines(
                    application_id, container_id, step_type))

            if task_error:
                task_error['message'] = _fix_traceback(task_error['message'])
                error['task_error'] = task_error

            result.setdefault('errors', []).append(error)

            # if partial is true, bail out when we find the first task error
            if task_error and partial:
                result['partial'] = True
                return result

        return result

    def _failed_task_container_ids(self, application_id):
        """Stream container IDs of failed tasks, in reverse order."""
        container_id_prefix = 'container' + application_id[11:]

        log_filter = self._make_log_filter(
            'yarn-yarn-nodemanager',
            {'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME})

        log.info('Scanning node manager logs for IDs of failed tasks...')

        # it doesn't seem to work to do self.logging_client.logger();
        # there's some RPC dispute about whether the log name should
        # be qualified by project name or not
        entries = self.logging_client.list_entries(
            filter_=log_filter, order_by=google.cloud.logging.DESCENDING)

        for entry in entries:
            message = entry.payload.get('message')
            if not message:
                continue

            m = _CONTAINER_EXIT_RE.match(message)
            if not m:
                continue

            returncode = int(m.group('returncode'))
            if not returncode:
                continue

            container_id = m.group('container_id')
            # matches some other step
            if not container_id.startswith(container_id_prefix):
                continue

            log.debug('  %s' % container_id)
            yield container_id

    def _task_stderr_lines(self, application_id, container_id, step_type):
        """Yield lines from a specific stderr log."""
        log_filter = self._make_log_filter(
            'yarn-userlogs', {
                'jsonPayload.application': application_id,
                'jsonPayload.container': container_id,
                # TODO: pick based on step_type
                'jsonPayload.container_logname': 'stderr',
            })

        log.info('    reading stderr log...')
        entries = self.logging_client.list_entries(filter_=log_filter)

        # use log4j parsing to handle tab -> newline conversion
        for record in _log_entries_to_log4j(entries):
            for line in record['message'].split('\n'):
                yield line

    def _task_syslog_records(self, application_id, container_id, step_type):
        """Yield log4j records from a specific syslog.
        """
        log_filter = self._make_log_filter(
            'yarn-userlogs', {
                'jsonPayload.application': application_id,
                'jsonPayload.container': container_id,
                # TODO: pick based on step_type
                'jsonPayload.container_logname': 'syslog',
            })

        log.info('    reading syslog...')
        entries = self.logging_client.list_entries(filter_=log_filter)

        return _log_entries_to_log4j(entries)

    # misc

    def _make_log_filter(self, log_name=None, extra_values=None):
        # we only want logs from this project, cluster, and region
        d = {}

        d['resource.labels.cluster_name'] = self._cluster_id
        d['resource.labels.project_id'] = self._project_id
        d['resource.labels.region'] = self._region()
        d['resource.type'] = 'cloud_dataproc_cluster'

        if log_name:
            d['logName'] = 'projects/%s/logs/%s' % (
                self._project_id, log_name)

        if extra_values:
            d.update(extra_values)

        return _log_filter_str(d)

    def counters(self):
        return [_pick_counters(log_interpretation)
                for log_interpretation in self._log_interpretations]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise ValueError('cluster has not yet been created')

        cluster = self._get_cluster(self._cluster_id)
        self._image_version = (
            cluster.config.software_config.image_version)
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(
            self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    def _bootstrap_pre_commands(self):
        # don't run the bootstrap script in / (see #1601)
        return [
            'mkdir /tmp/mrjob',
            'cd /tmp/mrjob',
        ]

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_kwargs(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__

        # TODO: remove mrjob-max-secs-idle once lifecycle_config is visible
        # through the gcloud utility and the Google Cloud Console
        cluster_metadata['mrjob-max-secs-idle'] = str(int(
            self._opts['max_mins_idle'] * 60))

        gce_cluster_config = dict(
            metadata=cluster_metadata,
            service_account_scopes=self._opts['service_account_scopes'],
        )

        if self._opts['network']:
            gce_cluster_config['network_uri'] = self._opts['network']

        if self._opts['subnet']:
            gce_cluster_config['subnetwork_uri'] = self._opts['subnet']

        if self._opts['service_account']:
            gce_cluster_config['service_account'] = (
                self._opts['service_account'])

        if self._opts['service_account_scopes']:
            gce_cluster_config['service_account_scopes'] = (
                self._opts['service_account_scopes'])

        if self._opts['zone']:
            gce_cluster_config['zone_uri'] = _gcp_zone_uri(
                project=self._project_id, zone=self._opts['zone'])

        cluster_config = dict(
            gce_cluster_config=gce_cluster_config,
            initialization_actions=[
                dict(executable_file=init_script_uri)
                for init_script_uri in gcs_init_script_uris
            ]
        )

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._project_id, zone=self._opts['zone'],
            count=1, instance_type=self._opts['master_instance_type'],
        )
        if self._opts['master_instance_config']:
            master_conf.update(self._opts['master_instance_config'])

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._project_id, zone=self._opts['zone'],
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type']
        )
        if self._opts['core_instance_config']:
            worker_conf.update(self._opts['core_instance_config'])

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._project_id, zone=self._opts['zone'],
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True
        )
        if self._opts['task_instance_config']:
            secondary_worker_conf.update(self._opts['task_instance_config'])

        cluster_config['master_config'] = master_conf
        cluster_config['worker_config'] = worker_conf
        if secondary_worker_conf.get('num_instances'):
            cluster_config['secondary_worker_config'] = secondary_worker_conf

        cluster_config['lifecycle_config'] = dict(
            idle_delete_ttl=dict(
                seconds=int(self._opts['max_mins_idle'] * 60)))

        software_config = {}

        if self._opts['cluster_properties']:
            software_config['properties'] = _values_to_text(
                self._opts['cluster_properties'])

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            software_config['image_version'] = self._opts['image_version']

        if software_config:
            cluster_config['software_config'] = software_config

        # in Python 2, dict keys loaded from JSON will be unicode, which
        # the Google protobuf objects don't like
        if PY2:
            cluster_config = _clean_json_dict_keys(cluster_config)

        kwargs = dict(project_id=self._project_id,
                      cluster_name=self._cluster_id,
                      config=cluster_config)

        return self._add_extra_cluster_params(kwargs)

    ### Dataproc-specific Stuff ###

    def _get_cluster(self, cluster_id):
        return self.cluster_client.get_cluster(
            cluster_name=cluster_id,
            **self._project_id_and_region()
        )

    def _create_cluster(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa

        self.cluster_client.create_cluster(
            cluster=cluster_data,
            **self._project_id_and_region()
        )

    def _delete_cluster(self, cluster_id):
        return self.cluster_client.delete_cluster(
            cluster_name=cluster_id,
            **self._project_id_and_region()
        )

    def _list_jobs(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = self._project_id_and_region()

        if cluster_name:
            list_kwargs['cluster_name'] = cluster_name

        if state_matcher:
            list_kwargs['job_state_matcher'] = state_matcher

        return self.job_client.list_jobs(**list_kwargs)

    def _get_job(self, job_id):
        return self.job_client.get_job(
            job_id=job_id,
            **self._project_id_and_region()
        )

    def _cancel_job(self, job_id):
        return self.job_client.cancel_job(
            job_id=job_id,
            **self._project_id_and_region()
        )

    def _submit_job(self, step_name, job_kwarg):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa

        submit_job_kwargs = dict(
            job=dict(
                reference=dict(project_id=self._project_id, job_id=step_name),
                placement=dict(cluster_name=self._cluster_id),
                **job_kwarg
            ),
            **self._project_id_and_region()
        )

        log.debug('  submit_job(%s)' % ', '.join(
            '%s=%r' % (k, v) for k, v in sorted(submit_job_kwargs.items())))

        return self.job_client.submit_job(**submit_job_kwargs)

    def _project_id_and_region(self):
        return dict(
            project_id=self._project_id,
            region=(self._opts['region'] or 'global'),
        )

    def _manifest_download_commands(self):
        return [
            # TODO: SSH in and figure out how to use gsutil or similar
            # ('gs://*', 'gsutil cp'),
            ('*://*', 'hadoop fs -copyToLocal'),
        ]

    ### SSH hooks ###

    def _job_tracker_host(self):
        return '%s-m' % self._cluster_id

    def _ssh_tunnel_config(self):
        return _SSH_TUNNEL_CONFIG

    def _launch_ssh_proc(self, args):
        ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args)

        # enter an empty passphrase if creating a key for the first time
        ssh_proc.stdin.write(b'\n\n')

        return ssh_proc

    def _ssh_launch_wait_secs(self):
        """Wait 20 seconds because gcloud has to update project metadata
        (unless we were going to check the cluster sooner anyway)."""
        return min(20.0, self._opts['check_cluster_every'])

    def _ssh_tunnel_args(self, bind_port):
        if not self._opts['gcloud_bin']:
            self._give_up_on_ssh_tunnel = True
            return None

        if not self._cluster_id:
            return

        cluster = self._get_cluster(self._cluster_id)
        zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1]

        return self._opts['gcloud_bin'] + [
            'compute', 'ssh',
            '--zone', zone,
            self._job_tracker_host(),
            '--',
        ] + self._ssh_tunnel_opts(bind_port)
Пример #31
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hdfs_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hdfs_scratch_dir'], self._job_name))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output'))

        self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home'])

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # init hadoop version cache
        self._hadoop_version = None

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split('\n')[0]
                m = HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = m.group('version')
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                    return self._hadoop_version
            self._hadoop_version = '0.20.203'
            log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
        return self._hadoop_version

    def _run(self):
        if self._opts['bootstrap_mrjob']:
            self._add_python_archive(self._create_mrjob_tar_gz())

        self._check_input_exists()
        self._create_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if not self.path_exists(path):
                raise AssertionError('Input path %s does not exist!' %
                                     (path, ))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().iteritems():
            self._upload_to_hdfs(path, uri)

    def _mkdir_on_hdfs(self, path):
        log.debug('Making directory %s on HDFS' % path)
        self.invoke_hadoop(['fs', '-mkdir', path])

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.invoke_hadoop(['fs', '-put', path, target])

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
        # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'w')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []
        steps = self._get_steps()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num + 1, len(steps)))

            streaming_args = self._streaming_args(step, step_num, len(steps))

            log.debug('> %s' % cmd_line(streaming_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error('STDOUT: ' + line.strip('\n'))

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(streaming_args[0], streaming_args)
                else:
                    master = os.fdopen(master_fd)
                    # reading from master gives us the subprocess's
                    # stderr and stdout (it's a fake terminal)
                    self._process_stderr_from_streaming(master)
                    _, returncode = os.waitpid(pid, 0)
                    master.close()

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (returncode, streaming_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(
                        line.strip('\n') for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise CalledProcessError(returncode, streaming_args)

    def _process_stderr_from_streaming(self, stderr):
        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield iter.next()  # okay for StopIteration to bubble up
                except IOError, e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + line)

            if 'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))
Пример #32
0
class HadoopJobRunner(MRJobRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        if self._opts['hadoop_home']:
            log.warning(
                'hadoop_home is deprecated since 0.5.0 and will be removed'
                ' in v0.6.0. In most cases, mrjob will now find the hadoop'
                ' binary and streaming jar without help. If not, use the'
                ' hadoop_bin and hadoop_streaming_jar options.')

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Fully qualify step_output_dir, if set
        if self._step_output_dir:
            self._step_output_dir = fully_qualify_hdfs_path(
                self._step_output_dir)

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # Keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

        # List of dicts (one for each step) potentially containing
        # the keys 'history', 'step', and 'task' ('step' will always
        # be filled because it comes from the hadoop jar command output,
        # others will be filled as needed)
        self._log_interpretations = []

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar
                or self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s...' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')), len(posixpath.basename(p)), p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_dirs(self):
        """Yield all possible hadoop directories (used for streaming jar
        and logs). May yield duplicates"""
        if self._opts['hadoop_home']:
            yield self._opts['hadoop_home']

        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar.
        May yield duplicates.
        """
        for hadoop_dir in self._hadoop_dirs():
            yield hadoop_dir

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _hadoop_log_dirs(self, output_dir=None):
        """Yield all possible places to look for hadoop logs."""
        # hadoop_log_dirs opt overrides all this
        if self._opts['hadoop_log_dirs']:
            for path in self._opts['hadoop_log_dirs']:
                yield path
            return

        hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
        if hadoop_log_dir:
            yield hadoop_log_dir

        yarn = uses_yarn(self.get_hadoop_version())

        if yarn:
            yarn_log_dir = os.environ.get('YARN_LOG_DIR')
            if yarn_log_dir:
                yield yarn_log_dir

            yield _DEFAULT_YARN_HDFS_LOG_DIR

        if output_dir:
            # Cloudera style of logging
            yield posixpath.join(output_dir, '_logs')

        for hadoop_dir in self._hadoop_dirs():
            yield posixpath.join(hadoop_dir, 'logs')

        # hard-coded fallback paths
        if yarn:
            for path in _FALLBACK_HADOOP_YARN_LOG_DIRS:
                yield path

        for path in _FALLBACK_HADOOP_LOG_DIRS:
            yield path

    def get_spark_submit_bin(self):
        if not self._spark_submit_bin:
            self._spark_submit_bin = self._find_spark_submit_bin()
        return self._spark_submit_bin

    def _find_spark_submit_bin(self):
        # TODO: this is very similar to _find_hadoop_bin() (in fs)
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' %
                     (path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']

    def _spark_submit_bin_dirs(self):
        # $SPARK_HOME
        spark_home = os.environ.get('SPARK_HOME')
        if spark_home:
            yield os.path.join(spark_home, 'bin')

        yield None  # use $PATH

        # some other places recommended by install docs (see #1366)
        yield '/usr/lib/spark/bin'
        yield '/usr/local/spark/bin'
        yield '/usr/local/lib/spark/bin'

    def _run(self):
        self._find_binaries_and_jars()
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _find_binaries_and_jars(self):
        """Find hadoop and (if needed) spark-submit bin up-front, before
        continuing with the job.

        (This is just for user-interaction purposes; these would otherwise
        lazy-load as needed.)
        """
        # this triggers looking for Hadoop binary
        self.get_hadoop_version()

        if self._has_streaming_steps():
            self.get_hadoop_streaming_jar()

        if self._has_spark_steps():
            self.get_spark_submit_bin()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError('Input path %s does not exist!' %
                                         (path, ))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files to %s...' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('  %s -> %s' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
        # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s...' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = self._env_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())

    def _warn_about_spark_archives(self, step):
        """If *step* is a Spark step, the *upload_archives* option is set,
        and *spark_master* is not ``'yarn'``, warn that *upload_archives*
        will be ignored by Spark."""
        if (_is_spark_step_type(step['type'])
                and self._opts['spark_master'] != 'yarn'
                and self._opts['upload_archives']):
            log.warning('Spark will probably ignore archives because'
                        " spark_master is not set to 'yarn'")

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        elif _is_spark_step_type(step['type']):
            return self._args_for_spark_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'], ))

    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # set up uploading from HDFS to the working dir
        args.extend(self._upload_args())

        # if no reducer, shut off reducer tasks. This has to come before
        # extra hadoop args, which could contain jar-specific args
        # (e.g. -outputformat). See #1331.
        #
        # might want to just integrate this into _hadoop_args_for_step?
        if not reducer:
            args.extend([
                '-D',
                ('%s=0' % translate_jobconf('mapreduce.job.reduces',
                                            self.get_hadoop_version()))
            ])

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._step_input_uris(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._step_output_uri(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        args = []

        args.extend(self.get_hadoop_bin())

        # -libjars, -D
        args.extend(self._hadoop_generic_args_for_step(step_num))

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args.extend(['jar', jar])

        if step.get('main_class'):
            args.append(step['main_class'])

        if step.get('args'):
            args.extend(
                self._interpolate_input_and_output(step['args'], step_num))

        return args

    def _spark_submit_arg_prefix(self):
        return ['--master', self._opts['spark_master']]

    def _env_for_step(self, step_num):
        step = self._get_step(step_num)

        env = dict(os.environ)

        # when running spark-submit, set its environment directly. See #1464
        if _is_spark_step_type(step['type']):
            env.update(self._spark_cmdenv(step_num))

        return env

    def _default_step_output_dir(self):
        return posixpath.join(self._hadoop_tmp_dir, 'step-output')

    def _cleanup_hadoop_tmp(self):
        if self._hadoop_tmp_dir:
            log.info('Removing HDFS temp directory %s...' %
                     self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    ### LOG (implementation of LogInterpretationMixin) ###

    def _stream_history_log_dirs(self, output_dir=None):
        """Yield lists of directories to look for the history log in."""
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if _logs_exist(self.fs, log_dir):
                log.info('Looking for history log in %s...' % log_dir)
                # logs aren't always in a subdir named history/
                yield [log_dir]

    def _stream_task_log_dirs(self, application_id=None, output_dir=None):
        """Yield lists of directories to look for the task logs in."""
        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a
        # local directory. It's a start, anyways. See #1201.
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if application_id:
                path = self.fs.join(log_dir, 'userlogs', application_id)
            else:
                path = self.fs.join(log_dir, 'userlogs')

            if _logs_exist(self.fs, path):
                log.info('Looking for task syslogs in %s...' % path)
                yield [path]

    def counters(self):
        return [
            _pick_counters(log_interpretation)
            for log_interpretation in self._log_interpretations
        ]
Пример #33
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """

    alias = "hadoop"

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hdfs_tmp_dir = fully_qualify_hdfs_path(posixpath.join(self._opts["hdfs_scratch_dir"], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, "files", "")
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(self._output_dir or posixpath.join(self._hdfs_tmp_dir, "output"))

        self._hadoop_log_dir = hadoop_log_dir(self._opts["hadoop_home"])

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # init hadoop version cache
        self._hadoop_version = None

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(HadoopFilesystem(self._opts["hadoop_bin"]), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(["version"], return_stdout=True)
            if stdout:
                first_line = stdout.split("\n")[0]
                m = HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = m.group("version")
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                    return self._hadoop_version
            self._hadoop_version = "0.20.203"
            log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
        return self._hadoop_version

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == "-":
                continue  # STDIN always exists

            if self._opts["check_input_paths"]:
                if not self.path_exists(path):
                    raise AssertionError("Input path %s does not exist!" % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info("Copying local files into %s" % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _mkdir_on_hdfs(self, path):
        log.debug("Making directory %s on HDFS" % path)

        hadoop_version = self.get_hadoop_version()
        # from version 0.23 / 2.x on, -mkdir needs a -p option to create
        # parent directories

        # version == 0.23
        if mrjob.compat.version_gte(hadoop_version, "0.23") and not mrjob.compat.version_gte(hadoop_version, "0.24"):
            self.invoke_hadoop(["fs", "-mkdir", "-p", path])

        # version >= 2.0
        elif mrjob.compat.version_gte(hadoop_version, "2.0"):
            self.invoke_hadoop(["fs", "-mkdir", "-p", path])

        # for version 0.20, 1.x
        else:
            self.invoke_hadoop(["fs", "-mkdir", path])

    def _upload_to_hdfs(self, path, target):
        log.debug("Uploading %s -> %s on HDFS" % (path, target))
        self.invoke_hadoop(["fs", "-put", path, target])

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = os.path.join(self._get_local_tmp_dir(), "STDIN")
        # prompt user, so they don't think the process has stalled
        log.info("reading from STDIN")

        log.debug("dumping stdin to local file %s" % stdin_path)
        stdin_file = open(stdin_path, "wb")
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug("running step %d of %d" % (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug("> %s" % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error("STDOUT: " + to_string(line.strip(b"\n")))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, "rb") as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = "Job failed with return code %d: %s" % (returncode, step_args)
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure([step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"])
                    cause_msg.extend(line.strip("\n") for line in cause["lines"])
                    if cause["input_uri"]:
                        cause_msg.append("(while reading from %s)" % cause["input_uri"])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += "\n" + "\n".join(cause_msg) + "\n"

                raise CalledProcessError(returncode, step_args)

    def _process_stderr_from_streaming(self, stderr):
        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield next(iter)  # okay for StopIteration to bubble up
                except IOError as e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info("HADOOP: " + to_string(line))

            if b"Streaming Job Failed!" in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group("timestamp")
                self._start_step_num = int(m.group("step_num"))

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step["type"] == "streaming":
            return self._args_for_streaming_step(step_num)
        elif step["type"] == "jar":
            return self._args_for_jar_step(step_num)
        else:
            raise AssertionError("Bad step type: %r" % (step["type"],))

    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        args = self._opts["hadoop_bin"] + ["jar", self._opts["hadoop_streaming_jar"]]

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._new_upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(["-input", input_uri])

        # set up output
        args.append("-output")
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(self._old_upload_args(self._upload_mgr))

        mapper, combiner, reducer = self._hadoop_streaming_commands(step_num)

        args.append("-mapper")
        args.append(mapper)

        if combiner:
            args.append("-combiner")
            args.append(combiner)

        if reducer:
            args.append("-reducer")
            args.append(reducer)
        else:
            args.extend(["-jobconf", "mapred.reduce.tasks=0"])

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step["jar"].startswith("file:///"):
            jar = step["jar"][7:]  # keep leading slash
        else:
            jar = step["jar"]

        args = self._opts["hadoop_bin"] + ["jar", jar]

        if step.get("main_class"):
            args.append(step["main_class"])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ",".join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get("args"):
            args.extend(interpolate(arg) for arg in step["args"])

        return args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p) for p in self._get_input_paths()]
        else:
            return [posixpath.join(self._hdfs_tmp_dir, "step-output", str(step_num))]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(self._hdfs_tmp_dir, "step-output", str(step_num + 1))

    def _cleanup_local_scratch(self):
        super(HadoopJobRunner, self)._cleanup_local_scratch()

        if self._hdfs_tmp_dir:
            log.info("deleting %s from HDFS" % self._hdfs_tmp_dir)

            try:
                self.invoke_hadoop(["fs", "-rmr", self._hdfs_tmp_dir])
            except Exception as e:
                log.exception(e)

    ### LOG FETCHING/PARSING ###

    def _enforce_path_regexp(self, paths, regexp, step_nums):
        """Helper for log fetching functions to filter out unwanted
        logs. Keyword arguments are checked against their corresponding
        regex groups.
        """
        for path in paths:
            m = regexp.match(path)
            if (
                m
                and (step_nums is None or int(m.group("step_num")) in step_nums)
                and (self._job_timestamp is None or m.group("timestamp") == self._job_timestamp)
            ):

                yield path

    def _ls_logs(self, relative_path):
        """List logs on the local filesystem by path relative to log root
        directory
        """
        return self.ls(os.path.join(self._hadoop_log_dir, relative_path))

    def _fetch_counters(self, step_nums, skip_s3_wait=False):
        """Read Hadoop counters from local logs.

        Args:
        step_nums -- the steps belonging to us, so that we can ignore errors
                     from other jobs run with the same timestamp
        """
        job_logs = self._enforce_path_regexp(self._ls_logs("history/"), HADOOP_JOB_LOG_URI_RE, step_nums)
        uris = list(job_logs)
        new_counters = scan_for_counters_in_files(uris, self, self.get_hadoop_version())

        # only include steps relevant to the current job
        for step_num in step_nums:
            self._counters.append(new_counters.get(step_num, {}))

    def counters(self):
        return self._counters

    def _find_probable_cause_of_failure(self, step_nums):
        all_task_attempt_logs = []
        try:
            all_task_attempt_logs.extend(self._ls_logs("userlogs/"))
        except IOError:
            # sometimes the master doesn't have these
            pass
        # TODO: get these logs from slaves if possible
        task_attempt_logs = self._enforce_path_regexp(all_task_attempt_logs, TASK_ATTEMPTS_LOG_URI_RE, step_nums)
        step_logs = self._enforce_path_regexp(self._ls_logs("steps/"), STEP_LOG_URI_RE, step_nums)
        job_logs = self._enforce_path_regexp(self._ls_logs("history/"), HADOOP_JOB_LOG_URI_RE, step_nums)
        log.info("Scanning logs for probable cause of failure")
        return best_error_from_logs(self, task_attempt_logs, step_logs, job_logs)
Пример #34
0
 def uri_adds_trailing_slash(self):
     sd = UploadDirManager('s3://bucket/dir')
     sd.add('foo/bar.py')
     self.assertEqual(sd.uri('foo/bar.py'), 's3://bucket/dir/bar.py')
     self.assertEqual(sd.path_to_uri(),
                      {'foo/bar.py': 's3://bucket/dir/bar.py'})
Пример #35
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hdfs_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hdfs_scratch_dir'], self._job_name))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hdfs_tmp_dir, 'output'))

        self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home'])

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # init hadoop version cache
        self._hadoop_version = None

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']), LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split('\n')[0]
                m = HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = m.group('version')
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                    return self._hadoop_version
            self._hadoop_version = '0.20.203'
            log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
        return self._hadoop_version

    def _run(self):
        if self._opts['bootstrap_mrjob']:
            self._add_python_archive(self._create_mrjob_tar_gz())

        self._check_input_exists()
        self._create_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if not self.path_exists(path):
                raise AssertionError('Input path %s does not exist!' %
                                     (path, ))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().iteritems():
            self._upload_to_hdfs(path, uri)

    def _mkdir_on_hdfs(self, path):
        log.debug('Making directory %s on HDFS' % path)
        self.invoke_hadoop(['fs', '-mkdir', path])

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.invoke_hadoop(['fs', '-put', path, target])

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
        # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'w')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []
        steps = self._get_steps()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num + 1, len(steps)))

            streaming_args = self._streaming_args(step, step_num, len(steps))

            log.debug('> %s' % cmd_line(streaming_args))
            step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE)

            # TODO: use a pty or something so that the hadoop binary
            # won't buffer the status messages
            self._process_stderr_from_streaming(step_proc.stderr)

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                log.error('STDOUT: ' + line.strip('\n'))

            returncode = step_proc.wait()
            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (step_proc.returncode, streaming_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(
                        line.strip('\n') for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise Exception(msg)
                raise CalledProcessError(step_proc.returncode, streaming_args)

    def _process_stderr_from_streaming(self, stderr):
        for line in stderr:
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + line)

            if 'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))

    def _streaming_args(self, step, step_num, num_steps):
        version = self.get_hadoop_version()

        streaming_args = (self._opts['hadoop_bin'] +
                          ['jar', self._opts['hadoop_streaming_jar']])

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            streaming_args.extend(self._new_upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        streaming_args.extend(self._hadoop_conf_args(step_num, num_steps))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            streaming_args.extend(['-input', input_uri])

        # set up output
        streaming_args.append('-output')
        streaming_args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            streaming_args.extend(self._old_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(
            step, step_num))

        streaming_args.append('-mapper')
        streaming_args.append(mapper)

        if combiner:
            streaming_args.append('-combiner')
            streaming_args.append(combiner)

        if reducer:
            streaming_args.append('-reducer')
            streaming_args.append(reducer)
        else:
            streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return streaming_args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p) for p in self._get_input_paths()]
        else:
            return [
                posixpath.join(self._hdfs_tmp_dir, 'step-output',
                               str(step_num))
            ]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(self._hdfs_tmp_dir, 'step-output',
                                  str(step_num + 1))

    def _cleanup_local_scratch(self):
        super(HadoopJobRunner, self)._cleanup_local_scratch()

        if self._hdfs_tmp_dir:
            log.info('deleting %s from HDFS' % self._hdfs_tmp_dir)

            try:
                self.invoke_hadoop(['fs', '-rmr', self._hdfs_tmp_dir])
            except Exception, e:
                log.exception(e)
Пример #36
0
class DataprocJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    # Don't need to bootstrap mrjob in the setup wrapper; that's what
    # the bootstrap script is for!
    BOOTSTRAP_MRJOB_IN_SETUP = False

    OPTION_STORE_CLASS = DataprocRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # Lazy-load gcloud config as needed - invocations fail in PyCharm
        # debugging
        self._gcloud_config = None

        # Google Cloud Platform - project
        self._gcp_project = (self._opts['gcp_project']
                             or self.gcloud_config()['core.project'])

        # Google Compute Engine - Region / Zone
        self._gce_region = (self._opts['region']
                            or self.gcloud_config()['compute.region'])
        self._gce_zone = (self._opts['zone']
                          or self.gcloud_config()['compute.zone'])

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage working dir for bootstrap script
        self._bootstrap_dir_mgr = BootstrapWorkingDirManager()

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        for cmd in self._bootstrap:
            for maybe_path_dict in cmd:
                if isinstance(maybe_path_dict, dict):
                    self._bootstrap_dir_mgr.add(**maybe_path_dict)

        # we'll create the script later
        self._master_bootstrap_script_path = None

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def gcloud_config(self):
        """Lazy load gcloud SDK configs"""
        if not self._gcloud_config:
            self._gcloud_config = _read_gcloud_config()

        return self._gcloud_config

    @property
    def api_client(self):
        if not self._api_client:
            credentials = GoogleCredentials.get_application_default()

            api_client = discovery.build(_DATAPROC_API_ENDPOINT,
                                         _DATAPROC_API_VERSION,
                                         credentials=credentials)
            self._api_client = api_client.projects().regions()

        return self._api_client

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem()

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        mrjob_buckets = self.fs.list_buckets(self._gcp_project,
                                             prefix='mrjob-')

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None
        gce_lower_location = self._gce_region.lower()
        for tmp_bucket in mrjob_buckets:
            tmp_bucket_name = tmp_bucket['name']

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase
            lower_location = tmp_bucket['location'].lower()
            if lower_location == gce_lower_location:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', gce_lower_location,
                 random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_input_exists()
        self._check_output_not_exists()
        self._create_setup_wrapper_script()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files_to_fs()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if is_uri(path) and not is_gcs_uri(path):
                continue  # can't check non-GCS URIs, hope for the best

            if not self.fs.exists(path):
                raise AssertionError('Input path %s does not exist!' %
                                     (path, ))

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError('Output path %s already exists!' %
                          (self._output_dir, ))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)
            self._upload_mgr.add(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

        # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored,
        # see _HADOOP_STREAMING_JAR_URI
        # if self._opts['hadoop_streaming_jar']:
        #     self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            # TODO - mtai @ davidmarin - Implement put function for other FSs
            self.fs.put(path, gcs_uri)

        self._wait_for_fs_sync()

    def _create_fs_tmp_bucket(self, bucket_name, location=None):
        """Create a temp bucket if missing

        Tie the temporary bucket to the same region as the GCE job and set a
        28-day TTL
        """
        # Return early if our bucket already exists
        try:
            self.fs.get_bucket(bucket_name)
            return
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

        log.info('creating FS bucket %r' % bucket_name)

        location = location or self._gce_region

        # NOTE - By default, we create a bucket in the same GCE region as our
        # job (tmp buckets ONLY)
        # https://cloud.google.com/storage/docs/bucket-locations
        self.fs.create_bucket(
            self._gcp_project,
            bucket_name,
            location=location,
            object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS)

        self._wait_for_fs_sync()

    ### Running the job ###
    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for current_job in self._api_job_list(cluster_name=self._cluster_id,
                                              state_matcher='ACTIVE'):
            # Kill all active jobs with the same job_prefix as this job
            current_job_id = current_job['reference']['jobId']

            if not current_job_id.startswith(job_prefix):
                continue

            self._api_job_cancel(current_job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._api_cluster_delete(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _build_dataproc_hadoop_job(self, step_num):
        """This function creates a "HadoopJob" to be passed to
        self._api_job_submit_hadoop

        :param step_num:
        :return: output_hadoop_job
        """
        # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        args = list()
        file_uris = list()
        archive_uris = list()
        properties = dict()

        step = self._get_step(step_num)

        assert step['type'] in ('streaming', 'jar'), ('Bad step type: %r' %
                                                      (step['type'], ))

        # TODO - mtai @ davidmarin - Might be trivial to support jar running,
        # see "mainJarFileUri" of variable "output_hadoop_job" in this function
        #         https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        assert step['type'] == 'streaming', 'Jar not implemented'
        main_jar_uri = _HADOOP_STREAMING_JAR_URI

        # TODO - mtai @ davidmarin - Not clear if we should move _upload_args
        # to file_uris, currently works fine as-is

        # TODO - dmarin @ mtai - Probably a little safer to do the API's way,
        # assuming the API supports distributed cache syntax (so we can pick
        # the names of the uploaded files).
        args.extend(self._upload_args())

        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        if mapper:
            args += ['-mapper', mapper]

        if combiner:
            args += ['-combiner', combiner]

        if reducer:
            args += ['-reducer', reducer]

        for current_input_uri in self._step_input_uris(step_num):
            args += ['-input', current_input_uri]

        args += ['-output', self._step_output_uri(step_num)]

        # TODO - mtai @ davidmarin - Add back support to specify a different
        # mainJarFileURI
        output_hadoop_job = dict(args=args,
                                 fileUris=file_uris,
                                 archiveUris=archive_uris,
                                 properties=properties,
                                 mainJarFileUri=main_jar_uri)
        return output_hadoop_job

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(),
                 random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info('Creating Dataproc Hadoop cluster - %s' %
                     self._cluster_id)

            cluster_data = self._cluster_create_args()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in _DATAPROC_CLUSTER_STATES_READY:
            result_describe = self.api_client.clusters().get(
                projectId=self._gcp_project,
                region=_DATAPROC_API_REGION,
                clusterName=cluster_id).execute()

            cluster_state = result_describe['status']['state']
            if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR:
                raise DataprocException(result_describe)

            self._wait_for_api('cluster to accept jobs')

        assert cluster_state in _DATAPROC_CLUSTER_STATES_READY
        log.info("Cluster %s ready", cluster_id)

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(job_id,
                                            step_num=step_num,
                                            num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        # Build each step
        hadoop_job = self._build_dataproc_hadoop_job(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._api_job_submit_hadoop(step_name, hadoop_job)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result['reference']['jobId']
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(self,
                                   job_id,
                                   step_num=None,
                                   num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job_result = self._api_job_get(job_id)
            job_state = job_result['status']['state']

            log.info('%s => %s' % (job_id, job_state))

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            if job_state in _DATAPROC_JOB_STATES_ACTIVE:
                self._wait_for_api('job completion')
                continue

            # we're done, will return at the end of this
            elif job_state == 'DONE':
                break

            raise StepFailedException(step_num=step_num, num_steps=num_steps)

    def _intermediate_output_uri(self, step_num):
        # TODO: davidmarin @ mtai: noticed this is 1-indexed and uses
        # %05d instead of %04d. Any particular reason?
        return 'hdfs:///tmp/mrjob/%s/step-output/%05d/' % (self._job_key,
                                                           step_num + 1)

    def counters(self):
        # TODO - mtai @ davidmarin - Counters are currently always empty as we
        # are not processing task logs
        return [
            _pick_counters(log_interpretation)
            for log_interpretation in self._log_interpretations
        ]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise AssertionError('cluster has not yet been created')

        cluster = self._api_cluster_get(self._cluster_id)
        self._image_version = (
            cluster['config']['softwareConfig']['imageVersion'])
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(self._image_version,
                                           _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    ### Bootstrapping ###

    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:
            return

        # don't bother if we're not starting a cluster
        if self._cluster_id:
            return

        # Also don't bother if we're not bootstrapping
        if not (self._bootstrap or self._bootstrap_mrjob()):
            return

        # create mrjob.zip if we need it, and add commands to install it
        mrjob_bootstrap = []
        if self._bootstrap_mrjob():
            assert self._mrjob_zip_path
            path_dict = {
                'type': 'file',
                'name': None,
                'path': self._mrjob_zip_path
            }
            self._bootstrap_dir_mgr.add(**path_dict)

            # find out where python keeps its libraries
            mrjob_bootstrap.append([
                "__mrjob_PYTHON_LIB=$(%s -c "
                "'from distutils.sysconfig import get_python_lib;"
                " print(get_python_lib())')" % cmd_line(self._python_bin())
            ])
            # unzip mrjob.zip
            mrjob_bootstrap.append(
                ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])
            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball (this would matter if
            # sh_bin were 'sh -e')
            mrjob_bootstrap.append([
                'sudo %s -m compileall -f $__mrjob_PYTHON_LIB/mrjob && true' %
                cmd_line(self._python_bin())
            ])

        # we call the script b.py because there's a character limit on
        # bootstrap script names (or there was at one time, anyway)
        path = os.path.join(self._get_local_tmp_dir(), 'b.py')
        log.info('writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(self._bootstrap +
                                                         mrjob_bootstrap)
        for line in contents:
            log.debug('BOOTSTRAP: ' + line.rstrip('\r\n'))

        with open(path, 'w') as f:
            for line in contents:
                f.write(line)

        self._master_bootstrap_script_path = path

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def _parse_bootstrap(self):
        """Parse the *bootstrap* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.
        """
        return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]

    def _master_bootstrap_script_content(self, bootstrap):
        """Create the contents of the master bootstrap script.
        """
        out = []

        def writeln(line=''):
            out.append(line + '\n')

        # shebang
        sh_bin = self._opts['sh_bin']
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        writeln('#!' + cmd_line(sh_bin))
        writeln()

        # store $PWD
        writeln('# store $PWD')
        writeln('__mrjob_PWD=$PWD')

        # FYI - mtai @ davidmarin - begin section, mtai had to add this
        # otherwise initialization didn't work
        # // kept blowing up in all subsequent invocations of $__mrjob_PWD/
        writeln('if [ $__mrjob_PWD = "/" ]; then')
        writeln('  __mrjob_PWD=""')
        writeln('fi')
        # FYI - mtai @ davidmarin - end section

        writeln()

        # download files
        writeln('# download files and mark them executable')

        cp_to_local = 'hadoop fs -copyToLocal'

        for name, path in sorted(
                self._bootstrap_dir_mgr.name_to_path('file').items()):
            uri = self._upload_mgr.uri(path)

            output_string = '%s %s $__mrjob_PWD/%s' % (
                cp_to_local, pipes.quote(uri), pipes.quote(name))

            writeln(output_string)
            # make everything executable, like Hadoop Distributed Cache
            writeln('chmod a+x $__mrjob_PWD/%s' % pipes.quote(name))
        writeln()

        # run bootstrap commands
        writeln('# bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = ''
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._bootstrap_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            writeln(line)
        writeln()

        return out

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_args(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

            # always add idle termination script
            # add it last, so that we don't count bootstrapping as idle time
            gcs_init_script_uris.append(
                self._upload_mgr.uri(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH))

        # NOTE - Cluster initializationActions can only take scripts with no
        # script args, so the auto-term script receives 'mrjob-max-secs-idle'
        # via metadata instead of as an arg
        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__
        cluster_metadata['mrjob-max-secs-idle'] = str(
            int(self._opts['max_hours_idle'] * 3600))

        cluster_config = dict(gceClusterConfig=dict(
            zoneUri=_gcp_zone_uri(project=self._gcp_project,
                                  zone=self._gce_zone),
            serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES,
            metadata=cluster_metadata),
                              initializationActions=[
                                  dict(executableFile=init_script_uri)
                                  for init_script_uri in gcs_init_script_uris
                              ])

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._gcp_project,
            zone=self._gce_zone,
            count=1,
            instance_type=self._opts['master_instance_type'])

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._gcp_project,
            zone=self._gce_zone,
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type'])

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._gcp_project,
            zone=self._gce_zone,
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True)

        cluster_config['masterConfig'] = master_conf
        cluster_config['workerConfig'] = worker_conf
        if self._opts['num_task_instances']:
            cluster_config['secondaryWorkerConfig'] = secondary_worker_conf

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            cluster_config['softwareConfig'] = dict(
                imageVersion=self._opts['image_version'])

        return dict(projectId=self._gcp_project,
                    clusterName=self._cluster_id,
                    config=cluster_config)

    ### Dataproc-specific Stuff ###

    def _api_cluster_get(self, cluster_id):
        return self.api_client.clusters().get(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id).execute()

    def _api_cluster_create(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa
        return self.api_client.clusters().create(projectId=self._gcp_project,
                                                 region=_DATAPROC_API_REGION,
                                                 body=cluster_data).execute()

    def _api_cluster_delete(self, cluster_id):
        return self.api_client.clusters().delete(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id).execute()

    def _api_job_list(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = dict(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
        )
        if cluster_name:
            list_kwargs['clusterName'] = cluster_name

        if state_matcher:
            list_kwargs['jobStateMatcher'] = state_matcher

        list_request = self.api_client.jobs().list(**list_kwargs)
        while list_request:
            try:
                resp = list_request.execute()
            except google_errors.HttpError as e:
                if e.resp.status == 404:
                    return
                raise

            for current_item in resp['items']:
                yield current_item

            list_request = self.api_client.jobs().list_next(list_request, resp)

    def _api_job_get(self, job_id):
        return self.api_client.jobs().get(projectId=self._gcp_project,
                                          region=_DATAPROC_API_REGION,
                                          jobId=job_id).execute()

    def _api_job_cancel(self, job_id):
        return self.api_client.jobs().cancel(projectId=self._gcp_project,
                                             region=_DATAPROC_API_REGION,
                                             jobId=job_id).execute()

    def _api_job_delete(self, job_id):
        return self.api_client.jobs().delete(projectId=self._gcp_project,
                                             region=_DATAPROC_API_REGION,
                                             jobId=job_id).execute()

    def _api_job_submit_hadoop(self, step_name, hadoop_job):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa
        job_data = dict(reference=dict(projectId=self._gcp_project,
                                       jobId=step_name),
                        placement=dict(clusterName=self._cluster_id),
                        hadoopJob=hadoop_job)

        jobs_submit_kwargs = dict(projectId=self._gcp_project,
                                  region=_DATAPROC_API_REGION,
                                  body=dict(job=job_data))
        return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
Пример #37
0
class DataprocJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    # Don't need to bootstrap mrjob in the setup wrapper; that's what
    # the bootstrap script is for!
    BOOTSTRAP_MRJOB_IN_SETUP = False

    OPTION_STORE_CLASS = DataprocRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # Lazy-load gcloud config as needed - invocations fail in PyCharm
        # debugging
        self._gcloud_config = None

        # Google Cloud Platform - project
        self._gcp_project = (
            self._opts['gcp_project'] or self.gcloud_config()['core.project'])

        # Google Compute Engine - Region / Zone
        self._gce_region = (
            self._opts['region'] or self.gcloud_config()['compute.region'])
        self._gce_zone = (
            self._opts['zone'] or self.gcloud_config()['compute.zone'])

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage working dir for bootstrap script
        self._bootstrap_dir_mgr = BootstrapWorkingDirManager()

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        for cmd in self._bootstrap:
            for maybe_path_dict in cmd:
                if isinstance(maybe_path_dict, dict):
                    self._bootstrap_dir_mgr.add(**maybe_path_dict)

        # we'll create the script later
        self._master_bootstrap_script_path = None

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def gcloud_config(self):
        """Lazy load gcloud SDK configs"""
        if not self._gcloud_config:
            self._gcloud_config = _read_gcloud_config()

        return self._gcloud_config

    @property
    def api_client(self):
        if not self._api_client:
            credentials = GoogleCredentials.get_application_default()

            api_client = discovery.build(
                _DATAPROC_API_ENDPOINT, _DATAPROC_API_VERSION,
                credentials=credentials)
            self._api_client = api_client.projects().regions()

        return self._api_client

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is not None:
            return self._fs

        self._gcs_fs = GCSFilesystem()

        self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem())
        return self._fs

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        mrjob_buckets = self.fs.list_buckets(
            self._gcp_project, prefix='mrjob-')

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None
        gce_lower_location = self._gce_region.lower()
        for tmp_bucket in mrjob_buckets:
            tmp_bucket_name = tmp_bucket['name']

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase
            lower_location = tmp_bucket['location'].lower()
            if lower_location == gce_lower_location:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', gce_lower_location, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_input_exists()
        self._check_output_not_exists()
        self._create_setup_wrapper_script()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files_to_fs()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if is_uri(path) and not is_gcs_uri(path):
                continue  # can't check non-GCS URIs, hope for the best

            if not self.fs.exists(path):
                raise AssertionError(
                    'Input path %s does not exist!' % (path,))

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError(
                'Output path %s already exists!' % (self._output_dir,))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)
            self._upload_mgr.add(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

        # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored,
        # see _HADOOP_STREAMING_JAR_URI
        # if self._opts['hadoop_streaming_jar']:
        #     self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            # TODO - mtai @ davidmarin - Implement put function for other FSs
            self.fs.put(path, gcs_uri)

        self._wait_for_fs_sync()

    def _create_fs_tmp_bucket(self, bucket_name, location=None):
        """Create a temp bucket if missing

        Tie the temporary bucket to the same region as the GCE job and set a
        28-day TTL
        """
        # Return early if our bucket already exists
        try:
            self.fs.get_bucket(bucket_name)
            return
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

        log.info('creating FS bucket %r' % bucket_name)

        location = location or self._gce_region

        # NOTE - By default, we create a bucket in the same GCE region as our
        # job (tmp buckets ONLY)
        # https://cloud.google.com/storage/docs/bucket-locations
        self.fs.create_bucket(
            self._gcp_project, bucket_name, location=location,
            object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS)

        self._wait_for_fs_sync()

    ### Running the job ###
    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for current_job in self._api_job_list(
                cluster_name=self._cluster_id, state_matcher='ACTIVE'):
            # Kill all active jobs with the same job_prefix as this job
            current_job_id = current_job['reference']['jobId']

            if not current_job_id.startswith(job_prefix):
                continue

            self._api_job_cancel(current_job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._api_cluster_delete(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _build_dataproc_hadoop_job(self, step_num):
        """This function creates a "HadoopJob" to be passed to
        self._api_job_submit_hadoop

        :param step_num:
        :return: output_hadoop_job
        """
        # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        args = list()
        file_uris = list()
        archive_uris = list()
        properties = dict()

        step = self._get_step(step_num)

        assert step['type'] in ('streaming', 'jar'), (
            'Bad step type: %r' % (step['type'],))

        # TODO - mtai @ davidmarin - Might be trivial to support jar running,
        # see "mainJarFileUri" of variable "output_hadoop_job" in this function
        #         https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa

        assert step['type'] == 'streaming', 'Jar not implemented'
        main_jar_uri = _HADOOP_STREAMING_JAR_URI

        # TODO - mtai @ davidmarin - Not clear if we should move _upload_args
        # to file_uris, currently works fine as-is

        # TODO - dmarin @ mtai - Probably a little safer to do the API's way,
        # assuming the API supports distributed cache syntax (so we can pick
        # the names of the uploaded files).
        args.extend(self._upload_args())

        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        if mapper:
            args += ['-mapper', mapper]

        if combiner:
            args += ['-combiner', combiner]

        if reducer:
            args += ['-reducer', reducer]

        for current_input_uri in self._step_input_uris(step_num):
            args += ['-input', current_input_uri]

        args += ['-output', self._step_output_uri(step_num)]

        # TODO - mtai @ davidmarin - Add back support to specify a different
        # mainJarFileURI
        output_hadoop_job = dict(
            args=args,
            fileUris=file_uris,
            archiveUris=archive_uris,
            properties=properties,
            mainJarFileUri=main_jar_uri
        )
        return output_hadoop_job

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(), random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info(
                'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)

            cluster_data = self._cluster_create_args()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in _DATAPROC_CLUSTER_STATES_READY:
            result_describe = self.api_client.clusters().get(
                projectId=self._gcp_project,
                region=_DATAPROC_API_REGION,
                clusterName=cluster_id).execute()

            cluster_state = result_describe['status']['state']
            if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR:
                raise DataprocException(result_describe)

            self._wait_for_api('cluster to accept jobs')

        assert cluster_state in _DATAPROC_CLUSTER_STATES_READY
        log.info("Cluster %s ready", cluster_id)

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(
                job_id, step_num=step_num, num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        # Build each step
        hadoop_job = self._build_dataproc_hadoop_job(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._api_job_submit_hadoop(step_name, hadoop_job)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result['reference']['jobId']
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(
            self, job_id, step_num=None, num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job_result = self._api_job_get(job_id)
            job_state = job_result['status']['state']

            log.info('%s => %s' % (job_id, job_state))

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            if job_state in _DATAPROC_JOB_STATES_ACTIVE:
                self._wait_for_api('job completion')
                continue

            # we're done, will return at the end of this
            elif job_state == 'DONE':
                break

            raise StepFailedException(step_num=step_num, num_steps=num_steps)

    def _intermediate_output_uri(self, step_num):
        # TODO: davidmarin @ mtai: noticed this is 1-indexed and uses
        # %05d instead of %04d. Any particular reason?
        return 'hdfs:///tmp/mrjob/%s/step-output/%05d/' % (
            self._job_key, step_num + 1)

    def counters(self):
        # TODO - mtai @ davidmarin - Counters are currently always empty as we
        # are not processing task logs
        return [_pick_counters(log_interpretation)
                for log_interpretation in self._log_interpretations]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise AssertionError('cluster has not yet been created')

        cluster = self._api_cluster_get(self._cluster_id)
        self._image_version = (
            cluster['config']['softwareConfig']['imageVersion'])
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(
            self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    ### Bootstrapping ###

    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:
            return

        # don't bother if we're not starting a cluster
        if self._cluster_id:
            return

        # Also don't bother if we're not bootstrapping
        if not (self._bootstrap or self._bootstrap_mrjob()):
            return

        # create mrjob.zip if we need it, and add commands to install it
        mrjob_bootstrap = []
        if self._bootstrap_mrjob():
            assert self._mrjob_zip_path
            path_dict = {
                'type': 'file', 'name': None, 'path': self._mrjob_zip_path}
            self._bootstrap_dir_mgr.add(**path_dict)

            # find out where python keeps its libraries
            mrjob_bootstrap.append([
                "__mrjob_PYTHON_LIB=$(%s -c "
                "'from distutils.sysconfig import get_python_lib;"
                " print(get_python_lib())')" %
                cmd_line(self._python_bin())])
            # unzip mrjob.zip
            mrjob_bootstrap.append(
                ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])
            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball (this would matter if
            # sh_bin were 'sh -e')
            mrjob_bootstrap.append(
                ['sudo %s -m compileall -q'
                 ' -f $__mrjob_PYTHON_LIB/mrjob && true' %
                 cmd_line(self._python_bin())])

        # we call the script b.py because there's a character limit on
        # bootstrap script names (or there was at one time, anyway)
        path = os.path.join(self._get_local_tmp_dir(), 'b.py')
        log.info('writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(
            self._bootstrap + mrjob_bootstrap)
        for line in contents:
            log.debug('BOOTSTRAP: ' + line.rstrip('\r\n'))

        with open(path, 'w') as f:
            for line in contents:
                f.write(line)

        self._master_bootstrap_script_path = path

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def _parse_bootstrap(self):
        """Parse the *bootstrap* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`.
        """
        return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]

    def _master_bootstrap_script_content(self, bootstrap):
        """Create the contents of the master bootstrap script.
        """
        out = []

        def writeln(line=''):
            out.append(line + '\n')

        # shebang
        sh_bin = self._opts['sh_bin']
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        writeln('#!' + cmd_line(sh_bin))
        writeln()

        # store $PWD
        writeln('# store $PWD')
        writeln('__mrjob_PWD=$PWD')

        # FYI - mtai @ davidmarin - begin section, mtai had to add this
        # otherwise initialization didn't work
        # // kept blowing up in all subsequent invocations of $__mrjob_PWD/
        writeln('if [ $__mrjob_PWD = "/" ]; then')
        writeln('  __mrjob_PWD=""')
        writeln('fi')
        # FYI - mtai @ davidmarin - end section

        writeln()

        # download files
        writeln('# download files and mark them executable')

        cp_to_local = 'hadoop fs -copyToLocal'

        for name, path in sorted(
                self._bootstrap_dir_mgr.name_to_path('file').items()):
            uri = self._upload_mgr.uri(path)

            output_string = '%s %s $__mrjob_PWD/%s' % (
                cp_to_local, pipes.quote(uri), pipes.quote(name))

            writeln(output_string)
            # make everything executable, like Hadoop Distributed Cache
            writeln('chmod a+x $__mrjob_PWD/%s' % pipes.quote(name))
        writeln()

        # run bootstrap commands
        writeln('# bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = ''
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._bootstrap_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            writeln(line)
        writeln()

        return out

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_args(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

            # always add idle termination script
            # add it last, so that we don't count bootstrapping as idle time
            gcs_init_script_uris.append(
                self._upload_mgr.uri(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH))

        # NOTE - Cluster initializationActions can only take scripts with no
        # script args, so the auto-term script receives 'mrjob-max-secs-idle'
        # via metadata instead of as an arg
        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__
        cluster_metadata['mrjob-max-secs-idle'] = str(int(
            self._opts['max_hours_idle'] * 3600))

        cluster_config = dict(
            gceClusterConfig=dict(
                zoneUri=_gcp_zone_uri(
                    project=self._gcp_project, zone=self._gce_zone),
                serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES,
                metadata=cluster_metadata
            ),
            initializationActions=[
                dict(executableFile=init_script_uri)
                for init_script_uri in gcs_init_script_uris
            ]
        )

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._gcp_project, zone=self._gce_zone,
            count=1, instance_type=self._opts['master_instance_type']
        )

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._gcp_project, zone=self._gce_zone,
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type']
        )

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._gcp_project, zone=self._gce_zone,
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True
        )

        cluster_config['masterConfig'] = master_conf
        cluster_config['workerConfig'] = worker_conf
        if self._opts['num_task_instances']:
            cluster_config['secondaryWorkerConfig'] = secondary_worker_conf

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            cluster_config['softwareConfig'] = dict(
                imageVersion=self._opts['image_version'])

        return dict(projectId=self._gcp_project,
                    clusterName=self._cluster_id,
                    config=cluster_config)

    ### Dataproc-specific Stuff ###

    def _api_cluster_get(self, cluster_id):
        return self.api_client.clusters().get(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id
        ).execute()

    def _api_cluster_create(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa
        return self.api_client.clusters().create(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            body=cluster_data
        ).execute()

    def _api_cluster_delete(self, cluster_id):
        return self.api_client.clusters().delete(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            clusterName=cluster_id
        ).execute()

    def _api_job_list(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = dict(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
        )
        if cluster_name:
            list_kwargs['clusterName'] = cluster_name

        if state_matcher:
            list_kwargs['jobStateMatcher'] = state_matcher

        list_request = self.api_client.jobs().list(**list_kwargs)
        while list_request:
            try:
                resp = list_request.execute()
            except google_errors.HttpError as e:
                if e.resp.status == 404:
                    return
                raise

            for current_item in resp['items']:
                yield current_item

            list_request = self.api_client.jobs().list_next(list_request, resp)

    def _api_job_get(self, job_id):
        return self.api_client.jobs().get(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            jobId=job_id
        ).execute()

    def _api_job_cancel(self, job_id):
        return self.api_client.jobs().cancel(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            jobId=job_id
        ).execute()

    def _api_job_delete(self, job_id):
        return self.api_client.jobs().delete(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            jobId=job_id
        ).execute()

    def _api_job_submit_hadoop(self, step_name, hadoop_job):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa
        job_data = dict(
            reference=dict(projectId=self._gcp_project, jobId=step_name),
            placement=dict(clusterName=self._cluster_id),
            hadoopJob=hadoop_job
        )

        jobs_submit_kwargs = dict(
            projectId=self._gcp_project,
            region=_DATAPROC_API_REGION,
            body=dict(job=job_data)
        )
        return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
Пример #38
0
class HadoopJobRunner(MRJobRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(
                self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or
            posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # List of dicts (one for each step) potentially containing
        # the keys 'history', 'step', and 'task' ('step' will always
        # be filled because it comes from the hadoop jar command output,
        # others will be filled as needed)
        self._log_interpretations = []

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']),
                LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar or
                self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')),
                            len(posixpath.basename(p)),
                            p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_dirs(self):
        """Yield all possible hadoop directories (used for streaming jar
        and logs). May yield duplicates"""
        if self._opts['hadoop_home']:
            yield self._opts['hadoop_home']

        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar.
        May yield duplicates.
        """
        for hadoop_dir in self._hadoop_dirs():
            yield hadoop_dir

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _hadoop_log_dirs(self, output_dir=None):
        """Yield all possible places to look for hadoop logs."""
        # hadoop_log_dirs opt overrides all this
        if self._opts['hadoop_log_dirs']:
            for path in self._opts['hadoop_log_dirs']:
                yield path
            return

        hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
        if hadoop_log_dir:
            yield hadoop_log_dir

        if uses_yarn(self.get_hadoop_version()):
            yarn_log_dir = os.environ.get('YARN_LOG_DIR')
            if yarn_log_dir:
                yield yarn_log_dir

            yield _DEFAULT_YARN_HDFS_LOG_DIR

        if output_dir:
            # Cloudera style of logging
            yield posixpath.join(output_dir, '_logs')

        for hadoop_dir in self._hadoop_dirs():
            yield posixpath.join(hadoop_dir, 'logs')

        # hard-coded log paths for EMR, so this can work out-of-the-box
        for path in _EMR_HADOOP_LOG_DIRS:
            yield path

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                      (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._hdfs_step_output_dir(step_num))

            log_interpretation['step'] = step_interpretation

            counters = self._pick_counters(log_interpretation)
            if counters:
                log.info(_format_counters(counters))
            else:
                log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'],))

    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # set up uploading from HDFS to the working dir
        args.extend(
            self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        # if no reducer, shut off reducer tasks
        if not reducer:
            args.extend(['-D', ('%s=0' % translate_jobconf(
                'mapreduce.job.reduces', self.get_hadoop_version()))])

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args = self.get_hadoop_bin() + ['jar', jar]

        if step.get('main_class'):
            args.append(step['main_class'])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ','.join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get('args'):
            args.extend(interpolate(arg) for arg in step['args'])

        return args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p)
                    for p in self._get_input_paths()]
        else:
            return [posixpath.join(
                self._hadoop_tmp_dir, 'step-output', str(step_num))]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(
                self._hadoop_tmp_dir, 'step-output', str(step_num + 1))

    def _cleanup_local_tmp(self):
        super(HadoopJobRunner, self)._cleanup_local_tmp()

        if self._hadoop_tmp_dir:
            log.info('deleting %s from HDFS' % self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    ### LOG (implementation of LogInterpretationMixin) ###

    def _stream_history_log_dirs(self, output_dir=None):
        """Yield lists of directories to look for the history log in."""
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if self.fs.exists(log_dir):
                log.info('Looking for history log in %s...' % log_dir)
                # logs aren't always in a subdir named history/
                yield [log_dir]

    def _stream_task_log_dirs(self, application_id=None, output_dir=None):
        """Yield lists of directories to look for the task logs in."""
        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a
        # local directory. It's a start, anyways. See #1201.
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if application_id:
                path = self.fs.join(log_dir, 'userlogs', application_id)
            else:
                path = self.fs.join(log_dir, 'userlogs')

            if self.fs.exists(path):
                log.info('Looking for task syslogs in %s...' % path)
                yield [path]

    def counters(self):
        return [_pick_counters(log_interpretation)
                for log_interpretation in self._log_interpretations]
Пример #39
0
 def test_empty(self):
     sd = UploadDirManager("hdfs:///")
     self.assertEqual(sd.path_to_uri(), {})
Пример #40
0
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'bootstrap_spark',
        'hadoop_bin',
        'hadoop_extra_args',
        'hadoop_log_dirs',
        'hadoop_streaming_jar',
        'hadoop_tmp_dir',
        'spark_master',
    }

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(
                self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or
            posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Fully qualify step_output_dir, if set
        if self._step_output_dir:
            self._step_output_dir = fully_qualify_hdfs_path(
                self._step_output_dir)

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # Keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

        # List of dicts (one for each step) potentially containing
        # the keys 'history', 'step', and 'task' ('step' will always
        # be filled because it comes from the hadoop jar command output,
        # others will be filled as needed)
        self._log_interpretations = []

    def _default_opts(self):
        return combine_dicts(
            super(HadoopJobRunner, self)._default_opts(),
            dict(
                hadoop_tmp_dir='tmp/mrjob',
                spark_master='yarn',
            )
        )

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']),
                LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar or
                self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s...' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')),
                            len(posixpath.basename(p)),
                            p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_dirs(self):
        """Yield all possible hadoop directories (used for streaming jar
        and logs). May yield duplicates"""
        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar.
        May yield duplicates.
        """
        for hadoop_dir in self._hadoop_dirs():
            yield hadoop_dir

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _hadoop_log_dirs(self, output_dir=None):
        """Yield all possible places to look for hadoop logs."""
        # hadoop_log_dirs opt overrides all this
        if self._opts['hadoop_log_dirs']:
            for path in self._opts['hadoop_log_dirs']:
                yield path
            return

        hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
        if hadoop_log_dir:
            yield hadoop_log_dir

        yarn = uses_yarn(self.get_hadoop_version())

        if yarn:
            yarn_log_dir = os.environ.get('YARN_LOG_DIR')
            if yarn_log_dir:
                yield yarn_log_dir

            yield _DEFAULT_YARN_HDFS_LOG_DIR

        if output_dir:
            # Cloudera style of logging
            yield posixpath.join(output_dir, '_logs')

        for hadoop_dir in self._hadoop_dirs():
            yield posixpath.join(hadoop_dir, 'logs')

        # hard-coded fallback paths
        if yarn:
            for path in _FALLBACK_HADOOP_YARN_LOG_DIRS:
                yield path

        for path in _FALLBACK_HADOOP_LOG_DIRS:
            yield path

    def get_spark_submit_bin(self):
        if not self._spark_submit_bin:
            self._spark_submit_bin = self._find_spark_submit_bin()
        return self._spark_submit_bin

    def _find_spark_submit_bin(self):
        # TODO: this is very similar to _find_hadoop_bin() (in fs)
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' % (
                path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']

    def _spark_submit_bin_dirs(self):
        # $SPARK_HOME
        spark_home = os.environ.get('SPARK_HOME')
        if spark_home:
            yield os.path.join(spark_home, 'bin')

        yield None  # use $PATH

        # some other places recommended by install docs (see #1366)
        yield '/usr/lib/spark/bin'
        yield '/usr/local/spark/bin'
        yield '/usr/local/lib/spark/bin'

    def _run(self):
        self._find_binaries_and_jars()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _find_binaries_and_jars(self):
        """Find hadoop and (if needed) spark-submit bin up-front, before
        continuing with the job.

        (This is just for user-interaction purposes; these would otherwise
        lazy-load as needed.)
        """
        # this triggers looking for Hadoop binary
        self.get_hadoop_version()

        if self._has_streaming_steps():
            self.get_hadoop_streaming_jar()

        if self._has_spark_steps():
            self.get_spark_submit_bin()

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files to %s...' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('  %s -> %s' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s...' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())

    def _warn_about_spark_archives(self, step):
        """If *step* is a Spark step, the *upload_archives* option is set,
        and *spark_master* is not ``'yarn'``, warn that *upload_archives*
        will be ignored by Spark."""
        if (_is_spark_step_type(step['type']) and
                self._opts['spark_master'] != 'yarn' and
                self._opts['upload_archives']):
            log.warning('Spark will probably ignore archives because'
                        " spark_master is not set to 'yarn'")

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        elif _is_spark_step_type(step['type']):
            return self._args_for_spark_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'],))

    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] +
                self._hadoop_streaming_jar_args(step_num))

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        args = []

        args.extend(self.get_hadoop_bin())

        # -libjars, -D
        args.extend(self._hadoop_generic_args_for_step(step_num))

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args.extend(['jar', jar])

        if step.get('main_class'):
            args.append(step['main_class'])

        if step.get('args'):
            args.extend(
                self._interpolate_input_and_output(step['args'], step_num))

        return args

    def _spark_submit_arg_prefix(self):
        return ['--master', self._opts['spark_master']]

    def _env_for_step(self, step_num):
        step = self._get_step(step_num)

        env = dict(os.environ)

        # when running spark-submit, set its environment directly. See #1464
        if _is_spark_step_type(step['type']):
            env.update(self._spark_cmdenv(step_num))

        return env

    def _default_step_output_dir(self):
        return posixpath.join(self._hadoop_tmp_dir, 'step-output')

    def _cleanup_hadoop_tmp(self):
        if self._hadoop_tmp_dir:
            log.info('Removing HDFS temp directory %s...' %
                     self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    ### LOG (implementation of LogInterpretationMixin) ###

    def _stream_history_log_dirs(self, output_dir=None):
        """Yield lists of directories to look for the history log in."""
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if _logs_exist(self.fs, log_dir):
                log.info('Looking for history log in %s...' % log_dir)
                # logs aren't always in a subdir named history/
                yield [log_dir]

    def _stream_task_log_dirs(self, application_id=None, output_dir=None):
        """Yield lists of directories to look for the task logs in."""
        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a
        # local directory. It's a start, anyways. See #1201.
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if application_id:
                path = self.fs.join(log_dir, 'userlogs', application_id)
            else:
                path = self.fs.join(log_dir, 'userlogs')

            if _logs_exist(self.fs, path):
                log.info('Looking for task syslogs in %s...' % path)
                yield [path]

    def counters(self):
        return [_pick_counters(log_interpretation)
                for log_interpretation in self._log_interpretations]
Пример #41
0
 def test_simple(self):
     sd = UploadDirManager("hdfs:///")
     sd.add("foo/bar.py")
     self.assertEqual(sd.path_to_uri(), {"foo/bar.py": "hdfs:///bar.py"})
Пример #42
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hdfs_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(
            self._opts['hdfs_scratch_dir'], self._job_name))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or
            posixpath.join(self._hdfs_tmp_dir, 'output'))

        self._hadoop_log_dir = hadoop_log_dir(self._opts['hadoop_home'])

        # Running jobs via hadoop assigns a new timestamp to each job.
        # Running jobs via mrjob only adds steps.
        # Store both of these values to enable log parsing.
        self._job_timestamp = None
        self._start_step_num = 0

        # init hadoop version cache
        self._hadoop_version = None

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']),
                LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split('\n')[0]
                m = HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = m.group('version')
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                    return self._hadoop_version
            self._hadoop_version = '0.20.203'
            log.info("Unable to determine Hadoop version. Assuming 0.20.203.")
        return self._hadoop_version

    def _run(self):
        if self._opts['bootstrap_mrjob']:
            self._add_python_archive(self._create_mrjob_tar_gz())

        self._check_input_exists()
        self._create_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if not self.path_exists(path):
                raise AssertionError(
                    'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self._mkdir_on_hdfs(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().iteritems():
            self._upload_to_hdfs(path, uri)

    def _mkdir_on_hdfs(self, path):
        log.debug('Making directory %s on HDFS' % path)
        self.invoke_hadoop(['fs', '-mkdir', path])

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.invoke_hadoop(['fs', '-put', path, target])

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'w')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        self._counters = []
        steps = self._get_steps()

        for step_num, step in enumerate(steps):
            log.debug('running step %d of %d' % (step_num + 1, len(steps)))

            streaming_args = self._streaming_args(step, step_num, len(steps))

            log.debug('> %s' % cmd_line(streaming_args))
            step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE)

            # TODO: use a pty or something so that the hadoop binary
            # won't buffer the status messages
            self._process_stderr_from_streaming(step_proc.stderr)

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                log.error('STDOUT: ' + line.strip('\n'))

            returncode = step_proc.wait()
            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (step_proc.returncode, streaming_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                               cause['log_file_uri'])
                    cause_msg.extend(line.strip('\n')
                                     for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise Exception(msg)
                raise CalledProcessError(step_proc.returncode, streaming_args)

    def _process_stderr_from_streaming(self, stderr):
        for line in stderr:
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + line)

            if 'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))

    def _streaming_args(self, step, step_num, num_steps):
        version = self.get_hadoop_version()

        streaming_args = (self._opts['hadoop_bin'] +
                          ['jar', self._opts['hadoop_streaming_jar']])

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            streaming_args.extend(
                self._new_upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        streaming_args.extend(
            self._hadoop_conf_args(step, step_num, num_steps))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            streaming_args.extend(['-input', input_uri])

        # set up output
        streaming_args.append('-output')
        streaming_args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            streaming_args.extend(
                self._old_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step, step_num))

        streaming_args.append('-mapper')
        streaming_args.append(mapper)

        if combiner:
            streaming_args.append('-combiner')
            streaming_args.append(combiner)

        if reducer:
            streaming_args.append('-reducer')
            streaming_args.append(reducer)
        else:
            streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return streaming_args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p)
                    for p in self._get_input_paths()]
        else:
            return [posixpath.join(
                self._hdfs_tmp_dir, 'step-output', str(step_num))]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(
                self._hdfs_tmp_dir, 'step-output', str(step_num + 1))

    def _cleanup_local_scratch(self):
        super(HadoopJobRunner, self)._cleanup_local_scratch()

        if self._hdfs_tmp_dir:
            log.info('deleting %s from HDFS' % self._hdfs_tmp_dir)

            try:
                self.invoke_hadoop(['fs', '-rmr', self._hdfs_tmp_dir])
            except Exception, e:
                log.exception(e)
Пример #43
0
class HadoopJobRunner(MRJobRunner):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPTION_STORE_CLASS = HadoopRunnerOptionStore

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(
                self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or
            posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # Keep track of the status of each step that ran
        #
        # these are dictionaries with the same keys as
        # mrjob.logs.parse._parse_hadoop_streaming_log()
        self._steps_info = []

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem(
                HadoopFilesystem(self._opts['hadoop_bin']),
                LocalFilesystem())
        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar or
                self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')),
                            len(posixpath.basename(p)),
                            p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_dirs(self):
        """Yield all possible hadoop directories (used for streaming jar
        and logs). May yield duplicates"""
        if self._opts['hadoop_home']:
            yield self._opts['hadoop_home']

        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar.
        May yield duplicates.
        """
        for hadoop_dir in self._hadoop_dirs():
            yield hadoop_dir

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _hadoop_log_dirs(self, output_dir=None):
        """Yield all possible places to look for hadoop logs."""
        # hadoop_log_dirs opt overrides all this
        if self._opts['hadoop_log_dirs']:
            for path in self._opts['hadoop_log_dirs']:
                yield path
            return

        hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
        if hadoop_log_dir:
            yield hadoop_log_dir

        if uses_yarn(self.get_hadoop_version()):
            yarn_log_dir = os.environ.get('YARN_LOG_DIR')
            if yarn_log_dir:
                yield yarn_log_dir

        if output_dir:
            # Cloudera style of logging
            yield posixpath.join(output_dir, '_logs')

        for hadoop_dir in self._hadoop_dirs():
            yield posixpath.join(hadoop_dir, 'logs')

        # hard-coded log paths for EMR, so this can work out-of-the-box
        for path in _EMR_HADOOP_LOG_DIRS:
            yield path

    def _run(self):
        self._check_input_exists()
        self._create_setup_wrapper_script()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if self._opts['check_input_paths']:
                if not self.fs.exists(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._get_input_paths():
            self._upload_mgr.add(path)

        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files into %s' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('Uploading %s -> %s on HDFS' % (path, target))
        self.fs._put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                      (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_info = _process_stderr_from_streaming(
                    step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_info = _process_stderr_from_streaming(
                            _wrap_streaming_pty_output(master))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if not step_info['output_dir']:
                step_info['output_dir'] = self._hdfs_step_output_dir(step_num)

            if not step_info['counters']:
                pass  # TODO: fetch counters; see _fetch_counters()

            self._steps_info.append(step_info)

            # just print counters for this one step
            self._print_counters(step_nums=[step_num])

            if returncode:
                err_lines = [
                    'Job failed with return code %d: %s' %
                       (returncode, cmd_line(step_args))]

                cause = self._find_probable_cause_of_failure(**step_info)

                if cause:
                    err_lines.append('')  # pad with empty line
                    err_lines.extend(_format_cause_of_failure(cause))

                for err_line in err_lines:
                    log.error(err_line)
                raise Exception('\n'.join(err_lines) + '\n')

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'],))

    def _args_for_streaming_step(self, step_num):
        version = self.get_hadoop_version()

        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # -files/-archives (generic options, new-style)
        if supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        # -cacheFile/-cacheArchive (streaming options, old-style)
        if not supports_new_distributed_cache_options(version):
            # set up uploading from HDFS to the working dir
            args.extend(
                self._pre_0_20_upload_args(self._upload_mgr))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)
        else:
            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])

        return args

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args = self.get_hadoop_bin() + ['jar', jar]

        if step.get('main_class'):
            args.append(step['main_class'])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ','.join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get('args'):
            args.extend(interpolate(arg) for arg in step['args'])

        return args

    def _hdfs_step_input_files(self, step_num):
        """Get the hdfs:// URI for input for the given step."""
        if step_num == 0:
            return [self._upload_mgr.uri(p)
                    for p in self._get_input_paths()]
        else:
            return [posixpath.join(
                self._hadoop_tmp_dir, 'step-output', str(step_num))]

    def _hdfs_step_output_dir(self, step_num):
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
        else:
            return posixpath.join(
                self._hadoop_tmp_dir, 'step-output', str(step_num + 1))

    def _cleanup_local_tmp(self):
        super(HadoopJobRunner, self)._cleanup_local_tmp()

        if self._hadoop_tmp_dir:
            log.info('deleting %s from HDFS' % self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    ### LOG FETCHING/PARSING ###

    def _find_probable_cause_of_failure(self, application_id=None, job_id=None,
                                        output_dir=None, **ignored):
        """Find probable cause of failure. Currently we just scan task logs.

        On YARN, you must set application_id, and pre-YARN, you must set
        job_id.
        """
        # package up logs for _find_error_intask_logs(),
        # and log where we're looking.
        hadoop_version = self.get_hadoop_version()
        yarn = uses_yarn(hadoop_version)

        if yarn and application_id is None:
            log.warning("No application ID!")
            return None

        if not yarn and job_id is None:
            log.warning("No job ID!")
            return None

        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a local
        # directory. It's a start, anyways. See #1201.
        def stream_task_log_dirs():
            for log_dir in unique(
                    self._hadoop_log_dirs(output_dir=output_dir)):

                if yarn:
                    path = self.fs.join(log_dir, 'userlogs', application_id)
                else:
                    # sometimes pre-YARN attempt logs are organized by job_id,
                    # sometimes not. Play it safe
                    path = self.fs.join(log_dir, 'userlogs')

                if self.fs.exists(path):
                    log.info('looking for logs in %s' % path)
                    yield [path]

        return _find_error_in_task_logs(
            self.fs, stream_task_log_dirs(), hadoop_version,
            application_id=application_id, job_id=job_id)

        # TODO: catch timeouts, etc.

    # TODO: redo this
    def _fetch_counters(self, step_nums, skip_s3_wait=False):
        """Read Hadoop counters from local logs.

        Args:
        step_nums -- the steps belonging to us, so that we can ignore errors
                     from other jobs run with the same timestamp
        """
        uris = self._ls_logs('job', step_nums)
        new_counters = scan_for_counters_in_files(uris, self,
                                                  self.get_hadoop_version())

        # only include steps relevant to the current job
        for step_num in step_nums:
            self._counters.append(new_counters.get(step_num, {}))

    def counters(self):
        return [step_info['counters'] for step_info in self._steps_info]