class DataprocJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' # Don't need to bootstrap mrjob in the setup wrapper; that's what # the bootstrap script is for! BOOTSTRAP_MRJOB_IN_SETUP = False OPTION_STORE_CLASS = DataprocRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = (self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = (self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = (self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage working dir for bootstrap script self._bootstrap_dir_mgr = BootstrapWorkingDirManager() # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() for cmd in self._bootstrap: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._bootstrap_dir_mgr.add(**maybe_path_dict) # we'll create the script later self._master_bootstrap_script_path = None # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def gcloud_config(self): """Lazy load gcloud SDK configs""" if not self._gcloud_config: self._gcloud_config = _read_gcloud_config() return self._gcloud_config @property def api_client(self): if not self._api_client: credentials = GoogleCredentials.get_application_default() api_client = discovery.build(_DATAPROC_API_ENDPOINT, _DATAPROC_API_VERSION, credentials=credentials) self._api_client = api_client.projects().regions() return self._api_client @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem() self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir mrjob_buckets = self.fs.list_buckets(self._gcp_project, prefix='mrjob-') # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None gce_lower_location = self._gce_region.lower() for tmp_bucket in mrjob_buckets: tmp_bucket_name = tmp_bucket['name'] # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase lower_location = tmp_bucket['location'].lower() if lower_location == gce_lower_location: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', gce_lower_location, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_input_exists() self._check_output_not_exists() self._create_setup_wrapper_script() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files_to_fs() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if is_uri(path) and not is_gcs_uri(path): continue # can't check non-GCS URIs, hope for the best if not self.fs.exists(path): raise AssertionError('Input path %s does not exist!' % (path, )) def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError('Output path %s already exists!' % (self._output_dir, )) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) self._upload_mgr.add(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored, # see _HADOOP_STREAMING_JAR_URI # if self._opts['hadoop_streaming_jar']: # self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) # TODO - mtai @ davidmarin - Implement put function for other FSs self.fs.put(path, gcs_uri) self._wait_for_fs_sync() def _create_fs_tmp_bucket(self, bucket_name, location=None): """Create a temp bucket if missing Tie the temporary bucket to the same region as the GCE job and set a 28-day TTL """ # Return early if our bucket already exists try: self.fs.get_bucket(bucket_name) return except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('creating FS bucket %r' % bucket_name) location = location or self._gce_region # NOTE - By default, we create a bucket in the same GCE region as our # job (tmp buckets ONLY) # https://cloud.google.com/storage/docs/bucket-locations self.fs.create_bucket( self._gcp_project, bucket_name, location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS) self._wait_for_fs_sync() ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for current_job in self._api_job_list(cluster_name=self._cluster_id, state_matcher='ACTIVE'): # Kill all active jobs with the same job_prefix as this job current_job_id = current_job['reference']['jobId'] if not current_job_id.startswith(job_prefix): continue self._api_job_cancel(current_job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._api_cluster_delete(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _build_dataproc_hadoop_job(self, step_num): """This function creates a "HadoopJob" to be passed to self._api_job_submit_hadoop :param step_num: :return: output_hadoop_job """ # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa args = list() file_uris = list() archive_uris = list() properties = dict() step = self._get_step(step_num) assert step['type'] in ('streaming', 'jar'), ('Bad step type: %r' % (step['type'], )) # TODO - mtai @ davidmarin - Might be trivial to support jar running, # see "mainJarFileUri" of variable "output_hadoop_job" in this function # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa assert step['type'] == 'streaming', 'Jar not implemented' main_jar_uri = _HADOOP_STREAMING_JAR_URI # TODO - mtai @ davidmarin - Not clear if we should move _upload_args # to file_uris, currently works fine as-is # TODO - dmarin @ mtai - Probably a little safer to do the API's way, # assuming the API supports distributed cache syntax (so we can pick # the names of the uploaded files). args.extend(self._upload_args()) args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) if mapper: args += ['-mapper', mapper] if combiner: args += ['-combiner', combiner] if reducer: args += ['-reducer', reducer] for current_input_uri in self._step_input_uris(step_num): args += ['-input', current_input_uri] args += ['-output', self._step_output_uri(step_num)] # TODO - mtai @ davidmarin - Add back support to specify a different # mainJarFileURI output_hadoop_job = dict(args=args, fileUris=file_uris, archiveUris=archive_uris, properties=properties, mainJarFileUri=main_jar_uri) return output_hadoop_job def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in _DATAPROC_CLUSTER_STATES_READY: result_describe = self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() cluster_state = result_describe['status']['state'] if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR: raise DataprocException(result_describe) self._wait_for_api('cluster to accept jobs') assert cluster_state in _DATAPROC_CLUSTER_STATES_READY log.info("Cluster %s ready", cluster_id) return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete(job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): # Build each step hadoop_job = self._build_dataproc_hadoop_job(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._api_job_submit_hadoop(step_name, hadoop_job) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result['reference']['jobId'] assert job_id == step_name return job_id def _wait_for_step_to_complete(self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job_result = self._api_job_get(job_id) job_state = job_result['status']['state'] log.info('%s => %s' % (job_id, job_state)) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa if job_state in _DATAPROC_JOB_STATES_ACTIVE: self._wait_for_api('job completion') continue # we're done, will return at the end of this elif job_state == 'DONE': break raise StepFailedException(step_num=step_num, num_steps=num_steps) def _intermediate_output_uri(self, step_num): # TODO: davidmarin @ mtai: noticed this is 1-indexed and uses # %05d instead of %04d. Any particular reason? return 'hdfs:///tmp/mrjob/%s/step-output/%05d/' % (self._job_key, step_num + 1) def counters(self): # TODO - mtai @ davidmarin - Counters are currently always empty as we # are not processing task logs return [ _pick_counters(log_interpretation) for log_interpretation in self._log_interpretations ] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise AssertionError('cluster has not yet been created') cluster = self._api_cluster_get(self._cluster_id) self._image_version = ( cluster['config']['softwareConfig']['imageVersion']) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version(self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) ### Bootstrapping ### def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path } self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin()) ]) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append([ 'sudo %s -m compileall -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin()) ]) # we call the script b.py because there's a character limit on # bootstrap script names (or there was at one time, anyway) path = os.path.join(self._get_local_tmp_dir(), 'b.py') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content(self._bootstrap + mrjob_bootstrap) for line in contents: log.debug('BOOTSTRAP: ' + line.rstrip('\r\n')) with open(path, 'w') as f: for line in contents: f.write(line) self._master_bootstrap_script_path = path def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _master_bootstrap_script_content(self, bootstrap): """Create the contents of the master bootstrap script. """ out = [] def writeln(line=''): out.append(line + '\n') # shebang sh_bin = self._opts['sh_bin'] if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin writeln('#!' + cmd_line(sh_bin)) writeln() # store $PWD writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') # FYI - mtai @ davidmarin - begin section, mtai had to add this # otherwise initialization didn't work # // kept blowing up in all subsequent invocations of $__mrjob_PWD/ writeln('if [ $__mrjob_PWD = "/" ]; then') writeln(' __mrjob_PWD=""') writeln('fi') # FYI - mtai @ davidmarin - end section writeln() # download files writeln('# download files and mark them executable') cp_to_local = 'hadoop fs -copyToLocal' for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) output_string = '%s %s $__mrjob_PWD/%s' % ( cp_to_local, pipes.quote(uri), pipes.quote(name)) writeln(output_string) # make everything executable, like Hadoop Distributed Cache writeln('chmod a+x $__mrjob_PWD/%s' % pipes.quote(name)) writeln() # run bootstrap commands writeln('# bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = '' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) writeln() return out def get_cluster_id(self): return self._cluster_id def _cluster_create_args(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) # always add idle termination script # add it last, so that we don't count bootstrapping as idle time gcs_init_script_uris.append( self._upload_mgr.uri(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH)) # NOTE - Cluster initializationActions can only take scripts with no # script args, so the auto-term script receives 'mrjob-max-secs-idle' # via metadata instead of as an arg cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ cluster_metadata['mrjob-max-secs-idle'] = str( int(self._opts['max_hours_idle'] * 3600)) cluster_config = dict(gceClusterConfig=dict( zoneUri=_gcp_zone_uri(project=self._gcp_project, zone=self._gce_zone), serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES, metadata=cluster_metadata), initializationActions=[ dict(executableFile=init_script_uri) for init_script_uri in gcs_init_script_uris ]) # Task tracker master_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=1, instance_type=self._opts['master_instance_type']) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type']) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True) cluster_config['masterConfig'] = master_conf cluster_config['workerConfig'] = worker_conf if self._opts['num_task_instances']: cluster_config['secondaryWorkerConfig'] = secondary_worker_conf # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: cluster_config['softwareConfig'] = dict( imageVersion=self._opts['image_version']) return dict(projectId=self._gcp_project, clusterName=self._cluster_id, config=cluster_config) ### Dataproc-specific Stuff ### def _api_cluster_get(self, cluster_id): return self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() def _api_cluster_create(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa return self.api_client.clusters().create(projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=cluster_data).execute() def _api_cluster_delete(self, cluster_id): return self.api_client.clusters().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() def _api_job_list(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, ) if cluster_name: list_kwargs['clusterName'] = cluster_name if state_matcher: list_kwargs['jobStateMatcher'] = state_matcher list_request = self.api_client.jobs().list(**list_kwargs) while list_request: try: resp = list_request.execute() except google_errors.HttpError as e: if e.resp.status == 404: return raise for current_item in resp['items']: yield current_item list_request = self.api_client.jobs().list_next(list_request, resp) def _api_job_get(self, job_id): return self.api_client.jobs().get(projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id).execute() def _api_job_cancel(self, job_id): return self.api_client.jobs().cancel(projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id).execute() def _api_job_delete(self, job_id): return self.api_client.jobs().delete(projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id).execute() def _api_job_submit_hadoop(self, step_name, hadoop_job): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa job_data = dict(reference=dict(projectId=self._gcp_project, jobId=step_name), placement=dict(clusterName=self._cluster_id), hadoopJob=hadoop_job) jobs_submit_kwargs = dict(projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=dict(job=job_data)) return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()
class DataprocJobRunner(MRJobRunner): """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc. Invoked when you run your job with ``-r dataproc``. :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which is basically a temporary Hadoop cluster. Input, support, and jar files can be either local or on GCS; use ``gs://...`` URLs to refer to files on GCS. This class has some useful utilities for talking directly to GCS and Dataproc, so you may find it useful to instantiate it without a script:: from mrjob.dataproc import DataprocJobRunner ... """ alias = 'dataproc' # Don't need to bootstrap mrjob in the setup wrapper; that's what # the bootstrap script is for! BOOTSTRAP_MRJOB_IN_SETUP = False OPTION_STORE_CLASS = DataprocRunnerOptionStore def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = ( self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = ( self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = ( self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage working dir for bootstrap script self._bootstrap_dir_mgr = BootstrapWorkingDirManager() # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() for cmd in self._bootstrap: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._bootstrap_dir_mgr.add(**maybe_path_dict) # we'll create the script later self._master_bootstrap_script_path = None # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = [] def gcloud_config(self): """Lazy load gcloud SDK configs""" if not self._gcloud_config: self._gcloud_config = _read_gcloud_config() return self._gcloud_config @property def api_client(self): if not self._api_client: credentials = GoogleCredentials.get_application_default() api_client = discovery.build( _DATAPROC_API_ENDPOINT, _DATAPROC_API_VERSION, credentials=credentials) self._api_client = api_client.projects().regions() return self._api_client @property def fs(self): """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and the local filesystem. """ if self._fs is not None: return self._fs self._gcs_fs = GCSFilesystem() self._fs = CompositeFilesystem(self._gcs_fs, LocalFilesystem()) return self._fs def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir mrjob_buckets = self.fs.list_buckets( self._gcp_project, prefix='mrjob-') # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None gce_lower_location = self._gce_region.lower() for tmp_bucket in mrjob_buckets: tmp_bucket_name = tmp_bucket['name'] # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase lower_location = tmp_bucket['location'].lower() if lower_location == gce_lower_location: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', gce_lower_location, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name def _run(self): self._launch() self._run_steps() def _launch(self): self._prepare_for_launch() self._launch_cluster() def _prepare_for_launch(self): self._check_input_exists() self._check_output_not_exists() self._create_setup_wrapper_script() self._add_bootstrap_files_for_upload() self._add_job_files_for_upload() self._upload_local_files_to_fs() def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if is_uri(path) and not is_gcs_uri(path): continue # can't check non-GCS URIs, hope for the best if not self.fs.exists(path): raise AssertionError( 'Input path %s does not exist!' % (path,)) def _check_output_not_exists(self): """Verify the output path does not already exist. This avoids provisioning a cluster only to have Hadoop refuse to launch. """ if self.fs.exists(self._output_dir): raise IOError( 'Output path %s already exists!' % (self._output_dir,)) def _add_bootstrap_files_for_upload(self): """Add files needed by the bootstrap script to self._upload_mgr. Tar up mrjob if bootstrap_mrjob is True. Create the master bootstrap script if necessary. """ # lazily create mrjob.zip if self._bootstrap_mrjob(): self._create_mrjob_zip() self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path) # all other files needed by the script are already in # _bootstrap_dir_mgr for path in self._bootstrap_dir_mgr.paths(): self._upload_mgr.add(path) # now that we know where the above files live, we can create # the master bootstrap script self._create_master_bootstrap_script_if_needed() if self._master_bootstrap_script_path: self._upload_mgr.add(self._master_bootstrap_script_path) self._upload_mgr.add(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH) def _add_job_files_for_upload(self): """Add files needed for running the job (setup and input) to self._upload_mgr.""" for path in self._get_input_paths(): self._upload_mgr.add(path) for path in self._working_dir_mgr.paths(): self._upload_mgr.add(path) # TODO - mtai @ davidmarin - hadoop_streaming_jar is currently ignored, # see _HADOOP_STREAMING_JAR_URI # if self._opts['hadoop_streaming_jar']: # self._upload_mgr.add(self._opts['hadoop_streaming_jar']) for step in self._get_steps(): if step.get('jar'): self._upload_mgr.add(step['jar']) def _upload_local_files_to_fs(self): """Copy local files tracked by self._upload_mgr to FS.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) log.info('Copying non-input files into %s' % self._upload_mgr.prefix) for path, gcs_uri in self._upload_mgr.path_to_uri().items(): log.debug('uploading %s -> %s' % (path, gcs_uri)) # TODO - mtai @ davidmarin - Implement put function for other FSs self.fs.put(path, gcs_uri) self._wait_for_fs_sync() def _create_fs_tmp_bucket(self, bucket_name, location=None): """Create a temp bucket if missing Tie the temporary bucket to the same region as the GCE job and set a 28-day TTL """ # Return early if our bucket already exists try: self.fs.get_bucket(bucket_name) return except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('creating FS bucket %r' % bucket_name) location = location or self._gce_region # NOTE - By default, we create a bucket in the same GCE region as our # job (tmp buckets ONLY) # https://cloud.google.com/storage/docs/bucket-locations self.fs.create_bucket( self._gcp_project, bucket_name, location=location, object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS) self._wait_for_fs_sync() ### Running the job ### def cleanup(self, mode=None): super(DataprocJobRunner, self).cleanup(mode=mode) # stop the cluster if it belongs to us (it may have stopped on its # own already, but that's fine) if self._cluster_id and not self._opts['cluster_id']: self._cleanup_cluster() def _cleanup_cloud_tmp(self): # delete all the files we created if not self._job_tmpdir: return try: log.info('Removing all files in %s' % self._job_tmpdir) self.fs.rm(self._job_tmpdir) self._job_tmpdir = None except Exception as e: log.exception(e) # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup def _cleanup_logs(self): super(DataprocJobRunner, self)._cleanup_logs() def _cleanup_job(self): job_prefix = self._dataproc_job_prefix() for current_job in self._api_job_list( cluster_name=self._cluster_id, state_matcher='ACTIVE'): # Kill all active jobs with the same job_prefix as this job current_job_id = current_job['reference']['jobId'] if not current_job_id.startswith(job_prefix): continue self._api_job_cancel(current_job_id) self._wait_for_api('job cancellation') def _cleanup_cluster(self): if not self._cluster_id: # If we don't have a cluster, then we can't terminate it. return try: log.info("Attempting to terminate cluster") self._api_cluster_delete(self._cluster_id) except Exception as e: log.exception(e) return log.info('cluster %s successfully terminated' % self._cluster_id) def _wait_for_api(self, msg): _wait_for(msg, self._opts['check_cluster_every']) def _wait_for_fs_sync(self): """Sleep for a little while, to give FS a chance to sync up. """ _wait_for('GCS sync (eventual consistency)', self._opts['cloud_fs_sync_secs']) def _build_dataproc_hadoop_job(self, step_num): """This function creates a "HadoopJob" to be passed to self._api_job_submit_hadoop :param step_num: :return: output_hadoop_job """ # Reference: https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa args = list() file_uris = list() archive_uris = list() properties = dict() step = self._get_step(step_num) assert step['type'] in ('streaming', 'jar'), ( 'Bad step type: %r' % (step['type'],)) # TODO - mtai @ davidmarin - Might be trivial to support jar running, # see "mainJarFileUri" of variable "output_hadoop_job" in this function # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa assert step['type'] == 'streaming', 'Jar not implemented' main_jar_uri = _HADOOP_STREAMING_JAR_URI # TODO - mtai @ davidmarin - Not clear if we should move _upload_args # to file_uris, currently works fine as-is # TODO - dmarin @ mtai - Probably a little safer to do the API's way, # assuming the API supports distributed cache syntax (so we can pick # the names of the uploaded files). args.extend(self._upload_args()) args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) if mapper: args += ['-mapper', mapper] if combiner: args += ['-combiner', combiner] if reducer: args += ['-reducer', reducer] for current_input_uri in self._step_input_uris(step_num): args += ['-input', current_input_uri] args += ['-output', self._step_output_uri(step_num)] # TODO - mtai @ davidmarin - Add back support to specify a different # mainJarFileURI output_hadoop_job = dict( args=args, fileUris=file_uris, archiveUris=archive_uris, properties=properties, mainJarFileUri=main_jar_uri ) return output_hadoop_job def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id def _wait_for_cluster_ready(self, cluster_id): # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State # noqa cluster_state = None # Poll until cluster is ready while cluster_state not in _DATAPROC_CLUSTER_STATES_READY: result_describe = self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id).execute() cluster_state = result_describe['status']['state'] if cluster_state in _DATAPROC_CLUSTER_STATES_ERROR: raise DataprocException(result_describe) self._wait_for_api('cluster to accept jobs') assert cluster_state in _DATAPROC_CLUSTER_STATES_READY log.info("Cluster %s ready", cluster_id) return cluster_id def _dataproc_job_prefix(self): return _cleanse_gcp_job_id(self._job_key) def _run_steps(self): """Wait for every step of the job to complete, one by one.""" total_steps = self._num_steps() # define out steps for step_num in range(total_steps): job_id = self._launch_step(step_num) self._wait_for_step_to_complete( job_id, step_num=step_num, num_steps=total_steps) log.info('Completed Dataproc Hadoop Job - %s', job_id) # After all steps completed, wait for the last output (which is # usually written to GCS) to sync self._wait_for_fs_sync() def _launch_step(self, step_num): # Build each step hadoop_job = self._build_dataproc_hadoop_job(step_num) # Clean-up step name step_name = '%s---step-%05d-of-%05d' % ( self._dataproc_job_prefix(), step_num + 1, self._num_steps()) # Submit it log.info('Submitting Dataproc Hadoop Job - %s', step_name) result = self._api_job_submit_hadoop(step_name, hadoop_job) log.info('Submitted Dataproc Hadoop Job - %s', step_name) job_id = result['reference']['jobId'] assert job_id == step_name return job_id def _wait_for_step_to_complete( self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job_result = self._api_job_get(job_id) job_state = job_result['status']['state'] log.info('%s => %s' % (job_id, job_state)) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa if job_state in _DATAPROC_JOB_STATES_ACTIVE: self._wait_for_api('job completion') continue # we're done, will return at the end of this elif job_state == 'DONE': break raise StepFailedException(step_num=step_num, num_steps=num_steps) def _intermediate_output_uri(self, step_num): # TODO: davidmarin @ mtai: noticed this is 1-indexed and uses # %05d instead of %04d. Any particular reason? return 'hdfs:///tmp/mrjob/%s/step-output/%05d/' % ( self._job_key, step_num + 1) def counters(self): # TODO - mtai @ davidmarin - Counters are currently always empty as we # are not processing task logs return [_pick_counters(log_interpretation) for log_interpretation in self._log_interpretations] ### Bootstrapping ### def get_hadoop_version(self): if self._hadoop_version is None: self._store_cluster_info() return self._hadoop_version def get_image_version(self): """Get the version that our cluster is running. """ if self._image_version is None: self._store_cluster_info() return self._image_version def _store_cluster_info(self): """Set self._image_version and self._hadoop_version.""" if not self._cluster_id: raise AssertionError('cluster has not yet been created') cluster = self._api_cluster_get(self._cluster_id) self._image_version = ( cluster['config']['softwareConfig']['imageVersion']) # protect against new versions, including patch versions # we didn't explicitly request. See #1428 self._hadoop_version = map_version( self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION) ### Bootstrapping ### def _create_master_bootstrap_script_if_needed(self): """Helper for :py:meth:`_add_bootstrap_files_for_upload`. Create the master bootstrap script and write it into our local temp directory. Set self._master_bootstrap_script_path. This will do nothing if there are no bootstrap scripts or commands, or if it has already been called.""" if self._master_bootstrap_script_path: return # don't bother if we're not starting a cluster if self._cluster_id: return # Also don't bother if we're not bootstrapping if not (self._bootstrap or self._bootstrap_mrjob()): return # create mrjob.zip if we need it, and add commands to install it mrjob_bootstrap = [] if self._bootstrap_mrjob(): assert self._mrjob_zip_path path_dict = { 'type': 'file', 'name': None, 'path': self._mrjob_zip_path} self._bootstrap_dir_mgr.add(**path_dict) # find out where python keeps its libraries mrjob_bootstrap.append([ "__mrjob_PYTHON_LIB=$(%s -c " "'from distutils.sysconfig import get_python_lib;" " print(get_python_lib())')" % cmd_line(self._python_bin())]) # unzip mrjob.zip mrjob_bootstrap.append( ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB']) # re-compile pyc files now, since mappers/reducers can't # write to this directory. Don't fail if there is extra # un-compileable crud in the tarball (this would matter if # sh_bin were 'sh -e') mrjob_bootstrap.append( ['sudo %s -m compileall -q' ' -f $__mrjob_PYTHON_LIB/mrjob && true' % cmd_line(self._python_bin())]) # we call the script b.py because there's a character limit on # bootstrap script names (or there was at one time, anyway) path = os.path.join(self._get_local_tmp_dir(), 'b.py') log.info('writing master bootstrap script to %s' % path) contents = self._master_bootstrap_script_content( self._bootstrap + mrjob_bootstrap) for line in contents: log.debug('BOOTSTRAP: ' + line.rstrip('\r\n')) with open(path, 'w') as f: for line in contents: f.write(line) self._master_bootstrap_script_path = path def _bootstrap_python(self): """Return a (possibly empty) list of parsed commands (in the same format as returned by parse_setup_cmd())'""" if not self._opts['bootstrap_python']: return [] if PY2: # Python 2 is already installed; install pip and dev packages return [ ['sudo apt-get install -y python-pip python-dev'], ] else: return [ ['sudo apt-get install -y python3 python3-pip python3-dev'], ] def _parse_bootstrap(self): """Parse the *bootstrap* option with :py:func:`mrjob.setup.parse_setup_cmd()`. """ return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']] def _master_bootstrap_script_content(self, bootstrap): """Create the contents of the master bootstrap script. """ out = [] def writeln(line=''): out.append(line + '\n') # shebang sh_bin = self._opts['sh_bin'] if not sh_bin[0].startswith('/'): sh_bin = ['/usr/bin/env'] + sh_bin writeln('#!' + cmd_line(sh_bin)) writeln() # store $PWD writeln('# store $PWD') writeln('__mrjob_PWD=$PWD') # FYI - mtai @ davidmarin - begin section, mtai had to add this # otherwise initialization didn't work # // kept blowing up in all subsequent invocations of $__mrjob_PWD/ writeln('if [ $__mrjob_PWD = "/" ]; then') writeln(' __mrjob_PWD=""') writeln('fi') # FYI - mtai @ davidmarin - end section writeln() # download files writeln('# download files and mark them executable') cp_to_local = 'hadoop fs -copyToLocal' for name, path in sorted( self._bootstrap_dir_mgr.name_to_path('file').items()): uri = self._upload_mgr.uri(path) output_string = '%s %s $__mrjob_PWD/%s' % ( cp_to_local, pipes.quote(uri), pipes.quote(name)) writeln(output_string) # make everything executable, like Hadoop Distributed Cache writeln('chmod a+x $__mrjob_PWD/%s' % pipes.quote(name)) writeln() # run bootstrap commands writeln('# bootstrap commands') for cmd in bootstrap: # reconstruct the command line, substituting $__mrjob_PWD/<name> # for path dicts line = '' for token in cmd: if isinstance(token, dict): # it's a path dictionary line += '$__mrjob_PWD/' line += pipes.quote(self._bootstrap_dir_mgr.name(**token)) else: # it's raw script line += token writeln(line) writeln() return out def get_cluster_id(self): return self._cluster_id def _cluster_create_args(self): gcs_init_script_uris = [] if self._master_bootstrap_script_path: gcs_init_script_uris.append( self._upload_mgr.uri(self._master_bootstrap_script_path)) # always add idle termination script # add it last, so that we don't count bootstrapping as idle time gcs_init_script_uris.append( self._upload_mgr.uri(_MAX_HOURS_IDLE_BOOTSTRAP_ACTION_PATH)) # NOTE - Cluster initializationActions can only take scripts with no # script args, so the auto-term script receives 'mrjob-max-secs-idle' # via metadata instead of as an arg cluster_metadata = dict() cluster_metadata['mrjob-version'] = mrjob.__version__ cluster_metadata['mrjob-max-secs-idle'] = str(int( self._opts['max_hours_idle'] * 3600)) cluster_config = dict( gceClusterConfig=dict( zoneUri=_gcp_zone_uri( project=self._gcp_project, zone=self._gce_zone), serviceAccountScopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES, metadata=cluster_metadata ), initializationActions=[ dict(executableFile=init_script_uri) for init_script_uri in gcs_init_script_uris ] ) # Task tracker master_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=1, instance_type=self._opts['master_instance_type'] ) # Compute + storage worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_core_instances'], instance_type=self._opts['core_instance_type'] ) # Compute ONLY secondary_worker_conf = _gcp_instance_group_config( project=self._gcp_project, zone=self._gce_zone, count=self._opts['num_task_instances'], instance_type=self._opts['task_instance_type'], is_preemptible=True ) cluster_config['masterConfig'] = master_conf cluster_config['workerConfig'] = worker_conf if self._opts['num_task_instances']: cluster_config['secondaryWorkerConfig'] = secondary_worker_conf # See - https://cloud.google.com/dataproc/dataproc-versions if self._opts['image_version']: cluster_config['softwareConfig'] = dict( imageVersion=self._opts['image_version']) return dict(projectId=self._gcp_project, clusterName=self._cluster_id, config=cluster_config) ### Dataproc-specific Stuff ### def _api_cluster_get(self, cluster_id): return self.api_client.clusters().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id ).execute() def _api_cluster_create(self, cluster_data): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get # noqa return self.api_client.clusters().create( projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=cluster_data ).execute() def _api_cluster_delete(self, cluster_id): return self.api_client.clusters().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, clusterName=cluster_id ).execute() def _api_job_list(self, cluster_name=None, state_matcher=None): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher # noqa list_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, ) if cluster_name: list_kwargs['clusterName'] = cluster_name if state_matcher: list_kwargs['jobStateMatcher'] = state_matcher list_request = self.api_client.jobs().list(**list_kwargs) while list_request: try: resp = list_request.execute() except google_errors.HttpError as e: if e.resp.status == 404: return raise for current_item in resp['items']: yield current_item list_request = self.api_client.jobs().list_next(list_request, resp) def _api_job_get(self, job_id): return self.api_client.jobs().get( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_cancel(self, job_id): return self.api_client.jobs().cancel( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_delete(self, job_id): return self.api_client.jobs().delete( projectId=self._gcp_project, region=_DATAPROC_API_REGION, jobId=job_id ).execute() def _api_job_submit_hadoop(self, step_name, hadoop_job): # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob # noqa # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference # noqa job_data = dict( reference=dict(projectId=self._gcp_project, jobId=step_name), placement=dict(clusterName=self._cluster_id), hadoopJob=hadoop_job ) jobs_submit_kwargs = dict( projectId=self._gcp_project, region=_DATAPROC_API_REGION, body=dict(job=job_data) ) return self.api_client.jobs().submit(**jobs_submit_kwargs).execute()