def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = (self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = (self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = (self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage working dir for bootstrap script self._bootstrap_dir_mgr = BootstrapWorkingDirManager() # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() for cmd in self._bootstrap: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._bootstrap_dir_mgr.add(**maybe_path_dict) # we'll create the script later self._master_bootstrap_script_path = None # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []