def test_dot_underscore(self): sd = UploadDirManager('hdfs:///') sd.add('._') sd.add('._.txt') sd.add('._foo') self.assertEqual(sd.path_to_uri(), { '._': 'hdfs:///1', '._.txt': 'hdfs:///1.txt', '._foo': 'hdfs:///foo' })
def __init__(self, **kwargs): super(SparkMRJobRunner, self).__init__(**kwargs) self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = posixpath.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin']
def __init__(self, **kwargs): """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(HadoopJobRunner, self).__init__(**kwargs) if self._opts['hadoop_home']: log.warning( 'hadoop_home is deprecated since 0.5.0 and will be removed' ' in v0.6.0. In most cases, mrjob will now find the hadoop' ' binary and streaming jar without help. If not, use the' ' hadoop_bin and hadoop_streaming_jar options.') self._hadoop_tmp_dir = fully_qualify_hdfs_path( posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key)) # Keep track of local files to upload to HDFS. We'll add them # to this manager just before we need them. hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(hdfs_files_dir) # Set output dir if it wasn't set explicitly self._output_dir = fully_qualify_hdfs_path( self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output')) # Fully qualify step_output_dir, if set if self._step_output_dir: self._step_output_dir = fully_qualify_hdfs_path( self._step_output_dir) # Track job and (YARN) application ID to enable log parsing self._application_id = None self._job_id = None # Keep track of where the hadoop streaming jar is self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar'] self._searched_for_hadoop_streaming_jar = False # Keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # List of dicts (one for each step) potentially containing # the keys 'history', 'step', and 'task' ('step' will always # be filled because it comes from the hadoop jar command output, # others will be filled as needed) self._log_interpretations = []
def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs): """Create a spark runner :param max_output_files: limit on number of output files when running streaming jobs. Can only be set on command line (not config file) :param mrjob_cls: class of the job you want to run. Used for running streaming steps in Spark SparkMRJobRunner ignores the keyword arguments *hadoop_input_format*, *hadoop_output_format*, and *sort_values* (see :py:meth:`MRJobRunner.__init__`). These are only set by the job as a way to communicate certain attributes to the runner, and the Spark runner instead inspects the job directly. """ # need to set this before checking steps in superclass __init__() self._mrjob_cls = mrjob_cls super(SparkMRJobRunner, self).__init__(**kwargs) self._max_output_files = max_output_files self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = self.fs.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # where to store a .zip file containing the MRJob, with a unique # module name self._job_script_zip_path = None # counters, one per job step. (Counters will be {} for non-streaming # steps because Spark doesn't have counters). self._counters = []
def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs): """Create a Spark runner. :param max_output_files: limit on number of output files when running streaming jobs. Can only be set on command line (not config file) :param mrjob_cls: class of the job you want to run. Used for running streaming steps in Spark """ # need to set this before checking steps in superclass __init__() self._mrjob_cls = mrjob_cls super(SparkMRJobRunner, self).__init__(**kwargs) self._max_output_files = max_output_files if self._opts['spark_tmp_dir']: self._check_spark_tmp_dir_opt() self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = self.fs.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin'] # where to store a .zip file containing the MRJob, with a unique # module name self._job_script_zip_path = None # counters, one per job step. (Counters will be {} for non-streaming # steps because Spark doesn't have counters). self._counters = []
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError('You must install google-cloud-logging and ' 'google-cloud-storage to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException('Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # see #1820 if self._opts['image_id']: log.warning('mrjob does not yet support custom machine images' ' on Dataproc') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() if self._opts['service_account_scopes']: self._opts['service_account_scopes'] = [ _fully_qualify_scope_uri(s) for s in self._opts['service_account_scopes'] ] # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # map driver_output_uri to a dict with the keys: # log_uri: uri of file we're reading from # pos: position in file # buffer: bytes read from file already self._driver_output_state = {} # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = (self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = (self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = (self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage working dir for bootstrap script self._bootstrap_dir_mgr = BootstrapWorkingDirManager() # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) self._bootstrap = self._bootstrap_python() + self._parse_bootstrap() for cmd in self._bootstrap: for maybe_path_dict in cmd: if isinstance(maybe_path_dict, dict): self._bootstrap_dir_mgr.add(**maybe_path_dict) # we'll create the script later self._master_bootstrap_script_path = None # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
def uri_adds_trailing_slash(self): sd = UploadDirManager('s3://bucket/dir') sd.add('foo/bar.py') self.assertEqual(sd.uri('foo/bar.py'), 's3://bucket/dir/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 's3://bucket/dir/bar.py'})
def test_uri(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') self.assertEqual(sd.uri('foo/bar.py'), 'hdfs:///bar.py')
def test_add_is_idempotent(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'}) sd.add('foo/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
def test_simple(self): sd = UploadDirManager('hdfs:///') sd.add('foo/bar.py') self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
def test_empty(self): sd = UploadDirManager('hdfs:///') self.assertEqual(sd.path_to_uri(), {})
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException( 'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # Lazy-load gcloud config as needed - invocations fail in PyCharm # debugging self._gcloud_config = None # Google Cloud Platform - project self._gcp_project = ( self._opts['gcp_project'] or self.gcloud_config()['core.project']) # Google Compute Engine - Region / Zone self._gce_region = ( self._opts['region'] or self.gcloud_config()['compute.region']) self._gce_zone = ( self._opts['zone'] or self.gcloud_config()['compute.zone']) # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []
def __init__(self, **kwargs): """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same arguments as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`. """ super(DataprocJobRunner, self).__init__(**kwargs) # check for library support if google is None: raise ImportError( 'You must install google-cloud and google-cloud-dataproc' ' to connect to Dataproc') # Dataproc requires a master and >= 2 core instances # num_core_instances refers ONLY to number of CORE instances and does # NOT include the required 1 instance for master # In other words, minimum cluster size is 3 machines, 1 master and 2 # "num_core_instances" workers if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS: raise DataprocException('Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS) if (self._opts['core_instance_type'] != self._opts['task_instance_type']): raise DataprocException( 'Dataproc v1 expects core/task instance types to be identical') # load credentials and project ID self._credentials, auth_project_id = google.auth.default( scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES) self._project_id = self._opts['project_id'] or auth_project_id if not self._project_id: raise DataprocException( 'project_id must be set. Use --project_id or' ' set $GOOGLE_CLOUD_PROJECT') self._fix_zone_and_region_opts() # cluster_id can be None here self._cluster_id = self._opts['cluster_id'] self._api_client = None self._gcs_fs = None self._fs = None # BEGIN - setup directories base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir']) self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir) # use job key to make a unique tmp dir self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/' # pick/validate output dir if self._output_dir: self._output_dir = _check_and_fix_fs_dir(self._output_dir) else: self._output_dir = self._job_tmpdir + 'output/' # END - setup directories # manage local files that we want to upload to GCS. We'll add them # to this manager just before we need them. fs_files_dir = self._job_tmpdir + 'files/' self._upload_mgr = UploadDirManager(fs_files_dir) # when did our particular task start? self._dataproc_job_start = None # init hadoop, ami version caches self._image_version = None self._hadoop_version = None # This will be filled by _run_steps() # NOTE - log_interpretations will be empty except job_id until we # parse task logs self._log_interpretations = []