def _step_output_uri(self, step_num): """URI to use as output for the given step. This is either an intermediate dir (see :py:meth:`intermediate_output_uri`) or ``self._output_dir`` for the final step.""" if step_num == len(self._get_steps()) - 1: return to_uri(self._output_dir) else: return to_uri(self._intermediate_output_dir(step_num))
def _step_input_uris(self, step_num): """A list of URIs to use as input for the given step. For all except the first step, this list will have a single item (a directory).""" if step_num == 0: return [self._upload_mgr.uri(path) if self._upload_mgr else to_uri(path) for path in self._get_input_paths()] else: return [to_uri(self._intermediate_output_dir(step_num - 1))]
def _step_input_uris(self, step_num): """A list of URIs to use as input for the given step. For all except the first step, this list will have a single item (a directory).""" if step_num == 0: return [ self._upload_mgr.uri(path) if self._upload_mgr else to_uri(path) for path in self._get_input_paths() ] else: return [to_uri(self._intermediate_output_dir(step_num - 1))]
def test_relative_path_to_uri(self): tmp_dir = realpath(gettempdir()) with save_cwd(): chdir(tmp_dir) foo_uri = to_uri('foo.db') self.assertEqual(foo_uri[:8], 'file:///') self.assertEqual(foo_uri, 'file://' + join(pathname2url(tmp_dir), 'foo.db'))
def _check_spark_tmp_dir_opt(self): # warn if spark_tmp_dir isn't actually visible to Spark executors # (see #2062) tmp_dir_is_local = to_uri( self._opts['spark_tmp_dir']).startswith('file://') spark_master_is_local = self._spark_master().startswith('local') if tmp_dir_is_local != spark_master_is_local: log.warning( 'Warning: executors on Spark master %s may not be able to' ' access spark_tmp_dir %s' % (self._spark_master(), self._opts['spark_tmp_dir']))
def test_local_output_dir_and_step_output_dir(self): input1_path = self.makefile('input1') input2_path = self.makefile('input2') output_dir = self.makedirs('output') step_output_dir = self.makedirs('step_output') # this has three steps, which lets us test step numbering job = MRCountingJob([ '-r', 'local', '--output-dir', output_dir, '--step-output-dir', step_output_dir, input1_path, input2_path]) job.sandbox() with job.make_runner() as runner: self.assertEqual(runner._num_steps(), 3) input_uris_0 = runner._step_input_uris(0) self.assertEqual([os.path.basename(uri) for uri in input_uris_0], ['input1', 'input2']) self.assertEqual([uri[:8] for uri in input_uris_0], ['file:///', 'file:///']) output_uri_0 = runner._step_output_uri(0) self.assertEqual(output_uri_0, to_uri(os.path.join(step_output_dir, '0000'))) input_uris_1 = runner._step_input_uris(1) self.assertEqual(input_uris_1, [output_uri_0]) output_uri_1 = runner._step_output_uri(1) self.assertEqual(output_uri_1, to_uri(os.path.join(step_output_dir, '0001'))) input_uris_2 = runner._step_input_uris(2) self.assertEqual(input_uris_2, [output_uri_1]) output_uri_2 = runner._step_output_uri(2) self.assertEqual(output_uri_2, to_uri(output_dir))
def test_to_uri(self): self.assertEqual(to_uri('/path/to/file'), 'file:///path/to/file') self.assertEqual(to_uri('s3://a/uri'), 's3://a/uri')