def test_multi_step_counters(self): stdin = BytesIO(b'foo\nbar\n') mr_job = MRCountingJob(['-r', 'local', '-']) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: runner.run() self.assertEqual(runner.counters(), [{'group': {'counter_name': 2}}, {'group': {'counter_name': 2}}, {'group': {'counter_name': 2}}])
def test_multi_step_counters(self): stdin = StringIO('foo\nbar\n') mr_job = MRCountingJob(['-c', self.mrjob_conf_path, '-']) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: runner.run() self.assertEqual(runner.counters(), [{'group': {'counter_name': 2}}, {'group': {'counter_name': 2}}, {'group': {'counter_name': 2}}])
def test_output_dir_and_step_output_dir(self): input1_path = self.makefile('input1') input2_path = self.makefile('input2') # this has three steps, which lets us test step numbering job = MRCountingJob([ '-r', 'hadoop', '--hadoop-bin', 'false', # shouldn't run; just in case '--output-dir', 'hdfs:///tmp/output', '--step-output-dir', 'hdfs://tmp/step-output', input1_path, input2_path ]) job.sandbox() with job.make_runner() as runner: self.assertEqual(runner._num_steps(), 3) self.add_files_for_upload(runner) input_uris_0 = runner._step_input_uris(0) self.assertEqual([os.path.basename(uri) for uri in input_uris_0], ['input1', 'input2']) output_uri_0 = runner._step_output_uri(0) self.assertEqual(output_uri_0, 'hdfs://tmp/step-output/0000') input_uris_1 = runner._step_input_uris(1) self.assertEqual(input_uris_1, [output_uri_0]) output_uri_1 = runner._step_output_uri(1) self.assertEqual(output_uri_1, 'hdfs://tmp/step-output/0001') input_uris_2 = runner._step_input_uris(2) self.assertEqual(input_uris_2, [output_uri_1]) output_uri_2 = runner._step_output_uri(2) self.assertEqual(output_uri_2, 'hdfs:///tmp/output')
def test_local_output_dir_and_step_output_dir(self): input1_path = self.makefile('input1') input2_path = self.makefile('input2') output_dir = self.makedirs('output') step_output_dir = self.makedirs('step_output') # this has three steps, which lets us test step numbering job = MRCountingJob([ '-r', 'local', '--output-dir', output_dir, '--step-output-dir', step_output_dir, input1_path, input2_path]) job.sandbox() with job.make_runner() as runner: self.assertEqual(runner._num_steps(), 3) input_uris_0 = runner._step_input_uris(0) self.assertEqual([os.path.basename(uri) for uri in input_uris_0], ['input1', 'input2']) self.assertEqual([uri[:8] for uri in input_uris_0], ['file:///', 'file:///']) output_uri_0 = runner._step_output_uri(0) self.assertEqual(output_uri_0, to_uri(os.path.join(step_output_dir, '0000'))) input_uris_1 = runner._step_input_uris(1) self.assertEqual(input_uris_1, [output_uri_0]) output_uri_1 = runner._step_output_uri(1) self.assertEqual(output_uri_1, to_uri(os.path.join(step_output_dir, '0001'))) input_uris_2 = runner._step_input_uris(2) self.assertEqual(input_uris_2, [output_uri_1]) output_uri_2 = runner._step_output_uri(2) self.assertEqual(output_uri_2, to_uri(output_dir))
def test_multi_step_counters(self): # read from STDIN, a regular file, and a .gz stdin = StringIO('foo\nbar\n') mr_job = MRCountingJob(['-c', self.mrjob_conf_path, '-']) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) assert_equal(runner._counters, [{ 'group': { 'counter_name': 2 } }, { 'group': { 'counter_name': 2 } }, { 'group': { 'counter_name': 2 } }])
def test_gz_split_regression(self): gz_path_1 = os.path.join(self.tmp_dir, '1.gz') gz_path_2 = os.path.join(self.tmp_dir, '2.gz') path_3 = os.path.join(self.tmp_dir, '3') input_gz_1 = gzip.GzipFile(gz_path_1, 'wb') input_gz_1.write(b'x\n') input_gz_1.close() input_gz_2 = gzip.GzipFile(gz_path_2, 'wb') input_gz_2.write(b'y\n') input_gz_2.close() with open(path_3, 'wb') as f: f.write(b'z') mr_job = MRCountingJob( ['--no-conf', '-r', 'local', gz_path_1, gz_path_2, path_3]) with mr_job.make_runner() as r: splits = r._get_file_splits([gz_path_1, gz_path_2, path_3], 1) self.assertEqual(len(set(s['task_num'] for s in splits.values())), 3)