def test_input_files(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') mr_job = MRWordCount(['-r', self.RUNNER, input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() results.extend(mr_job.parse_output(runner.cat_output())) self.assertGreater(runner.counters()[0]['count']['combiners'], 2) self.assertEqual(sorted(results), [('file://' + input_path, 3), ('file://' + input_gz_path, 1)])
def test_input_files_and_setting_number_of_tasks(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRWordCount([ '-r', self.RUNNER, '--jobconf=mapred.map.tasks=3', '--jobconf=mapred.reduce.tasks=3', input_path, input_gz_path ]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() results.extend(mr_job.parse_output(runner.cat_output())) self.assertEqual(runner.counters()[0]['count']['combiners'], 3) self.assertEqual(sorted(results), [(input_path, 3), (input_gz_path, 1)])
def test_input_files(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') mr_job = MRWordCount(['-r', self.RUNNER, input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() results.extend(mr_job.parse_output(runner.cat_output())) self.assertGreater(runner.counters()[0]['count']['combiners'], 2) self.assertEqual(sorted(results), [(input_path, 3), (input_gz_path, 1)])