def test_empty(self): # this doesn't work on the inline runner because # Spark doesn't have a working dir to upload stop_words.txt # to. See below for what does and doesn't work in inline # runner job = MRSparkScriptWordcount(['-r', 'local']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(sorted(to_lines(runner.cat_output())), [])
def test_spark_script_mrjob(self): text = b'one fish\ntwo fish\nred fish\nblue fish\n' job = MRSparkScriptWordcount(['-r', 'spark']) job.sandbox(stdin=BytesIO(text)) counts = {} with job.make_runner() as runner: runner.run() for line in to_lines(runner.cat_output()): k, v = safeeval(line) counts[k] = v self.assertEqual(counts, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_spark_script_mrjob(self): text = b'one fish\ntwo fish\nred fish\nblue fish\n' job = MRSparkScriptWordcount(['-r', 'local']) job.sandbox(stdin=BytesIO(text)) counts = {} with job.make_runner() as runner: runner.run() for line in to_lines(runner.cat_output()): k, v = safeeval(line) counts[k] = v self.assertEqual(counts, dict( blue=1, fish=4, one=1, red=1, two=1))
def test_count_words(self): job = MRSparkScriptWordcount(['-r', 'local']) job.sandbox( stdin=BytesIO(b'Mary had a little lamb\nlittle lamb\nlittle lamb')) with job.make_runner() as runner: runner.run() output = sorted( safeeval(line) for line in to_lines(runner.cat_output())) self.assertEqual(output, [ ('a', 1), ('had', 1), ('lamb', 3), ('little', 3), ('mary', 1), ])
def test_no_spark_script_steps(self): # just a sanity check; _STEP_TYPES is tested in a lot of ways job = MRSparkScriptWordcount(['-r', 'inline']) job.sandbox() self.assertRaises(NotImplementedError, job.make_runner)