Python MRWordFreqCount示例，mrjob.examples.mr_word_freq_count.MRWordFreqCount Python示例

示例#1

0

显示文件

    def test_max_output_files(self):
        job = MRWordFreqCount(['-r', 'spark', '--max-output-files', '1'])
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(self._num_output_files(runner), 1)

示例#2

0

显示文件

文件： test_runner.py 项目： Yelp/mrjob

    def test_max_output_files(self):
        job = MRWordFreqCount(['-r', 'spark', '--max-output-files', '1'])
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(self._num_output_files(runner), 1)

示例#3

0

显示文件

    def test_num_reducers(self):
        jobconf_args = ['--jobconf', 'mapreduce.job.reduces=1']

        job = MRWordFreqCount(['-r', 'spark'] + jobconf_args)
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(self._num_output_files(runner), 1)

示例#4

0

显示文件

文件： test_runner.py 项目： Yelp/mrjob

    def test_basic_job(self):
        job = MRWordFreqCount(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(
            b'one fish\ntwo fish\nred fish\nblue fish\n'))

        with job.make_runner() as runner:
            runner.run()
            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, dict(blue=1, fish=4, one=1, red=1, two=1))

示例#5

0

显示文件

    def test_num_reducers(self):
        jobconf_args = ['--jobconf', 'mapreduce.job.reduces=1']

        job = MRWordFreqCount(['-r', 'spark'] + jobconf_args)
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))
        with job.make_runner() as runner:
            runner.run()
            num_output_files = sum(1 for f in listdir(runner.get_output_dir())
                                   if f.startswith('part'))
        self.assertEqual(num_output_files, 1)

示例#6

0

显示文件

文件： test_runner.py 项目： Yelp/mrjob

    def test_num_reducers(self):
        jobconf_args = [
            '--jobconf', 'mapreduce.job.reduces=1'
        ]

        job = MRWordFreqCount(['-r', 'spark'] + jobconf_args)
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(self._num_output_files(runner), 1)

示例#7

0

显示文件

    def test_max_output_files_is_cmd_line_only(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(spark=dict(max_output_files=1)))))

        log = self.start(patch('mrjob.runner.log'))

        job = MRWordFreqCount(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            # by default there should be at least 2 output files
            self.assertNotEqual(self._num_output_files(runner), 1)

        self.assertTrue(log.warning.called)

示例#8

0

显示文件

文件： test_runner.py 项目： Yelp/mrjob

    def test_max_output_files_is_cmd_line_only(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(spark=dict(max_output_files=1)))))

        log = self.start(patch('mrjob.runner.log'))

        job = MRWordFreqCount(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            # by default there should be at least 2 output files
            self.assertNotEqual(self._num_output_files(runner), 1)

        self.assertTrue(log.warning.called)

示例#9

0

显示文件

文件： Demo.py 项目： osandrolucas/Map_Reduce_Demo

class MRWordFreqCount(MRJob):
    OUTPUT_PROTOCOL = protocol.JSONProtocol

    def mapper(self, _, line):
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def reducer(self, word, counts):
        yield (word, sum(counts))

    if __name__ == '__main__':
        MRWordFreqCount().run()

示例#10

0

显示文件

文件： test_runner.py 项目： Affirm/mrjob

    def test_compression(self):
        # deliberately mix Hadoop 1 and 2 config properties
        jobconf_args = [
            '--jobconf',
            'mapred.output.compression.codec='\
            'org.apache.hadoop.io.compress.GzipCodec',
            '--jobconf',
            'mapreduce.output.fileoutputformat.compress=true',
        ]

        job = MRWordFreqCount(['-r', 'spark'] + jobconf_args)
        job.sandbox(stdin=BytesIO(b'fa la la la la\nla la la la\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertTrue(runner.fs.exists(
                join(runner.get_output_dir(), 'part*.gz')))

            self.assertEqual(dict(job.parse_output(runner.cat_output())),
                             dict(fa=1, la=8))

示例#11

0

显示文件

    def test_basic_job(self):
        job = MRWordFreqCount(['-r', 'spark'])
        job.sandbox(
            stdin=BytesIO(b'one fish\ntwo fish\nred fish\nblue fish\n'))

        with job.make_runner() as runner:
            runner.run()
            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, dict(blue=1, fish=4, one=1, red=1, two=1))

示例#12

0

显示文件

文件： test_sim.py 项目： yzhanggithub/mrjob

    def test_file_uris_as_input(self):
        input1 = self.makefile('input1.txt', b'cat rat bat')
        input2 = 'file://' + self.makefile('input2.txt', b'dog dog dog')

        job = MRWordFreqCount([input1, input2])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

            self.assertEqual(dict(job.parse_output(runner.cat_output())),
                             dict(bat=1, cat=1, dog=3, rat=1))

示例#13

0

显示文件

文件： test_mr_word_freq_count.py 项目： zhiaozhou/mrjob

    def test_the_wheels_on_the_bus(self):
        RAW_INPUT = b"""
        The wheels on the bus go round and round,
        round and round, round and round
        The wheels on the bus go round and round,
        all through the town.
        """

        EXPECTED_OUTPUT = {
            u'all': 1,
            u'and': 4,
            u'bus': 2,
            u'go': 2,
            u'on': 2,
            u'round': 8,
            u'the': 5,
            u'through': 1,
            u'town': 1,
            u'wheels': 2,
        }

        self.assertEqual(run_job(MRWordFreqCount(), RAW_INPUT),
                         EXPECTED_OUTPUT)

示例#14

0

显示文件

    def test_compression(self):
        # deliberately mix Hadoop 1 and 2 config properties
        jobconf_args = [
            '--jobconf',
            ('mapred.output.compression.codec='
             'org.apache.hadoop.io.compress.GzipCodec'),
            '--jobconf',
            'mapreduce.output.fileoutputformat.compress=true',
        ]

        job = MRWordFreqCount(['-r', 'spark'] + jobconf_args)
        job.sandbox(stdin=BytesIO(b'fa la la la la\nla la la la\n'))

        with job.make_runner() as runner:
            runner.run()

            self.assertTrue(
                runner.fs.exists(join(runner.get_output_dir(), 'part*.gz')))

            self.assertEqual(dict(job.parse_output(runner.cat_output())),
                             dict(fa=1, la=8))

示例#15

0

显示文件

文件： test_mr_word_freq_count.py 项目： zhiaozhou/mrjob

 def test_empty(self):
     self.assertEqual(run_job(MRWordFreqCount()), {})

示例#16

0

显示文件

文件： wordcount.py 项目： li354886/COSC-588


from mrjob.job import MRJob
from mrjob.step import MRStep
import re
from mrjob.examples.mr_word_freq_count import MRWordFreqCount
import sys
from operator import itemgetter, attrgetter, methodcaller
WORD_RE = re.compile(r"[\w']+")

list = []
if __name__ == '__main__':
    job = MRWordFreqCount(args=sys.argv[1:])
    with job.make_runner() as runner:
        runner.run()
        for line in runner.stream_output():
            key, value = job.parse_output_line(line)
            object = (key, value)
            list.append(object)

    sorted_list = sorted(list, key=itemgetter(1), reverse=True)
    print sorted_list[0:10]