Exemplo n.º 1
0
 def test_says_boom(self):
     try:
         run_job(MRBoom([]), b'some input')
     except Exception as ex:
         self.assertIn('BOOM', str(ex))
     else:
         raise AssertionError('no exception raised')
Exemplo n.º 2
0
    def test_works_with_built_in_json_module(self):
        # regression test: make sure we're not trying to serialize dict_items
        self.start(patch.object(MRTextClassifier,
                                'INTERNAL_PROTOCOL', StandardJSONProtocol))
        self.start(patch.object(MRTextClassifier,
                                'OUTPUT_PROTOCOL', StandardJSONProtocol))

        docs_paths = glob(join(
            dirname(mrjob.__file__), 'examples', 'docs-to-classify', '*'))

        # use --min-df 1 because we have so few documents
        job_args = ['--min-df', '1'] + docs_paths

        run_job(MRTextClassifier(job_args))
Exemplo n.º 3
0
    def test_can_tell_milne_from_whitman(self):
        docs_paths = glob(join(
            dirname(mrjob.__file__), 'examples', 'docs-to-classify', '*'))

        # use --min-df 1 because we have so few documents
        job_args = ['--min-df', '1'] + docs_paths

        output = run_job(MRTextClassifier(job_args))
        test_set_docs = [
            doc for k, doc in output.items()
            if k[0] == 'doc' and not doc['in_test_set']
        ]

        # make sure that there are some docs in the test set
        self.assertGreater(len(test_set_docs), 3)

        for doc in test_set_docs:
            for cat in ('milne', 'whitman'):
                # include doc ID to make debugging easier
                self.assertEqual(
                    (doc['id'], bool(doc['cats'].get(cat))),
                    (doc['id'], bool(doc['cat_to_score'][cat] > 0)))

        # the empty doc should only be something that appears with no input
        self.assertNotIn(('doc', ''), output)
    def test_files(self):
        cat_file = self.makefile('cats.txt', b'cats are the best')
        dog_file = self.makefile('dogs.txt', b'woof woof woof\nwoof woof')
        empty_file = self.makefile('empty.txt')

        self.assertEqual(
            run_job(MRCountLinesRight([cat_file, dog_file, empty_file])),
            {None: 3})
Exemplo n.º 5
0
    def test_files(self):
        cat_file = self.makefile('cats.txt', b'cats are the best')
        dog_file = self.makefile('dogs.txt', b'woof woof woof\nwoof woof')
        empty_file = self.makefile('empty.txt')

        self.assertEqual(
            run_job(MRCountLinesByFile([cat_file, dog_file, empty_file])), {
                'file://' + cat_file: 1,
                'file://' + dog_file: 2,
            })
    def test_ignore_stop_words(self):
        RAW_INPUT = b"""
        A Car Joke:
        When is a car not a car?
        When it turns into a driveway!
        """

        EXPECTED_OUTPUT = {
            None: u'car',
        }

        self.assertEqual(run_job(MRMostUsedWord(), RAW_INPUT), EXPECTED_OUTPUT)
Exemplo n.º 7
0
    def test_ignore_stop_words(self):
        RAW_INPUT = b"""
        A Car Joke:
        When is a car not a car?
        When it turns into a driveway!
        """

        EXPECTED_OUTPUT = {
            None: u'car',
        }

        self.assertEqual(run_job(MRMostUsedWord(), RAW_INPUT),
                         EXPECTED_OUTPUT)
Exemplo n.º 8
0
    def test_frequency(self):
        input = (b'Oh where, oh where, has my little dog gone?\n'
                 b'Oh where, oh where can he be?')

        self.assertEqual(
            run_job(MRNextWordStats([]), input), {
                ('can', 'he'): [1, 1, 100.0],
                ('dog', 'gone'): [1, 1, 100.0],
                ('has', 'my'): [1, 1, 100.0],
                ('he', 'be'): [1, 1, 100.0],
                ('little', 'dog'): [1, 1, 100.0],
                ('my', 'little'): [1, 1, 100.0],
                ('oh', 'where'): [4, 4, 100.0],
                ('where', 'can'): [4, 1, 25.0],
                ('where', 'has'): [4, 1, 25.0],
                ('where', 'oh'): [4, 2, 50.0],
            })
Exemplo n.º 9
0
    def test_the_wheels_on_the_bus(self):
        RAW_INPUT = b"""
        The wheels on the bus go round and round,
        round and round, round and round
        The wheels on the bus go round and round,
        all through the town.
        """

        EXPECTED_OUTPUT = {
            u'bus': 2,
            u'round': 8,
            u'through': 1,
        }

        self.assertEqual(
            run_job(MRWordsContainingUFreqCount(['-r', 'local']), RAW_INPUT),
            EXPECTED_OUTPUT)
Exemplo n.º 10
0
    def test_input_manifest(self):
        wet1 = BytesIO()
        writer1 = WARCWriter(wet1, gzip=False)

        write_conversion_record(writer1, 'https://nophonenumbershere.info',
                                b'THIS-IS-NOT-A-NUMBER')
        write_conversion_record(
            writer1, 'https://big.directory/',
            b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n')

        wet2_gz_path = join(self.tmp_dir, 'wet2.warc.wet.gz')
        with open(wet2_gz_path, 'wb') as wet2:
            writer2 = WARCWriter(wet2, gzip=True)

            write_conversion_record(writer2, 'https://jseventplanning.biz/',
                                    b'contact us at +1 201 867 5309')

        self.assertEqual(
            run_job(MRPhoneToURL(['-r', self.RUNNER, wet2_gz_path, '-']),
                    raw_input=wet1.getvalue()), self.EXPECTED_OUTPUT)
Exemplo n.º 11
0
    def test_the_wheels_on_the_bus(self):
        RAW_INPUT = b"""
        The wheels on the bus go round and round,
        round and round, round and round
        The wheels on the bus go round and round,
        all through the town.
        """

        EXPECTED_OUTPUT = {
            u'all': 1,
            u'and': 4,
            u'bus': 2,
            u'go': 2,
            u'on': 2,
            u'round': 8,
            u'the': 5,
            u'through': 1,
            u'town': 1,
            u'wheels': 2,
        }

        self.assertEqual(run_job(MRWordFreqCount(), RAW_INPUT),
                         EXPECTED_OUTPUT)
Exemplo n.º 12
0
    def test_the_wheels_on_the_bus(self):
        RAW_INPUT = b"""
        The wheels on the bus go round and round,
        round and round, round and round
        The wheels on the bus go round and round,
        all through the town.
        """

        EXPECTED_OUTPUT = {
            u'all': 1,
            u'and': 4,
            u'bus': 2,
            u'go': 2,
            u'on': 2,
            u'round': 8,
            u'the': 5,
            u'through': 1,
            u'town': 1,
            u'wheels': 2,
        }

        self.assertEqual(run_job(MRWordFreqCount(), RAW_INPUT),
                         EXPECTED_OUTPUT)
Exemplo n.º 13
0
    def test_setup_cmd(self):
        wet_path = join(self.tmp_dir, 'wet.warc.wet.gz')
        with open(wet_path, 'wb') as wet:
            writer = WARCWriter(wet)

            write_conversion_record(
                writer, 'https://big.directory/',
                b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n')
            write_conversion_record(writer, 'https://jseventplanning.biz/',
                                    b'contact us at +1 201 867 5309')

        touched_path = join(self.tmp_dir, 'touched')
        setup_cmd = 'touch ' + touched_path

        self.assertFalse(exists(touched_path))

        self.assertEqual(
            run_job(
                MRPhoneToURL(
                    ['-r', self.RUNNER, '--setup', setup_cmd, wet_path])),
            self.EXPECTED_OUTPUT)

        self.assertTrue(exists(touched_path))
Exemplo n.º 14
0
    def test_setup_cmd(self):
        wet_path = join(self.tmp_dir, 'wet.warc.wet.gz')
        with open(wet_path, 'wb') as wet:
            writer = WARCWriter(wet)

            write_conversion_record(
                writer, 'https://big.directory/',
                b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n')
            write_conversion_record(
                writer, 'https://jseventplanning.biz/',
                b'contact us at +1 201 867 5309')

        touched_path = join(self.tmp_dir, 'touched')
        setup_cmd = 'touch ' + touched_path

        self.assertFalse(exists(touched_path))

        self.assertEqual(
            run_job(MRPhoneToURL(
                ['-r', self.RUNNER, '--setup', setup_cmd, wet_path])),
            self.EXPECTED_OUTPUT)

        self.assertTrue(exists(touched_path))
Exemplo n.º 15
0
    def test_input_manifest(self):
        wet1 = BytesIO()
        writer1 = WARCWriter(wet1, gzip=False)

        write_conversion_record(
            writer1, 'https://nophonenumbershere.info',
            b'THIS-IS-NOT-A-NUMBER')
        write_conversion_record(
            writer1, 'https://big.directory/',
            b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n')

        wet2_gz_path = join(self.tmp_dir, 'wet2.warc.wet.gz')
        with open(wet2_gz_path, 'wb') as wet2:
            writer2 = WARCWriter(wet2, gzip=True)

            write_conversion_record(
                writer2, 'https://jseventplanning.biz/',
                b'contact us at +1 201 867 5309')

        self.assertEqual(
            run_job(MRPhoneToURL(['-r', self.RUNNER, wet2_gz_path, '-']),
                    raw_input=wet1.getvalue()),
            self.EXPECTED_OUTPUT)
Exemplo n.º 16
0
 def test_empty(self):
     self.assertEqual(run_job(MRWordFreqCount()), {})
Exemplo n.º 17
0
 def test_empty(self):
     self.assertEqual(
         run_job(MRWordCountUtility([])),
         dict(chars=0, lines=0, words=0))
Exemplo n.º 18
0
 def test_empty(self):
     self.assertEqual(run_job(MRNextWordStats([])), {})
Exemplo n.º 19
0
 def test_empty(self):
     self.assertEqual(run_job(MRPhoneToURL()), {})
Exemplo n.º 20
0
 def test_empty(self):
     self.assertEqual(run_job(MRLogSampler(['--sample-size', '100'])), {})
Exemplo n.º 21
0
 def test_empty(self):
     self.assertEqual(run_job(MRPhoneToURL()), {})
Exemplo n.º 22
0
 def test_empty(self):
     self.assertEqual(run_job(MRWordFreqCount()), {})
Exemplo n.º 23
0
    def test_two_lines(self):
        RAW_INPUT = b'dog dog dog\ncat cat\n'

        self.assertEqual(
            run_job(MRWordCountUtility([]), RAW_INPUT),
            dict(chars=20, words=5, lines=2))
Exemplo n.º 24
0
 def test_empty(self):
     self.assertEqual(run_job(MRCountLinesRight([])), {None: 0})
Exemplo n.º 25
0
 def test_empty(self):
     self.assertEqual(
         run_job(MRTextClassifier([])),
         {('doc', ''): dict(
             cats={}, cat_to_score={}, id='', in_test_set=True)})
Exemplo n.º 26
0
 def test_empty(self):
     self.assertEqual(run_job(MRMostUsedWord()), {})
Exemplo n.º 27
0
 def test_empty(self):
     self.assertEqual(run_job(MRMostUsedWord()), {})
Exemplo n.º 28
0
 def test_empty(self):
     self.assertEqual(run_job(MRCountLinesByFile([])), {})
Exemplo n.º 29
0
 def test_empty(self):
     self.assertEqual(run_job(MRWordsContainingUFreqCount(['-r', 'local'])),
                      {})