def test_says_boom(self): try: run_job(MRBoom([]), b'some input') except Exception as ex: self.assertIn('BOOM', str(ex)) else: raise AssertionError('no exception raised')
def test_works_with_built_in_json_module(self): # regression test: make sure we're not trying to serialize dict_items self.start(patch.object(MRTextClassifier, 'INTERNAL_PROTOCOL', StandardJSONProtocol)) self.start(patch.object(MRTextClassifier, 'OUTPUT_PROTOCOL', StandardJSONProtocol)) docs_paths = glob(join( dirname(mrjob.__file__), 'examples', 'docs-to-classify', '*')) # use --min-df 1 because we have so few documents job_args = ['--min-df', '1'] + docs_paths run_job(MRTextClassifier(job_args))
def test_can_tell_milne_from_whitman(self): docs_paths = glob(join( dirname(mrjob.__file__), 'examples', 'docs-to-classify', '*')) # use --min-df 1 because we have so few documents job_args = ['--min-df', '1'] + docs_paths output = run_job(MRTextClassifier(job_args)) test_set_docs = [ doc for k, doc in output.items() if k[0] == 'doc' and not doc['in_test_set'] ] # make sure that there are some docs in the test set self.assertGreater(len(test_set_docs), 3) for doc in test_set_docs: for cat in ('milne', 'whitman'): # include doc ID to make debugging easier self.assertEqual( (doc['id'], bool(doc['cats'].get(cat))), (doc['id'], bool(doc['cat_to_score'][cat] > 0))) # the empty doc should only be something that appears with no input self.assertNotIn(('doc', ''), output)
def test_files(self): cat_file = self.makefile('cats.txt', b'cats are the best') dog_file = self.makefile('dogs.txt', b'woof woof woof\nwoof woof') empty_file = self.makefile('empty.txt') self.assertEqual( run_job(MRCountLinesRight([cat_file, dog_file, empty_file])), {None: 3})
def test_files(self): cat_file = self.makefile('cats.txt', b'cats are the best') dog_file = self.makefile('dogs.txt', b'woof woof woof\nwoof woof') empty_file = self.makefile('empty.txt') self.assertEqual( run_job(MRCountLinesByFile([cat_file, dog_file, empty_file])), { 'file://' + cat_file: 1, 'file://' + dog_file: 2, })
def test_ignore_stop_words(self): RAW_INPUT = b""" A Car Joke: When is a car not a car? When it turns into a driveway! """ EXPECTED_OUTPUT = { None: u'car', } self.assertEqual(run_job(MRMostUsedWord(), RAW_INPUT), EXPECTED_OUTPUT)
def test_frequency(self): input = (b'Oh where, oh where, has my little dog gone?\n' b'Oh where, oh where can he be?') self.assertEqual( run_job(MRNextWordStats([]), input), { ('can', 'he'): [1, 1, 100.0], ('dog', 'gone'): [1, 1, 100.0], ('has', 'my'): [1, 1, 100.0], ('he', 'be'): [1, 1, 100.0], ('little', 'dog'): [1, 1, 100.0], ('my', 'little'): [1, 1, 100.0], ('oh', 'where'): [4, 4, 100.0], ('where', 'can'): [4, 1, 25.0], ('where', 'has'): [4, 1, 25.0], ('where', 'oh'): [4, 2, 50.0], })
def test_the_wheels_on_the_bus(self): RAW_INPUT = b""" The wheels on the bus go round and round, round and round, round and round The wheels on the bus go round and round, all through the town. """ EXPECTED_OUTPUT = { u'bus': 2, u'round': 8, u'through': 1, } self.assertEqual( run_job(MRWordsContainingUFreqCount(['-r', 'local']), RAW_INPUT), EXPECTED_OUTPUT)
def test_input_manifest(self): wet1 = BytesIO() writer1 = WARCWriter(wet1, gzip=False) write_conversion_record(writer1, 'https://nophonenumbershere.info', b'THIS-IS-NOT-A-NUMBER') write_conversion_record( writer1, 'https://big.directory/', b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n') wet2_gz_path = join(self.tmp_dir, 'wet2.warc.wet.gz') with open(wet2_gz_path, 'wb') as wet2: writer2 = WARCWriter(wet2, gzip=True) write_conversion_record(writer2, 'https://jseventplanning.biz/', b'contact us at +1 201 867 5309') self.assertEqual( run_job(MRPhoneToURL(['-r', self.RUNNER, wet2_gz_path, '-']), raw_input=wet1.getvalue()), self.EXPECTED_OUTPUT)
def test_the_wheels_on_the_bus(self): RAW_INPUT = b""" The wheels on the bus go round and round, round and round, round and round The wheels on the bus go round and round, all through the town. """ EXPECTED_OUTPUT = { u'all': 1, u'and': 4, u'bus': 2, u'go': 2, u'on': 2, u'round': 8, u'the': 5, u'through': 1, u'town': 1, u'wheels': 2, } self.assertEqual(run_job(MRWordFreqCount(), RAW_INPUT), EXPECTED_OUTPUT)
def test_setup_cmd(self): wet_path = join(self.tmp_dir, 'wet.warc.wet.gz') with open(wet_path, 'wb') as wet: writer = WARCWriter(wet) write_conversion_record( writer, 'https://big.directory/', b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n') write_conversion_record(writer, 'https://jseventplanning.biz/', b'contact us at +1 201 867 5309') touched_path = join(self.tmp_dir, 'touched') setup_cmd = 'touch ' + touched_path self.assertFalse(exists(touched_path)) self.assertEqual( run_job( MRPhoneToURL( ['-r', self.RUNNER, '--setup', setup_cmd, wet_path])), self.EXPECTED_OUTPUT) self.assertTrue(exists(touched_path))
def test_setup_cmd(self): wet_path = join(self.tmp_dir, 'wet.warc.wet.gz') with open(wet_path, 'wb') as wet: writer = WARCWriter(wet) write_conversion_record( writer, 'https://big.directory/', b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n') write_conversion_record( writer, 'https://jseventplanning.biz/', b'contact us at +1 201 867 5309') touched_path = join(self.tmp_dir, 'touched') setup_cmd = 'touch ' + touched_path self.assertFalse(exists(touched_path)) self.assertEqual( run_job(MRPhoneToURL( ['-r', self.RUNNER, '--setup', setup_cmd, wet_path])), self.EXPECTED_OUTPUT) self.assertTrue(exists(touched_path))
def test_input_manifest(self): wet1 = BytesIO() writer1 = WARCWriter(wet1, gzip=False) write_conversion_record( writer1, 'https://nophonenumbershere.info', b'THIS-IS-NOT-A-NUMBER') write_conversion_record( writer1, 'https://big.directory/', b'The Time: (612) 777-9311\nJenny: (201) 867-5309\n') wet2_gz_path = join(self.tmp_dir, 'wet2.warc.wet.gz') with open(wet2_gz_path, 'wb') as wet2: writer2 = WARCWriter(wet2, gzip=True) write_conversion_record( writer2, 'https://jseventplanning.biz/', b'contact us at +1 201 867 5309') self.assertEqual( run_job(MRPhoneToURL(['-r', self.RUNNER, wet2_gz_path, '-']), raw_input=wet1.getvalue()), self.EXPECTED_OUTPUT)
def test_empty(self): self.assertEqual(run_job(MRWordFreqCount()), {})
def test_empty(self): self.assertEqual( run_job(MRWordCountUtility([])), dict(chars=0, lines=0, words=0))
def test_empty(self): self.assertEqual(run_job(MRNextWordStats([])), {})
def test_empty(self): self.assertEqual(run_job(MRPhoneToURL()), {})
def test_empty(self): self.assertEqual(run_job(MRLogSampler(['--sample-size', '100'])), {})
def test_two_lines(self): RAW_INPUT = b'dog dog dog\ncat cat\n' self.assertEqual( run_job(MRWordCountUtility([]), RAW_INPUT), dict(chars=20, words=5, lines=2))
def test_empty(self): self.assertEqual(run_job(MRCountLinesRight([])), {None: 0})
def test_empty(self): self.assertEqual( run_job(MRTextClassifier([])), {('doc', ''): dict( cats={}, cat_to_score={}, id='', in_test_set=True)})
def test_empty(self): self.assertEqual(run_job(MRMostUsedWord()), {})
def test_empty(self): self.assertEqual(run_job(MRCountLinesByFile([])), {})
def test_empty(self): self.assertEqual(run_job(MRWordsContainingUFreqCount(['-r', 'local'])), {})