Пример #1
0
    def test_non_log_lines(self):
        lines = StringIO('foo\n'
                         'bar\n'
                         '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                         ' Error Launching job :'
                         ' Output directory already exists\n'
                         'Streaming Command Failed!')

        with no_handlers_for_logger('mrjob.logs.parse'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.parse', stderr)

            self.assertEqual(
                list(_parse_hadoop_log_lines(lines)),
                [
                    # ignore leading non-log lines
                    dict(
                        timestamp='15/12/11 13:26:08',
                        level='ERROR',
                        logger='streaming.StreamJob',
                        thread=None,
                        # no way to know that Streaming Command Failed! wasn't part
                        # of a multi-line message
                        message=('Error Launching job :'
                                 ' Output directory already exists\n'
                                 'Streaming Command Failed!'))
                ])

            # should be one warning for each leading non-log line
            log_lines = stderr.getvalue().splitlines()
            self.assertEqual(len(log_lines), 2)
Пример #2
0
 def test_log_lines(self):
     lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:'
                      ' Connecting to ResourceManager at /0.0.0.0:8032\n'
                      '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                      ' Error Launching job :'
                      ' Output directory already exists\n')
     self.assertEqual(
         list(_parse_hadoop_log4j_records(lines)), [
             dict(
                 level='INFO',
                 logger='client.RMProxy',
                 message='Connecting to ResourceManager at /0.0.0.0:8032',
                 num_lines=1,
                 start_line=0,
                 thread='',
                 timestamp='15/12/11 13:26:07',
             ),
             dict(
                 level='ERROR',
                 logger='streaming.StreamJob',
                 message=('Error Launching job :'
                          ' Output directory already exists'),
                 num_lines=1,
                 start_line=1,
                 thread='',
                 timestamp='15/12/11 13:26:08',
             ),
         ])
Пример #3
0
    def test_hadoop_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)

            # HadoopRunnerOptionStore really wants to find the streaming jar
            with patch.object(mrjob.hadoop,
                              'find_hadoop_streaming_jar',
                              return_value='found'):
                opts = HadoopRunnerOptionStore(
                    'hadoop',
                    dict(base_tmp_dir='/scratch',
                         hadoop_home='required',
                         hdfs_scratch_dir='hdfs:///scratch'), [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn(
                'Deprecated option base_tmp_dir has been renamed'
                ' to local_tmp_dir', stderr.getvalue())

            self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch')
            self.assertNotIn('hdfs_scratch_dir', opts)
            self.assertIn(
                'Deprecated option hdfs_scratch_dir has been renamed'
                ' to hadoop_tmp_dir', stderr.getvalue())
Пример #4
0
    def test_yarn_output(self):
        # abbreviated version of real output from Hadoop 2.7.0.
        # Including things that might be interesting to parse later on
        lines = StringIO(
            '15/12/11 13:32:44 INFO client.RMProxy:'
            ' Connecting to ResourceManager at /0.0.0.0:8032\n'
            '15/12/11 13:32:45 INFO mapreduce.JobSubmitter:'
            ' Submitting tokens for job: job_1449857544442_0002\n'
            '15/12/11 13:32:45 INFO impl.YarnClientImpl:'
            ' Submitted application application_1449857544442_0002\n'
            '15/12/11 13:32:45 INFO mapreduce.Job:'
            ' The url to track the job:'
            ' http://0a7802e19139:8088/proxy/application_1449857544442_0002/\n'
            '15/12/11 13:33:11 INFO mapreduce.Job:  map 100% reduce 100%\n'
            '15/12/11 13:33:11 INFO mapreduce.Job:'
            ' Job job_1449857544442_0002 completed successfully\n'
            '15/12/11 13:33:11 INFO mapreduce.Job: Counters: 49\n'
            '        File System Counters\n'
            '                FILE: Number of bytes read=86\n'
            '15/12/11 13:33:11 INFO streaming.StreamJob:'
            ' Output directory:'
            ' hdfs:///user/root/tmp/mrjob/mr_wc.root.20151211.181326.984074'
            '/output\n')

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(application_id='application_1449857544442_0002',
                 counters={
                     'File System Counters': {
                         'FILE: Number of bytes read': 86,
                     }
                 },
                 job_id='job_1449857544442_0002',
                 output_dir=('hdfs:///user/root/tmp/mrjob'
                             '/mr_wc.root.20151211.181326.984074/output')))
Пример #5
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self._dataproc_client.job_get_advances_states = (collections.deque(
                ['SETUP_DONE', 'RUNNING', 'ERROR']))

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = (
            self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'DELETING')
    def test_dry_run(self):
        stdout = StringIO()
        self.maybe_terminate_quietly(
            stdout=stdout, max_mins_idle=0.6, dry_run=True)

        # shouldn't *actually* terminate clusters
        self.assertEqual(self.ids_of_terminated_clusters(), [])
Пример #7
0
    def test_cleanup_options(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.runner'):
            log_to_stream('mrjob.runner', stderr)
            opts = RunnerOptionStore(
                'inline',
                dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'],
                     cleanup_on_failure=['JOB_FLOW', 'SCRATCH']),
                [])

            self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'CLOUD_TMP'])
            self.assertIn(
                'Deprecated cleanup option LOCAL_SCRATCH has been renamed'
                ' to LOCAL_TMP', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup option REMOTE_SCRATCH has been renamed'
                ' to CLOUD_TMP', stderr.getvalue())

            self.assertEqual(opts['cleanup_on_failure'], ['CLUSTER', 'TMP'])
            self.assertIn(
                'Deprecated cleanup_on_failure option JOB_FLOW has been'
                ' renamed to CLUSTER', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup_on_failure option SCRATCH has been renamed'
                ' to TMP', stderr.getvalue())
Пример #8
0
    def test_pre_yarn_output(self):
        # actual output from Hadoop 1.0.3 on EMR AMI 2.4.9
        # Including things that might be interesting to parse later on
        lines = StringIO(
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' getLocalDirs(): [/mnt/var/lib/hadoop/mapred]\n'
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' Running job: job_201512112247_0003\n'
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' Tracking URL:'
            ' http://ip-172-31-27-129.us-west-2.compute.internal:9100'
            '/jobdetails.jsp?jobid=job_201512112247_0003\n'
            '15/12/11 23:09:16 INFO streaming.StreamJob:'
            '  map 100%  reduce 100%\n'
            '15/12/11 23:09:22 INFO streaming.StreamJob:'
            ' Output: hdfs:///user/hadoop/tmp/mrjob'
            '/mr_wc.hadoop.20151211.230352.433691/output\n')

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(application_id=None,
                 counters=None,
                 job_id='job_201512112247_0003',
                 output_dir=('hdfs:///user/hadoop/tmp/mrjob'
                             '/mr_wc.hadoop.20151211.230352.433691/output')))
Пример #9
0
 def test_with_all_job_flows(self):
     self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID)
     emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
     emr_conn.run_jobflow('no name', log_uri=None)
     main(['-q', '--no-conf'])
     lines = [line for line in StringIO(self.stdout.getvalue())]
     self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)
Пример #10
0
 def test_verbose(self):
     with patch.object(sys, 'stderr', StringIO()) as stderr:
         MRJob.set_up_logging(verbose=True)
         log = logging.getLogger('__main__')
         log.info('INFO')
         log.debug('DEBUG')
         self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
Пример #11
0
 def assert_hadoop_version(self, JobClass, version_string):
     mr_job = JobClass()
     mock_log = StringIO()
     with no_handlers_for_logger('mrjob.job'):
         log_to_stream('mrjob.job', mock_log)
         self.assertEqual(mr_job.jobconf()['hadoop_version'],
                          version_string)
         self.assertIn('should be a string', mock_log.getvalue())
Пример #12
0
 def test_default_options(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', StringIO()) as stderr:
             MRJob.set_up_logging()
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\n')
Пример #13
0
 def test_messy_error(self):
     counter_string = b'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="THIS IS NOT ACTUALLY A COUNTER"'
     with no_handlers_for_logger(''):
         stderr = StringIO()
         log_to_stream('mrjob.parse', stderr, level=logging.WARN)
         self.assertEqual(({}, 1),
                          parse_hadoop_counters_from_line(counter_string))
         self.assertIn('Cannot parse Hadoop counter string',
                       stderr.getvalue())
Пример #14
0
    def updated_and_warnings(self, jobconf, hadoop_version):
        jobconf = jobconf.copy()
        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)
            self.runner._update_jobconf_for_hadoop_version(
                jobconf, hadoop_version)

        return jobconf, stderr.getvalue()
Пример #15
0
 def test_trailing_carriage_return(self):
     lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:'
                      ' Connecting to ResourceManager at /0.0.0.0:8032\r\n')
     self.assertEqual(list(_parse_hadoop_log_lines(lines)), [
         dict(timestamp='15/12/11 13:26:07',
              level='INFO',
              logger='client.RMProxy',
              thread=None,
              message='Connecting to ResourceManager at /0.0.0.0:8032')
     ])
Пример #16
0
    def test_exclude(self):
        for cluster in CLUSTERS:
            self.add_mock_emr_cluster(cluster)

        main(['-q', '--no-conf', '-x', 'my_key,my_value'])

        lines = [line for line in StringIO(self.stdout.getvalue())]
        self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 2)
        self.assertNotIn('j-COMPLETED', self.stdout.getvalue())
        self.assertNotIn('j-RUNNING1STEP', self.stdout.getvalue())
Пример #17
0
    def test_option_debug_printout(self):
        stderr = StringIO()

        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr, debug=True)

            InlineMRJobRunner(owner='dave')

        self.assertIn("'owner'", stderr.getvalue())
        self.assertIn("'dave'", stderr.getvalue())
Пример #18
0
    def get_debug_printout(self, opt_store_class, alias, opts):
        stderr = StringIO()

        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr, debug=True)

            # debug printout happens in constructor
            opt_store_class(alias, opts, [])

        return stderr.getvalue()
Пример #19
0
    def test_empty_runner_error(self):
        conf = dict(runner=dict(local=dict(local_tmp_dir='/tmp')))
        path = self.save_conf('basic', conf)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr)
            RunnerOptionStore('inline', {}, [path])
            self.assertEqual("No configs specified for inline runner\n",
                             stderr.getvalue())
Пример #20
0
    def test_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)
            opts = RunnerOptionStore(
                'inline', dict(base_tmp_dir='/scratch'), [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn('Deprecated option base_tmp_dir has been renamed'
                          ' to local_tmp_dir', stderr.getvalue())
Пример #21
0
    def test_with_all_clusters(self):
        for cluster in CLUSTERS:
            self.add_mock_emr_cluster(cluster)

        emr_conn = self.connect_emr()
        emr_conn.run_jobflow('no name',
                             job_flow_role='fake-instance-profile',
                             service_role='fake-service-role')
        main(['-q', '--no-conf'])

        lines = [line for line in StringIO(self.stdout.getvalue())]
        self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1)
Пример #22
0
    def test_recurse(self):
        path = os.path.join(self.tmp_dir, 'LOL.conf')
        recurse_conf = dict(include=path)
        with open(path, 'w') as f:
            dump_mrjob_conf(recurse_conf, f)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.conf', stderr)
            RunnerOptionStore('inline', {}, [path])
            self.assertIn('%s tries to recursively include %s!' % (path, path),
                          stderr.getvalue())
Пример #23
0
 def test_attrs_should_be_classes(self):
     with no_handlers_for_logger('mrjob.job'):
         stderr = StringIO()
         log_to_stream('mrjob.job', stderr)
         job = self.StrangeJob()
         self.assertIsInstance(job.input_protocol(), JSONProtocol)
         self.assertIsInstance(job.internal_protocol(), JSONProtocol)
         self.assertIsInstance(job.output_protocol(), JSONProtocol)
         logs = stderr.getvalue()
         self.assertIn('INPUT_PROTOCOL should be a class', logs)
         self.assertIn('INTERNAL_PROTOCOL should be a class', logs)
         self.assertIn('OUTPUT_PROTOCOL should be a class', logs)
Пример #24
0
    def test_thread(self):
        lines = StringIO(
            '2015-08-22 00:46:18,411 INFO amazon.emr.metrics.MetricsSaver'
            ' (main): Thread 1 created MetricsLockFreeSaver 1\n')

        self.assertEqual(list(_parse_hadoop_log_lines(lines)), [
            dict(timestamp='2015-08-22 00:46:18,411',
                 level='INFO',
                 logger='amazon.emr.metrics.MetricsSaver',
                 thread='main',
                 message='Thread 1 created MetricsLockFreeSaver 1')
        ])
Пример #25
0
class CollectEMRStatsTestCase(TestCase):
    @patch('mrjob.tools.emr.collect_emr_stats.describe_all_job_flows')
    @patch('mrjob.tools.emr.collect_emr_stats.EMRJobRunner')
    def test_collect_active_job_flows(self, mock_job_runner,
                                      mock_describe_jobflows):

        collect_active_job_flows(conf_paths=[])

        # check if args for calling describe_jobflows are correct
        self.assertEqual(mock_job_runner.call_count, 1)
        self.assertEqual(mock_job_runner.call_args_list, [call(conf_paths=[])])
        self.assertEqual(mock_describe_jobflows.call_count, 1)

        # check if args for calling describe_jobflows are correct
        active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING']
        args, kwargs = mock_describe_jobflows.call_args
        self.assertEqual(active_states, kwargs['states'])

    def test_job_flows_to_stats(self):

        # mock jobflows
        NUM_JOB_FLOWS = 30
        job_flows = []
        for i in range(NUM_JOB_FLOWS):
            job_flow_id = 'j-%04d' % i
            job_flows.append(
                MockEmrObject(
                    jobflowid=job_flow_id,
                    instancecount=i,  # each jobflow has different instance count
                ))

        stats = job_flows_to_stats(job_flows)

        self.assertEqual(stats['num_jobflows'], NUM_JOB_FLOWS)
        self.assertEqual(stats['total_instance_count'],
                         sum(range(NUM_JOB_FLOWS)))

    @patch('mrjob.tools.emr.collect_emr_stats.job_flows_to_stats')
    @patch('mrjob.tools.emr.collect_emr_stats.collect_active_job_flows')
    @patch('sys.stdout', StringIO())
    def test_main_no_conf(self, mock_collect_active_jobflows,
                          mock_job_flows_to_stats):

        mock_collect_active_jobflows.return_value = []
        mock_job_flows_to_stats.return_value = {}
        main(['-q', '--no-conf'])

        # check if args for calling collect_active_jobflows are correct
        self.assertEqual(mock_collect_active_jobflows.call_count, 1)
        self.assertEqual(mock_collect_active_jobflows.call_args_list,
                         [call([])])
        self.assertEqual(mock_job_flows_to_stats.call_count, 1)
Пример #26
0
    def _test_recoverable_error(self, ex):
        self.mock_paths = ['/path/to/logs/oak', ex]

        with no_handlers_for_logger('mrjob.logs.wrap'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.wrap', stderr)

            self.assertEqual(self._ls_logs([['/path/to/logs']]),
                             [dict(path='/path/to/logs/oak')])

            self.mock_fs.ls.assert_called_once_with('/path/to/logs')

            self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
Пример #27
0
    def test_io_error(self):
        self.mock_paths = [
            IOError(),
        ]

        with no_handlers_for_logger('mrjob.logs.ls'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.ls', stderr)

            self.assertEqual(list(_ls_logs(self.mock_fs, '/path/to/logs')), [])

            self.mock_fs.ls.assert_called_once_with('/path/to/logs')

            self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
Пример #28
0
    def test_deprecated_alias(self):
        with no_handlers_for_logger('mrjob.util'):
            stderr = StringIO()
            log_to_stream('mrjob.util', stderr)

            self.assertEqual(
                list(buffer_iterator_to_line_iterator(chunk for chunk in
                          [b'The quick\nbrown fox\nju',
                           b'mped over\nthe lazy\ndog',
                           b's.\n'])),
            [b'The quick\n', b'brown fox\n', b'jumped over\n', b'the lazy\n',
             b'dogs.\n'])

            self.assertIn('has been renamed', stderr.getvalue())
Пример #29
0
    def test_indentation_is_required(self):
        lines = [
            'File System Counters',
            '   FILE: Number of bytes read=8',
        ]

        with no_handlers_for_logger('mrjob.logs.step'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.step', stderr)

            # counter line is interpreted as group
            self.assertEqual(_parse_indented_counters(lines), {})

            # should complain
            self.assertNotEqual(stderr.getvalue(), '')
Пример #30
0
    def test_dry_run(self):
        stdout = StringIO()
        self.maybe_terminate_quietly(
            stdout=stdout, max_mins_idle=0.6, dry_run=True)

        # dry_run doesn't actually try to lock
        expected_stdout_lines = self.EXPECTED_STDOUT_LINES + [
            'Terminated cluster j-IDLE_AND_LOCKED (IDLE_AND_LOCKED);'
            ' was idle for 2:00:00']

        self.assertEqual(set(stdout.getvalue().splitlines()),
                         set(expected_stdout_lines))

        # shouldn't *actually* terminate clusters
        self.assertEqual(self.ids_of_terminated_clusters(), [])