Exemplo n.º 1
0
    def test_audit_differs0001(self, monkeypatch, mock_platform, caplog,
                               no_results_instantiation_check):
        """Check that if the audit differs, a crash occurs"""

        make_reboot_raise(monkeypatch)
        no_envlogs(monkeypatch)

        config = Config(os.path.join(TEST_DIR, "example.krun"))
        krun.util.assign_platform(config, mock_platform)
        emulate_first_reboot(mock_platform, config)
        results_path = config.results_filename()

        # mutate the audit, so it won't match later
        results = Results(config, mock_platform, results_file=results_path)
        results.audit._audit["wibble"] = "wut"
        results.write_to_file()

        sched = ExecutionScheduler(config,
                                   mock_platform.mailer,
                                   mock_platform,
                                   dry_run=True)
        with pytest.raises(krun.util.FatalKrunError):
            sched.run()

        expect = "This is only valid if the machine you are using is identical"
        assert expect in caplog.text

        os.unlink(sched.manifest.path)
        os.unlink(results_path)
Exemplo n.º 2
0
    def __init__(self,
                 config,
                 mailer,
                 platform,
                 resume=False,
                 reboot=False,
                 dry_run=False,
                 started_by_init=False):
        self.mailer = mailer

        self.config = config
        self.work_deque = deque()
        self.eta_avail = 0
        self.jobs_done = 0
        self.platform = platform
        self.resume = resume
        self.reboot = reboot
        self.dry_run = dry_run
        self.started_by_init = started_by_init

        if resume:
            self.results = Results(self.config,
                                   platform,
                                   results_file=self.config.results_filename())
        else:
            self.results = Results(self.config, platform)

        self.log_path = self.config.log_filename(self.resume)
Exemplo n.º 3
0
def emulate_first_reboot(platform, config):
    platform.starting_temperatures = platform.take_temperature_readings()
    manifest = ManifestManager(config, platform, new_file=True)
    manifest.set_starting_temperatures(platform.starting_temperatures)
    results = Results(config, platform)
    results.write_to_file()
    return manifest
Exemplo n.º 4
0
    def test_audit_differs0001(self, monkeypatch, mock_platform, caplog):
        """Check that if the audit differs, a crash occurs"""

        make_reboot_raise(monkeypatch)
        no_envlogs(monkeypatch)

        config = Config(os.path.join(TEST_DIR, "example.krun"))
        krun.util.assign_platform(config, mock_platform)
        emulate_first_reboot(mock_platform, config, monkeypatch)
        results_path = config.results_filename()

        # mutate the audit, so it won't match later
        results = Results(config, mock_platform, results_file=results_path)
        results.audit._audit["wibble"] = "wut"
        results.write_to_file()

        sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform,
                                   dry_run=True)
        with pytest.raises(krun.util.FatalKrunError):
            sched.run()

        expect = "This is only valid if the machine you are using is identical"
        assert expect in caplog.text()

        os.unlink(sched.manifest.path)
        os.unlink(results_path)
Exemplo n.º 5
0
Arquivo: krun.py Projeto: bennn/krun
def main(parser):
    args = parser.parse_args()

    if args.dump is not None:
        if not args.filename.endswith(".json.bz2"):
            usage(parser)
        else:
            results = Results(None, None, results_file=args.filename)
            if args.dump == "config" or "audit":
                text = unicode(results.__getattribute__(args.dump))
            else:
                text = json.dumps(results.__getattribute__(args.dump),
                                  sort_keys=True, indent=2)
            # String data read in from JSON are unicode objects. This matters
            # for us as some data in the audit includes unicode characters.
            # If it does, a simple print no longer suffices if the system
            # locale is (e.g.) ASCII. In this case print will raise an
            # exception. The correct thing to do is to encode() the unicode to
            # the system locale.
            print(text.encode(locale.getpreferredencoding()))
            sys.exit(0)

    if not args.filename.endswith(".krun"):
        usage(parser)

    try:
        if os.stat(args.filename).st_size <= 0:
            util.fatal('Krun configuration file %s is empty.' % args.filename)
    except OSError:
        util.fatal('Krun configuration file %s does not exist.' % args.filename)

    config = Config(args.filename)

    if args.info:
        # Info mode doesn't run the experiment.
        # Just prints some metrics and exits.
        util.print_session_info(config)
        return

    if args.strip_results:
        util.strip_results(config, args.strip_results)
        return

    attach_log_file(config, args.resume)
    debug("Krun invoked with arguments: %s" % sys.argv)

    mail_recipients = config.MAIL_TO
    if type(mail_recipients) is not list:
        util.fatal("MAIL_TO config should be a list")

    mailer = Mailer(mail_recipients, max_mails=config.MAX_MAILS)

    try:
        inner_main(mailer, config, args)
    except util.FatalKrunError as e:
        subject = "Fatal Krun Exception"
        mailer.send(subject, e.args[0], bypass_limiter=True)
        util.run_shell_cmd_list(config.POST_EXECUTION_CMDS)
        raise e
Exemplo n.º 6
0
 def test_eq(self, mock_platform):
     results = Results(None,
                       None,
                       results_file="krun/tests/quick_results.json.bz2")
     assert results == results
     assert not results == None
     assert not results == \
         Results(Config("krun/tests/example.krun"), mock_platform)
Exemplo n.º 7
0
def emulate_first_reboot(platform, config, monkeypatch):
    no_results_instantiation_check(monkeypatch)

    platform.starting_temperatures = platform.take_temperature_readings()
    manifest = ManifestManager(config, platform, new_file=True)
    manifest.set_starting_temperatures(platform.starting_temperatures)
    results = Results(config, platform)
    results.write_to_file()
    return manifest
Exemplo n.º 8
0
    def test_dump_config(self, no_results_instantiation_check):
        """Simulates krun.py --dump-config RESULTS_FILE.json.bz2
        """

        res_path = os.path.join(TEST_DIR, "quick_results.json.bz2")
        conf_path = os.path.join(TEST_DIR, "quick.krun")
        results = Results(None, None, results_file=res_path)
        with open(conf_path) as fp:
            config = fp.read()
            assert config == results.dump("config")
Exemplo n.º 9
0
def strip_results(config, key_spec):
    from krun.platform import detect_platform
    from krun.results import Results

    platform = detect_platform(None)
    results = Results(config, platform, results_file=config.results_filename())
    n_removed = results.strip_results(key_spec)
    if n_removed > 0:
        results.write_to_file()
    info("Removed %d result keys" % n_removed)
Exemplo n.º 10
0
    def test_dump_config(self, no_results_instantiation_check):
        """Simulates krun.py --dump-config RESULTS_FILE.json.bz2
        """

        res_path = os.path.join(TEST_DIR, "quick_results.json.bz2")
        conf_path = os.path.join(TEST_DIR, "quick.krun")
        results = Results(None, None, results_file=res_path)
        with open(conf_path) as fp:
            config = fp.read()
            assert config == results.dump("config")
Exemplo n.º 11
0
Arquivo: util.py Projeto: bennn/krun
def strip_results(config, key_spec):
    from krun.platform import detect_platform
    from krun.results import Results

    platform = detect_platform(None)
    results = Results(config, platform,
                      results_file=config.results_filename())
    n_removed = results.strip_results(key_spec)
    if n_removed > 0:
        results.write_to_file()
    info("Removed %d result keys" % n_removed)
Exemplo n.º 12
0
def fake_results(mock_platform, no_results_instantiation_check):
    results = Results(None, mock_platform)
    mock_platform.num_cpus = 2
    mock_platform.num_per_core_measurements = 2

    results.eta_estimates = {"bench:vm:variant": [1., 1.]}
    results.wallclock_times = {"bench:vm:variant": [[2., 2.], [2., 2.]]}
    results.core_cycle_counts = {
        "bench:vm:variant": [[[3., 3.], [
            3.,
            3.,
        ]], [[3., 3.], [3., 3.]]]
    }
    results.aperf_counts = {
        "bench:vm:variant": [[[4., 4.], [
            4.,
            4.,
        ]], [[4., 4.], [4., 4.]]]
    }
    results.mperf_counts = {
        "bench:vm:variant": [[[5., 5.], [
            5.,
            5.,
        ]], [[5., 5.], [5., 5.]]]
    }
    results.pexec_flags = {"bench:vm:variant": ["C", "T"]}
    return results
Exemplo n.º 13
0
    def test_run_schedule0005(self, mock_platform, monkeypatch,
                              no_results_instantiation_check):
        def dummy_execjob_run(self, mailer, dryrun=False):
            return self.empty_measurements, {}, "E"  # pretend jobs fail

        monkeypatch.setattr(ExecutionJob, 'run', dummy_execjob_run)

        config = Config(os.path.join(TEST_DIR, "example.krun"))
        n_reboots, sched = run_with_captured_reboots(config, mock_platform,
                                                     monkeypatch)
        assert n_reboots == 8  # 2 benchmarks, 2 vms, 2 execs

        results = Results(config,
                          mock_platform,
                          results_file=config.results_filename())
        type_check_results(results)

        assert len(results.wallclock_times) == 4  # 2 benchmarks, 2 vms
        for key, execs in results.wallclock_times.iteritems():
            assert len(execs) == 2

            for _exec in execs:
                assert len(_exec) == 0  # due to error

        os.unlink(config.results_filename())
        os.unlink(sched.manifest.path)
Exemplo n.º 14
0
 def test_write_results_to_disk(self, mock_platform):
     config = Config("krun/tests/example.krun")
     out_file = "krun/tests/example_results.json.bz2"
     results0 = Results(config, mock_platform)
     results0.audit = dict()
     results0.starting_temperatures = [4355, 9879]
     results0.data = {u"dummy:Java:default-java": [[1.000726]]}
     results0.eta_estimates = {u"dummy:Java:default-java": [1.1]}
     results0.reboots = 5
     results0.error_flag = False
     results0.write_to_file()
     results1 = Results(config, None, results_file=out_file)
     assert results0 == results1
     # Clean-up generated file.
     os.unlink(out_file)
Exemplo n.º 15
0
def to_strip():
    from krun.platform import detect_platform
    from krun.results import Results

    path = os.path.join(TEST_DIR, "quick.krun")
    config = Config(path)

    platform = detect_platform(None)
    results = Results(config, platform, results_file=config.results_filename())
    return results
Exemplo n.º 16
0
    def test_error_flag_persists0002(self, monkeypatch, mock_platform,
                                     no_results_instantiation_check):
        """Check a changed dmesg will correctly set the error flag"""

        make_reboot_raise(monkeypatch)
        no_envlogs(monkeypatch)

        # pretend dmesg changes
        def dummy_check_for_dmesg_changes(self):
            return True

        monkeypatch.setattr(mock_platform, 'check_dmesg_for_changes',
                            dummy_check_for_dmesg_changes)

        config = Config(os.path.join(TEST_DIR, "example.krun"))
        krun.util.assign_platform(config, mock_platform)
        emulate_first_reboot(mock_platform, config)
        results_path = config.results_filename()

        # To start, the error flag is not set
        results = Results(config, mock_platform, results_file=results_path)
        assert not results.error_flag

        # run an execution where the dmesg changes
        sched = ExecutionScheduler(config,
                                   mock_platform.mailer,
                                   mock_platform,
                                   dry_run=True)
        try:
            sched.run()
        except _TestReboot:
            pass
        else:
            assert False

        # reload results and check the error flag is now set
        results = Results(config, mock_platform, results_file=results_path)
        assert results.error_flag

        os.unlink(sched.manifest.path)
        os.unlink(results_path)
Exemplo n.º 17
0
    def test_error_flag_persists0001(self, monkeypatch, mock_platform,
                                     no_results_instantiation_check):
        """Check a failing exec will correctly set the error flag"""

        make_reboot_raise(monkeypatch)
        no_envlogs(monkeypatch)

        # pretend exec fails
        def dummy_job_run(self, mailer, dry):
            measurements = self.make_empty_measurement()
            return measurements, None, 'E'  # measurements, instr_data, flag

        monkeypatch.setattr(ExecutionJob, 'run', dummy_job_run)

        config = Config(os.path.join(TEST_DIR, "example.krun"))
        krun.util.assign_platform(config, mock_platform)
        emulate_first_reboot(mock_platform, config)
        results_path = config.results_filename()

        # To start, the error flag is not set
        results = Results(config, mock_platform, results_file=results_path)
        assert not results.error_flag

        # run a (failing) execution, which will dump the results file
        sched = ExecutionScheduler(config,
                                   mock_platform.mailer,
                                   mock_platform,
                                   dry_run=True)
        try:
            sched.run()
        except _TestReboot:
            pass
        else:
            assert False

        # reload results and check the error flag is now set
        results = Results(config, mock_platform, results_file=results_path)
        assert results.error_flag

        os.unlink(sched.manifest.path)
        os.unlink(results_path)
Exemplo n.º 18
0
    def __init__(self, config, mailer, platform,
                 resume=False, reboot=False, dry_run=False,
                 started_by_init=False):
        self.mailer = mailer

        self.config = config
        self.work_deque = deque()
        self.eta_avail = 0
        self.jobs_done = 0
        self.platform = platform
        self.resume = resume
        self.reboot = reboot
        self.dry_run = dry_run
        self.started_by_init = started_by_init

        if resume:
            self.results = Results(self.config, platform,
                                   results_file=self.config.results_filename())
        else:
            self.results = Results(self.config, platform)

        self.log_path = self.config.log_filename(self.resume)
Exemplo n.º 19
0
    def test_run_schedule0002(self, mock_platform, monkeypatch):
        config = Config(os.path.join(TEST_DIR, "example.krun"))
        n_reboots, sched = run_with_captured_reboots(config, mock_platform,
                                                     monkeypatch)
        assert n_reboots == 8  # 2 benchmarks, 2 vms, 2 execs

        results = Results(config, mock_platform,
                          results_file=config.results_filename())
        type_check_results(results)

        assert len(results.wallclock_times) == 4  # 2 benchmarks, 2 vms
        for key, execs in results.wallclock_times.iteritems():
            assert len(execs) == 2
            for _exec in execs:
               assert len(_exec) == 0  # due to dry run

        os.unlink(config.results_filename())
        os.unlink(sched.manifest.path)
Exemplo n.º 20
0
    def test_run_schedule(self, monkeypatch, mock_platform):
        json_file = os.path.join(TEST_DIR, "example_results.json.bz2")

        def dummy_shell_cmd(text):
            pass

        monkeypatch.setattr(subprocess, 'call', dummy_shell_cmd)
        monkeypatch.setattr(krun.util, 'run_shell_cmd', dummy_shell_cmd)
        config = Config(os.path.join(TEST_DIR, "example.krun"))
        krun.util.assign_platform(config, mock_platform)
        sched = ExecutionScheduler(config,
                                   mock_platform.mailer,
                                   mock_platform,
                                   resume=False,
                                   reboot=False,
                                   dry_run=True,
                                   started_by_init=False)
        sched.build_schedule()
        assert len(sched) == 8
        sched.run()
        assert len(sched) == 0

        results = Results(Config(os.path.join(TEST_DIR, "example.krun")),
                          mock_platform,
                          results_file=json_file)

        for k, execs in results.data.iteritems():
            assert type(execs) is list
            for one_exec in execs:
                assert type(one_exec) is list
                assert all([type(x) is float for x in one_exec])

        for k, execs in results.eta_estimates.iteritems():
            assert type(execs) is list
            assert all([type(x) is float for x in execs])

        assert type(results.starting_temperatures) is dict
        assert type(results.reboots) is int
        assert type(results.audit) is type(Audit(dict()))
        assert type(results.config) is type(Config())
        assert type(results.error_flag) is bool

        os.unlink(json_file)
Exemplo n.º 21
0
def fake_results(mock_platform, no_results_instantiation_check):
    results = Results(None, mock_platform)
    mock_platform.num_cpus = 2
    mock_platform.num_per_core_measurements = 2

    results.eta_estimates = {"bench:vm:variant": [1., 1.]}
    results.wallclock_times = {"bench:vm:variant": [[2., 2.], [2., 2.]]}
    results.core_cycle_counts = {"bench:vm:variant":
                                 [[[3., 3.], [3., 3.,]], [[3., 3.], [3., 3.]]]}
    results.aperf_counts = {"bench:vm:variant":
                            [[[4., 4.], [4., 4.,]], [[4., 4.], [4., 4.]]]}
    results.mperf_counts = {"bench:vm:variant":
                            [[[5., 5.], [5., 5.,]], [[5., 5.], [5., 5.]]]}
    return results
Exemplo n.º 22
0
    def test_run_schedule0001(self, monkeypatch, mock_platform):
        config = Config(os.path.join(TEST_DIR, "one_exec.krun"))
        n_reboots, sched = run_with_captured_reboots(config, mock_platform,
                                                     monkeypatch)
        assert sched.manifest.total_num_execs == 1
        assert sched.manifest.num_execs_left == 0
        assert n_reboots == 1

        results = Results(config, mock_platform,
                          results_file=config.results_filename())
        type_check_results(results)

        assert len(results.wallclock_times) == 1  # 1 benchmark, 1 vm
        for key, execs in results.wallclock_times.iteritems():
            assert len(execs) == 1
            for _exec in execs:
                assert len(_exec) == 0  # due to dry run

        os.unlink(config.results_filename())
        os.unlink(sched.manifest.path)
Exemplo n.º 23
0
 def test_read_results_from_disk(self):
     config = Config("krun/tests/quick.krun")
     results = Results(None,
                       None,
                       results_file="krun/tests/quick_results.json.bz2")
     expected = {
         u'nbody:CPython:default-python': [[0.022256]],
         u'dummy:CPython:default-python': [[1.005115]],
         u'nbody:Java:default-java': [[26.002632]],
         u'dummy:Java:default-java': [[1.000941]]
     }
     assert results.config == config
     assert results.audit[u'uname'] == u'Linux'
     assert results.audit[u'debian_version'] == u'jessie/sid'
     assert results.data == expected
     assert results.starting_temperatures == {"x": 3333, "y": 4444}
     assert results.eta_estimates == \
         {
             u'nbody:CPython:default-python': [0.022256],
             u'dummy:CPython:default-python': [1.005115],
             u'nbody:Java:default-java': [26.002632],
             u'dummy:Java:default-java': [1.000941]
         }
Exemplo n.º 24
0
    def test_run_schedule0004(self, mock_platform, monkeypatch,
                              no_results_instantiation_check):
        config = Config(os.path.join(TEST_DIR, "example_skip_1vm.krun"))
        n_reboots, sched = run_with_captured_reboots(config, mock_platform,
                                                     monkeypatch)
        assert n_reboots == 4  # 2 benchmarks, 2 vms, 2 execs, one VM skipped

        results = Results(config,
                          mock_platform,
                          results_file=config.results_filename())
        type_check_results(results)

        assert len(results.wallclock_times) == 4  # 2 benchmarks, 2 vms
        for key, execs in results.wallclock_times.iteritems():
            if "CPython" in key:
                assert len(execs) == 0
            else:
                assert len(execs) == 2

            for _exec in execs:
                assert len(_exec) == 0  # due to dry run

        os.unlink(config.results_filename())
        os.unlink(sched.manifest.path)
Exemplo n.º 25
0
 def test_write_results_to_disk(self, mock_platform,
                                no_results_instantiation_check):
     config = Config("krun/tests/example.krun")
     mock_platform.num_cpus = 4
     mock_platform.num_per_core_measurements = mock_platform.num_cpus
     out_file = "krun/tests/example_results.json.bz2"
     results0 = Results(config, mock_platform)
     results0.audit = dict()
     results0.starting_temperatures = [4355, 9879]
     results0.wallclock_times = {u"dummy:Java:default-java": [[1.000726]]}
     results0.eta_estimates = {u"dummy:Java:default-java": [1.1]}
     results0.core_cycle_counts = {
         u"dummy:Java:default-java": [[[2], [3], [4], [5]]]
     }
     results0.aperf_counts = {
         u"dummy:Java:default-java": [[[3], [4], [5], [6]]]
     }
     results0.mperf_counts = {
         u"dummy:Java:default-java": [[[4], [5], [6], [7]]]
     }
     results0.reboots = 5
     results0.error_flag = False
     results0.write_to_file()
     results1 = Results(config, None, results_file=out_file)
     assert results0 == results1
     # Clean-up generated file.
     os.unlink(out_file)
Exemplo n.º 26
0
 def test_write_results_to_disk(self, mock_platform):
     config = Config("krun/tests/example.krun")
     out_file = "krun/tests/example_results.json.bz2"
     results0 = Results(config, mock_platform)
     results0.audit = dict()
     results0.starting_temperatures = [4355, 9879]
     results0.data = {u"dummy:Java:default-java": [[1.000726]]}
     results0.eta_estimates = {u"dummy:Java:default-java": [1.1]}
     results0.reboots = 5
     results0.error_flag = False
     results0.write_to_file()
     results1 = Results(config, None, results_file=out_file)
     assert results0 == results1
     # Clean-up generated file.
     os.unlink(out_file)
Exemplo n.º 27
0
def main(parser):
    args = parser.parse_args()

    if args.dump is not None:
        if not args.filename.endswith(".json.bz2"):
            usage(parser)
        else:
            Results.ok_to_instantiate = True
            results = Results(None, None, results_file=args.filename)
            text = results.dump(args.dump)
            # String data read in from JSON are unicode objects. This matters
            # for us as some data in the audit includes unicode characters.
            # If it does, a simple print no longer suffices if the system
            # locale is (e.g.) ASCII. In this case print will raise an
            # exception. The correct thing to do is to encode() the unicode to
            # the system locale.
            print(text.encode(locale.getpreferredencoding()))
            sys.exit(0)

    if not args.filename.endswith(".krun"):
        usage(parser)

    try:
        if os.stat(args.filename).st_size <= 0:
            util.fatal('Krun configuration file %s is empty.' % args.filename)
    except OSError:
        util.fatal('Krun configuration file %s does not exist.' % args.filename)

    config = Config(args.filename)

    if args.info:
        # Info mode doesn't run the experiment.
        # Just prints some metrics and exits.
        util.print_session_info(config)
        return

    manifest_filename = ManifestManager.get_filename(config)
    on_first_invocation = not (os.path.isfile(manifest_filename) and
                               os.stat(manifest_filename).st_size > 0)

    log_file = config.log_filename()
    if on_first_invocation and os.path.exists(log_file):
        util.fatal("Log file '%s' already exists. "
                   "Move the file away before running Krun." % log_file)

    attach_log_file(config, not on_first_invocation)
    debug("Krun invoked with arguments: %s" % sys.argv)

    mail_recipients = config.MAIL_TO
    if type(mail_recipients) is not list:
        util.fatal("MAIL_TO config should be a list")

    mailer = Mailer(mail_recipients, max_mails=config.MAX_MAILS)

    try:
        inner_main(mailer, on_first_invocation, config, args)
    except Exception as exn:
        error_info = sys.exc_info()
        subject = "Fatal Krun Exception"
        lines = ["Fatal Krun error: %s\n" % str(error_info[1])]
        for frame in traceback.format_tb(error_info[2]):
            lines.append(frame)
        msg = "".join(lines)
        util.log_and_mail(mailer, debug, subject, msg, bypass_limiter=True)
        raise exn
Exemplo n.º 28
0
 def test_write_results_to_disk(self, mock_platform,
                                no_results_instantiation_check):
     config = Config("krun/tests/example.krun")
     mock_platform.num_cpus = 4
     mock_platform.num_per_core_measurements = mock_platform.num_cpus
     out_file = "krun/tests/example_results.json.bz2"
     results0 = Results(config, mock_platform)
     results0.audit = dict()
     results0.starting_temperatures = [4355, 9879]
     results0.wallclock_times = {u"dummy:Java:default-java": [[1.000726]]}
     results0.eta_estimates = {u"dummy:Java:default-java": [1.1]}
     results0.core_cycle_counts = {u"dummy:Java:default-java": [[[2], [3], [4], [5]]]}
     results0.aperf_counts = {u"dummy:Java:default-java": [[[3], [4], [5], [6]]]}
     results0.mperf_counts = {u"dummy:Java:default-java": [[[4], [5], [6], [7]]]}
     results0.reboots = 5
     results0.error_flag = False
     results0.write_to_file()
     results1 = Results(config, None, results_file=out_file)
     assert results0 == results1
     # Clean-up generated file.
     os.unlink(out_file)
Exemplo n.º 29
0
Arquivo: krun.py Projeto: bennn/krun
def inner_main(mailer, config, args):
    out_file = config.results_filename()
    out_file_exists = os.path.exists(out_file)

    if out_file_exists and not os.path.isfile(out_file):
        util.fatal(
            "Output file '%s' exists but is not a regular file" % out_file)

    if out_file_exists and not args.resume:
        util.fatal("Output file '%s' already exists. "
                   "Either resume the session (--resume) or "
                   "move the file away" % out_file)

    if not out_file_exists and args.resume:
        util.fatal("No results file to resume. Expected '%s'" % out_file)

    if args.started_by_init and not args.reboot:
        util.fatal("--started-by-init makes no sense without --reboot")

    if args.started_by_init and not args.resume:
        util.fatal("--started-by-init makes no sense without --resume")

    if args.develop:
        warn("Developer mode enabled. Results will not be reliable.")

    # Initialise platform instance and assign to VM defs.
    # This needs to be done early, so VM sanity checks can run.
    platform = detect_platform(mailer)

    if not args.develop:
        debug("Checking platform preliminaries")
        platform.check_preliminaries()
    else:
        # Needed to skip the use of certain tools and techniques.
        # E.g. switching user.
        warn("Not checking platform prerequisites due to developer mode")
        platform.developer_mode = True

    platform.collect_audit()

    # If the user has asked for resume-mode, the current platform must
    # be an identical machine to the current one.
    error_msg = ("You have asked Krun to resume an interrupted benchmark. " +
                 "This is only valid if the machine you are using is " +
                 "identical to the one on which the last results were " +
                 "gathered, which is not the case.")
    current = None
    if args.resume:
        # output file must exist, due to check above
        assert(out_file_exists)
        current = Results(config, platform, results_file=out_file)
        from krun.audit import Audit
        if not Audit(platform.audit) == current.audit:
            util.fatal(error_msg)

        debug("Using pre-recorded initial temperature readings")
        platform.starting_temperatures = current.starting_temperatures
    else:
        # Touch the config file to update its mtime. This is required
        # by resume-mode which uses the mtime to determine the name of
        # the log file, should this benchmark be resumed.
        _, _, rc = util.run_shell_cmd("touch " + args.filename)
        if rc != 0:
            util.fatal("Could not touch config file: " + args.filename)

        info(("Wait %s secs to allow system to cool prior to "
             "collecting initial temperature readings") %
             config.TEMP_READ_PAUSE)

        if args.develop or args.dry_run:
            info("SIMULATED: time.sleep(%s)" % config.TEMP_READ_PAUSE)
        else:
            time.sleep(config.TEMP_READ_PAUSE)

        debug("Taking fresh initial temperature readings")
        platform.starting_temperatures = platform.take_temperature_readings()

    # Assign platform to VM defs -- needs to happen early for sanity checks
    util.assign_platform(config, platform)

    sanity_checks(config, platform)

    # Build job queue -- each job is an execution
    sched = ExecutionScheduler(config,
                               mailer,
                               platform,
                               resume=args.resume,
                               reboot=args.reboot,
                               dry_run=args.dry_run,
                               started_by_init=args.started_by_init)
    sched.build_schedule()
    sched.run()
Exemplo n.º 30
0
class ExecutionScheduler(object):
    """Represents our entire benchmarking session"""
    def __init__(self,
                 config,
                 mailer,
                 platform,
                 resume=False,
                 reboot=False,
                 dry_run=False,
                 started_by_init=False):
        self.mailer = mailer

        self.config = config
        self.work_deque = deque()
        self.eta_avail = 0
        self.jobs_done = 0
        self.platform = platform
        self.resume = resume
        self.reboot = reboot
        self.dry_run = dry_run
        self.started_by_init = started_by_init

        if resume:
            self.results = Results(self.config,
                                   platform,
                                   results_file=self.config.results_filename())
        else:
            self.results = Results(self.config, platform)

        self.log_path = self.config.log_filename(self.resume)

    def set_eta_avail(self):
        """call after adding job before eta should become available"""
        self.eta_avail = len(self)

    def jobs_until_eta_known(self):
        return self.eta_avail - self.jobs_done

    def add_job(self, job):
        self.work_deque.append(job)

    def remove_job_by_key(self, key):
        for job in self.work_deque:
            if job.key == key:
                job_to_remove = job
                break
        else:
            raise JobMissingError(key)
        debug("Removed %s from schedule" % key)
        self.work_deque.remove(job_to_remove)

    def next_job(self, peek=False):
        try:
            job = self.work_deque.popleft()
            if peek:
                self.work_deque.appendleft(job)
            return job
        except IndexError:
            raise ScheduleEmpty()  # we are done

    def __len__(self):
        return len(self.work_deque)

    def get_estimated_exec_duration(self, key):
        previous_exec_times = self.results.eta_estimates.get(key)
        if previous_exec_times:
            return mean(previous_exec_times)
        else:
            return None  # we don't know

    def get_estimated_overall_duration(self):
        etas = [j.get_estimated_exec_duration() for j in self.work_deque]
        if None in etas:
            return None  # we don't know
        return sum(etas)

    def get_exec_estimate_time_formatter(self, key):
        return TimeEstimateFormatter(self.get_estimated_exec_duration(key))

    def get_overall_time_estimate_formatter(self):
        return TimeEstimateFormatter(self.get_estimated_overall_duration())

    def add_eta_info(self, key, exec_time):
        self.results.eta_estimates[key].append(exec_time)

    def build_schedule(self):
        """Builds a queue of process execution jobs.

        Returns two sets: non_skipped_keys, skipped_keys"""

        skipped_keys, non_skipped_keys = set(), set()

        one_exec_scheduled = False
        eta_avail_job = None
        for exec_n in xrange(self.config.N_EXECUTIONS):
            for vm_name, vm_info in self.config.VMS.items():
                for bmark, param in self.config.BENCHMARKS.items():
                    for variant in vm_info["variants"]:
                        job = ExecutionJob(self, vm_name, vm_info, bmark,
                                           variant, param)
                        if not self.config.should_skip(job.key):
                            non_skipped_keys |= set([job.key])
                            if one_exec_scheduled and not eta_avail_job:
                                # first job of second executions eta becomes known.
                                eta_avail_job = job
                                self.set_eta_avail()
                            self.add_job(job)
                        else:
                            skipped_keys |= set([job.key])
                            if not one_exec_scheduled:
                                debug("%s is in skip list. Not scheduling." %
                                      job.key)
            one_exec_scheduled = True
        self.expected_reboots = len(self)
        # Resume mode: if previous results are available, remove the
        # jobs from the schedule which have already been executed, and
        # add the results to this object, ready to be saved to a Json file.
        if self.resume:
            self._remove_previous_execs_from_schedule()
            # Sanity check ETA estimates
            for key, exec_data in self.results.data.iteritems():
                got_len = len(self.results.eta_estimates[key])
                expect_len = len(exec_data)
                if expect_len != got_len:
                    msg = "ETA estimates didn't tally with results: "
                    msg += "key=%s, expect_len=%d, got_len=%d " % \
                        (key, expect_len, got_len)
                    msg += "data[%s]=%s; " % (key, str(self.results.data[key]))
                    msg += "eta[%s]=%s" % \
                        (key, str(self.results.eta_estimates[key]))
                    util.log_and_mail(self.mailer,
                                      error,
                                      "Fatal Krun Error",
                                      msg,
                                      bypass_limiter=True,
                                      exit=True)
        return non_skipped_keys, skipped_keys

    def _remove_previous_execs_from_schedule(self):
        for key in self.results.data:
            num_completed_jobs = self.results.jobs_completed(key)
            if num_completed_jobs > 0:
                try:
                    debug("%s has already been run %d times." %
                          (key, num_completed_jobs))
                    for _ in range(num_completed_jobs):
                        self.remove_job_by_key(key)
                        self.jobs_done += 1
                except JobMissingError as excn:
                    tup = (excn.key, self.config.filename,
                           self.config.results_filename())
                    msg = ("Failed to resume benchmarking session\n." +
                           "The execution %s appears in results " +
                           "file: %s, but not in config file: %s." % tup)
                    util.fatal(msg)

    def _make_pre_post_cmd_env(self):
        """Prepare an environment dict for pre/post execution hooks"""

        jobs_until_eta_known = self.jobs_until_eta_known()
        if jobs_until_eta_known > 0:
            eta_val = "Unknown. Known in %d process executions." % \
                self.jobs_until_eta_known()
        else:
            eta_val = self.get_overall_time_estimate_formatter().finish_str

        return {
            "KRUN_RESULTS_FILE": self.config.results_filename(),
            "KRUN_LOG_FILE": self.config.log_filename(resume=True),
            "KRUN_ETA_DATUM": now_str(),
            "KRUN_ETA_VALUE": eta_val,
        }

    def run(self):
        """Benchmark execution starts here"""
        jobs_left = len(self)
        if jobs_left == 0:
            debug("Krun started with an empty queue of jobs")

        if not self.started_by_init:
            util.log_and_mail(self.mailer,
                              debug,
                              "Benchmarking started",
                              "Benchmarking started.\nLogging to %s" %
                              self.log_path,
                              bypass_limiter=True)

        if self.reboot and not self.started_by_init:
            # Reboot before first benchmark (dumps results file).
            info("Reboot prior to first execution")
            self._reboot()

        if self.reboot and self.started_by_init and jobs_left > 0:
            debug("Waiting %s seconds for the system to come up." %
                  str(STARTUP_WAIT_SECONDS))
            if self.dry_run:
                info("SIMULATED: time.sleep (would have waited %s seconds)." %
                     STARTUP_WAIT_SECONDS)
            else:
                time.sleep(STARTUP_WAIT_SECONDS)

        # Important that the dmesg is collected after the above startup wait.
        # Otherwise we get spurious dmesg changes.
        self.platform.collect_starting_dmesg()

        while True:
            self.platform.wait_for_temperature_sensors()

            jobs_left = len(self)
            if jobs_left == 0:
                break

            # Run the user's pre-process-execution commands
            util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS,
                                    extra_env=self._make_pre_post_cmd_env())

            job = self.next_job()

            # We collect rough execution times separate from real results. The
            # reason for this is that, even if a benchmark crashes it takes
            # time and we need to account for this when making estimates. A
            # crashing benchmark will give an empty list of iteration times,
            # meaning we can't use 'raw_exec_result' below for estimates.
            exec_start_time = time.time()
            raw_exec_result = job.run(self.mailer, self.dry_run)
            exec_end_time = time.time()

            exec_result = util.format_raw_exec_results(raw_exec_result)

            if not exec_result and not self.dry_run:
                self.results.error_flag = True

            self.results.data[job.key].append(exec_result)

            eta_info = exec_end_time - exec_start_time
            if self.reboot:
                # Add time taken to wait for system to come up if we are in
                # reboot mode.
                eta_info += STARTUP_WAIT_SECONDS
            self.add_eta_info(job.key, eta_info)

            info("%d executions left in scheduler queue" % (jobs_left - 1))

            # We dump the json after each experiment so we can monitor the
            # json file mid-run. It is overwritten each time.
            self.results.write_to_file()

            self.jobs_done += 1

            # Run the user's post-process-execution commands with updated
            # ETA estimates. Important that this happens *after* dumping
            # results, as the user is likely copying intermediate results to
            # another host.
            util.run_shell_cmd_list(self.config.POST_EXECUTION_CMDS,
                                    extra_env=self._make_pre_post_cmd_env())

            tfmt = self.get_overall_time_estimate_formatter()

            if self.eta_avail == self.jobs_done:
                # We just found out roughly how long the session has left, mail out.
                msg = "ETA for current session now known: %s" % tfmt.finish_str
                util.log_and_mail(self.mailer,
                                  debug,
                                  "ETA for Current Session Available",
                                  msg,
                                  bypass_limiter=True)

            info("{:<25s}: {} ({} from now)".format("Estimated completion",
                                                    tfmt.finish_str,
                                                    tfmt.delta_str))

            try:
                job = self.next_job(peek=True)
            except ScheduleEmpty:
                pass  # no next job
            else:
                # print info about the next job
                info("Next execution is '%s(%d)' (%s variant) under '%s'" %
                     (job.benchmark, job.parameter, job.variant, job.vm_name))

                tfmt = self.get_exec_estimate_time_formatter(job.key)
                info("{:<35s}: {} ({} from now)".format(
                    "Estimated completion (next execution)", tfmt.finish_str,
                    tfmt.delta_str))

            if (self.eta_avail
                    is not None) and (self.jobs_done < self.eta_avail):
                info("Executions until ETA known: %s" %
                     self.jobs_until_eta_known())

            if self.platform.check_dmesg_for_changes():
                self.results.error_flag = True

            if self.reboot and len(self) > 0:
                info("Reboot in preparation for next execution")
                self._reboot()

        self.platform.save_power()

        info("Done: Results dumped to %s" % self.config.results_filename())
        err_msg = "Errors/warnings occurred -- read the log!"
        if self.results.error_flag:
            warn(err_msg)

        msg = "Session completed. Log file at: '%s'" % (self.log_path)

        if self.results.error_flag:
            msg += "\n\n%s" % err_msg

        if self.reboot:
            msg += "\n\nDon't forget to disable Krun at boot."

        util.log_and_mail(self.mailer,
                          info,
                          "Benchmarks Complete",
                          msg,
                          bypass_limiter=True)

    def _reboot(self):
        self.results.reboots += 1
        debug("About to execute reboot: %g, expecting %g in total." %
              (self.results.reboots, self.expected_reboots))
        # Dump the results file. This may already have been done, but we
        # have changed self.nreboots, which needs to be written out.
        self.results.write_to_file()

        if self.results.reboots > self.expected_reboots:
            util.fatal(("HALTING now to prevent an infinite reboot loop: " +
                        "INVARIANT num_reboots <= num_jobs violated. " +
                        "Krun was about to execute reboot number: %g. " +
                        "%g jobs have been completed, %g are left to go.") %
                       (self.results.reboots, self.jobs_done, len(self)))
        if self.dry_run:
            info("SIMULATED: reboot (restarting Krun in-place)")
            args = sys.argv
            if not self.started_by_init:
                args.extend(["--resume", "--started-by-init"])
                debug("Simulated reboot with args: " + " ".join(args))
            os.execv(args[0], args)  # replace myself
            assert False  # unreachable
        else:
            subprocess.call(self.platform.get_reboot_cmd())
Exemplo n.º 31
0
def inner_main(mailer, on_first_invocation, config, args):
    out_file = config.results_filename()
    out_file_exists = os.path.exists(out_file)

    instr_dir = util.get_instr_json_dir(config)
    instr_dir_exists = os.path.exists(instr_dir)

    envlog_dir = util.get_envlog_dir(config)
    envlog_dir_exists = os.path.exists(envlog_dir)

    if out_file_exists and not os.path.isfile(out_file):
        util.fatal(
            "Output file '%s' exists but is not a regular file" % out_file)

    if out_file_exists and on_first_invocation:
        util.fatal("Output results file '%s' already exists. "
                   "Move the file away before running Krun." % out_file)

    if instr_dir_exists and on_first_invocation:
        util.fatal("Instrumentation dir '%s' exists." % instr_dir)

    if envlog_dir_exists and on_first_invocation:
        util.fatal("Env log dir '%s' exists." % envlog_dir)

    if not out_file_exists and not on_first_invocation:
        util.fatal("No results file to resume. Expected '%s'" % out_file)

    # Initialise platform instance and assign to VM defs.
    # This needs to be done early, so VM sanity checks can run.
    platform = detect_platform(mailer, config)

    platform.quick_mode = args.quick
    platform.no_user_change = args.no_user_change
    platform.no_tickless_check = args.no_tickless_check
    platform.no_pstate_check = args.no_pstate_check
    platform.hardware_reboots = args.hardware_reboots

    # Create the instrumentation directory if required
    if on_first_invocation:
        # We only want make a dir if >=1 VM is in instrumentation mode.
        for vm in config.VMS.itervalues():
            if vm['vm_def'].instrument:
                util.make_instr_dir(config)
                break

    debug("Checking platform preliminaries")
    platform.check_preliminaries()

    # Make a bit of noise if this is a virtualised environment
    if platform.is_virtual():
        warn("This appears to be a virtualised host. The results will be flawed. "
             "Use bare-metal for reliable results!")

    platform.collect_audit()

    # At this point the config file is OK, and on-disk state is consistent,
    # so let's daemonise (if requested).
    if args.daemonise:
        util.daemonise()

    if not on_first_invocation:
        # output file must exist, due to check above
        assert(out_file_exists)

        debug("Using pre-recorded initial temperature readings")
        manifest = ManifestManager(config, platform)

        platform_temps = {}
        for sensor, tup in manifest.starting_temperatures.iteritems():
            platform_temps[sensor] = tup[1]
        platform.starting_temperatures = platform_temps
    else:
        manifest = ManifestManager(config, platform, new_file=True)
        if manifest.num_execs_left == 0:
            # No executions, or all skipped
            fatal("Empty schedule!")

        try:
            info(("Wait %s secs to allow system to cool prior to "
                 "collecting initial temperature readings") %
                 config.TEMP_READ_PAUSE)

            # This part is wrapped in hooks, so that if daemons or networking are
            # taken down for process executions, then the initial temperature
            # reading gets the same treatment.
            util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS,)
            platform.sleep(config.TEMP_READ_PAUSE)

            debug("Taking fresh initial temperature readings")
            platform.starting_temperatures = platform.take_temperature_readings()
            manifest.set_starting_temperatures(platform.starting_temperatures)

            # Write out an empty results file. After the initial reboot Krun
            # will expect this to exist.
            Results.ok_to_instantiate = True
            results = Results(config, platform)
            results.write_to_file()
        except:
            raise
        finally:
            util.run_shell_cmd_list(config.POST_EXECUTION_CMDS,)

        log_path = config.log_filename(resume=False)
        util.log_and_mail(mailer, debug,
                          "Benchmarking started",
                          "Benchmarking started.\nLogging to %s" %
                          log_path, bypass_limiter=True)

        util.reboot(manifest, platform)

    # Assign platform to VM defs -- needs to happen early for sanity checks
    util.assign_platform(config, platform)

    sanity_checks(config, platform)

    # Build job queue -- each job is an execution
    sched = ExecutionScheduler(config,
                               mailer,
                               platform,
                               dry_run=args.dry_run)
    sched.run()
Exemplo n.º 32
0
Arquivo: krun.py Projeto: warsier/krun
def inner_main(mailer, on_first_invocation, config, args):
    out_file = config.results_filename()
    out_file_exists = os.path.exists(out_file)

    instr_dir = util.get_instr_json_dir(config)
    instr_dir_exists = os.path.exists(instr_dir)

    envlog_dir = util.get_envlog_dir(config)
    envlog_dir_exists = os.path.exists(envlog_dir)

    if out_file_exists and not os.path.isfile(out_file):
        util.fatal("Output file '%s' exists but is not a regular file" %
                   out_file)

    if out_file_exists and on_first_invocation:
        util.fatal("Output results file '%s' already exists. "
                   "Move the file away before running Krun." % out_file)

    if instr_dir_exists and on_first_invocation:
        util.fatal("Instrumentation dir '%s' exists." % instr_dir)

    if envlog_dir_exists and on_first_invocation:
        util.fatal("Env log dir '%s' exists." % envlog_dir)

    if not out_file_exists and not on_first_invocation:
        util.fatal("No results file to resume. Expected '%s'" % out_file)

    # Initialise platform instance and assign to VM defs.
    # This needs to be done early, so VM sanity checks can run.
    platform = detect_platform(mailer, config)

    platform.quick_mode = args.quick
    platform.no_user_change = args.no_user_change
    platform.no_tickless_check = args.no_tickless_check
    platform.no_pstate_check = args.no_pstate_check
    platform.hardware_reboots = args.hardware_reboots

    # Create the instrumentation directory if required
    if on_first_invocation:
        # We only want make a dir if >=1 VM is in instrumentation mode.
        for vm in config.VMS.itervalues():
            if vm['vm_def'].instrument:
                util.make_instr_dir(config)
                break

    debug("Checking platform preliminaries")
    platform.check_preliminaries()

    # Make a bit of noise if this is a virtualised environment
    if platform.is_virtual():
        warn(
            "This appears to be a virtualised host. The results will be flawed. "
            "Use bare-metal for reliable results!")

    platform.collect_audit()

    # At this point the config file is OK, and on-disk state is consistent,
    # so let's daemonise (if requested).
    if args.daemonise:
        util.daemonise()

    if not on_first_invocation:
        # output file must exist, due to check above
        assert (out_file_exists)

        debug("Using pre-recorded initial temperature readings")
        manifest = ManifestManager(config, platform)

        platform_temps = {}
        for sensor, tup in manifest.starting_temperatures.iteritems():
            platform_temps[sensor] = tup[1]
        platform.starting_temperatures = platform_temps
    else:
        manifest = ManifestManager(config, platform, new_file=True)
        if manifest.num_execs_left == 0:
            # No executions, or all skipped
            fatal("Empty schedule!")

        try:
            info(("Wait %s secs to allow system to cool prior to "
                  "collecting initial temperature readings") %
                 config.TEMP_READ_PAUSE)

            # This part is wrapped in hooks, so that if daemons or networking are
            # taken down for process executions, then the initial temperature
            # reading gets the same treatment.
            util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS, )
            platform.sleep(config.TEMP_READ_PAUSE)

            debug("Taking fresh initial temperature readings")
            platform.starting_temperatures = platform.take_temperature_readings(
            )
            manifest.set_starting_temperatures(platform.starting_temperatures)

            # Write out an empty results file. After the initial reboot Krun
            # will expect this to exist.
            Results.ok_to_instantiate = True
            results = Results(config, platform)
            results.write_to_file()
        except:
            raise
        finally:
            util.run_shell_cmd_list(config.POST_EXECUTION_CMDS, )

        log_path = config.log_filename(resume=False)
        util.log_and_mail(mailer,
                          debug,
                          "Benchmarking started",
                          "Benchmarking started.\nLogging to %s" % log_path,
                          bypass_limiter=True)

        util.reboot(manifest, platform)

    # Assign platform to VM defs -- needs to happen early for sanity checks
    util.assign_platform(config, platform)

    sanity_checks(config, platform)

    # Build job queue -- each job is an execution
    sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run)
    sched.run()
Exemplo n.º 33
0
Arquivo: krun.py Projeto: warsier/krun
def main(parser):
    args = parser.parse_args()

    if args.dump is not None:
        if not args.filename.endswith(".json.bz2"):
            usage(parser)
        else:
            Results.ok_to_instantiate = True
            results = Results(None, None, results_file=args.filename)
            text = results.dump(args.dump)
            # String data read in from JSON are unicode objects. This matters
            # for us as some data in the audit includes unicode characters.
            # If it does, a simple print no longer suffices if the system
            # locale is (e.g.) ASCII. In this case print will raise an
            # exception. The correct thing to do is to encode() the unicode to
            # the system locale.
            print(text.encode(locale.getpreferredencoding()))
            sys.exit(0)

    if not args.filename.endswith(".krun"):
        usage(parser)

    try:
        if os.stat(args.filename).st_size <= 0:
            util.fatal('Krun configuration file %s is empty.' % args.filename)
    except OSError:
        util.fatal('Krun configuration file %s does not exist.' %
                   args.filename)

    config = Config(args.filename)

    if args.info:
        # Info mode doesn't run the experiment.
        # Just prints some metrics and exits.
        util.print_session_info(config)
        return

    manifest_filename = ManifestManager.get_filename(config)
    on_first_invocation = not (os.path.isfile(manifest_filename)
                               and os.stat(manifest_filename).st_size > 0)

    log_file = config.log_filename()
    if on_first_invocation and os.path.exists(log_file):
        util.fatal("Log file '%s' already exists. "
                   "Move the file away before running Krun." % log_file)

    attach_log_file(config, not on_first_invocation)
    debug("Krun invoked with arguments: %s" % sys.argv)

    mail_recipients = config.MAIL_TO
    if type(mail_recipients) is not list:
        util.fatal("MAIL_TO config should be a list")

    mailer = Mailer(mail_recipients, max_mails=config.MAX_MAILS)

    try:
        inner_main(mailer, on_first_invocation, config, args)
    except Exception as exn:
        error_info = sys.exc_info()
        subject = "Fatal Krun Exception"
        lines = ["Fatal Krun error: %s\n" % str(error_info[1])]
        for frame in traceback.format_tb(error_info[2]):
            lines.append(frame)
        msg = "".join(lines)
        util.log_and_mail(mailer, debug, subject, msg, bypass_limiter=True)
        raise exn
Exemplo n.º 34
0
    def run(self):
        """Benchmark execution starts here"""

        # In reboot mode, wait for the system to come up before we proceed
        if self.platform.hardware_reboots:
            debug("Waiting %s seconds for the system to come up." %
                  str(STARTUP_WAIT_SECONDS))
            self.platform.sleep(STARTUP_WAIT_SECONDS)

        # Important that the dmesg is collected after the above startup wait.
        # Otherwise we get spurious dmesg changes.
        self.platform.collect_starting_dmesg()

        assert self.manifest.num_execs_left > 0
        self.platform.wait_for_temperature_sensors()

        bench, vm, variant = self.manifest.next_exec_key.split(":")
        key_pexec_idx = self.manifest.next_exec_key_index()
        job = ExecutionJob(self, vm, self.config.VMS[vm], bench, variant,
                           self.config.BENCHMARKS[bench], key_pexec_idx)

        # Default to error state. This is the value the finally block will see
        # if an exception is raised inside the try block, otherwise it is
        # re-assigned based on the result of running the benchmark.
        flag = 'E'

        # Run the pre-exec commands, the benchmark and the post-exec commands.
        # These are wrapped in a try/except, so that the post-exec commands
        # are always executed, even if an exception has occurred. We only
        # reboot /after/ the post-exec commands have completed.
        results = None
        try:
            # Run the user's pre-process-execution commands. We can't put an
            # ETA estimate in the environment for the pre-commands as we have
            # not (and should not) load the results file into memory yet.
            #
            # It might seem tempting to move this outside the try block, to
            # ensure that post-hooks are only run if pre-hooks ran. We don't,
            # thus avoiding the case where only *part* of the pre-hooks run,
            # but the post-hooks then don't run.
            util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS,)

            # We collect rough execution times separate from real results. The
            # reason for this is that, even if a benchmark crashes it takes
            # time and we need to account for this when making estimates. A
            # crashing benchmark will give an empty list of iteration times,
            # meaning we can't use 'raw_exec_result' below for estimates.
            exec_start_time = time.time()
            measurements, instr_data, flag = job.run(self.mailer, self.dry_run)
            exec_end_time = time.time()

            # Only now is it OK to load the results file into memory.
            Results.ok_to_instantiate = True
            results = Results(self.config, self.platform,
                              results_file=self.config.results_filename())

            # Bail early if the process execution needs to be re-run.
            if flag == "O":
                util.run_shell_cmd_list(
                    self.config.POST_EXECUTION_CMDS,
                    extra_env=self._make_post_cmd_env(results)
                )
                info("Rebooting to re-run previous process execution")
                util.reboot(self.manifest, self.platform, update_count=False)
                # reboot() does not return
                raise RuntimeError("reached unreachable code!")

            # Store new result.
            results.append_exec_measurements(job.key, measurements, flag)

            # Store instrumentation data in a separate file
            if job.vm_info["vm_def"].instrument:
                key_exec_num = self.manifest.completed_exec_counts[job.key]
                util.dump_instr_json(job.key, key_exec_num, self.config, instr_data)

            eta_info = exec_end_time - exec_start_time
            if self.platform.hardware_reboots:
                # Add time taken to wait for system to come up if we are in
                # hardware-reboot mode.
                eta_info += STARTUP_WAIT_SECONDS
            results.eta_estimates[job.key].append(eta_info)
            self.manifest.update(flag)
        except Exception:
            raise
        finally:
            # Run the user's post-process-execution commands with updated
            # ETA estimates. Important that this happens *after* dumping
            # results, as the user is likely copying intermediate results to
            # another host.

            # _make_post_cmd_env() needs the results to make an ETA. If an
            # exception occurred in the above try block, there's a chance that
            # they have not have been loaded.
            if results is None:
                Results.ok_to_instantiate = True
                results = Results(self.config, self.platform,
                                       results_file=self.config.results_filename())

            # If errors occured, set error flag in results file
            if self.platform.check_dmesg_for_changes(self.manifest) or \
                    flag == 'E':
                results.error_flag = True

            results.write_to_file()
            util.run_shell_cmd_list(
                self.config.POST_EXECUTION_CMDS,
                extra_env=self._make_post_cmd_env(results)
            )

        tfmt = self.get_overall_time_estimate_formatter(results)

        if self.manifest.eta_avail_idx == self.manifest.next_exec_idx:
            # We just found out roughly how long the session has left, mail out.
            msg = "ETA for current session now known: %s" % tfmt.finish_str
            util.log_and_mail(self.mailer, debug,
                              "ETA for Current Session Available",
                              msg, bypass_limiter=True)

        info("{:<25s}: {} ({} from now)".format(
            "Estimated completion (whole session)", tfmt.finish_str,
            tfmt.delta_str))

        info("%d executions left in scheduler queue" % self.manifest.num_execs_left)

        if self.manifest.num_execs_left > 0 and \
                self.manifest.eta_avail_idx > self.manifest.next_exec_idx:
            info("Executions until ETA known: %s" %
                 (self.manifest.eta_avail_idx -
                  self.manifest.next_exec_idx))

        # Although it would have been nice to have checked this prior to
        # running the execution, it depends on the results file, which we
        # should not load prior to the process execution.
        util.check_audit_unchanged(results, self.platform)

        assert self.manifest.num_execs_left >= 0
        if self.manifest.num_execs_left > 0:
            # print info about the next job
            benchmark, vm_name, variant = \
                self.manifest.next_exec_key.split(":")
            info("Next execution is '%s(%d)' (%s variant) under '%s'" %
                 (benchmark, self.config.BENCHMARKS[benchmark], variant, vm_name))

            tfmt = self.get_exec_estimate_time_formatter(job.key, results)
            info("{:<35s}: {} ({} from now)".format(
                "Estimated completion (next execution)",
                tfmt.finish_str,
                tfmt.delta_str))

            info("Reboot in preparation for next execution")
            util.reboot(self.manifest, self.platform)
        elif self.manifest.num_execs_left == 0:
            self.platform.save_power()
            if self.config.ENABLE_PINNING:
                self.platform.clear_cpu_pinning()

            info("Done: Results dumped to %s" % self.config.results_filename())
            err_msg = "Errors/warnings occurred -- read the log!"
            if results.error_flag:
                warn(err_msg)

            msg = "Session completed. Log file at: '%s'" % (self.log_path)

            if results.error_flag:
                msg += "\n\n%s" % err_msg

            msg += "\n\nDon't forget to disable Krun at boot."

            util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg,
                              bypass_limiter=True)
Exemplo n.º 35
0
class ExecutionScheduler(object):
    """Represents our entire benchmarking session"""

    def __init__(self, config, mailer, platform,
                 resume=False, reboot=False, dry_run=False,
                 started_by_init=False):
        self.mailer = mailer

        self.config = config
        self.work_deque = deque()
        self.eta_avail = 0
        self.jobs_done = 0
        self.platform = platform
        self.resume = resume
        self.reboot = reboot
        self.dry_run = dry_run
        self.started_by_init = started_by_init

        if resume:
            self.results = Results(self.config, platform,
                                   results_file=self.config.results_filename())
        else:
            self.results = Results(self.config, platform)

        self.log_path = self.config.log_filename(self.resume)

    def set_eta_avail(self):
        """call after adding job before eta should become available"""
        self.eta_avail = len(self)

    def jobs_until_eta_known(self):
        return self.eta_avail - self.jobs_done

    def add_job(self, job):
        self.work_deque.append(job)

    def remove_job_by_key(self, key):
        for job in self.work_deque:
            if job.key == key:
                job_to_remove = job
                break
        else:
            raise JobMissingError(key)
        debug("Removed %s from schedule" % key)
        self.work_deque.remove(job_to_remove)

    def next_job(self, peek=False):
        try:
            job = self.work_deque.popleft()
            if peek:
                self.work_deque.appendleft(job)
            return job
        except IndexError:
            raise ScheduleEmpty() # we are done

    def __len__(self):
        return len(self.work_deque)

    def get_estimated_exec_duration(self, key):
        previous_exec_times = self.results.eta_estimates.get(key)
        if previous_exec_times:
            return mean(previous_exec_times)
        else:
            return None # we don't know

    def get_estimated_overall_duration(self):
        etas = [j.get_estimated_exec_duration() for j in self.work_deque]
        if None in etas:
            return None # we don't know
        return sum(etas)

    def get_exec_estimate_time_formatter(self, key):
        return TimeEstimateFormatter(self.get_estimated_exec_duration(key))

    def get_overall_time_estimate_formatter(self):
        return TimeEstimateFormatter(self.get_estimated_overall_duration())

    def add_eta_info(self, key, exec_time):
        self.results.eta_estimates[key].append(exec_time)

    def build_schedule(self):
        """Builds a queue of process execution jobs.

        Returns two sets: non_skipped_keys, skipped_keys"""

        skipped_keys, non_skipped_keys = set(), set()

        one_exec_scheduled = False
        eta_avail_job = None
        for exec_n in xrange(self.config.N_EXECUTIONS):
            for vm_name, vm_info in self.config.VMS.items():
                for bmark, param in self.config.BENCHMARKS.items():
                    for variant in vm_info["variants"]:
                        job = ExecutionJob(self, vm_name, vm_info, bmark, variant, param)
                        if not self.config.should_skip(job.key):
                            non_skipped_keys |= set([job.key])
                            if one_exec_scheduled and not eta_avail_job:
                                # first job of second executions eta becomes known.
                                eta_avail_job = job
                                self.set_eta_avail()
                            self.add_job(job)
                        else:
                            skipped_keys |= set([job.key])
                            if not one_exec_scheduled:
                                debug("%s is in skip list. Not scheduling." %
                                      job.key)
            one_exec_scheduled = True
        self.expected_reboots = len(self)
        # Resume mode: if previous results are available, remove the
        # jobs from the schedule which have already been executed, and
        # add the results to this object, ready to be saved to a Json file.
        if self.resume:
            self._remove_previous_execs_from_schedule()
            # Sanity check ETA estimates
            for key, exec_data in self.results.data.iteritems():
                got_len = len(self.results.eta_estimates[key])
                expect_len = len(exec_data)
                if expect_len != got_len:
                    msg = "ETA estimates didn't tally with results: "
                    msg += "key=%s, expect_len=%d, got_len=%d " % \
                        (key, expect_len, got_len)
                    msg += "data[%s]=%s; " % (key, str(self.results.data[key]))
                    msg += "eta[%s]=%s" % \
                        (key, str(self.results.eta_estimates[key]))
                    util.log_and_mail(self.mailer, error,
                                      "Fatal Krun Error",
                                      msg, bypass_limiter=True, exit=True)
        return non_skipped_keys, skipped_keys

    def _remove_previous_execs_from_schedule(self):
        for key in self.results.data:
            num_completed_jobs = self.results.jobs_completed(key)
            if num_completed_jobs > 0:
                try:
                    debug("%s has already been run %d times." %
                          (key, num_completed_jobs))
                    for _ in range(num_completed_jobs):
                        self.remove_job_by_key(key)
                        self.jobs_done += 1
                except JobMissingError as excn:
                    tup = (excn.key, self.config.filename,
                           self.config.results_filename())
                    msg = ("Failed to resume benchmarking session\n." +
                           "The execution %s appears in results " +
                           "file: %s, but not in config file: %s." % tup)
                    util.fatal(msg)

    def _make_pre_post_cmd_env(self):
        """Prepare an environment dict for pre/post execution hooks"""

        jobs_until_eta_known = self.jobs_until_eta_known()
        if jobs_until_eta_known > 0:
            eta_val = "Unknown. Known in %d process executions." % \
                self.jobs_until_eta_known()
        else:
            eta_val = self.get_overall_time_estimate_formatter().finish_str

        return {
            "KRUN_RESULTS_FILE": self.config.results_filename(),
            "KRUN_LOG_FILE": self.config.log_filename(resume=True),
            "KRUN_ETA_DATUM": now_str(),
            "KRUN_ETA_VALUE": eta_val,
        }

    def run(self):
        """Benchmark execution starts here"""
        jobs_left = len(self)
        if jobs_left == 0:
            debug("Krun started with an empty queue of jobs")

        if not self.started_by_init:
            util.log_and_mail(self.mailer, debug,
                              "Benchmarking started",
                              "Benchmarking started.\nLogging to %s" %
                              self.log_path,
                              bypass_limiter=True)

        if self.reboot and not self.started_by_init:
            # Reboot before first benchmark (dumps results file).
            info("Reboot prior to first execution")
            self._reboot()

        if self.reboot and self.started_by_init and jobs_left > 0:
            debug("Waiting %s seconds for the system to come up." %
                 str(STARTUP_WAIT_SECONDS))
            if self.dry_run:
                info("SIMULATED: time.sleep (would have waited %s seconds)." %
                     STARTUP_WAIT_SECONDS)
            else:
                time.sleep(STARTUP_WAIT_SECONDS)

        # Important that the dmesg is collected after the above startup wait.
        # Otherwise we get spurious dmesg changes.
        self.platform.collect_starting_dmesg()

        while True:
            self.platform.wait_for_temperature_sensors()

            jobs_left = len(self)
            if jobs_left == 0:
                break

            # Run the user's pre-process-execution commands
            util.run_shell_cmd_list(
                self.config.PRE_EXECUTION_CMDS,
                extra_env=self._make_pre_post_cmd_env()
            )

            job = self.next_job()

            # We collect rough execution times separate from real results. The
            # reason for this is that, even if a benchmark crashes it takes
            # time and we need to account for this when making estimates. A
            # crashing benchmark will give an empty list of iteration times,
            # meaning we can't use 'raw_exec_result' below for estimates.
            exec_start_time = time.time()
            raw_exec_result = job.run(self.mailer, self.dry_run)
            exec_end_time = time.time()

            exec_result = util.format_raw_exec_results(raw_exec_result)

            if not exec_result and not self.dry_run:
                self.results.error_flag = True

            self.results.data[job.key].append(exec_result)


            eta_info = exec_end_time - exec_start_time
            if self.reboot:
                # Add time taken to wait for system to come up if we are in
                # reboot mode.
                eta_info += STARTUP_WAIT_SECONDS
            self.add_eta_info(job.key, eta_info)

            info("%d executions left in scheduler queue" % (jobs_left - 1))

            # We dump the json after each experiment so we can monitor the
            # json file mid-run. It is overwritten each time.
            self.results.write_to_file()

            self.jobs_done += 1

            # Run the user's post-process-execution commands with updated
            # ETA estimates. Important that this happens *after* dumping
            # results, as the user is likely copying intermediate results to
            # another host.
            util.run_shell_cmd_list(
                self.config.POST_EXECUTION_CMDS,
                extra_env=self._make_pre_post_cmd_env()
            )

            tfmt = self.get_overall_time_estimate_formatter()

            if self.eta_avail == self.jobs_done:
                # We just found out roughly how long the session has left, mail out.
                msg = "ETA for current session now known: %s" % tfmt.finish_str
                util.log_and_mail(self.mailer, debug,
                             "ETA for Current Session Available",
                             msg, bypass_limiter=True)

            info("{:<25s}: {} ({} from now)".format(
                "Estimated completion", tfmt.finish_str, tfmt.delta_str))

            try:
                job = self.next_job(peek=True)
            except ScheduleEmpty:
                pass  # no next job
            else:
                # print info about the next job
                info("Next execution is '%s(%d)' (%s variant) under '%s'" %
                     (job.benchmark, job.parameter, job.variant, job.vm_name))

                tfmt = self.get_exec_estimate_time_formatter(job.key)
                info("{:<35s}: {} ({} from now)".format(
                    "Estimated completion (next execution)",
                    tfmt.finish_str,
                    tfmt.delta_str))

            if (self.eta_avail is not None) and (self.jobs_done < self.eta_avail):
                info("Executions until ETA known: %s" % self.jobs_until_eta_known())

            if self.platform.check_dmesg_for_changes():
                self.results.error_flag = True

            if self.reboot and len(self) > 0:
                info("Reboot in preparation for next execution")
                self._reboot()

        self.platform.save_power()

        info("Done: Results dumped to %s" % self.config.results_filename())
        err_msg = "Errors/warnings occurred -- read the log!"
        if self.results.error_flag:
            warn(err_msg)

        msg = "Session completed. Log file at: '%s'" % (self.log_path)

        if self.results.error_flag:
            msg += "\n\n%s" % err_msg

        if self.reboot:
            msg += "\n\nDon't forget to disable Krun at boot."

        util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg,
                          bypass_limiter=True)

    def _reboot(self):
        self.results.reboots += 1
        debug("About to execute reboot: %g, expecting %g in total." %
              (self.results.reboots, self.expected_reboots))
        # Dump the results file. This may already have been done, but we
        # have changed self.nreboots, which needs to be written out.
        self.results.write_to_file()

        if self.results.reboots > self.expected_reboots:
            util.fatal(("HALTING now to prevent an infinite reboot loop: " +
                        "INVARIANT num_reboots <= num_jobs violated. " +
                        "Krun was about to execute reboot number: %g. " +
                        "%g jobs have been completed, %g are left to go.") %
                       (self.results.reboots, self.jobs_done, len(self)))
        if self.dry_run:
            info("SIMULATED: reboot (restarting Krun in-place)")
            args =  sys.argv
            if not self.started_by_init:
                args.extend(["--resume", "--started-by-init"])
                debug("Simulated reboot with args: " + " ".join(args))
            os.execv(args[0], args)  # replace myself
            assert False  # unreachable
        else:
            subprocess.call(self.platform.get_reboot_cmd())