Exemplo n.º 1
0
    def test_timeout_unhandled(self):
        '''with timeout handling disabled, jobs are marked FAILED'''

        remote_dir = tempfile.TemporaryDirectory(prefix="remote")
        remote_path = os.path.join(remote_dir.name, 'side0.dat')
        with open(remote_path, 'w') as fp:
            fp.write('9\n')

        # Same as previous test, but square.py hangs for 300 sec
        job = create_job(name='square_testjob2',
                         app='square',
                         args='side0.dat --sleep 5',
                         url_in=f'local:{remote_dir.name}',
                         stage_out_files='square*',
                         url_out=f'local:{remote_dir.name}',
                         auto_timeout_retry=False)

        # Job reaches the RUNNING state and then times out
        success = util.run_launcher_until_state(job, 'RUNNING')
        self.assertTrue(success)

        # On termination, actively running job is marked RUN_TIMEOUT
        def check():
            job.refresh_from_db()
            return job.state == 'RUN_TIMEOUT'

        success = util.poll_until_returns_true(check, timeout=12)
        self.assertEqual(job.state, 'RUN_TIMEOUT')

        # If we run the launcher again, it will pick up the timed out job
        # But without timeout handling, it fails
        success = util.run_launcher_until_state(job, 'FAILED')
        self.assertTrue(success)
Exemplo n.º 2
0
    def test_error_unhandled(self):
        '''test unhandled return code from app'''

        remote_dir = tempfile.TemporaryDirectory(prefix="remote")
        remote_path = os.path.join(remote_dir.name, 'side0.dat')
        with open(remote_path, 'w') as fp:
            fp.write('9\n')

        # Same as previous test, but square.py returns nonzero
        job = create_job(name='square_testjob2',
                         app='square',
                         args='side0.dat --retcode 1',
                         url_in=f'local:{remote_dir.name}',
                         stage_out_files='square*',
                         url_out=f'local:{remote_dir.name}')
        self.assertEqual(job.application_args, 'side0.dat --retcode 1')
        self.assertEqual(BalsamJob.objects.all().count(), 1)

        # The job is marked FAILED due to unhandled nonzero return code
        success = util.run_launcher_until_state(job, 'FAILED')
        self.assertTrue(success)

        # (But actually the application ran and printed its result correctly)
        result = float(job.read_file_in_workdir('square.dat'))
        self.assertEqual(result, 81.0)

        preproc_out_contents = job.read_file_in_workdir('preprocess.log')

        jobid_line = [
            l for l in preproc_out_contents.split('\n') if 'jobid' in l
        ][0]
        self.assertIn(str(job.pk), jobid_line)
Exemplo n.º 3
0
    def test_static(self):
        '''test normal processing of a pre-defined DAG'''

        NUM_SIDES, NUM_RANKS = 3, 2
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        parent = create_job(name='make_sides', app='make_sides', preproc=pre)

        # Each side length is mapped to a square area in a set of mapping jobs.
        # These 3 "square_jobs" all have the same parent make_sides, but each
        # takes a different input file
        square_jobs = {
            i: create_job(name=f'square{i}',
                          app='square',
                          args=f'side{i}.dat',
                          input_files=f'side{i}.dat')
            for i in range(NUM_SIDES)
        }
        for job in square_jobs.values():
            job.set_parents([parent])

        # The final reduce job depends on all the square jobs: all square.dat
        # files will be staged in and final results staged out to a remote
        # directory
        remote_dir = tempfile.TemporaryDirectory(prefix="remote")
        reduce_job = create_job(name='reduce',
                                app='reduce',
                                input_files="square*.dat*",
                                url_out=f'local:{remote_dir.name}',
                                stage_out_files='summary*.dat reduce.out')
        reduce_job.set_parents(square_jobs.values())

        # Run the entire DAG until finished
        success = util.run_launcher_until_state(reduce_job,
                                                'JOB_FINISHED',
                                                timeout=180.0)
        self.assertTrue(success)
        for job in (parent, *square_jobs.values(), reduce_job):
            job.refresh_from_db()
            self.assertEqual(job.state, 'JOB_FINISHED')

        # Double-check the calculation result; thereby testing flow of data
        workdir = parent.working_directory
        files = (os.path.join(workdir, f"side{i}.dat")
                 for i in range(NUM_SIDES))
        sides = [float(open(f).read()) for f in files]
        self.assertTrue(all(0.5 <= s <= 5.0 for s in sides))
        expected_result = sum(s**2 for s in sides)

        resultpath = os.path.join(remote_dir.name, 'reduce.out')
        result = open(resultpath).read()
        self.assertIn('Total area:', result)
        result_line = [l for l in result.split('\n') if 'Total area:' in l][0]
        result = float(result_line.split()[-1])
        self.assertAlmostEqual(result, expected_result)
Exemplo n.º 4
0
    def test_child_timeout(self):
        '''timeout handling in a dag'''

        # Same DAG triplet as above: one parent with 2 children A & B
        NUM_SIDES, NUM_RANKS = 2, 1
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        parent = create_job(name='make_sides', app='make_sides', preproc=pre)

        chA = create_job(name='square0',
                         app='square',
                         args='side0.dat',
                         input_files='side0.dat')
        chB = create_job(name='square1',
                         app='square',
                         args='side1.dat --sleep 30',
                         input_files='side1.dat',
                         post_timeout_handler=True)
        chA.set_parents([parent])
        chB.set_parents([parent])

        # Run until A finishes, but B will still be hanging
        def check():
            chA.refresh_from_db()
            chB.refresh_from_db()
            return chA.state == 'JOB_FINISHED' and chB.state == 'RUNNING'

        success = util.run_launcher_until(check)
        self.assertTrue(success)

        # Give the launcher time to clean up and mark B as timed out
        def check():
            chB.refresh_from_db()
            return chB.state == 'RUN_TIMEOUT'

        success = util.poll_until_returns_true(check, timeout=12)
        self.assertEqual(chB.state, 'RUN_TIMEOUT')

        # Since B has a timeout handler, when we re-run the launcher,
        # It is handled gracefully
        success = util.run_launcher_until_state(chB, 'JOB_FINISHED')

        parent.refresh_from_db()
        chA.refresh_from_db()
        self.assertEqual(parent.state, 'JOB_FINISHED')
        self.assertEqual(chA.state, 'JOB_FINISHED')
        self.assertEqual(chB.state, 'JOB_FINISHED')

        # The data-flow was correct
        self.triplet_data_check(parent, chA, chB)

        # The post-processor in fact handled the timeout
        self.assertIn('recognized timeout',
                      chB.read_file_in_workdir('postprocess.log'))
Exemplo n.º 5
0
    def test_timeout_post_handler(self):
        '''test postprocess handling option for timed-out jobs'''

        remote_dir = tempfile.TemporaryDirectory(prefix="remote")
        remote_path = os.path.join(remote_dir.name, 'side0.dat')
        with open(remote_path, 'w') as fp:
            fp.write('9\n')

        # Same as previous test, but square.py hangs for 300 sec
        job = create_job(name='square_testjob2',
                         app='square',
                         args='side0.dat --sleep 5',
                         url_in=f'local:{remote_dir.name}',
                         stage_out_files='square*',
                         url_out=f'local:{remote_dir.name}',
                         post_timeout_handler=True)

        # Job reaches the RUNNING state and then times out
        success = util.run_launcher_until_state(job, 'RUNNING')
        self.assertTrue(success)

        # On termination, actively running job is marked RUN_TIMEOUT
        def check():
            job.refresh_from_db()
            return job.state == 'RUN_TIMEOUT'

        success = util.poll_until_returns_true(check, timeout=12)
        self.assertEquals(job.state, 'RUN_TIMEOUT')

        # If we run the launcher again, it will pick up the timed out job
        success = util.run_launcher_until_state(job, 'JOB_FINISHED')
        self.assertTrue(success)
        self.assertNotIn('RESTART_READY', job.state_history)
        self.assertIn('handled timeout in square_post', job.state_history)

        # The postprocessor handled the timeout; did not restart
        post_contents = job.read_file_in_workdir('postprocess.log')
        self.assertIn('Invoked to handle RUN_TIMEOUT', post_contents)
        self.assertIn('recognized timeout', post_contents)
Exemplo n.º 6
0
    def test_kill_during_execution_serial(self):
        '''Serial job running in mpi_ensemble is properly terminated'''
        killer_job = create_job(name="killer",
                                app="killer",
                                args="when-running")
        slow_job = create_job(name="slow_job", app="slow", args="30")

        success = util.run_launcher_until_state(killer_job, 'JOB_FINISHED')
        self.assertTrue(success)

        slow_job.refresh_from_db()
        self.assertEqual(slow_job.state, "USER_KILLED")
        stdout = slow_job.read_file_in_workdir('slow_job.out')
        self.assertIn("Sleeping for a long time", stdout)
        self.assertIn("RUNNING", slow_job.state_history)
        self.assertIn("USER_KILLED", slow_job.state_history)
        self.assertNotIn("RUN_DONE", slow_job.state_history)
Exemplo n.º 7
0
    def test_kill_during_preprocess(self):
        '''Job killed while pre-processing is properly marked'''
        killer_job = create_job(name="killer", app="killer")
        slow_job = create_job(name="slow_job",
                              app="slow",
                              preproc=self.slow_name,
                              args="30")

        success = util.run_launcher_until_state(killer_job, 'JOB_FINISHED')
        self.assertTrue(success)

        slow_job.refresh_from_db()
        self.assertEqual(slow_job.state, "USER_KILLED")
        preproc_out = slow_job.read_file_in_workdir('preprocess.log')
        self.assertIn("Sleeping for a long time", preproc_out)
        self.assertNotIn("RUN_DONE", slow_job.state_history)
        self.assertIn("STAGED_IN", slow_job.state_history)
Exemplo n.º 8
0
    def test_kill_during_execution_mpi(self):
        '''Parallel MPIRunner job is properly terminated'''
        launcherInfo = util.launcher_info()
        if len(launcherInfo.workerGroup.workers) < 2:
            self.skipTest("Need at least 2 workers to run this test")

        killer_job = create_job(name="killer", app="killer")
        slow_job = create_job(name="slow_job",
                              app="slow",
                              ranks_per_node=2,
                              args="30 parallel")

        success = util.run_launcher_until_state(killer_job, 'JOB_FINISHED')
        self.assertTrue(success)

        slow_job.refresh_from_db()
        self.assertEqual(slow_job.state, "USER_KILLED")
        stdout = slow_job.read_file_in_workdir('slow_job.out')
        self.assertIn("Rank 0 Sleeping for a long time", stdout)
        self.assertIn("Rank 1 Sleeping for a long time", stdout)
        self.assertIn("RUNNING", slow_job.state_history)
        self.assertIn("USER_KILLED", slow_job.state_history)
        self.assertNotIn("RUN_DONE", slow_job.state_history)
Exemplo n.º 9
0
    def test_error_handled(self):
        '''test postprocessor-handled nonzero return code'''

        remote_dir = tempfile.TemporaryDirectory(prefix="remote")
        remote_path = os.path.join(remote_dir.name, 'side0.dat')
        with open(remote_path, 'w') as fp:
            fp.write('9\n')

        # Same as previous test, but square.py returns nonzero
        job = create_job(name='square_testjob2',
                         app='square',
                         args='side0.dat --retcode 1',
                         url_in=f'local:{remote_dir.name}',
                         stage_out_files='square*',
                         url_out=f'local:{remote_dir.name}',
                         post_error_handler=True)
        self.assertEqual(job.application_args, 'side0.dat --retcode 1')
        self.assertEqual(BalsamJob.objects.all().count(), 1)

        # The job finished successfully despite a nonzero return code
        success = util.run_launcher_until_state(job, 'JOB_FINISHED')
        self.assertTrue(success)

        # Make sure at some point, it was marked with RUN_ERROR
        self.assertIn('RUN_ERROR', job.state_history)

        # It was saved by the postprocessor:
        self.assertIn('handled error in square_post', job.state_history)

        # We can also check the postprocessor stdout:
        post_contents = job.read_file_in_workdir('postprocess.log')
        self.assertIn("recognized error", post_contents)
        self.assertIn("Invoked to handle RUN_ERROR", post_contents)

        # job id sanity check
        jobid_line = [l for l in post_contents.split('\n') if 'jobid' in l][0]
        self.assertIn(str(job.pk), jobid_line)
Exemplo n.º 10
0
    def test_dynamic(self):
        '''test dynamic generation of child jobs'''

        # The parent will create between 4 and 8 child jobs in the course
        # of its post-processing step:
        NUM_SIDES, NUM_RANKS = random.randint(4, 8), 1
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        post = self.apps['make_sides'].default_postprocess + ' --dynamic-spawn'
        parent = create_job(name='make_sides',
                            app='make_sides',
                            preproc=pre,
                            postproc=post)

        # The final reduce job will depend on all these spawned child jobs, but
        # they do not exist yet!  We will allow these dependencies to be
        # established dynamically; for now the reduce step just depends on the
        # top-level parent of the tree.
        remote_dir = tempfile.TemporaryDirectory(prefix="remote")
        reduce_job = create_job(name='sum_squares',
                                app='reduce',
                                input_files="square*.dat*",
                                url_out=f'local:{remote_dir.name}',
                                stage_out_files='summary?.dat *.out')
        reduce_job.set_parents([parent])

        # Run the entire DAG until finished
        success = util.run_launcher_until_state(reduce_job,
                                                'JOB_FINISHED',
                                                timeout=200.0)
        self.assertTrue(success)
        for job in BalsamJob.objects.all():
            self.assertEqual(job.state, 'JOB_FINISHED')

        # Double-check the calculation result; thereby testing flow of data
        workdir = parent.working_directory
        files = (os.path.join(workdir, f"side{i}.dat")
                 for i in range(NUM_SIDES))
        sides = [float(open(f).read()) for f in files]
        self.assertTrue(all(0.5 <= s <= 5.0 for s in sides))
        expected_result = sum(s**2 for s in sides)

        resultpath = os.path.join(remote_dir.name, 'sum_squares.out')
        result = open(resultpath).read()
        self.assertIn('Total area:', result)
        result_line = [l for l in result.split('\n') if 'Total area:' in l][0]
        result = float(result_line.split()[-1])
        self.assertAlmostEqual(result, expected_result)

        # Checking the post-processor log, we see that those jobs were actually
        # spawned.
        post_contents = parent.read_file_in_workdir('postprocess.log')
        for i in range(NUM_SIDES):
            self.assertIn(f'spawned square{i} job', post_contents)

        # The spawned jobs' state histories confirm this.
        square_jobs = BalsamJob.objects.filter(name__startswith='square')
        self.assertEqual(square_jobs.count(), NUM_SIDES)
        for job in square_jobs:
            self.assertIn(f'spawned by {parent.cute_id}', job.state_history)

        # Make sure that the correct number of dependencies were created for the
        # reduce job: one for each dynamically-spawned job (plus the original)
        self.assertEqual(reduce_job.get_parents().count(), NUM_SIDES + 1)
Exemplo n.º 11
0
    def test_parent_timeout(self):
        '''timeout handling (with rescue job) in a dag'''

        # Same DAG triplet as above: one parent with 2 children A & B
        NUM_SIDES, NUM_RANKS = 2, 1
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        parent = create_job(name='make_sides',
                            app='make_sides',
                            preproc=pre,
                            args='--sleep 30',
                            post_timeout_handler=True)

        chA = create_job(name='square0',
                         app='square',
                         args='side0.dat',
                         input_files='side0.dat')
        chB = create_job(name='square1',
                         app='square',
                         args='side1.dat',
                         input_files='side1.dat')
        chA.set_parents([parent])
        chB.set_parents([parent])

        # We run the launcher and kill it once parent starts running
        success = util.run_launcher_until_state(parent, 'RUNNING')
        self.assertTrue(success)

        # Parent timed out
        def check():
            parent.refresh_from_db()
            return parent.state == 'RUN_TIMEOUT'

        success = util.poll_until_returns_true(check, timeout=12)
        self.assertTrue(success)

        parent.refresh_from_db()
        chA.refresh_from_db()
        chB.refresh_from_db()
        self.assertEqual(parent.state, 'RUN_TIMEOUT')
        self.assertEqual(chA.state, 'AWAITING_PARENTS')
        self.assertEqual(chB.state, 'AWAITING_PARENTS')

        # On re-run, everything finishes okay
        def check():
            chA.refresh_from_db()
            chB.refresh_from_db()
            return chA.state == 'JOB_FINISHED' and chB.state == 'JOB_FINISHED'

        success = util.run_launcher_until(check, timeout=120)

        parent.refresh_from_db()
        chA.refresh_from_db()
        chB.refresh_from_db()

        self.assertEqual(parent.state, 'JOB_FINISHED')
        self.assertEqual(chA.state, 'JOB_FINISHED')
        self.assertEqual(chB.state, 'JOB_FINISHED')

        # What happened: a rescue job was created by the time-out handler and
        # ran in the second launcher invocation
        jobs = BalsamJob.objects.all()
        self.assertEqual(jobs.count(), 4)

        # This rescue job was made to be the parent of A and B
        rescue_job = chB.get_parents().first()
        self.assertEqual(rescue_job.state, 'JOB_FINISHED')

        # The job state history shows how this happened:
        self.assertIn(f'spawned by {parent.cute_id}', rescue_job.state_history)
        self.assertIn(f'spawned rescue job {rescue_job.cute_id}',
                      parent.state_history)

        # It happened during the post-processing step:
        post_log = parent.read_file_in_workdir('postprocess.log')
        self.assertIn('Creating rescue job', post_log)

        # Data flow correct:
        self.triplet_data_check(rescue_job, chA, chB)
Exemplo n.º 12
0
    def test_normal(self):
        '''normal processing of a single job'''

        # A mock "remote" data source has a file side0.dat
        # This file contains the side length of a square: 9
        remote_dir = tempfile.TemporaryDirectory(prefix="remote")
        remote_path = os.path.join(remote_dir.name, 'side0.dat')
        with open(remote_path, 'w') as fp:
            fp.write('9\n')

        job = create_job(name='square_testjob',
                         app='square',
                         url_in=f'local:{remote_dir.name}',
                         stage_out_files='square*',
                         url_out=f'local:{remote_dir.name}',
                         args='')

        # Sanity check test case isolation
        self.assertEquals(job.state, 'CREATED')
        self.assertEqual(job.application_args, '')
        self.assertEqual(BalsamJob.objects.all().count(), 1)

        # Run the launcher and make sure that the job gets carried all the way
        # through to completion
        success = util.run_launcher_until_state(job, 'JOB_FINISHED')
        self.assertTrue(success)

        # job staged in this remote side0.dat file; it contains "9"
        staged_in_file_contents = job.read_file_in_workdir('side0.dat')
        self.assertIn('9\n', staged_in_file_contents)

        # Preprocess script actually ran:
        preproc_out_contents = job.read_file_in_workdir('preprocess.log')

        # Preprocess inherited the correct job from the environment:
        jobid_line = [
            l for l in preproc_out_contents.split('\n') if 'jobid' in l
        ][0]
        self.assertIn(str(job.pk), jobid_line)

        # Preprocess recgonized the side0.dat file
        # And it altered the job application_args accordingly:
        self.assertIn('set square.py input to side0.dat', preproc_out_contents)
        self.assertIn('side0.dat', job.application_args)

        # application stdout was written to the job's .out file
        app_stdout = job.read_file_in_workdir('square_testjob.out')
        self.assertIn("Hello from square", app_stdout)

        # the square.py app wrote its result to square.dat
        app_outfile = job.read_file_in_workdir('square.dat')

        # The result of squaring 9 is 81
        result = float(app_outfile)
        self.assertEqual(result, 81.0)

        # the job finished normally, so square_post.py just said hello
        post_contents = job.read_file_in_workdir('postprocess.log')

        jobid_line = [l for l in post_contents.split('\n') if 'jobid' in l][0]
        self.assertIn(str(job.pk), jobid_line)
        self.assertIn('hello from square_post', post_contents)

        # After stage out, the remote directory contains two new files
        # That matched the pattern square*  ....
        # square.dat and square_testjob.out
        remote_square = os.path.join(remote_dir.name, 'square.dat')
        remote_stdout = os.path.join(remote_dir.name, 'square_testjob.out')
        self.assertTrue(os.path.exists(remote_square))
        self.assertTrue(os.path.exists(remote_stdout))

        result_remote = float(open(remote_square).read())
        self.assertEquals(result_remote, 81.0)
        self.assertIn("Hello from square", open(remote_stdout).read())