예제 #1
0
    def test_Theta(self):
        '''MPI/OMP C binary for Theta: check thread/rank placement'''
        launcherInfo = util.launcher_info()

        if launcherInfo.host_type != 'CRAY':
            self.skipTest('did not recognize Cray environment')
        if launcherInfo.num_workers < 2:
            self.skipTest('need at least two nodes reserved to run this test')

        binary = glob.glob(os.path.join(self.app_path, 'omp.theta.x'))
        self.app.executable = binary[0]
        self.app.save()

        def check():
            jobs = BalsamJob.objects.all()
            return all(j.state == 'JOB_FINISHED' for j in jobs)

        util.run_launcher_until(check)
        self.job0.refresh_from_db()
        self.job1.refresh_from_db()
        self.job2.refresh_from_db()

        self.assertEqual(self.job0.state, 'JOB_FINISHED')
        self.assertEqual(self.job1.state, 'JOB_FINISHED')
        self.assertEqual(self.job2.state, 'JOB_FINISHED')

        # Check output of dummy MPI/OpenMP C program
        self.check_omp_exe_output(self.job0)
        self.check_omp_exe_output(self.job1)
        self.check_omp_exe_output(self.job2)
예제 #2
0
    def test_serial(self):
        '''Populate DB, run launcher, get timing data from job histories
        Serial: all jobs pack into MPIEnsembles and can run concurrently'''
        done_query = BalsamJob.objects.filter(state='JOB_FINISHED')

        for (num_nodes, rpn, jpn) in self.experiments:
            title = f'{num_nodes}nodes_{rpn}rpn_{jpn}jpn'
            self.create_serial_expt(num_nodes, rpn, jpn)
            num_jobs = num_nodes * jpn

            launcher_start_time = datetime.now()
            success = util.run_launcher_until(
                lambda: done_query.count() == num_jobs,
                timeout=1000,
                maxrpn=rpn)
            self.assertEqual(done_query.count(), num_jobs)

            time_data = util.process_job_times(time0=launcher_start_time)
            self.assertEqual(len(time_data['PREPROCESSED']), num_jobs)
            self.assertEqual(len(time_data['JOB_FINISHED']), num_jobs)

            cdf_table = util.print_jobtimes_cdf(time_data)
            resultpath = util.benchmark_outfile_path('serial_no_op.dat')

            with open(resultpath, 'w') as fp:
                title = f'# {num_nodes} nodes, {rpn} rpn, {jpn} jpn ({num_jobs} total jobs)'
                comment = 'All jobs pack into MPIEnsembles and can run concurrently'
                fp.write(util.FormatTable.create_header(title, comment))
                fp.write(cdf_table)
예제 #3
0
    def test_parent_error(self):
        '''test dag error handling'''

        # Same DAG triplet as above: one parent with 2 children A & B
        NUM_SIDES, NUM_RANKS = 2, 1
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        parent = create_job(name='make_sides',
                            app='make_sides',
                            preproc=pre,
                            args='--retcode 1',
                            post_error_handler=True)

        chA = create_job(name='square0',
                         app='square',
                         args='side0.dat',
                         input_files='side0.dat')
        chB = create_job(name='square1',
                         app='square',
                         args='side1.dat',
                         input_files='side1.dat')
        chA.set_parents([parent])
        chB.set_parents([parent])

        # Parent will give an error, but it will be handled
        def check():
            parent.refresh_from_db()
            chA.refresh_from_db()
            chB.refresh_from_db()
            jobs = parent, chA, chB
            return all(j.state == 'JOB_FINISHED' for j in jobs)

        # Everything finished successfully
        success = util.run_launcher_until(check, timeout=120)
        self.assertTrue(success)

        parent.refresh_from_db()
        chA.refresh_from_db()
        chB.refresh_from_db()

        # The parent state history shows that an error was handled
        self.assertIn(f'RUN_ERROR', parent.state_history)
        self.assertIn(f'handled error; it was okay', parent.state_history)

        # The post-processor handled it
        post_log = parent.read_file_in_workdir('postprocess.log')
        self.assertIn('the job was actually done', post_log)

        # Data flow okay:
        self.triplet_data_check(parent, chA, chB)

        # no rescue jobs had to be created:
        jobs = BalsamJob.objects.all()
        self.assertEqual(jobs.count(), 3)
예제 #4
0
    def test_child_timeout(self):
        '''timeout handling in a dag'''

        # Same DAG triplet as above: one parent with 2 children A & B
        NUM_SIDES, NUM_RANKS = 2, 1
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        parent = create_job(name='make_sides', app='make_sides', preproc=pre)

        chA = create_job(name='square0',
                         app='square',
                         args='side0.dat',
                         input_files='side0.dat')
        chB = create_job(name='square1',
                         app='square',
                         args='side1.dat --sleep 30',
                         input_files='side1.dat',
                         post_timeout_handler=True)
        chA.set_parents([parent])
        chB.set_parents([parent])

        # Run until A finishes, but B will still be hanging
        def check():
            chA.refresh_from_db()
            chB.refresh_from_db()
            return chA.state == 'JOB_FINISHED' and chB.state == 'RUNNING'

        success = util.run_launcher_until(check)
        self.assertTrue(success)

        # Give the launcher time to clean up and mark B as timed out
        def check():
            chB.refresh_from_db()
            return chB.state == 'RUN_TIMEOUT'

        success = util.poll_until_returns_true(check, timeout=12)
        self.assertEqual(chB.state, 'RUN_TIMEOUT')

        # Since B has a timeout handler, when we re-run the launcher,
        # It is handled gracefully
        success = util.run_launcher_until_state(chB, 'JOB_FINISHED')

        parent.refresh_from_db()
        chA.refresh_from_db()
        self.assertEqual(parent.state, 'JOB_FINISHED')
        self.assertEqual(chA.state, 'JOB_FINISHED')
        self.assertEqual(chB.state, 'JOB_FINISHED')

        # The data-flow was correct
        self.triplet_data_check(parent, chA, chB)

        # The post-processor in fact handled the timeout
        self.assertIn('recognized timeout',
                      chB.read_file_in_workdir('postprocess.log'))
예제 #5
0
    def test_child_error(self):
        '''error handling in a dag'''

        # Same DAG triplet as above: one parent with 2 children A & B
        NUM_SIDES, NUM_RANKS = 2, 1
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        parent = create_job(name='make_sides', app='make_sides', preproc=pre)

        chA = create_job(name='square0',
                         app='square',
                         args='side0.dat',
                         input_files='side0.dat')
        chB = create_job(name='square1',
                         app='square',
                         args='side1.dat --retcode 1',
                         input_files='side1.dat',
                         post_error_handler=True)
        chA.set_parents([parent])
        chB.set_parents([parent])

        # child B will give a RUN_ERROR, but it will be handled
        def check():
            return all(j.state == 'JOB_FINISHED'
                       for j in BalsamJob.objects.all())

        success = util.run_launcher_until(check, timeout=120)
        self.assertTrue(success)

        parent.refresh_from_db()
        chA.refresh_from_db()
        chB.refresh_from_db()

        self.assertEqual(parent.state, 'JOB_FINISHED')
        self.assertEqual(chA.state, 'JOB_FINISHED')
        self.assertEqual(chB.state, 'JOB_FINISHED')

        # Data flow was correct
        self.triplet_data_check(parent, chA, chB)

        # The post-processor handled the nonzero return code in B
        self.assertIn('recognized error',
                      chB.read_file_in_workdir('postprocess.log'))
예제 #6
0
    def test_many_write(self):
        '''Many ranks can simultaneously add a job to the DB'''
        job = create_job(name="mpi_insert",
                         app='mpi4py-insert',
                         num_nodes=self.num_nodes,
                         ranks_per_node=16)
        num_ranks = job.num_ranks

        def check():
            jobs = BalsamJob.objects.filter(state='JOB_FINISHED')
            return jobs.count() == num_ranks + 1

        success = util.run_launcher_until(check, timeout=200)
        job.refresh_from_db()
        self.assertEqual(job.state, 'JOB_FINISHED')
        jobs = BalsamJob.objects.all()
        self.assertListEqual(['JOB_FINISHED'] * len(jobs),
                             [j.state for j in jobs])

        created_jobs = BalsamJob.objects.filter(name__icontains='hello')
        self.assertEqual(created_jobs.count(), num_ranks)
예제 #7
0
    def test_parent_timeout(self):
        '''timeout handling (with rescue job) in a dag'''

        # Same DAG triplet as above: one parent with 2 children A & B
        NUM_SIDES, NUM_RANKS = 2, 1
        pre = self.apps[
            'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
        parent = create_job(name='make_sides',
                            app='make_sides',
                            preproc=pre,
                            args='--sleep 30',
                            post_timeout_handler=True)

        chA = create_job(name='square0',
                         app='square',
                         args='side0.dat',
                         input_files='side0.dat')
        chB = create_job(name='square1',
                         app='square',
                         args='side1.dat',
                         input_files='side1.dat')
        chA.set_parents([parent])
        chB.set_parents([parent])

        # We run the launcher and kill it once parent starts running
        success = util.run_launcher_until_state(parent, 'RUNNING')
        self.assertTrue(success)

        # Parent timed out
        def check():
            parent.refresh_from_db()
            return parent.state == 'RUN_TIMEOUT'

        success = util.poll_until_returns_true(check, timeout=12)
        self.assertTrue(success)

        parent.refresh_from_db()
        chA.refresh_from_db()
        chB.refresh_from_db()
        self.assertEqual(parent.state, 'RUN_TIMEOUT')
        self.assertEqual(chA.state, 'AWAITING_PARENTS')
        self.assertEqual(chB.state, 'AWAITING_PARENTS')

        # On re-run, everything finishes okay
        def check():
            chA.refresh_from_db()
            chB.refresh_from_db()
            return chA.state == 'JOB_FINISHED' and chB.state == 'JOB_FINISHED'

        success = util.run_launcher_until(check, timeout=120)

        parent.refresh_from_db()
        chA.refresh_from_db()
        chB.refresh_from_db()

        self.assertEqual(parent.state, 'JOB_FINISHED')
        self.assertEqual(chA.state, 'JOB_FINISHED')
        self.assertEqual(chB.state, 'JOB_FINISHED')

        # What happened: a rescue job was created by the time-out handler and
        # ran in the second launcher invocation
        jobs = BalsamJob.objects.all()
        self.assertEqual(jobs.count(), 4)

        # This rescue job was made to be the parent of A and B
        rescue_job = chB.get_parents().first()
        self.assertEqual(rescue_job.state, 'JOB_FINISHED')

        # The job state history shows how this happened:
        self.assertIn(f'spawned by {parent.cute_id}', rescue_job.state_history)
        self.assertIn(f'spawned rescue job {rescue_job.cute_id}',
                      parent.state_history)

        # It happened during the post-processing step:
        post_log = parent.read_file_in_workdir('postprocess.log')
        self.assertIn('Creating rescue job', post_log)

        # Data flow correct:
        self.triplet_data_check(rescue_job, chA, chB)
예제 #8
0
    def test_dag_error_timeout_mixture(self):
        '''test error/timeout handling mechanisms on 81 jobs (takes a couple min)'''

        # We will run 3*3*3 triples of jobs (81 total)
        # Each triple is a tree with 1 parent and 2 children
        # Try every possible permutation of normal/timeout/fail
        # Timeout jobs sleep for a couple seconds and have a higher chance of
        # being interrupted and timed-out; this is not guaranteed though
        from itertools import product
        states = 'normal timeout fail'.split()
        triplets = product(states, repeat=3)

        # Parent job template
        parent_types = {
            'normal':
            create_job(name='make_sides',
                       app='make_sides',
                       args='',
                       post_error_handler=True,
                       post_timeout_handler=True,
                       wtime=0),
            'timeout':
            create_job(name='make_sides',
                       app='make_sides',
                       args='--sleep 2',
                       post_error_handler=True,
                       post_timeout_handler=True,
                       wtime=0),
            'fail':
            create_job(name='make_sides',
                       app='make_sides',
                       args='--retcode 1',
                       post_error_handler=True,
                       post_timeout_handler=True,
                       wtime=0),
        }

        # Child job template
        child_types = {
            'normal':
            create_job(name='square',
                       app='square',
                       args='',
                       post_error_handler=True,
                       post_timeout_handler=True,
                       wtime=0),
            'timeout':
            create_job(name='square',
                       app='square',
                       args='--sleep 2',
                       post_error_handler=True,
                       post_timeout_handler=True,
                       wtime=0),
            'fail':
            create_job(name='square',
                       app='square',
                       args='--retcode 1',
                       post_error_handler=True,
                       post_timeout_handler=True,
                       wtime=0),
        }

        # Create all 81 jobs
        job_triplets = {}
        for triplet in triplets:
            parent, childA, childB = triplet

            # Load the template
            jobP = BalsamJob.objects.get(pk=parent_types[parent].pk)
            jobA = BalsamJob.objects.get(pk=child_types[childA].pk)
            jobB = BalsamJob.objects.get(pk=child_types[childB].pk)
            jobP.pk, jobA.pk, jobB.pk = None, None, None
            for job in (jobP, jobA, jobB):
                #job.working_directory = ''  # no working path in this branch (computed on the fly)
                job.save()

            # Parent has two children (sides); either 1 rank (serial) or 2 ranks (mpi)
            NUM_SIDES, NUM_RANKS = 2, random.randint(1, 2)
            pre = self.apps[
                'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}'
            jobP.preprocess = pre
            jobP.save()

            jobA.application_args += "  side0.dat"
            jobA.input_files += "side0.dat"
            jobA.save()
            jobA.set_parents([jobP])

            jobB.application_args += "  side1.dat"
            jobB.input_files += "side1.dat"
            jobB.save()
            jobB.set_parents([jobP])

            job_triplets[triplet] = (jobP, jobA, jobB)

        # Remove jobs that were only used as template
        for j in parent_types.values():
            j.delete()
        for j in child_types.values():
            j.delete()
        del parent_types, child_types
        self.assertEqual(BalsamJob.objects.all().count(), 81)

        #for job in BalsamJob.objects.all():
        #self.assertEqual(job.working_directory, '')

        # Run the entire DAG until finished, with two interruptions

        def check(N_run, N_finish):
            running = BalsamJob.objects.filter(state='RUNNING')
            finished = BalsamJob.objects.filter(state='JOB_FINISHED')
            return running.count() >= N_run and finished.count() >= N_finish

        util.run_launcher_until(check, args=(1, 1))  # interrupt at least 1
        util.run_launcher_until(check, args=(2, 5))  # interrupt at least 2

        # Get rid of the sleep now to speed up finish
        slow_jobs = BalsamJob.objects.filter(
            application_args__contains="sleep")
        for job in slow_jobs:
            job.application_args = '--sleep 0'
            job.save()

        def check():
            return all(j.state == 'JOB_FINISHED'
                       for j in BalsamJob.objects.all())

        # Just check that all jobs reach JOB_FINISHED state
        success = util.run_launcher_until(check, timeout=360.0)
        self.assertTrue(success)

        # No race conditions in working directory naming: each job must have a
        # unique working directory
        workdirs = [job.working_directory for job in BalsamJob.objects.all()]
        self.assertEqual(len(workdirs), len(set(workdirs)))