def test_Theta(self): '''MPI/OMP C binary for Theta: check thread/rank placement''' launcherInfo = util.launcher_info() if launcherInfo.host_type != 'CRAY': self.skipTest('did not recognize Cray environment') if launcherInfo.num_workers < 2: self.skipTest('need at least two nodes reserved to run this test') binary = glob.glob(os.path.join(self.app_path, 'omp.theta.x')) self.app.executable = binary[0] self.app.save() def check(): jobs = BalsamJob.objects.all() return all(j.state == 'JOB_FINISHED' for j in jobs) util.run_launcher_until(check) self.job0.refresh_from_db() self.job1.refresh_from_db() self.job2.refresh_from_db() self.assertEqual(self.job0.state, 'JOB_FINISHED') self.assertEqual(self.job1.state, 'JOB_FINISHED') self.assertEqual(self.job2.state, 'JOB_FINISHED') # Check output of dummy MPI/OpenMP C program self.check_omp_exe_output(self.job0) self.check_omp_exe_output(self.job1) self.check_omp_exe_output(self.job2)
def test_serial(self): '''Populate DB, run launcher, get timing data from job histories Serial: all jobs pack into MPIEnsembles and can run concurrently''' done_query = BalsamJob.objects.filter(state='JOB_FINISHED') for (num_nodes, rpn, jpn) in self.experiments: title = f'{num_nodes}nodes_{rpn}rpn_{jpn}jpn' self.create_serial_expt(num_nodes, rpn, jpn) num_jobs = num_nodes * jpn launcher_start_time = datetime.now() success = util.run_launcher_until( lambda: done_query.count() == num_jobs, timeout=1000, maxrpn=rpn) self.assertEqual(done_query.count(), num_jobs) time_data = util.process_job_times(time0=launcher_start_time) self.assertEqual(len(time_data['PREPROCESSED']), num_jobs) self.assertEqual(len(time_data['JOB_FINISHED']), num_jobs) cdf_table = util.print_jobtimes_cdf(time_data) resultpath = util.benchmark_outfile_path('serial_no_op.dat') with open(resultpath, 'w') as fp: title = f'# {num_nodes} nodes, {rpn} rpn, {jpn} jpn ({num_jobs} total jobs)' comment = 'All jobs pack into MPIEnsembles and can run concurrently' fp.write(util.FormatTable.create_header(title, comment)) fp.write(cdf_table)
def test_parent_error(self): '''test dag error handling''' # Same DAG triplet as above: one parent with 2 children A & B NUM_SIDES, NUM_RANKS = 2, 1 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' parent = create_job(name='make_sides', app='make_sides', preproc=pre, args='--retcode 1', post_error_handler=True) chA = create_job(name='square0', app='square', args='side0.dat', input_files='side0.dat') chB = create_job(name='square1', app='square', args='side1.dat', input_files='side1.dat') chA.set_parents([parent]) chB.set_parents([parent]) # Parent will give an error, but it will be handled def check(): parent.refresh_from_db() chA.refresh_from_db() chB.refresh_from_db() jobs = parent, chA, chB return all(j.state == 'JOB_FINISHED' for j in jobs) # Everything finished successfully success = util.run_launcher_until(check, timeout=120) self.assertTrue(success) parent.refresh_from_db() chA.refresh_from_db() chB.refresh_from_db() # The parent state history shows that an error was handled self.assertIn(f'RUN_ERROR', parent.state_history) self.assertIn(f'handled error; it was okay', parent.state_history) # The post-processor handled it post_log = parent.read_file_in_workdir('postprocess.log') self.assertIn('the job was actually done', post_log) # Data flow okay: self.triplet_data_check(parent, chA, chB) # no rescue jobs had to be created: jobs = BalsamJob.objects.all() self.assertEqual(jobs.count(), 3)
def test_child_timeout(self): '''timeout handling in a dag''' # Same DAG triplet as above: one parent with 2 children A & B NUM_SIDES, NUM_RANKS = 2, 1 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' parent = create_job(name='make_sides', app='make_sides', preproc=pre) chA = create_job(name='square0', app='square', args='side0.dat', input_files='side0.dat') chB = create_job(name='square1', app='square', args='side1.dat --sleep 30', input_files='side1.dat', post_timeout_handler=True) chA.set_parents([parent]) chB.set_parents([parent]) # Run until A finishes, but B will still be hanging def check(): chA.refresh_from_db() chB.refresh_from_db() return chA.state == 'JOB_FINISHED' and chB.state == 'RUNNING' success = util.run_launcher_until(check) self.assertTrue(success) # Give the launcher time to clean up and mark B as timed out def check(): chB.refresh_from_db() return chB.state == 'RUN_TIMEOUT' success = util.poll_until_returns_true(check, timeout=12) self.assertEqual(chB.state, 'RUN_TIMEOUT') # Since B has a timeout handler, when we re-run the launcher, # It is handled gracefully success = util.run_launcher_until_state(chB, 'JOB_FINISHED') parent.refresh_from_db() chA.refresh_from_db() self.assertEqual(parent.state, 'JOB_FINISHED') self.assertEqual(chA.state, 'JOB_FINISHED') self.assertEqual(chB.state, 'JOB_FINISHED') # The data-flow was correct self.triplet_data_check(parent, chA, chB) # The post-processor in fact handled the timeout self.assertIn('recognized timeout', chB.read_file_in_workdir('postprocess.log'))
def test_child_error(self): '''error handling in a dag''' # Same DAG triplet as above: one parent with 2 children A & B NUM_SIDES, NUM_RANKS = 2, 1 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' parent = create_job(name='make_sides', app='make_sides', preproc=pre) chA = create_job(name='square0', app='square', args='side0.dat', input_files='side0.dat') chB = create_job(name='square1', app='square', args='side1.dat --retcode 1', input_files='side1.dat', post_error_handler=True) chA.set_parents([parent]) chB.set_parents([parent]) # child B will give a RUN_ERROR, but it will be handled def check(): return all(j.state == 'JOB_FINISHED' for j in BalsamJob.objects.all()) success = util.run_launcher_until(check, timeout=120) self.assertTrue(success) parent.refresh_from_db() chA.refresh_from_db() chB.refresh_from_db() self.assertEqual(parent.state, 'JOB_FINISHED') self.assertEqual(chA.state, 'JOB_FINISHED') self.assertEqual(chB.state, 'JOB_FINISHED') # Data flow was correct self.triplet_data_check(parent, chA, chB) # The post-processor handled the nonzero return code in B self.assertIn('recognized error', chB.read_file_in_workdir('postprocess.log'))
def test_many_write(self): '''Many ranks can simultaneously add a job to the DB''' job = create_job(name="mpi_insert", app='mpi4py-insert', num_nodes=self.num_nodes, ranks_per_node=16) num_ranks = job.num_ranks def check(): jobs = BalsamJob.objects.filter(state='JOB_FINISHED') return jobs.count() == num_ranks + 1 success = util.run_launcher_until(check, timeout=200) job.refresh_from_db() self.assertEqual(job.state, 'JOB_FINISHED') jobs = BalsamJob.objects.all() self.assertListEqual(['JOB_FINISHED'] * len(jobs), [j.state for j in jobs]) created_jobs = BalsamJob.objects.filter(name__icontains='hello') self.assertEqual(created_jobs.count(), num_ranks)
def test_parent_timeout(self): '''timeout handling (with rescue job) in a dag''' # Same DAG triplet as above: one parent with 2 children A & B NUM_SIDES, NUM_RANKS = 2, 1 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' parent = create_job(name='make_sides', app='make_sides', preproc=pre, args='--sleep 30', post_timeout_handler=True) chA = create_job(name='square0', app='square', args='side0.dat', input_files='side0.dat') chB = create_job(name='square1', app='square', args='side1.dat', input_files='side1.dat') chA.set_parents([parent]) chB.set_parents([parent]) # We run the launcher and kill it once parent starts running success = util.run_launcher_until_state(parent, 'RUNNING') self.assertTrue(success) # Parent timed out def check(): parent.refresh_from_db() return parent.state == 'RUN_TIMEOUT' success = util.poll_until_returns_true(check, timeout=12) self.assertTrue(success) parent.refresh_from_db() chA.refresh_from_db() chB.refresh_from_db() self.assertEqual(parent.state, 'RUN_TIMEOUT') self.assertEqual(chA.state, 'AWAITING_PARENTS') self.assertEqual(chB.state, 'AWAITING_PARENTS') # On re-run, everything finishes okay def check(): chA.refresh_from_db() chB.refresh_from_db() return chA.state == 'JOB_FINISHED' and chB.state == 'JOB_FINISHED' success = util.run_launcher_until(check, timeout=120) parent.refresh_from_db() chA.refresh_from_db() chB.refresh_from_db() self.assertEqual(parent.state, 'JOB_FINISHED') self.assertEqual(chA.state, 'JOB_FINISHED') self.assertEqual(chB.state, 'JOB_FINISHED') # What happened: a rescue job was created by the time-out handler and # ran in the second launcher invocation jobs = BalsamJob.objects.all() self.assertEqual(jobs.count(), 4) # This rescue job was made to be the parent of A and B rescue_job = chB.get_parents().first() self.assertEqual(rescue_job.state, 'JOB_FINISHED') # The job state history shows how this happened: self.assertIn(f'spawned by {parent.cute_id}', rescue_job.state_history) self.assertIn(f'spawned rescue job {rescue_job.cute_id}', parent.state_history) # It happened during the post-processing step: post_log = parent.read_file_in_workdir('postprocess.log') self.assertIn('Creating rescue job', post_log) # Data flow correct: self.triplet_data_check(rescue_job, chA, chB)
def test_dag_error_timeout_mixture(self): '''test error/timeout handling mechanisms on 81 jobs (takes a couple min)''' # We will run 3*3*3 triples of jobs (81 total) # Each triple is a tree with 1 parent and 2 children # Try every possible permutation of normal/timeout/fail # Timeout jobs sleep for a couple seconds and have a higher chance of # being interrupted and timed-out; this is not guaranteed though from itertools import product states = 'normal timeout fail'.split() triplets = product(states, repeat=3) # Parent job template parent_types = { 'normal': create_job(name='make_sides', app='make_sides', args='', post_error_handler=True, post_timeout_handler=True, wtime=0), 'timeout': create_job(name='make_sides', app='make_sides', args='--sleep 2', post_error_handler=True, post_timeout_handler=True, wtime=0), 'fail': create_job(name='make_sides', app='make_sides', args='--retcode 1', post_error_handler=True, post_timeout_handler=True, wtime=0), } # Child job template child_types = { 'normal': create_job(name='square', app='square', args='', post_error_handler=True, post_timeout_handler=True, wtime=0), 'timeout': create_job(name='square', app='square', args='--sleep 2', post_error_handler=True, post_timeout_handler=True, wtime=0), 'fail': create_job(name='square', app='square', args='--retcode 1', post_error_handler=True, post_timeout_handler=True, wtime=0), } # Create all 81 jobs job_triplets = {} for triplet in triplets: parent, childA, childB = triplet # Load the template jobP = BalsamJob.objects.get(pk=parent_types[parent].pk) jobA = BalsamJob.objects.get(pk=child_types[childA].pk) jobB = BalsamJob.objects.get(pk=child_types[childB].pk) jobP.pk, jobA.pk, jobB.pk = None, None, None for job in (jobP, jobA, jobB): #job.working_directory = '' # no working path in this branch (computed on the fly) job.save() # Parent has two children (sides); either 1 rank (serial) or 2 ranks (mpi) NUM_SIDES, NUM_RANKS = 2, random.randint(1, 2) pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' jobP.preprocess = pre jobP.save() jobA.application_args += " side0.dat" jobA.input_files += "side0.dat" jobA.save() jobA.set_parents([jobP]) jobB.application_args += " side1.dat" jobB.input_files += "side1.dat" jobB.save() jobB.set_parents([jobP]) job_triplets[triplet] = (jobP, jobA, jobB) # Remove jobs that were only used as template for j in parent_types.values(): j.delete() for j in child_types.values(): j.delete() del parent_types, child_types self.assertEqual(BalsamJob.objects.all().count(), 81) #for job in BalsamJob.objects.all(): #self.assertEqual(job.working_directory, '') # Run the entire DAG until finished, with two interruptions def check(N_run, N_finish): running = BalsamJob.objects.filter(state='RUNNING') finished = BalsamJob.objects.filter(state='JOB_FINISHED') return running.count() >= N_run and finished.count() >= N_finish util.run_launcher_until(check, args=(1, 1)) # interrupt at least 1 util.run_launcher_until(check, args=(2, 5)) # interrupt at least 2 # Get rid of the sleep now to speed up finish slow_jobs = BalsamJob.objects.filter( application_args__contains="sleep") for job in slow_jobs: job.application_args = '--sleep 0' job.save() def check(): return all(j.state == 'JOB_FINISHED' for j in BalsamJob.objects.all()) # Just check that all jobs reach JOB_FINISHED state success = util.run_launcher_until(check, timeout=360.0) self.assertTrue(success) # No race conditions in working directory naming: each job must have a # unique working directory workdirs = [job.working_directory for job in BalsamJob.objects.all()] self.assertEqual(len(workdirs), len(set(workdirs)))