def newdep(args): from balsam import setup setup() # from balsam.core import models from balsam.launcher import dag parent = match_uniq_job(args.parent) child = match_uniq_job(args.child) dag.add_dependency(parent, child) print(f"Created link {parent.cute_id} --> {child.cute_id}")
def mkchild(args): from balsam import setup setup() from balsam.launcher import dag if not dag.current_job: raise RuntimeError(f"mkchild requires that BALSAM_JOB_ID is in the environment") child_job = newjob(args) dag.add_dependency(dag.current_job, child_job) print(f"Created link {dag.current_job.cute_id} --> {child_job.cute_id}")
description="joining final outputfiles", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=join_args, wall_time_minutes=10, application="join_art_rootfiles") for ievent in range(nevents): beamon_args = f"{_file} {ievent}" beamon_job = dag.add_job(name=f"beamon_{i}_{ievent}", workflow=workflow_main, description="uboone full beam on chain", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=beamon_args, wall_time_minutes=80, application="beamon_chain_run1") # add_dependency(parent, child) dag.add_dependency(timestamp_job, beamon_job) dag.add_dependency(beamon_job, mergeFinal_job) print("Total number of events to be processed: ", tot_events) print("Total number of files to be processed: ", files_processed) print(workflow_timestamp) print(workflow_main) print(workflow_join)
for py_script in Path(path_to_ts_estimate).glob('**/*.py'): job_dir, script_name = os.path.split(str(py_script)) job_to_add = BalsamJob( name=script_name, workflow=workflow_name, application='python', args=str(py_script), input_files='', ranks_per_node=1, threads_per_rank=balsam_exe_settings['threads_per_rank'], node_packing_count={node_packing_count}, user_workdir=job_dir, ) job_to_add.save() # for menten # ranks_per_node=1, # node_packing_count={node_packing_count}, # threads_per_rank not specified # all job_to_add_ are childs of 01 job, as from jobs_to_be_finished # nested for loop becouse BalsamJob.objects.filter(name=dep_job) returns # django.query object for a single dep_job, e.g. (H_00_relax.py) # no nested loop required if workflow__contains=dependent_workflow_name for job in pending_simulations: for sub_job in job: add_dependency(sub_job, job_to_add) # parent, child # do not run 03 until all 02 for a given reaction are done for job in pending_simulations_dep: add_dependency(job_to_add, job) # parent, child
app_path = os.path.join(work_dir, job) app_desc = 'Run ' + app_name run_line = sys.executable + ' ' + app_path add_app(app_name, run_line, app_desc) job_name = 'job_' + app_name dag.add_job(name=job_name, workflow="libe_workflow", application=app_name, num_nodes=num_nodes, ranks_per_node=ranks_per_node, stage_out_url="local:" + work_dir, stage_out_files=job_name + ".out") # Add dependency between jobs so run one at a time. if prev_job_name: BalsamJob = dag.BalsamJob parent = BalsamJob.objects.get(name=prev_job_name) child = BalsamJob.objects.get(name=job_name) dag.add_dependency(parent, child) prev_job_name = job_name # Check how to do in API - until then use CLI run_cmd("balsam ls apps", True) run_cmd("balsam ls jobs", True) print("") run_cmd("echo -e To launch jobs run: balsam launcher --consume-all") print("")
reco2_job = dag.add_job(name=f"reco2_{i}_{ievent}", workflow=workflow, description="uboone testing reco2", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=reco2_args, wall_time_minutes=50, application="uboonecode_v08_00_00_27") reco2_post_job = dag.add_job(name=f"reco2_post_{i}_{ievent}", workflow=workflow, description="uboone testing reco2 post", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=reco2_post_args, wall_time_minutes=50, application="uboonecode_v08_00_00_27") # add_dependency(parent, child) dag.add_dependency(reco1_job, celltree_job) dag.add_dependency(celltree_job, larcv_job) dag.add_dependency(larcv_job, reco1a_job) dag.add_dependency(reco1a_job, reco2_job) dag.add_dependency(reco2_job, reco2_post_job) dag.add_dependency(reco2_post_job, mergeFinal_job) print("Total number of events to be processed: ", tot_events)
description = "joining final outputfiles", num_nodes = 1, ranks_per_node = 1, node_packing_count = node_pack_count, args = join_args, wall_time_minutes = 1, application= "join_art_rootfiles_preproc" ) for ievent in range(nevents): beamoff_args = f"{_file} {ievent} {ts_string}" beamoff_job = dag.add_job( name = f"beamoff_{i}_{ievent}", workflow = workflow, description = "uboone full beam off chain", num_nodes = 1, ranks_per_node = 1, node_packing_count = node_pack_count, args = beamoff_args, wall_time_minutes = 50, application= "uboonecode_beamoff_chain_run1" ) # add_dependency(parent, child) dag.add_dependency(beamoff_job,mergeFinal_job) print("Total number of events to be processed: ", tot_events) print(workflow)
for ievent in range(nevents): v01b_args = f"{_file} {ievent}" v27_args = f" " v01b_job = dag.add_job( name=f"v01b_{i}_{ievent}", workflow=workflow, description="uboone testing v08_00_00_01b chain", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=v01b_args, wall_time_minutes=50, application="uboonecode_v08_00_00_01b_combined") v27_job = dag.add_job(name=f"v27_{i}_{ievent}", workflow=workflow, description="uboone testing v08_00_00_27", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=v27_args, wall_time_minutes=50, application="uboonecode_v08_00_00_27_combined") # add_dependency(parent, child) dag.add_dependency(v01b_job, v27_job) dag.add_dependency(v27_job, mergeFinal_job) print("Total number of events to be processed: ", tot_events)
def mock_addjobs(): job1 = dag.add_job(name="added1") job2 = dag.add_job(name="added2") job3 = dag.add_job(name="added3") dag.add_dependency(parent=job2, child=job3)
input_files='', user_workdir=job_dir, node_packing_count={node_packing_count}, ranks_per_node=1, ) job_to_add.save() # for a given rxn_name, get all BalsamJob objects that it depends on dependancy.append( BalsamJob.objects.filter(name=py_script).exclude( state="JOB_FINISHED")) # add dependencies for job in pending_simulations_dep_1: for adding_job in dependancy: # handle double species like O2_O+O try: add_dependency(adding_job, job) # parent, child except RuntimeError: pass # ads_vib jobs dependancies dependent_workflow_name_2 = facetpath + '_vib' pending_simulations_dep_2 = BalsamJob.objects.filter( workflow__contains=dependent_workflow_name_2).exclude(state="JOB_FINISHED") for pending_job in pending_simulations_dep_2: for submitted_job in all_submitted_jobs: balsam_submitted_job = BalsamJob.objects.filter( name=submitted_job).exclude(state="JOB_FINISHED") add_dependency(balsam_submitted_job, pending_job) # parent, child
r_grid = np.linspace(0.8, 1.3) water_scan = [] for i, r in enumerate(r_grid): job = BalsamJob( name=f"task{i}", workflow="demo", description=f"r = {r:.3f}", application="nwchem-water", args="input.nw", num_nodes=1, ranks_per_node=64, cpu_affinity="depth", data={ 'r': r, 'theta': 104.5 }, ) water_scan.append(job) job.save() plotjob = BalsamJob( name="plot", application="plot-pes", workflow="demo", input_files="", ) plotjob.save() for job in water_scan: add_dependency(parent=job, child=plotjob)
description="uboone reco1 beam off chain", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=beamoff_reco1_args, wall_time_minutes=80, application="beamoff_chain_run1_reco1") beamoff_reco2_job = dag.add_job( name=f"beamoff_reco2_{i}_{ievent}", workflow=workflow_main_reco2, description="uboone reco2 beamoff on chain", num_nodes=1, ranks_per_node=1, node_packing_count=node_pack_count, args=beamoff_reco2_args, wall_time_minutes=80, application="beamoff_chain_run1_reco2") # add_dependency(parent, child) dag.add_dependency(timestamp_job, beamoff_reco1_job) dag.add_dependency(beamoff_reco1_job, beamoff_reco2_job) dag.add_dependency(beamoff_reco2_job, mergeFinal_job) print("Total number of events to be processed: ", tot_events) print("Total number of files to be processed: ", files_processed) print(workflow_timestamp) print(workflow_main_reco1) print(workflow_main_reco2) print(workflow_join)
def postprocess_training(): # First, get this current job: current_job = dag.current_job # Let's parse the arguments to get the logdir: args, unknown = generic_parser() print(args) print(unknown) # We should be able to see the log dir. print("Attempting to scrape tensorboard information from {}".format( args.log_directory)) train_steps, train_loss, test_steps, test_loss = tabulate_events( args.log_directory) value = quantify_overtraining(minibatch_size=args.minibatch_size, train_loss=train_loss, test_loss=test_loss, train_steps=train_steps, test_steps=test_steps) if "2D" in dag.current_job.application: dimension = "2D" else: dimension = "3D" print(dimension) if value == 0: print("Should spawn another identical job") next_job = spawn_training_job( num_nodes=dag.current_job.num_nodes, wall_time_minutes=dag.current_job.wall_time_minutes, name=dag.current_job.name + "C", workflow=dag.current_job.workflow, dimension=dimension, args=dag.current_job.args) dag.add_dependency(dag.current_job, next_job) elif value == -1: print("Aborting this path") # elif value == 1: # args.learning_rate = str(float(args.learning_rate)*0.5) # new_args = " ".join(unknown) # for key in args: # new_args += "--{key} {value}".format(key=key, value=args[key]) # print("Should spawn another job, lower LR") # next_job = spawn_training_job( # num_nodes = dag.current_job.num_nodes, # wall_time_minutes = dag.current_job.wall_time_minutes, # name = dag.current_job.name + "-", # workflow = dag.current_job.workflow, # dimension = dimension, # args=new_args # ) # dag.add_dependency(dag.current_job, next_job) # print("Decrease learning rate") elif value == 2: # Inference files: inference_files = [ "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_1_of_8.root", "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_2_of_8.root", "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_3_of_8.root", "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_4_of_8.root", "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_5_of_8.root", "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_6_of_8.root", "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_7_of_8.root", "/lus/theta-fs0/projects/datascience/cadams/wire_pixel_preprocessed_files_split/val_event_id_8_of_8.root", ] for i, _file in enumerate(inference_files): basename = os.path.basename(_file) basename = basename.replace('.root', '_out.root') out_file = args.checkpoint_directory + basename args.file = _file args.minibatch_size = 1 args.iterations = 10 new_args = " ".join(unknown) for key in vars(args): new_args += "--{key} {value} ".format(key=key.replace( "_", "-"), value=getattr(args, key)) # Add the output file new_args += "--output-file {}".format(out_file) print(new_args) next_job = spawn_inference_job( num_nodes=dag.current_job.num_nodes, wall_time_minutes=dag.current_job.wall_time_minutes, name=dag.current_job.name + "I{}".format(i), workflow=dag.current_job.workflow, dimension=dimension, args=new_args) dag.add_dependency(dag.current_job, next_job) break print("Spawn inference jobs")
else: print("make_sides_post recognized timeout flag") num_sides = int(os.environ['BALSAM_FT_NUM_SIDES']) num_files = len(glob.glob("side*.dat")) if num_files == num_sides: print("it's okay, the job was actually done") current_job.update_state("JOB_FINISHED", "handled error; it was okay") exit(0) elif num_files == 0: print("Creating rescue job") children = current_job.get_children() rescue = dag.spawn_child(clone=True, application_args="--sleep 0 --retcode 0") rescue.set_parents([]) current_job.update_state("JOB_FINISHED", f"spawned rescue job {rescue.cute_id}") for child in children: child.set_parents([rescue]) exit(0) if '--dynamic-spawn' not in sys.argv: sys.exit(0) reduce_job = current_job.get_child_by_name('sum_squares') for i, sidefile in enumerate(glob.glob("side*.dat")): square_job = dag.spawn_child(name=f"square{i}", application="square", application_args=sidefile, input_files=sidefile) dag.add_dependency(parent=square_job, child=reduce_job) print(f"spawned square{i} job")