def execute_pattern(self, pattern, resource): pattern_start_time = datetime.datetime.now() def unit_state_cb(unit, state): # Callback function for the messages printed when # RADICAL_ENMD_VERBOSE = info if state == radical.pilot.DONE: self.get_logger().info("Task {0} state has finished succefully.".format(unit.uid)) if state==radical.pilot.FAILED: # In Case the pilot fails report Error messsage self.get_logger().error("Task {0} FAILED.".format(unit.uid)) self.get_logger().error("Error: {0}".format(unit.stderr)) def comparisons (set1,set2): ret = list () if set2 == None: for i in range (1, (len(set1)+1)): for j in range (1, len(set2)+1): ret.append ([i, j]) else: for i in range (1, (len(set1)+1)): for j in range (i+1, len(set1)+1): ret.append ([i, j]) return ret #----------------------------------------------------------------------- # Starting Plugin Execution self.get_logger().debug("Set 1 is {0}".format(pattern.set1_elements())) self.get_logger().debug("Set 2 is {0}".format(pattern.set2_elements())) NumElementsSet1 = len(pattern.set1_elements()) Permutations = pattern.permutations if pattern.set2_elements() is None: self.get_logger().info("Number of Elements {0}".format(NumElementsSet1)) self.get_logger().info("Executing All Pairs Pattern on the set {0} with {1} cores on {2}" .format(pattern.set1_elements(),resource._cores,resource._resource_key)) else: NumElementsSet2 = len(pattern.set2_elements()) self.get_logger().info("Number of Elements of the First Set {0}".format(NumElementsSet1)) self.get_logger().info("Number of Elements of the First Set {0}".format(NumElementsSet2)) self.get_logger().info("Executing All Pairs Pattern on the sets {0}-{1} with {2} cores on {3}" .format(pattern.set1_elements(),pattern.set2_elements(),resource._cores,resource._resource_key)) STAGING_AREA = 'staging:///' try: resource._umgr.register_callback(unit_state_cb) CUDesc_list = list() self.get_logger().info("Creating the Elements of Set 1") for i in range(1,NumElementsSet1+1): kernel = pattern.set1element_initialization(element=i) link_out_data=kernel.get_arg("--filename=") kernel._bind_to_resource(resource._resource_key) self.get_logger().debug("Kernels : {0}, Name: {1}".format(kernel,dir(kernel))) # #Output File Staging. The file after it is created in the folder of each CU, is moved to the folder defined in # #the start of the script OUTPUT_FILE = {'source':link_out_data, 'target':os.path.join(STAGING_AREA,link_out_data), 'action':radical.pilot.LINK} cudesc = radical.pilot.ComputeUnitDescription() cudesc.pre_exec = kernel._cu_def_pre_exec cudesc.executable = kernel._cu_def_executable cudesc.arguments = kernel.arguments cudesc.mpi = kernel.uses_mpi cudesc.output_staging = [OUTPUT_FILE] #self.get_logger().info("Target {0} to : {0}".format(kernel._cu_def_output_data)) self.get_logger().debug("Pre Exec: {0} Executable: {1} Arguments: {2} MPI: {3} Output: {4}".format(cudesc.pre_exec, kernel._cu_def_executable,cudesc.arguments,cudesc.mpi,cudesc.output_staging)) CUDesc_list.append(cudesc) if pattern.set2_elements() is not None: self.get_logger().info("Creating the Elements of Set 2") for i in range(1,NumElementsSet2+1): kernel = pattern.set2element_initialization(element=i) link_out_data=kernel.get_arg("--filename=") kernel._bind_to_resource(resource._resource_key) self.get_logger().debug("Kernels : {0}, Name: {1}".format(kernel,dir(kernel))) # #Output File Staging. The file after it is created in the folder of each CU, is moved to the folder defined in # #the start of the script OUTPUT_FILE = {'source':link_out_data, 'target':os.path.join(STAGING_AREA,link_out_data), 'action':radical.pilot.LINK} cudesc = radical.pilot.ComputeUnitDescription() cudesc.pre_exec = kernel._cu_def_pre_exec cudesc.executable = kernel._cu_def_executable cudesc.arguments = kernel.arguments cudesc.mpi = kernel.uses_mpi cudesc.output_staging = [OUTPUT_FILE] #self.get_logger().info("Target {0} to : {0}".format(kernel._cu_def_output_data)) self.get_logger().debug("Pre Exec: {0} Executable: {1} Arguments: {2} MPI: {3} Output: {4}".format(cudesc.pre_exec, kernel._cu_def_executable,cudesc.arguments,cudesc.mpi,cudesc.output_staging)) CUDesc_list.append(cudesc) Units = resource._umgr.submit_units(CUDesc_list) resource._umgr.wait_units() CUDesc_list = list() all_cus = [] windowsize1 = pattern._windowsize1 windowsize2 = pattern._windowsize2 journal = dict() step_timings = { "name": "AllPairs", "timings": {} } step_start_time_abs = datetime.datetime.now() for i in range(1,NumElementsSet1+1,windowsize1): if pattern.set2_elements() is None: for j in range(i,NumElementsSet1+1,windowsize1): kernel = pattern.element_comparison(elements1=range(i,i+windowsize1), elements2=range(j,j+windowsize1)) try: link_input1=ast.literal_eval(kernel.get_arg("--inputfile1=")) except: link_input1=[kernel.get_arg("--inputfile1=")] try: link_input2=ast.literal_eval(kernel.get_arg("--inputfile2=")) except: link_input2=[kernel.get_arg("--inputfile2=")] link_output=kernel.get_arg("--outputfile=") kernel._bind_to_resource(resource._resource_key) self.get_logger().info("Kernels : {0}, Name: {1}".format(kernel,dir(kernel))) # #Output File Staging. The file after it is created in the folder of each CU, is moved to the folder defined in # #the start of the script self.get_logger().debug("i = {0}, j = {1}, window size = {2}".format(i,j,windowsize1)) self.get_logger().debug("Link Input 1 = {0}".format(link_input1)) self.get_logger().debug("Link Input 2 = {0}".format(link_input2)) INPUT_FILE1 = [{'source': os.path.join(STAGING_AREA,link_input1[k-1]), 'target' : link_input1[k-1], 'action' : radical.pilot.LINK} for k in range(1,windowsize1+1)] if i != j: INPUT_FILE2 = [{'source': os.path.join(STAGING_AREA, link_input2[k-1]), 'target' : link_input2[k-1], 'action' : radical.pilot.LINK} for k in range(1,windowsize1+1)] else: INPUT_FILE2 = [] cudesc = radical.pilot.ComputeUnitDescription() cudesc.name = "comp; {el11};{el21}".format(el11=i,el21=j) cudesc.pre_exec = kernel._cu_def_pre_exec cudesc.executable = kernel._cu_def_executable cudesc.arguments = kernel.arguments cudesc.mpi = kernel.uses_mpi self.get_logger().debug("Input File 1: {0}".format(INPUT_FILE1)) self.get_logger().debug("Input File 2: {0}".format(INPUT_FILE2)) if kernel._cu_def_input_data is None: self.get_logger().debug("Input Staging without Kernel CU DEF Input Data") cudesc.input_staging = INPUT_FILE1+INPUT_FILE2 else: self.get_logger().debug("Input Staging with Kernel CU DEF Input Data") cudesc.input_staging = kernel._cu_def_input_data+INPUT_FILE1+INPUT_FILE2 cudesc.output_staging = [link_output] self.get_logger().debug("Pre Exec: {0} Executable: {1} Arguments: {2} MPI: {3} Input: {4} Output: {5}".format(cudesc.pre_exec, kernel._cu_def_executable,cudesc.arguments,cudesc.mpi,cudesc.input_staging,cudesc.output_staging)) all_cus.append(cudesc) else: for j in range(1,NumElementsSet2+1,windowsize2): kernel = pattern.element_comparison(elements1=range(i,i+windowsize1), elements2=range(j,j+windowsize2)) try: link_input1=ast.literal_eval(kernel.get_arg("--inputfile1=")) except: link_input1=[kernel.get_arg("--inputfile1=")] try: link_input2=ast.literal_eval(kernel.get_arg("--inputfile2=")) except: link_input2=[kernel.get_arg("--inputfile2=")] link_output=kernel.get_arg("--outputfile=") kernel._bind_to_resource(resource._resource_key) self.get_logger().info("Kernels : {0}, Name: {1}".format(kernel,dir(kernel))) # #Output File Staging. The file after it is created in the folder of each CU, is moved to the folder defined in # #the start of the script INPUT_FILE1 = [{'source': os.path.join(STAGING_AREA,link_input1[k-1]), 'target' : link_input1[k-1], 'action' : radical.pilot.LINK} for k in range(1,windowsize1+1)] INPUT_FILE2 = [{'source': os.path.join(STAGING_AREA, link_input2[k-1]), 'target' : link_input2[k-1], 'action' : radical.pilot.LINK} for k in range(1,windowsize2+1)] cudesc = radical.pilot.ComputeUnitDescription() cudesc.name = "comp; {el11};{el21}".format(el11=i,el21=j) cudesc.pre_exec = kernel._cu_def_pre_exec cudesc.executable = kernel._cu_def_executable cudesc.arguments = kernel.arguments cudesc.mpi = kernel.uses_mpi if kernel._cu_def_input_data is None: self.get_logger().debug("Input Staging without Kernel CU DEF Input Data") cudesc.input_staging=INPUT_FILE1+INPUT_FILE2 else: self.get_logger().debug("Input Staging with Kernel CU DEF Input Data") cudesc.input_staging = kernel._cu_def_input_data+INPUT_FILE1+INPUT_FILE2 cudesc.output_staging = [link_output] self.get_logger().debug("Pre Exec: {0} Executable: {1} Arguments: {2} MPI: {3} Input: {4} Output: {5}".format(cudesc.pre_exec, kernel._cu_def_executable,cudesc.arguments,cudesc.mpi,cudesc.input_staging,cudesc.output_staging)) all_cus.append(cudesc) sub_unit=resource._umgr.submit_units(all_cus) #self.get_logger().debug(sub_unit) resource._umgr.wait_units() step_end_time_abs = datetime.datetime.now() self.get_logger().info("Pattern execution successful.") # Process CU information and append it to the dictionary tinfo = extract_timing_info(sub_unit, pattern_start_time, step_start_time_abs, step_end_time_abs) self.get_logger().debug("Extracted timings Information") mean_unit_time = 0 for unit in sub_unit: mean_unit_time =mean_unit_time+(unit.stop_time-unit.start_time).total_seconds() mean_unit_time = mean_unit_time/len(sub_unit) self.get_logger().debug("Mean CU execution time is %f"%mean_unit_time) for key, val in tinfo.iteritems(): step_timings['timings'][key] = val self.get_logger().debug("Created step timings") ## Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) self.get_logger().debug("Wrote the whole thing to the profiling dict") except KeyboardInterrupt: traceback.print_exc()
def execute_pattern(self, pattern, resource): try: try: cycles = pattern.nr_cycles + 1 except: self.get_logger().exception("Number of cycles (nr_cycles) must be defined for pattern ReplicaExchange!") raise do_profile = os.getenv("RADICAL_ENMD_PROFILING", "0") if do_profile == "1": pattern._execution_profile = [] all_cus = [] # shared data needs to be processed here # # Pilot must be active resource._pmgr.wait_pilots(resource._pilot.uid, "Active") if do_profile == "1": pattern_start_time = datetime.datetime.now() replicas = pattern.get_replicas() for c in range(1, cycles): if do_profile == "1": step_timings = {"name": "md_run_{0}".format(c), "timings": {}} step_start_time_abs = datetime.datetime.now() md_units = [] for r in replicas: self.get_logger().info("Building input files for replica %d" % r.id) pattern.build_input_file(r) self.get_logger().info("Preparing replica %d for MD run" % r.id) r_kernel = pattern.prepare_replica_for_md(r) r_kernel._bind_to_resource(resource._resource_key) # need to process data directives here # cu = radical.pilot.ComputeUnitDescription() cu.pre_exec = r_kernel._cu_def_pre_exec cu.executable = r_kernel._cu_def_executable cu.arguments = r_kernel.arguments cu.mpi = r_kernel.uses_mpi cu.cores = r_kernel.cores cu.input_staging = r_kernel._cu_def_input_data cu.output_staging = r_kernel._cu_def_output_data sub_replica = resource._umgr.submit_units(cu) md_units.append(sub_replica) if do_profile == "1": all_cus.extend(md_units) self.get_logger().info("Performing MD step for replicas") resource._umgr.wait_units() if do_profile == "1": step_end_time_abs = datetime.datetime.now() failed_units = "" for unit in md_units: if unit.state != radical.pilot.DONE: failed_units += " * MD step: Unit {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) if len(failed_units) > 0: sys.exit() if do_profile == "1": # Process CU information and append it to the dictionary if isinstance(pattern_start_time, datetime.datetime): if isinstance(step_start_time_abs, datetime.datetime): if isinstance(step_end_time_abs, datetime.datetime): tinfo = extract_timing_info( md_units, pattern_start_time, step_start_time_abs, step_end_time_abs ) else: sys.exit( "Ensemble MD Toolkit Error: step_end_time_abs for {0} is not datetime.datetime instance.".format( step_timings["name"] ) ) else: sys.exit( "Ensemble MD Toolkit Error: step_start_time_abs for {0} is not datetime.datetime instance.".format( step_timings["name"] ) ) else: sys.exit("Ensemble MD Toolkit Error: pattern_start_time is not datetime.datetime instance.") for key, val in tinfo.iteritems(): step_timings["timings"][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) # --------------------------------------------------------------- if c < cycles: if do_profile == "1": step_timings = {"name": "local_exchange_{0}".format(c), "timings": {}} step_start_time_abs = datetime.datetime.now() # computing swap matrix self.get_logger().info("Computing swap matrix") swap_matrix = pattern.get_swap_matrix(replicas) # this is actual exchange for r_i in replicas: r_j = pattern.exchange(r_i, replicas, swap_matrix) if r_j != r_i: # swap parameters self.get_logger().info( "Performing exchange of parameters between replica %d and replica %d" % (r_j.id, r_i.id) ) pattern.perform_swap(r_i, r_j) if do_profile == "1": step_end_time_abs = datetime.datetime.now() # processing timings step_start_time_rel = step_start_time_abs - pattern_start_time step_end_time_rel = step_end_time_abs - pattern_start_time tinfo = { "step_start_time": {"abs": step_start_time_abs, "rel": step_start_time_rel}, "step_end_time": {"abs": step_end_time_abs, "rel": step_end_time_rel}, } for key, val in tinfo.iteritems(): step_timings["timings"][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) # -------------------------------------------------------------------- # End of simulation loop # ------------------------ except Exception, ex: self.get_logger().exception("Fatal error during execution: {0}.".format(str(ex))) raise
def execute_pattern(self, pattern, resource): pattern_start_time = datetime.datetime.now() #----------------------------------------------------------------------- # def unit_state_cb (unit, state) : if state == radical.pilot.FAILED: self.get_logger().error("ComputeUnit error: STDERR: {0}, STDOUT: {0}".format(unit.stderr, unit.stdout)) self.get_logger().error("Pattern execution FAILED.") self.get_logger().info("Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'".format(pattern.iterations, resource._cores, resource._resource_key)) working_dirs = {} all_cus = [] pattern._execution_profile = [] try: resource._umgr.register_callback(unit_state_cb) ######################################################################## # execute pre_loop # try: ################################################################ # EXECUTE PRE-LOOP step_timings = { "name": "pre_loop", "timings": {} } step_start_time_abs = datetime.datetime.now() pre_loop = pattern.pre_loop() pre_loop._bind_to_resource(resource._resource_key) cu = radical.pilot.ComputeUnitDescription() cu.name = "pre_loop" cu.pre_exec = pre_loop._cu_def_pre_exec cu.executable = pre_loop._cu_def_executable cu.arguments = pre_loop.arguments cu.mpi = pre_loop.uses_mpi cu.input_staging = pre_loop._cu_def_input_data cu.output_staging = pre_loop._cu_def_output_data self.get_logger().debug("Created pre_loop CU: {0}.".format(cu.as_dict())) unit = resource._umgr.submit_units(cu) all_cus.append(unit) self.get_logger().info("Submitted ComputeUnit(s) for pre_loop step.") self.get_logger().info("Waiting for ComputeUnit(s) in pre_loop step to complete.") resource._umgr.wait_units() self.get_logger().info("Pre_loop completed.") step_end_time_abs = datetime.datetime.now() if unit.state != radical.pilot.DONE: raise EnsemblemdError("Pre-loop CU failed with error: {0}".format(unit.stdout)) pre_loop_cu = [unit] working_dirs["pre_loop"] = saga.Url(unit.working_directory).path # Process CU information and append it to the dictionary tinfo = extract_timing_info(pre_loop_cu, pattern_start_time, step_start_time_abs, step_end_time_abs) for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) except Exception: # Doesn't exist. That's fine as it is not mandatory. self.get_logger().info("pre_loop() not defined. Skipping.") pass ######################################################################## # execute simulation analysis loop # for iteration in range(1, pattern.iterations+1): working_dirs['iteration_{0}'.format(iteration)] = {} ################################################################ # EXECUTE SIMULATION STEPS step_timings = { "name": "simulation_iteration_{0}".format(iteration), "timings": {} } step_start_time_abs = datetime.datetime.now() if isinstance(pattern.simulation_step(iteration=1, instance=1),list): num_sim_kerns = len(pattern.simulation_step(iteration=1, instance=1)) else: num_sim_kerns = 1 #print num_sim_kerns all_sim_cus = [] for kern_step in range(0,num_sim_kerns): s_units = [] for s_instance in range(1, pattern._simulation_instances+1): if isinstance(pattern.simulation_step(iteration=iteration, instance=s_instance),list): sim_step = pattern.simulation_step(iteration=iteration, instance=s_instance)[kern_step] else: sim_step = pattern.simulation_step(iteration=iteration, instance=s_instance) sim_step._bind_to_resource(resource._resource_key) # Resolve all placeholders #if sim_step.link_input_data is not None: # for i in range(len(sim_step.link_input_data)): # sim_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step.link_input_data[i]) cud = radical.pilot.ComputeUnitDescription() cud.name = "sim ;{iteration} ;{instance}".format(iteration=iteration, instance=s_instance) cud.pre_exec = sim_step._cu_def_pre_exec cud.executable = sim_step._cu_def_executable cud.arguments = sim_step.arguments cud.mpi = sim_step.uses_mpi cud.input_staging = None cud.output_staging = None # INPUT DATA: #------------------------------------------------------------------------------------------------------------------ # upload_input_data data_in = [] if sim_step._kernel._upload_input_data is not None: if isinstance(sim_step._kernel._upload_input_data,list): pass else: sim_step._kernel._upload_input_data = [sim_step._kernel._upload_input_data] for i in range(0,len(sim_step._kernel._upload_input_data)): var=resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step._kernel._upload_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip() } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()) } data_in.append(temp) if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # link_input_data data_in = [] if sim_step._kernel._link_input_data is not None: if isinstance(sim_step._kernel._link_input_data,list): pass else: sim_step._kernel._link_input_data = [sim_step._kernel._link_input_data] for i in range(0,len(sim_step._kernel._link_input_data)): var=resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step._kernel._link_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.LINK } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.LINK } data_in.append(temp) if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # copy_input_data data_in = [] if sim_step._kernel._copy_input_data is not None: if isinstance(sim_step._kernel._copy_input_data,list): pass else: sim_step._kernel._copy_input_data = [sim_step._kernel._copy_input_data] for i in range(0,len(sim_step._kernel._copy_input_data)): var=resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step._kernel._copy_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.COPY } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.COPY } data_in.append(temp) if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # download input data if sim_step.download_input_data is not None: data_in = sim_step.download_input_data if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ # OUTPUT DATA: #------------------------------------------------------------------------------------------------------------------ # copy_output_data data_out = [] if sim_step._kernel._copy_output_data is not None: if isinstance(sim_step._kernel._copy_output_data,list): pass else: sim_step._kernel._copy_output_data = [sim_step._kernel._copy_output_data] for i in range(0,len(sim_step._kernel._copy_output_data)): var=resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step._kernel._copy_output_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.COPY } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.COPY } data_out.append(temp) if cud.output_staging is None: cud.output_staging = data_out else: cud.output_staging += data_out #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # download_output_data data_out = [] if sim_step._kernel._download_output_data is not None: if isinstance(sim_step._kernel._download_output_data,list): pass else: sim_step._kernel._download_output_data = [sim_step._kernel._download_output_data] for i in range(0,len(sim_step._kernel._download_output_data)): var=resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step._kernel._download_output_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip() } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()) } data_out.append(temp) if cud.output_staging is None: cud.output_staging = data_out else: cud.output_staging += data_out #------------------------------------------------------------------------------------------------------------------ if sim_step.cores is not None: cud.cores = sim_step.cores s_units.append(cud) if sim_step.get_instance_type() == 'single': break self.get_logger().debug("Created simulation CU: {0}.".format(cud.as_dict())) s_cus = resource._umgr.submit_units(s_units) all_cus.extend(s_cus) all_sim_cus.extend(s_cus) self.get_logger().info("Submitted tasks for simulation iteration {0}.".format(iteration)) self.get_logger().info("Waiting for simulations in iteration {0}/ kernel {1}: {2} to complete.".format(iteration,kern_step+1,sim_step.name)) resource._umgr.wait_units() self.get_logger().info("Simulations in iteration {0}/ kernel {1}: {2} completed.".format(iteration,kern_step+1,sim_step.name)) failed_units = "" for unit in s_cus: if unit.state != radical.pilot.DONE: failed_units += " * Simulation task {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) step_end_time_abs = datetime.datetime.now() # TODO: ensure working_dir <-> instance mapping i = 0 for cu in s_cus: i += 1 working_dirs['iteration_{0}'.format(iteration)]['simulation_{0}'.format(i)] = saga.Url(cu.working_directory).path # Process CU information and append it to the dictionary tinfo = extract_timing_info(all_sim_cus, pattern_start_time, step_start_time_abs, step_end_time_abs) for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) ################################################################ # EXECUTE ANALYSIS STEPS step_timings = { "name": "analysis_iteration_{0}".format(iteration), "timings": {} } step_start_time_abs = datetime.datetime.now() if isinstance(pattern.analysis_step(iteration=1, instance=1),list): num_ana_kerns = len(pattern.analysis_step(iteration=1, instance=1)) else: num_ana_kerns = 1 #print num_ana_kerns all_ana_cus = [] for kern_step in range(0,num_ana_kerns): a_units = [] for a_instance in range(1, pattern._analysis_instances+1): if isinstance(pattern.analysis_step(iteration=iteration, instance=a_instance),list): ana_step = pattern.analysis_step(iteration=iteration, instance=a_instance)[kern_step] else: ana_step = pattern.analysis_step(iteration=iteration, instance=a_instance) ana_step._bind_to_resource(resource._resource_key) # Resolve all placeholders #if ana_step.link_input_data is not None: # for i in range(len(ana_step.link_input_data)): # ana_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step.link_input_data[i]) cud = radical.pilot.ComputeUnitDescription() cud.name = "ana ; {iteration}; {instance}".format(iteration=iteration, instance=a_instance) cud.pre_exec = ana_step._cu_def_pre_exec cud.executable = ana_step._cu_def_executable cud.arguments = ana_step.arguments cud.mpi = ana_step.uses_mpi cud.input_staging = None cud.output_staging = None #------------------------------------------------------------------------------------------------------------------ # upload_input_data data_in = [] if ana_step._kernel._upload_input_data is not None: if isinstance(ana_step._kernel._upload_input_data,list): pass else: ana_step._kernel._upload_input_data = [ana_step._kernel._upload_input_data] for i in range(0,len(ana_step._kernel._upload_input_data)): var=resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step._kernel._upload_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip() } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()) } data_in.append(temp) if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # link_input_data data_in = [] if ana_step._kernel._link_input_data is not None: if isinstance(ana_step._kernel._link_input_data,list): pass else: ana_step._kernel._link_input_data = [ana_step._kernel._link_input_data] for i in range(0,len(ana_step._kernel._link_input_data)): var=resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step._kernel._link_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.LINK } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.LINK } data_in.append(temp) if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # copy_input_data data_in = [] if ana_step._kernel._copy_input_data is not None: if isinstance(ana_step._kernel._copy_input_data,list): pass else: ana_step._kernel._copy_input_data = [ana_step._kernel._copy_input_data] for i in range(0,len(ana_step._kernel._copy_input_data)): var=resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step._kernel._copy_input_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.COPY } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.COPY } data_in.append(temp) if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # download input data if ana_step.download_input_data is not None: data_in = ana_step.download_input_data if cud.input_staging is None: cud.input_staging = data_in else: cud.input_staging += data_in #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # copy_output_data data_out = [] if ana_step._kernel._copy_output_data is not None: if isinstance(ana_step._kernel._copy_output_data,list): pass else: ana_step._kernel._copy_output_data = [ana_step._kernel._copy_output_data] for i in range(0,len(ana_step._kernel._copy_output_data)): var=resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step._kernel._copy_output_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip(), 'action': radical.pilot.COPY } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()), 'action': radical.pilot.COPY } data_out.append(temp) if cud.output_staging is None: cud.output_staging = data_out else: cud.output_staging += data_out #------------------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------------------ # download_output_data data_out = [] if ana_step._kernel._download_output_data is not None: if isinstance(ana_step._kernel._download_output_data,list): pass else: ana_step._kernel._download_output_data = [ana_step._kernel._download_output_data] for i in range(0,len(ana_step._kernel._download_output_data)): var=resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step._kernel._download_output_data[i]) if len(var.split('>')) > 1: temp = { 'source': var.split('>')[0].strip(), 'target': var.split('>')[1].strip() } else: temp = { 'source': var.split('>')[0].strip(), 'target': os.path.basename(var.split('>')[0].strip()) } data_out.append(temp) if cud.output_staging is None: cud.output_staging = data_out else: cud.output_staging += data_out #------------------------------------------------------------------------------------------------------------------ if ana_step.cores is not None: cud.cores = ana_step.cores a_units.append(cud) if ana_step.get_instance_type == 'single': break self.get_logger().debug("Created analysis CU: {0}.".format(cud.as_dict())) a_cus = resource._umgr.submit_units(a_units) all_cus.extend(a_cus) all_ana_cus.extend(a_cus) self.get_logger().info("Submitted tasks for analysis iteration {0}.".format(iteration)) self.get_logger().info("Waiting for analysis tasks in iteration {0}/kernel {1}: {2} to complete.".format(iteration,kern_step+1,ana_step.name)) resource._umgr.wait_units() self.get_logger().info("Analysis in iteration {0}/kernel {1}: {2} completed.".format(iteration,kern_step+1,ana_step.name)) failed_units = "" for unit in a_cus: if unit.state != radical.pilot.DONE: failed_units += " * Analysis task {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) step_end_time_abs = datetime.datetime.now() i = 0 for cu in a_cus: i += 1 working_dirs['iteration_{0}'.format(iteration)]['analysis_{0}'.format(i)] = saga.Url(cu.working_directory).path # Process CU information and append it to the dictionary tinfo = extract_timing_info(all_ana_cus, pattern_start_time, step_start_time_abs, step_end_time_abs) for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) except KeyboardInterrupt: traceback.print_exc()
def execute_pattern(self, pattern, resource): pattern_start_time = datetime.datetime.now() #----------------------------------------------------------------------- # def unit_state_cb (unit, state) : if state == radical.pilot.FAILED: self.get_logger().error("ComputeUnit error: STDERR: {0}, STDOUT: {0}".format(unit.stderr, unit.stdout)) self.get_logger().error("Pattern execution FAILED.") self.get_logger().info("Executing simulation-analysis loop with {0} iterations on {1} allocated core(s) on '{2}'".format(pattern.iterations, resource._cores, resource._resource_key)) working_dirs = {} all_cus = [] pattern._execution_profile = [] try: resource._umgr.register_callback(unit_state_cb) ######################################################################## # execute pre_loop # try: pre_loop = pattern.pre_loop() pre_loop._bind_to_resource(resource._resource_key) cu = radical.pilot.ComputeUnitDescription() cu.name = "pre_loop" cu.pre_exec = pre_loop._cu_def_pre_exec cu.executable = pre_loop._cu_def_executable cu.arguments = pre_loop.arguments cu.mpi = pre_loop.uses_mpi cu.input_staging = pre_loop._cu_def_input_data cu.output_staging = pre_loop._cu_def_output_data self.get_logger().debug("Created pre_loop CU: {0}.".format(cu.as_dict())) unit = resource._umgr.submit_units(cu) all_cus.append(unit) self.get_logger().info("Submitted ComputeUnit(s) for pre_loop step.") self.get_logger().info("Waiting for ComputeUnit(s) in pre_loop step to complete.") resource._umgr.wait_units() self.get_logger().info("Pre_loop completed.") if unit.state != radical.pilot.DONE: raise EnsemblemdError("Pre-loop CU failed with error: {0}".format(unit.stdout)) working_dirs["pre_loop"] = saga.Url(unit.working_directory).path except Exception: # Doesn't exist. That's fine as it is not mandatory. self.get_logger().info("pre_loop() not defined. Skipping.") pass ######################################################################## # execute simulation analysis loop # for iteration in range(1, pattern.iterations+1): working_dirs['iteration_{0}'.format(iteration)] = {} ################################################################ # EXECUTE SIMULATION STEPS step_timings = { "name": "simulation_iteration_{0}".format(iteration), "timings": {} } step_start_time_abs = datetime.datetime.now() s_units = [] for s_instance in range(1, pattern._simulation_instances+1): sim_step = pattern.simulation_step(iteration=iteration, instance=s_instance) sim_step._bind_to_resource(resource._resource_key) # Resolve all placeholders if sim_step.link_input_data is not None: for i in range(len(sim_step.link_input_data)): sim_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, s_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "simulation", sim_step.link_input_data[i]) cud = radical.pilot.ComputeUnitDescription() cud.name = "sim ;{iteration} ;{instance}".format(iteration=iteration, instance=s_instance) cud.pre_exec = sim_step._cu_def_pre_exec cud.executable = sim_step._cu_def_executable cud.arguments = sim_step.arguments cud.mpi = sim_step.uses_mpi cud.input_staging = sim_step._cu_def_input_data cud.output_staging = sim_step._cu_def_output_data # This is a good time to replace all placeholders in the # pre_exec list. try: cud.cores = sim_step.cores except: pass s_units.append(cud) self.get_logger().debug("Created simulation CU: {0}.".format(cud.as_dict())) s_cus = resource._umgr.submit_units(s_units) all_cus.extend(s_cus) self.get_logger().info("Submitted tasks for simulation iteration {0}.".format(iteration)) self.get_logger().info("Waiting for simulations in iteration {0} to complete.".format(iteration)) resource._umgr.wait_units() self.get_logger().info("Simulations in iteration {0} completed.".format(iteration)) failed_units = "" for unit in s_cus: if unit.state != radical.pilot.DONE: failed_units += " * Simulation task {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) # TODO: ensure working_dir <-> instance mapping i = 0 for cu in s_cus: i += 1 working_dirs['iteration_{0}'.format(iteration)]['simulation_{0}'.format(i)] = saga.Url(cu.working_directory).path step_end_time_abs = datetime.datetime.now() # Process CU information and append it to the dictionary tinfo = extract_timing_info(s_cus, pattern_start_time, step_start_time_abs, step_end_time_abs) for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) ################################################################ # EXECUTE ANALYSIS STEPS step_timings = { "name": "analysis_iteration_{0}".format(iteration), "timings": {} } step_start_time_abs = datetime.datetime.now() a_units = [] analysis_list = None for a_instance in range(1, pattern._analysis_instances+1): analysis_list = pattern.analysis_step(iteration=iteration, instance=a_instance) if not isinstance(analysis_list,list): analysis_list = [analysis_list] if len(analysis_list) > 1: kernel_wd = "" cur_kernel = 1 for ana_step in analysis_list: a_units = [] ana_step._bind_to_resource(resource._resource_key) # Resolve all placeholders if ana_step.link_input_data is not None: for i in range(len(ana_step.link_input_data)): ana_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", ana_step.link_input_data[i]) cud = radical.pilot.ComputeUnitDescription() cud.name = "ana ; {iteration}; {instance}".format(iteration=iteration, instance=a_instance) cud.pre_exec = ana_step._cu_def_pre_exec if cur_kernel > 1: cud.pre_exec.append('cp -n %s/*.* .'%kernel_wd) cud.executable = ana_step._cu_def_executable cud.arguments = ana_step.arguments cud.mpi = ana_step.uses_mpi cud.input_staging = ana_step._cu_def_input_data cud.output_staging = ana_step._cu_def_output_data try: cud.cores = ana_step.cores except: pass a_units.append(cud) self.get_logger().debug("Created analysis CU: {0}.".format(cud.as_dict())) a_cus = resource._umgr.submit_units(a_units) all_cus.extend(a_cus) self.get_logger().info("Submitted tasks for analysis iteration {0}/ kernel {1}.".format(iteration,cur_kernel)) self.get_logger().info("Waiting for analysis tasks in iteration {0}/kernel {1} to complete.".format(iteration,cur_kernel)) resource._umgr.wait_units() self.get_logger().info("Analysis in iteration {0}/kernel {1}:{2} completed.".format(iteration,cur_kernel,ana_step.name)) failed_units = "" for unit in a_cus: if unit.state != radical.pilot.DONE: failed_units += " * Analysis task {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) else: kernel_wd = saga.Url(unit.working_directory).path cur_kernel += 1 working_dirs['iteration_{0}'.format(iteration)]['analysis_1'] = saga.Url(unit.working_directory).path else: analysis_step = analysis_list[0] analysis_step._bind_to_resource(resource._resource_key) # Resolve all placeholders if analysis_step.link_input_data is not None: for i in range(len(analysis_step.link_input_data)): analysis_step.link_input_data[i] = resolve_placeholder_vars(working_dirs, a_instance, iteration, pattern._simulation_instances, pattern._analysis_instances, "analysis", analysis_step.link_input_data[i]) cud = radical.pilot.ComputeUnitDescription() cud.name = "ana; {iteration};{instance}".format(iteration=iteration, instance=a_instance) cud.pre_exec = analysis_step._cu_def_pre_exec cud.executable = analysis_step._cu_def_executable cud.arguments = analysis_step.arguments cud.mpi = analysis_step.uses_mpi cud.input_staging = analysis_step._cu_def_input_data cud.output_staging = analysis_step._cu_def_output_data a_units.append(cud) self.get_logger().debug("Created analysis CU: {0}.".format(cud.as_dict())) if len(analysis_list)==1: a_cus = resource._umgr.submit_units(a_units) all_cus.extend(a_cus) self.get_logger().info("Submitted tasks for analysis iteration {0}.".format(iteration)) self.get_logger().info("Waiting for analysis tasks in iteration {0} to complete.".format(iteration)) resource._umgr.wait_units() self.get_logger().info("Analysis in iteration {0} completed.".format(iteration)) failed_units = "" for unit in a_cus: if unit.state != radical.pilot.DONE: failed_units += " * Analysis task {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) # TODO: ensure working_dir <-> instance mapping i = 0 for cu in a_cus: i += 1 working_dirs['iteration_{0}'.format(iteration)]['analysis_{0}'.format(i)] = saga.Url(cu.working_directory).path step_end_time_abs = datetime.datetime.now() # Process CU information and append it to the dictionary tinfo = extract_timing_info(a_cus, pattern_start_time, step_start_time_abs, step_end_time_abs) for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) except Exception, ex: self.get_logger().error("Fatal error during execution: {0}.".format(str(ex))) raise
def execute_pattern(self, pattern, resource): try: try: cycles = pattern.nr_cycles+1 except: self.get_logger().exception("Number of cycles (nr_cycles) must be defined for pattern ReplicaExchange!") raise do_profile = os.getenv('RADICAL_ENMD_PROFILING', '0') if do_profile == '1': pattern._execution_profile = [] all_cus = [] # shared data pattern.prepare_shared_data() shared_input_file_urls = pattern.shared_urls shared_input_files = pattern.shared_files sd_shared_list = [] for i in range(len(shared_input_files)): sd_pilot = {'source': shared_input_file_urls[i], 'target': 'staging:///%s' % shared_input_files[i], 'action': radical.pilot.TRANSFER } resource._pilot.stage_in(sd_pilot) sd_shared = {'source': 'staging:///%s' % shared_input_files[i], 'target': shared_input_files[i], 'action': radical.pilot.COPY } sd_shared_list.append(sd_shared) # Pilot must be active resource._pmgr.wait_pilots(resource._pilot.uid,'Active') if do_profile == '1': pattern_start_time = datetime.datetime.now() replicas = pattern.get_replicas() for c in range(1, cycles): if do_profile == '1': step_timings = { "name": "md_run_{0}".format(c), "timings": {} } step_start_time_abs = datetime.datetime.now() md_units = [] for r in replicas: self.get_logger().info("Cycle %d: Building input files for replica %d" % ((c), r.id) ) pattern.build_input_file(r) self.get_logger().info("Cycle %d: Preparing replica %d for MD run" % ((c), r.id) ) r_kernel = pattern.prepare_replica_for_md(r) if ((r_kernel._kernel.get_name()) == "md.amber"): r_kernel._bind_to_resource(resource._resource_key, pattern.name) else: r_kernel._bind_to_resource(resource._resource_key) # processing data directives # need means to distinguish between copy and link copy_out = [] items_out = r_kernel._kernel._copy_output_data # copy_output_data is not mandatory if items_out: for item in items_out: i_out = { 'source': item, 'target': 'staging:///%s' % item, 'action': radical.pilot.COPY } copy_out.append(i_out) cu = radical.pilot.ComputeUnitDescription() cu.name = "md ;{cycle} ;{replica}".format(cycle=c, replica=r.id) cu.pre_exec = r_kernel._cu_def_pre_exec cu.executable = r_kernel._cu_def_executable cu.arguments = r_kernel.arguments cu.mpi = r_kernel.uses_mpi cu.cores = r_kernel.cores cu.input_staging = sd_shared_list + r_kernel._cu_def_input_data cu.output_staging = copy_out + r_kernel._cu_def_output_data sub_replica = resource._umgr.submit_units(cu) md_units.append(sub_replica) if do_profile == '1': all_cus.extend(md_units) self.get_logger().info("Cycle %d: Performing MD step for replicas" % (c) ) resource._umgr.wait_units() if do_profile == '1': step_end_time_abs = datetime.datetime.now() failed_units = "" for unit in md_units: if unit.state != radical.pilot.DONE: failed_units += " * MD step: Unit {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) if len(failed_units) > 0: sys.exit() if do_profile == '1': # Process CU information and append it to the dictionary if isinstance(pattern_start_time, datetime.datetime): if isinstance(step_start_time_abs, datetime.datetime): if isinstance(step_end_time_abs, datetime.datetime): tinfo = extract_timing_info(md_units, pattern_start_time, step_start_time_abs, step_end_time_abs) else: sys.exit("Ensemble MD Toolkit Error: step_end_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: step_start_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: pattern_start_time is not datetime.datetime instance.") for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) #----------------------------------------------------------- if (c < cycles): if do_profile == '1': step_timings = { "name": "ex_run_{0}".format(c), "timings": {} } step_start_time_abs = datetime.datetime.now() ex_units = [] for r in replicas: self.get_logger().info("Cycle %d: Preparing replica %d for Exchange run" % ((c), r.id) ) ex_kernel = pattern.prepare_replica_for_exchange(r) ex_kernel._bind_to_resource(resource._resource_key) cu = radical.pilot.ComputeUnitDescription() cu.name = "ex ;{cycle} ;{replica}".format(cycle=c, replica=r.id) cu.pre_exec = ex_kernel._cu_def_pre_exec cu.executable = ex_kernel._cu_def_executable cu.arguments = ex_kernel.arguments cu.mpi = ex_kernel.uses_mpi cu.cores = ex_kernel.cores cu.input_staging = ex_kernel._cu_def_input_data cu.output_staging = ex_kernel._cu_def_output_data sub_replica = resource._umgr.submit_units(cu) ex_units.append(sub_replica) self.get_logger().info("Cycle %d: Performing Exchange step for replicas" % (c) ) resource._umgr.wait_units() if do_profile == '1': step_end_time_abs = datetime.datetime.now() all_cus.extend(ex_units) failed_units = "" for unit in ex_units: if unit.state != radical.pilot.DONE: failed_units += " * EX step: Unit {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) if len(failed_units) > 0: sys.exit() if do_profile == '1': # Process CU information and append it to the dictionary if isinstance(pattern_start_time, datetime.datetime): if isinstance(step_start_time_abs, datetime.datetime): if isinstance(step_end_time_abs, datetime.datetime): tinfo = extract_timing_info(ex_units, pattern_start_time, step_start_time_abs, step_end_time_abs) else: sys.exit("Ensemble MD Toolkit Error: step_end_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: step_start_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: pattern_start_time is not datetime.datetime instance.") for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) step_timings = { "name": "post_processing_{0}".format(c), "timings": {} } step_start_time_abs = datetime.datetime.now() #--------------------------------------------------------------- matrix_columns = [] for r in ex_units: d = str(r.stdout) data = d.split() matrix_columns.append(data) # writing swap matrix out sw_file = "matrix_columns_" + str(c) try: w_file = open( sw_file, "w") for i in matrix_columns: for j in i: w_file.write("%s " % j) w_file.write("\n") w_file.close() except IOError: self.get_logger().info('Warning: unable to access file %s' % sw_file) # computing swap matrix self.get_logger().info("Cycle %d: Composing swap matrix" % (c) ) swap_matrix = pattern.get_swap_matrix(replicas, matrix_columns) # this is actual exchange for r_i in replicas: r_j = pattern.exchange(r_i, replicas, swap_matrix) if (r_j != r_i): self.get_logger().info("Performing exchange of parameters between replica %d and replica %d" % ( r_j.id, r_i.id )) # swap parameters pattern.perform_swap(r_i, r_j) if do_profile == '1': step_end_time_abs = datetime.datetime.now() # processing timings step_start_time_rel = step_start_time_abs - pattern_start_time step_end_time_rel = step_end_time_abs - pattern_start_time tinfo = { "step_start_time": { "abs": step_start_time_abs, "rel": step_start_time_rel }, "step_end_time": { "abs": step_end_time_abs, "rel": step_end_time_rel } } for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) #------------------------------------------------------------------ # End of simulation loop #------------------------ except Exception, ex: self.get_logger().exception("Fatal error during execution: {0}.".format(str(ex))) raise
def execute_pattern(self, pattern, resource): try: try: cycles = pattern.nr_cycles+1 except: self.get_logger().exception("Number of cycles (nr_cycles) \ must be defined for pattern ReplicaExchange!") raise do_profile = os.getenv('RADICAL_ENMD_PROFILING', '0') if do_profile == '1': pattern._execution_profile = [] all_cus = [] # shared data pattern.prepare_shared_data() shared_input_file_urls = pattern.shared_urls shared_input_files = pattern.shared_files sd_shared_list = [] for i in range(len(shared_input_files)): sd_pilot = {'source': shared_input_file_urls[i], 'target': 'staging:///%s' % shared_input_files[i], 'action': radical.pilot.TRANSFER } resource._pilot.stage_in(sd_pilot) sd_shared = {'source': 'staging:///%s' % shared_input_files[i], 'target': shared_input_files[i], 'action': radical.pilot.COPY } sd_shared_list.append(sd_shared) # Pilot must be active resource._pmgr.wait_pilots(resource._pilot.uid,'Active') if do_profile == '1': pattern_start_time = datetime.datetime.now() replicas = pattern.get_replicas() #------------------------------------------------------------------- # GL = 0: submit global calculator before # GL = 1: submit global calculator after GL = 0 for c in range(1, cycles): if do_profile == '1': step_timings = { "name": "md_run_{0}".format(c), "timings": {} } step_start_time_abs = datetime.datetime.now() md_units = [] cus = [] for r in replicas: self.get_logger().info("Cycle %d: Preparing replica %d for MD-step" % ((c), r.id) ) r_kernel = pattern.prepare_replica_for_md(r) if ((r_kernel._kernel.get_name()) == "md.amber"): r_kernel._bind_to_resource(resource._resource_key, pattern.name) else: r_kernel._bind_to_resource(resource._resource_key) # processing data directives # need means to distinguish between copy and link #----------------------------------------------------------- copy_out = [] items_out = r_kernel._kernel._copy_output_data if items_out: for item in items_out: i_out = { 'source': item, 'target': 'staging:///%s' % item, 'action': radical.pilot.COPY } copy_out.append(i_out) #----------------------------------------------------------- copy_in = [] items_in = r_kernel._kernel._copy_input_data if items_in: for item in items_in: i_in = { 'source': 'staging:///%s' % item, 'target': item, 'action': radical.pilot.COPY } copy_in.append(i_in) #----------------------------------------------------------- cu = radical.pilot.ComputeUnitDescription() cu.name = "md ;{cycle} ;{replica}"\ .format(cycle=c, replica=r.id) cu.pre_exec = r_kernel._cu_def_pre_exec cu.executable = r_kernel._cu_def_executable cu.post_exec = r_kernel._cu_def_post_exec cu.arguments = r_kernel.arguments cu.mpi = r_kernel.uses_mpi cu.cores = r_kernel.cores #----------------------------------------------------------- in_list = [] if r_kernel._cu_def_input_data: in_list = in_list + r_kernel._cu_def_input_data if copy_in: in_list = in_list + copy_in cu.input_staging = in_list #----------------------------------------------------------- out_list = [] if r_kernel._cu_def_output_data: out_list = out_list + r_kernel._cu_def_output_data if copy_out: out_list = out_list + copy_out cu.output_staging = out_list #----------------------------------------------------------- cus.append(cu) # bulk submission sub_replicas = resource._umgr.submit_units(cus) for r in sub_replicas: md_units.append(r) if do_profile == '1': all_cus.extend(md_units) self.get_logger().info("Cycle %d: Performing MD-step for replicas" % (c) ) resource._umgr.wait_units() if do_profile == '1': step_end_time_abs = datetime.datetime.now() failed_units = "" for unit in md_units: if unit.state != radical.pilot.DONE: failed_units += " * MD step: Unit {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) if len(failed_units) > 0: sys.exit() if do_profile == '1': # Process CU information and append it to the dictionary if isinstance(pattern_start_time, datetime.datetime): if isinstance(step_start_time_abs, datetime.datetime): if isinstance(step_end_time_abs, datetime.datetime): tinfo = extract_timing_info(md_units, pattern_start_time, step_start_time_abs, step_end_time_abs) else: sys.exit("Ensemble MD Toolkit Error: step_end_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: step_start_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: pattern_start_time is not datetime.datetime instance.") for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) #--------------------------------------------------------------- if (c <= cycles): if do_profile == '1': step_timings = { "name": "ex_run_{0}".format(c), "timings": {} } step_start_time_abs = datetime.datetime.now() #----------------------------------------------------------- # global calc #----------- ------------------------------------------------ ex_units = [] self.get_logger().info("Cycle %d: Preparing replicas for Exchange-Step" % (c) ) gl_ex_kernel = pattern.prepare_global_ex_calc(GL, c, \ replicas) gl_ex_kernel._bind_to_resource(resource._resource_key) cu = radical.pilot.ComputeUnitDescription() #----------------------------------------------------------- copy_out = [] items_out = gl_ex_kernel._kernel._copy_output_data if items_out: for item in items_out: i_out = { 'source': item, 'target': 'staging:///%s' % item, 'action': radical.pilot.COPY } copy_out.append(i_out) #----------------------------------------------------------- copy_in = [] items_in = gl_ex_kernel._kernel._copy_input_data if items_in: for item in items_in: i_in = { 'source': 'staging:///%s' % item, 'target': item, 'action': radical.pilot.COPY } copy_in.append(i_in) #----------------------------------------------------------- in_list = [] if gl_ex_kernel._cu_def_input_data: in_list = in_list + gl_ex_kernel._cu_def_input_data if copy_in: in_list = in_list + copy_in cu.input_staging = in_list #----------------------------------------------------------- out_list = [] if gl_ex_kernel._cu_def_output_data: out_list = out_list + gl_ex_kernel._cu_def_output_data if copy_out: out_list = out_list + copy_out cu.output_staging = out_list #----------------------------------------------------------- cu.pre_exec = gl_ex_kernel._cu_def_pre_exec cu.executable = gl_ex_kernel._cu_def_executable cu.post_exec = gl_ex_kernel._cu_def_post_exec cu.arguments = gl_ex_kernel.arguments cu.mpi = gl_ex_kernel.uses_mpi cu.cores = gl_ex_kernel.cores sub_replica = resource._umgr.submit_units(cu) resource._umgr.wait_units() ex_units.append(sub_replica) if do_profile == '1': step_end_time_abs = datetime.datetime.now() all_cus.extend(ex_units) failed_units = "" for unit in ex_units: if unit.state != radical.pilot.DONE: failed_units += " * EX step: Unit {0} failed with an error: {1}\n".format(unit.uid, unit.stderr) if len(failed_units) > 0: sys.exit() if do_profile == '1': # Process CU information and append it to the dictionary if isinstance(pattern_start_time, datetime.datetime): if isinstance(step_start_time_abs, datetime.datetime): if isinstance(step_end_time_abs, datetime.datetime): tinfo = extract_timing_info(ex_units, pattern_start_time, step_start_time_abs, step_end_time_abs) else: sys.exit("Ensemble MD Toolkit Error: step_end_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: step_start_time_abs for {0} is not datetime.datetime instance.".format(step_timings["name"])) else: sys.exit("Ensemble MD Toolkit Error: pattern_start_time is not datetime.datetime instance.") for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) step_timings = { "name": "post_processing_{0}".format(c), "timings": {} } step_start_time_abs = datetime.datetime.now() #----------------------------------------------------------- pattern.do_exchange(c, replicas) if do_profile == '1': step_end_time_abs = datetime.datetime.now() # processing timings step_start_time_rel = step_start_time_abs - pattern_start_time step_end_time_rel = step_end_time_abs - pattern_start_time tinfo = { "step_start_time": { "abs": step_start_time_abs, "rel": step_start_time_rel }, "step_end_time": { "abs": step_end_time_abs, "rel": step_end_time_rel } } for key, val in tinfo.iteritems(): step_timings['timings'][key] = val # Write the whole thing to the profiling dict pattern._execution_profile.append(step_timings) #----------------------------------------------------------- # End of simulation loop #------------------------------------------------------------------- self.get_logger().info("Replica Exchange simulation finished successfully!") except KeyboardInterrupt: traceback.print_exc()