def test_create_task_from_cu(): """ **Purpose**: Test if the 'create_task_from_cu' function generates a Task with the correct uid, parent_stage and parent_pipeline from a RP ComputeUnit """ session = rp.Session(dburl=MLAB) umgr = rp.UnitManager(session=session) cud = rp.ComputeUnitDescription() cud.name = 'uid, name, parent_stage_uid, parent_stage_name, parent_pipeline_uid, parent_pipeline_name' cud.executable = '/bin/echo' cu = rp.ComputeUnit(umgr, cud) t = create_task_from_cu(cu) assert t.uid == 'uid' assert t.name == 'name' assert t.parent_stage['uid'] == 'parent_stage_uid' assert t.parent_stage['name'] == 'parent_stage_name' assert t.parent_pipeline['uid'] == 'parent_pipeline_uid' assert t.parent_pipeline['name'] == 'parent_pipeline_name'
def enter(self, project=None): if project is not None: self.project = project project = self.project # just in case the user did not open a session yet, we do it now if project.session is None: project.open_rp() self.unit_manager = rp.UnitManager(session=project.session) # register this cluster with the session for later cleanup self.project.schedulers.add(self) self.pilot = self.project.pilot_manager.submit_pilots(self.desc) self.unit_manager.add_pilots(self.pilot) self.unit_manager.register_callback(self.unit_callback) self._folder_name = '%s-%s' % ( project.session._uid, self.pilot._uid) self.stage_generators()
pdesc2.project = '' pilot_list.append(pdesc2) # Continue adding pilot by creating a new descrption and appending it to # the list. # Submit the pilot list to the Pilot Manager. Actually all the pilots are # submitted to the Pilot Manager at once. print "Submitting Compute Pilots to Pilot Manager ..." pilots = pmgr.submit_pilots(pilot_list) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. The scheduler that supports multi-pilot sessions # is Round Robin. Direct Submittion does not. print "Initializing Unit Manager ..." umgr = rp.UnitManager(session=session, scheduler=rp.SCHEDULER_ROUND_ROBIN) # Register our callback with the UnitManager. This callback will get # called every time any of the units managed by the UnitManager # change their state. umgr.register_callback(unit_state_cb) # Add the created ComputePilot to the UnitManager. print "Registering Compute Pilots with Unit Manager ..." umgr.add_pilots(pilots) NUMBER_JOBS = 64 # the total number of cus to run # submit CUs to pilot job cudesc_list = [] for i in range(NUMBER_JOBS):
def test_bw_tagging(): # we use a reporter class for nicer output report = ru.Reporter(name='radical.pilot') report.title('Getting Started (RP version %s)' % rp.version) # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = { 'resource': 'ncsa.bw_aprun', 'runtime': 10, # pilot runtime (min) 'exit_on_error': True, 'project': 'gk4', 'queue': 'high', 'access_schema': 'gsissh', 'cores': 128 } pdesc = rp.ComputePilotDescription(pd_init) # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) report.header('submit units') # Register the ComputePilot in a UnitManager object. umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. n = 5 # number of units to run report.info('create %d unit description(s)\n\t' % n) cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's1_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 16 # cud.cpu_process_type = rp.MPI # cud.cpu_thread_type = rp.OpenMP cud.output_staging = { 'source': 'unit:///s1_t%s_hostname.txt' % i, 'target': 'client:///s1_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds.append(cud) report.progress() report.ok('>>ok\n') # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all compute units to reach a final state # (DONE, CANCELED or FAILED). report.header('gather results') umgr.wait_units() n = 5 # number of units to run report.info('create %d unit description(s)\n\t' % n) cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's2_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 16 cud.tag = cus[i].uid # cud.cpu_process_type = rp.MPI # cud.cpu_thread_type = rp.OpenMP cud.output_staging = { 'source': 'unit:///s2_t%s_hostname.txt' % i, 'target': 'client:///s2_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds.append(cud) report.progress() report.ok('>>ok\n') # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all compute units to reach a final state (DONE, CANCELED or FAILED). report.header('gather results') umgr.wait_units() for i in range(0, n): assert open('s1_t%s_hostname.txt' % i,'r').readline().strip() == \ open('s2_t%s_hostname.txt' % i,'r').readline().strip() report.header('finalize') session.close(download=True) report.header() for f in glob.glob('%s/*.txt' % os.getcwd()): os.remove(f)
pdesc = rp.ComputePilotDescription() pdesc.resource = "xsede.stampede" pdesc.runtime = 40 # minutes pdesc.cores = 32 pdesc.project = "TG-MCB090174" pilot_2 = pmgr.submit_pilots(pdesc) # reuse the pilot description for the third pilot pdesc.cores = 128 pilot_3 = pmgr.submit_pilots(pdesc) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. umgr = rp.UnitManager(session=session, scheduler=rp.SCHED_BACKFILLING) # Register our callback with the UnitManager. This callback will get # called every time any of the units managed by the UnitManager # change their state. umgr.register_callback(unit_state_cb) # Add the previsouly created ComputePilot to the UnitManager. umgr.add_pilots([pilot_1, pilot_2, pilot_3]) # # wait until first pilots become active pilot_1.wait(state=rp.ACTIVE) # Create a workload of 8 ComputeUnits. cus = list()
def test_ordered_scheduler(): report = ru.Reporter(name='radical.pilot') report.title('Getting Started (RP version %s)' % rp.version) session = rp.Session() try: # read the config used for resource details report.info('read config') report.ok('>>ok\n') report.header('submit pilots') pd_init = { 'resource': 'local.localhost', 'runtime': 5, 'exit_on_error': True, 'cores': 10 } pdesc = rp.ComputePilotDescription(pd_init) pmgr = rp.PilotManager(session=session) pilot = pmgr.submit_pilots(pdesc) report.header('submit pipelines') umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) n_pipes = 2 n_stages = 5 n_tasks = 4 cuds = list() for p in range(n_pipes): for s in range(n_stages): for t in range(n_tasks): cud = rp.ComputeUnitDescription() cud.executable = '%s/pipeline_task.sh' % pwd cud.arguments = [p, s, t, 10] cud.cpu_processes = 1 cud.tags = { 'order': { 'ns': p, 'order': s, 'size': n_tasks } } cud.name = 'p%03d-s%03d-t%03d' % (p, s, t) cuds.append(cud) report.progress() import random random.shuffle(cuds) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. umgr.submit_units(cuds) # Wait for all compute units to reach a final state report.header('gather results') umgr.wait_units() except Exception as e: # Something unexpected happened in the pilot code above report.error('caught Exception: %s\n' % e) ru.print_exception_trace() raise except (KeyboardInterrupt, SystemExit) as e: # the callback called sys.exit(), and we can here catch the # corresponding KeyboardInterrupt exception for shutdown. We also catch # SystemExit (which gets raised if the main threads exits for some other # reason). ru.print_exception_trace() report.warn('exit requested\n') finally: # always clean up the session, no matter if we caught an exception or # not. This will kill all remaining pilots. report.header('finalize') session.close(download=False) report.header()
def __enter__(self): # create the managers self.session = rp.Session() self.pmgr = rp.PilotManager(session=self.session) self.umgr = rp.UnitManager(session=self.session)
# Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() print "session id: %s" % session.uid # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: pmgr = rp.PilotManager(session=session) pmgr.register_callback(pilot_state_cb) umgr = rp.UnitManager(session=session, scheduler=SCHED) umgr.register_callback(unit_state_cb, rp.UNIT_STATE) umgr.register_callback(wait_queue_size_cb, rp.WAIT_QUEUE_SIZE) cuds = list() for unit_count in range(0, UNITS): cud = rp.ComputeUnitDescription() cud.executable = "/bin/sh" cud.arguments = ["-c", "echo $HOSTNAME:$OSG_HOSTNAME && sleep %d" % SLEEP] cud.cores = 1 cuds.append(cud) units = umgr.submit_units(cuds)
def _process_tasks(self, task_queue, rmgr, mq_hostname, port): ''' **Purpose**: The new thread that gets spawned by the main tmgr process invokes this function. This function receives tasks from 'task_queue' and submits them to the RADICAL Pilot RTS. ''' placeholders = dict() # ---------------------------------------------------------------------- def load_placeholder(task, rts_uid): parent_pipeline = str(task.parent_pipeline['name']) parent_stage = str(task.parent_stage['name']) if parent_pipeline not in placeholders: placeholders[parent_pipeline] = dict() if parent_stage not in placeholders[parent_pipeline]: placeholders[parent_pipeline][parent_stage] = dict() if None not in [parent_pipeline, parent_stage, task.name]: placeholders[parent_pipeline][parent_stage][task.name] = \ {'path' : task.path, 'rts_uid': rts_uid} # ---------------------------------------------------------------------- def unit_state_cb(unit, state): try: self._log.debug('Unit %s in state %s' % (unit.uid, unit.state)) if unit.state in rp.FINAL: # Acquire a connection+channel to the rmq server mq_connection = pika.BlockingConnection( pika.ConnectionParameters(host=mq_hostname, port=port)) mq_channel = mq_connection.channel() task = None task = create_task_from_cu(unit, self._prof) self._advance(task, 'Task', states.COMPLETED, mq_channel, '%s-cb-to-sync' % self._sid) load_placeholder(task, unit.uid) task_as_dict = json.dumps(task.to_dict()) mq_channel.basic_publish(exchange='', routing_key='%s-completedq-1' % self._sid, body=task_as_dict) self._log.info( 'Pushed task %s with state %s to completed ' 'queue %s-completedq-1', task.uid, task.state, self._sid) mq_connection.close() except KeyboardInterrupt: self._log.exception( 'Execution interrupted (probably by Ctrl+C)' ' exit callback thread gracefully...') raise KeyboardInterrupt except Exception as e: self._log.exception('Error in RP callback thread: %s', e) # ---------------------------------------------------------------------- umgr = rp.UnitManager(session=rmgr._session) umgr.add_pilots(rmgr.pilot) umgr.register_callback(unit_state_cb) try: while not self._tmgr_terminate.is_set(): body = None try: body = task_queue.get(block=True, timeout=10) except Queue.Empty: # Ignore, we don't always have new tasks to run pass if not body: continue task_queue.task_done() bulk_tasks = list() bulk_cuds = list() for msg in body: task = Task() task.from_dict(msg) bulk_tasks.append(task) bulk_cuds.append( create_cud_from_task(task, placeholders, self._prof)) mq_connection = pika.BlockingConnection( pika.ConnectionParameters(host=mq_hostname, port=port)) mq_channel = mq_connection.channel() self._advance(task, 'Task', states.SUBMITTING, mq_channel, '%s-tmgr-to-sync' % self._sid) mq_connection.close() umgr.submit_units(bulk_cuds) except KeyboardInterrupt: self._log.exception('Execution interrupted (probably by Ctrl+C), ' 'cancel task processor gracefully...') except Exception as e: self._log.exception('%s failed with %s', self._uid, e) raise EnTKError(e)
def test_local_integration(): # if __name__ == '__main__': # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Update localhost lfs path and size cfg = session.get_resource_config('local.localhost') new_cfg = rp.ResourceConfig('local.localhost', cfg) new_cfg.lfs_path_per_node = '/tmp' new_cfg.lfs_size_per_node = 1024 # MB session.add_resource_config(new_cfg) cfg = session.get_resource_config('local.localhost') # Check that the updated config is read by the session assert 'lfs_path_per_node' in cfg.keys() assert 'lfs_size_per_node' in cfg.keys() assert cfg['lfs_path_per_node'] == '/tmp' assert cfg['lfs_size_per_node'] == 1024 # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = { 'resource': 'local.localhost', 'runtime': 15, # pilot runtime (min) 'cores': 4 } pdesc = rp.ComputePilotDescription(pd_init) # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) # Register the ComputePilot in a UnitManager object. umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) # Run 16 tasks that each require 1 core and 10MB of LFS n = 16 cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's1_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 1 # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s1_t%s_hostname.txt' % i, 'target': 'client:///s1_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all units to finish umgr.wait_units() n = 16 cuds2 = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.tag = cus[i].uid cud.executable = '/bin/hostname' cud.arguments = ['>', 's2_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 1 # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s2_t%s_hostname.txt' % i, 'target': 'client:///s2_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds2.append(cud) # # Submit the previously created ComputeUnit descriptions to the # # PilotManager. This will trigger the selected scheduler to start # # assigning ComputeUnits to the ComputePilots. cus2 = umgr.submit_units(cuds2) # # Wait for all units to finish umgr.wait_units() for i in range(0, n): assert open('s1_t%s_hostname.txt' % i, 'r').readline().strip() == open('s2_t%s_hostname.txt' % i, 'r').readline().strip() session.close() txts = glob('%s/*.txt' % os.getcwd()) for f in txts: os.remove(f)
def test_pass_issue_359(): session = rp.Session() try: c = rp.Context('ssh') c.user_id = CONFIG["xsede.stampede"]["user_id"] session.add_context(c) pmgr = rp.PilotManager(session=session) pmgr.register_callback(pilot_state_cb) core_configs = [8, 16, 17, 32, 33] umgr_list = [] for cores in core_configs: umgr = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) umgr.register_callback(unit_state_cb) pdesc = rp.ComputePilotDescription() pdesc.resource = "xsede.stampede" pdesc.project = CONFIG["xsede.stampede"]["project"] pdesc.runtime = 10 pdesc.cores = cores pilot = pmgr.submit_pilots(pdesc) umgr.add_pilots(pilot) umgr_list.append(umgr) unit_list = [] for umgr in umgr_list: test_task = rp.ComputeUnitDescription() test_task.pre_exec = CONFIG["xsede.stampede"]["pre_exec"] test_task.input_staging = ["../helloworld_mpi.py"] test_task.executable = "python" test_task.arguments = ["helloworld_mpi.py"] test_task.mpi = True test_task.cores = 8 unit = umgr.submit_units(test_task) unit_list.append(unit) for umgr in umgr_list: umgr.wait_units() for unit in unit_list: print "* Task %s - state: %s, exit code: %s, started: %s, finished: %s, stdout: %s" \ % (unit.uid, unit.state, unit.exit_code, unit.start_time, unit.stop_time, unit.stdout) assert (unit.state == rp.DONE) except Exception as e: print 'test failed' raise finally: pmgr.cancel_pilots() pmgr.wait_pilots() session.close()
def run_test(cfg): # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() print "session id: %s" % session.uid # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. print "Initializing Pilot Manager ..." pmgr = rp.PilotManager(session=session) # Register our callback with the PilotManager. This callback will get # called every time any of the pilots managed by the PilotManager # change their state. pmgr.register_callback(pilot_state_cb) pdesc = rp.ComputePilotDescription() pdesc.resource = cfg['cp_resource'] if cfg['cp_schema']: pdesc.access_schema = cfg['cp_schema'] pdesc.project = cfg['cp_project'] pdesc.queue = cfg['cp_queue'] pdesc.runtime = cfg['cp_runtime'] pdesc.cores = cfg['cp_cores'] pdesc.cleanup = True # submit the pilot. print "Submitting Compute Pilot to Pilot Manager ..." pilot = pmgr.submit_pilots(pdesc) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. print "Initializing Unit Manager ..." umgr = rp.UnitManager(session=session, scheduler=rp.SCHEDULER_DIRECT_SUBMISSION) # Register our callback with the UnitManager. This callback will get # called every time any of the units managed by the UnitManager # change their state. umgr.register_callback(unit_state_cb) # Add the created ComputePilot to the UnitManager. print "Registering Compute Pilot with Unit Manager ..." umgr.add_pilots(pilot) NUMBER_JOBS = 10 # the total number of cus to run # submit CUs to pilot job cudesc_list = [] for i in range(NUMBER_JOBS): cudesc = rp.ComputeUnitDescription() if cfg['cu_pre_exec']: cudesc.pre_exec = cfg['cu_pre_exec'] cudesc.executable = cfg['executable'] cudesc.arguments = ["helloworld_mpi.py"] cudesc.input_staging = [ "%s/../examples/helloworld_mpi.py" % cfg['pwd'] ] cudesc.cores = cfg['cu_cores'] cudesc.mpi = True cudesc_list.append(cudesc) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. print "Submit Compute Units to Unit Manager ..." cu_set = umgr.submit_units(cudesc_list) print "Waiting for CUs to complete ..." umgr.wait_units() print "All CUs completed successfully!" for unit in cu_set: print "* Task %s - state: %s, exit code: %s, started: %s, finished: %s, stdout: %s" \ % (unit.uid, unit.state, unit.exit_code, unit.start_time, unit.stop_time, unit.stdout) assert (unit.state == rp.DONE) for i in range(cfg['cu_cores']): assert ('mpi rank %d/%d' % (i + 1, cfg['cu_cores']) in unit.stdout) except Exception as e: # Something unexpected happened in the pilot code above print "caught Exception: %s" % e raise except (KeyboardInterrupt, SystemExit) as e: # the callback called sys.exit(), and we can here catch the # corresponding KeyboardInterrupt exception for shutdown. We also catch # SystemExit (which gets raised if the main threads exits for some other # reason). print "need to exit now: %s" % e raise finally: # always clean up the session, no matter if we caught an exception or # not. print "closing session" print "SESSION ID: %s" % session.uid session.close(cleanup=False)
'target': staged_file, 'action': rp.TRANSFER } # Synchronously stage the data to the pilot pilot.stage_in(sd_pilot) # Configure the staging directive for shared input file. sd_shared = { 'source': staged_file, 'target': SHARED_INPUT_FILE, 'action': rp.LINK } # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. umgr = rp.UnitManager(session, rp.SCHEDULER_BACKFILLING) # Add the previously created ComputePilot to the UnitManager. umgr.add_pilots(pilot) compute_unit_descs = [] for unit_idx in range(len(radical_cockpit_occupants)): # Configure the per unit input file. input_file = 'input_file-%d.txt' % (unit_idx + 1) # Configure the for per unit output file. output_file = 'output_file-%d.txt' % (unit_idx + 1) # Actual task description.
pmgr.register_callback(pilot_state_cb) # Define a 2-core local pilot that runs for 10 minutes. pdesc = rp.ComputePilotDescription() pdesc.resource = "local.localhost" pdesc.runtime = 10 pdesc.cores = 1 # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. umgr = rp.UnitManager( session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION, output_transfer_workers=4, input_transfer_workers=4) # Register our callback with the UnitManager. This callback will get # called every time any of the units managed by the UnitManager # change their state. umgr.register_callback(unit_state_change_cb) # Add the previsouly created ComputePilot to the UnitManager. umgr.add_pilots(pilot) # Create a workload of 8 ComputeUnits (tasks). compute_units = [] for unit_count in range(0, 8): cu = rp.ComputeUnitDescription()
'action': rp.TRANSFER } # Synchronously stage the data to the pilot pilot.stage_in(sd_pilot) # Configure the staging directive for shared input file. sd_shared = { 'source': staged_file, 'target': 'input.txt', 'action': rp.LINK } # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. print "Initializing Unit Manager ..." umgr = rp.UnitManager(session=session, scheduler=rp.SCHED_DIRECT) umgr.register_callback(unit_state_cb) # Add the created ComputePilot to the UnitManager. print "Registering Compute Pilot with Unit Manager ..." umgr.add_pilots(pilot) NUMBER_OF_TRAJECTORIES = total_file_lines # i should define a "good" window based on division rules. in respect to the NUMBER_OF_TRAJECTORIES # submit CUs to pilot job cudesc_list = [] sd_inter_in_list = list() #print 'window is ' + str(WINDOW_SIZE) #print 'traj count is ' + str(NUMBER_OF_TRAJECTORIES) for i in range(1, NUMBER_OF_TRAJECTORIES + 1, WINDOW_SIZE):
def run_test(cfg): # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() print "session id: %s" % session.uid # all other pilot code is now tried/excepted. If an exception is caught, we # can rely on the session object to exist and be valid, and we can thus tear # the whole RP stack down via a 'session.close()' call in the 'finally' # clause... try: pmgr = rp.PilotManager(session=session) pmgr.register_callback(pilot_state_cb) pdesc = rp.ComputePilotDescription() pdesc.resource = cfg['cp_resource'] pdesc.cores = cfg['cp_cores'] pdesc.project = cfg['cp_project'] pdesc.queue = cfg['cp_queue'] pdesc.runtime = cfg['cp_runtime'] pdesc.cleanup = False pdesc.access_schema = cfg['cp_schema'] pilot = pmgr.submit_pilots(pdesc) input_sd_pilot = { 'source': 'file:///etc/passwd', 'target': 'staging:///f1', 'action': rp.TRANSFER } pilot.stage_in(input_sd_pilot) umgr = rp.UnitManager(session=session, scheduler=SCHED) umgr.register_callback(unit_state_cb, rp.UNIT_STATE) umgr.register_callback(wait_queue_size_cb, rp.WAIT_QUEUE_SIZE) umgr.add_pilots(pilot) input_sd_umgr = { 'source': 'file:///etc/group', 'target': 'f2', 'action': rp.COPY } input_sd_agent = { 'source': 'staging:///f1', 'target': 'f1', 'action': rp.COPY } output_sd_agent = { 'source': 'f1', 'target': 'staging:///f1.bak', 'action': rp.COPY } output_sd_umgr = { 'source': 'f2', 'target': 'f2.bak', 'action': rp.TRANSFER } cuds = list() for unit_count in range(0, UNITS): cud = rp.ComputeUnitDescription() cud.executable = "wc" cud.arguments = ["f1", "f2"] cud.cores = 1 cud.input_staging = [input_sd_umgr, input_sd_agent] cud.output_staging = [output_sd_umgr, output_sd_agent] cuds.append(cud) units = umgr.submit_units(cuds) umgr.wait_units() for cu in units: print "* Task %s state %s, exit code: %s, started: %s, finished: %s" \ % (cu.uid, cu.state, cu.exit_code, cu.start_time, cu.stop_time) # os.system ("radicalpilot-stats -m stat,plot -s %s > %s.stat" % (session.uid, session_name)) except Exception as e: # Something unexpected happened in the pilot code above print "caught Exception: %s" % e raise except (KeyboardInterrupt, SystemExit) as e: # the callback called sys.exit(), and we can here catch the # corresponding KeyboardInterrupt exception for shutdown. We also catch # SystemExit (which gets raised if the main threads exits for some other # reason). print "need to exit now: %s" % e raise finally: # always clean up the session, no matter if we caught an exception or # not. print "closing session" print "SESSION ID: %s" % session.uid session.close(cleanup=False)
'runtime': 15, # pilot runtime (min) 'exit_on_error': True, 'project': config[resource]['project'], 'queue': config[resource]['queue'], 'access_schema': config[resource]['schema'], 'cores': config[resource]['cores'] } pdesc = rp.ComputePilotDescription(pd_init) # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) report.header('submit units') # Register the ComputePilot in a UnitManager object. umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. n = 128 # number of units to run report.info('create %d unit description(s)\n\t' % n) cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/date' cuds.append(cud)
# Create a Compute Unit that sorts the local password file and writes the # output to result.dat. # # The exact command that is executed by the agent is: # "/usr/bin/sort -o result.dat passwd" # cud = rp.ComputeUnitDescription() cud.executable = "/usr/bin/sort" cud.arguments = ["-o", "result.dat", "passwd"] cud.input_staging = "/etc/passwd" cud.output_staging = "result.dat" # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. umgr = rp.UnitManager(session, rp.SCHED_DIRECT_SUBMISSION) # Register our callback with the UnitManager. This callback will get # called every time any of the units managed by the UnitManager # change their state. umgr.register_callback(unit_state_cb) # Add the previously created ComputePilot to the UnitManager. umgr.add_pilots(pilot) # Submit the previously created ComputeUnit description to the # PilotManager. This will trigger the selected scheduler to start # assigning the ComputeUnit to the ComputePilot. unit = umgr.submit_units(cud) # Wait for the compute unit to reach a terminal state (DONE or FAILED).
cu.arguments = ["helloworld_mpi.py"] cu.input_staging = ["helloworld_mpi.py"] # These two parameters are relevant to MPI execution: # 'cores' sets the number of cores required by the task # 'mpi' identifies the task as an MPI taskg cu.cores = 8 cu.mpi = True cud_list.append(cu) # Combine the ComputePilot, the ComputeUnits and a scheduler via # a UnitManager object. umgr = rp.UnitManager( session=session, scheduler=rp.SCHED_DIRECT_SUBMISSION) # Register our callback with the UnitManager. This callback will get # called every time any of the units managed by the UnitManager # change their state. umgr.register_callback(unit_state_cb) # Add the previously created ComputePilot to the UnitManager. umgr.add_pilots(pilot) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. units = umgr.submit_units(cud_list)
def test_bw_integration(): # Create a new session. No need to try/except this: if session creation # fails, there is not much we can do anyways... session = rp.Session() # Add a Pilot Manager. Pilot managers manage one or more ComputePilots. pmgr = rp.PilotManager(session=session) # Define an [n]-core local pilot that runs for [x] minutes # Here we use a dict to initialize the description object pd_init = { 'resource': 'ncsa.bw_aprun', 'runtime': 10, # pilot runtime (min) 'cores': 128, 'project': 'gk4', 'queue': 'high' } pdesc = rp.ComputePilotDescription(pd_init) # Launch the pilot. pilot = pmgr.submit_pilots(pdesc) # Register the ComputePilot in a UnitManager object. umgr = rp.UnitManager(session=session) umgr.add_pilots(pilot) # Run 16 tasks that each require 1 core and 10MB of LFS n = 4 cuds = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.executable = '/bin/hostname' cud.arguments = ['>', 's1_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 16 # cud.cpu_process_type = None # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s1_t%s_hostname.txt' % i, 'target': 'client:///s1_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus = umgr.submit_units(cuds) # Wait for all units to finish umgr.wait_units() n = 4 cuds2 = list() for i in range(0, n): # create a new CU description, and fill it. # Here we don't use dict initialization. cud = rp.ComputeUnitDescription() cud.tag = cus[i].uid cud.executable = '/bin/hostname' cud.arguments = ['>', 's2_t%s_hostname.txt' % i] cud.cpu_processes = 1 cud.cpu_threads = 16 cud.cpu_process_type = None # cud.cpu_process_type = rp.MPI cud.lfs_per_process = 10 # MB cud.output_staging = { 'source': 'unit:///s2_t%s_hostname.txt' % i, 'target': 'client:///s2_t%s_hostname.txt' % i, 'action': rp.TRANSFER } cuds2.append(cud) # Submit the previously created ComputeUnit descriptions to the # PilotManager. This will trigger the selected scheduler to start # assigning ComputeUnits to the ComputePilots. cus2 = umgr.submit_units(cuds2) # Wait for all units to finish umgr.wait_units() # Check that all units succeeded for i in range(0, n): assert open('s1_t%s_hostname.txt' % i, 'r').readline().strip() == open('s2_t%s_hostname.txt' % i, 'r').readline().strip() session.close() txts = glob('%s/*.txt' % os.getcwd()) for f in txts: os.remove(f)