def test_s3import_via_pilotapi(): COORDINATION_URL="redis://localhost:6379" from pilot import PilotComputeService, PilotDataService, ComputeDataService, State pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) ################################################################################################### # Pick one of the Pilot Data Descriptions below pilot_data_description_aws={ "service_url": "s3://pilot-data-andre-workflow", "size": 100, "affinity_datacenter_label": "us-east-1", "affinity_machine_label": "" , "access_key_id": "AKIAJPGNDJRYIG5LIEUA", "secret_access_key":"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", } pd = pilot_data_service.create_pilot(pilot_data_description=pilot_data_description_aws) data_unit_description = { "file_urls": ['s3://pilot-data-cec5d816-fa8f-11e1-ab5e-e61f1322a75c/du-67b4c762-fa90-11e1-ab5e-e61f1322a75c/ip-10-84-173-21512MB_2.input-chunk-02'], "affinity_datacenter_label": "us-east-1", "affinity_machine_label": "" } # submit pilot data to a pilot store input_data_unit = pd.submit_data_unit(data_unit_description) input_data_unit.wait()
def create_pilotdata(): pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) pilot_data_description={ "service_url": "ssh://localhost/tmp/pilot-data/", } pilotdata=pilot_data_service.create_pilot(pilot_data_description=pilot_data_description) return pilotdata
def start(self): darelogger.info("Creating Compute Engine service ") self.pilot_compute_service = PilotComputeService( coordination_url=COORDINATION_URL) self.pilot_data_service = PilotDataService( coordination_url=COORDINATION_URL) for compute_pilot, desc in list( self.workflow.compute_pilot_repo.items()): self.pilot_compute_service.create_pilot( pilot_compute_description=desc) for data_pilot, desc in list(self.workflow.data_pilot_repo.items()): self.data_pilot_service_repo.append( self.pilot_data_service.create_pilot( pilot_data_description=desc)) self.compute_data_service = ComputeDataServiceDecentral() self.compute_data_service.add_pilot_compute_service( self.pilot_compute_service) self.compute_data_service.add_pilot_data_service( self.pilot_data_service) ### run the steps self.step_start_lock = threading.RLock() self.step_run_lock = threading.RLock() for step_id in list(self.workflow.step_units_repo.keys()): darelogger.info(" Sumitted step %s " % step_id) self.step_start_lock.acquire() self.start_thread_step_id = step_id self.step_start_lock.release() self.step_threads[step_id] = threading.Thread( target=self.start_step) self.step_threads[step_id].start() while (1): count_step = [ v.is_alive() for k, v in list(self.step_threads.items()) ] darelogger.info('count_step %s' % count_step) if not True in count_step and len(count_step) > 0: break time.sleep(10) darelogger.info(" All Steps Done processing") self.quit(message='quit gracefully')
def start(self): # try: from pilot import PilotComputeService, PilotDataService, ComputeDataService, State darelogger.info("Create Compute Engine service ") self.pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) self.pilot_data_service = PilotDataService() for compute_pilot, desc in self.workflow.compute_pilot_repo.items(): self.compute_pilot_service_repo.append(self.pilot_compute_service.create_pilot(pilot_compute_description=desc)) #for data_pilot, desc in self.workflow.data_pilot_repo.items(): # self.data_pilot_service_repo.append(self.pilot_data_service.create_pilot(pilot_data_description=desc)) self.compute_data_service = ComputeDataService() self.compute_data_service.add_pilot_compute_service(self.pilot_compute_service) # self.compute_data_service.add_pilot_data_service(self.pilot_data_service) self.step_thread= {} ### run the steps self.step_start_lock=threading.RLock() self.step_run_lock=threading.RLock() for step_id in self.workflow.step_units_repo.keys(): darelogger.info(" Sumitted step %s "%step_id) self.step_start_lock.acquire() self.start_thread_step_id =step_id self.step_start_lock.release() self.step_thread[step_id] = threading.Thread(target=self.start_step) self.step_thread[step_id].start() while(1): count_step = [v.is_alive() for k,v in self.step_thread.items()] darelogger.info('count_step %s'%count_step) if not True in count_step and len(count_step)>0: break time.sleep(10) darelogger.info(" All Steps Done processing") self.cancel()
base_dir = "/Users/luckow/workspace-saga/applications/pilot-store/test/data1" url_list = os.listdir(base_dir) # make absolute paths absolute_url_list = [os.path.join(base_dir, i) for i in url_list] data_unit_description = { "file_urls":absolute_url_list, "number_of_replicas": 2 } logging.debug("Pilot Data Description: \n%s"%str(data_unit_description)) # create pilot data service (factory for pilot stores (physical, distributed storage)) pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) pd1 = pilot_data_service.create_pilot({ 'service_url': "ssh://localhost/tmp/pilotdata-1/", 'size':100, 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-1" }) pd2 = pilot_data_service.create_pilot({ 'service_url': "ssh://localhost/tmp/pilotdata-2/", 'size':100, 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-2" })
logging.basicConfig(level=logging.DEBUG) sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) from pilot import PilotDataService, ComputeDataService, DataUnit, State if __name__ == "__main__": if len(sys.argv)==2: reconnect_url=sys.argv[1] else: print "Usage: " + sys.executable + " " + __file__ + " <Data Unit URL to Reconnect to>" sys.exit(-1) # create pilot data service (factory for pilot stores (physical, distributed storage)) pilot_data_service = PilotDataService() pd_new = pilot_data_service.create_pilot({ 'service_url': "ssh://localhost/tmp/pilotdata-reconnect/", 'size':100, 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-1" }) logging.debug("Pilot Data URL: %s"%pilot_data_service.url) ########################################################################### # PD should only be scheduled to machine 1 logging.debug("Connect to PD URL: %s"%reconnect_url) pd = DataUnit(du_url=reconnect_url)
sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) from pilot import PilotDataService, ComputeDataService, DataUnit, State COORDINATION_URL = "redis://localhost:6379" if __name__ == "__main__": if len(sys.argv)==2: reconnect_url=sys.argv[1] else: print "Usage: " + sys.executable + " " + __file__ + " <Data Unit URL to Reconnect to>" sys.exit(-1) # create pilot data service (factory for pilot stores (physical, distributed storage)) pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) pd_new = pilot_data_service.create_pilot({ 'service_url': "ssh://localhost/tmp/pilotdata-reconnect/", 'size':100, 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-1" }) logging.debug("Pilot Data URL: %s"%pilot_data_service.url) ########################################################################### # PD should only be scheduled to machine 1 logging.debug("Connect to PD URL: %s"%reconnect_url) pd = DataUnit(du_url=reconnect_url)
# create pilot job service and initiate a pilot job pilot_compute_description = { "service_url": 'fork://localhost', "number_of_processes": 1, "working_directory": "/tmp/pilot-compute/", 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-1" } pilotjob = pilot_compute_service.create_pilot(pilot_compute_description=pilot_compute_description) # create pilot data service (factory for data pilots (physical, distributed storage)) # and pilot data pilot_data_service = PilotDataService() pilot_data_description={ "service_url": "ssh://localhost/tmp/pilot-data/", "size": 100, "affinity_datacenter_label": "eu-de-south", "affinity_machine_label": "mymachine-1" } ps = pilot_data_service.create_pilot(pilot_data_description=pilot_data_description) compute_data_service = ComputeDataService() compute_data_service.add_pilot_compute_service(pilot_compute_service) compute_data_service.add_pilot_data_service(pilot_data_service) # Create Data Unit Description base_dir = "/Users/luckow/workspace-saga/applications/pilot-store/test/data1" url_list = os.listdir(base_dir)
# What files? Create Pilot Data Description using remote SSH URLs # make remotete paths remote_url_list = ["ssh://localhost" + os.path.join(base_dir, i) for i in url_list] data_unit_description2 = { "file_urls": remote_url_list, "affinity_datacenter_label": "eu-de-south", "affinity_machine_label": "mymachine-2", } logging.debug("Pilot Data Description 2: \n%s" % str(data_unit_description2)) # create pilot data service compute_data_service = ComputeDataService() # create pilot data service (factory for pilot stores (physical, distributed storage)) pilot_data_service = PilotDataService() ps1 = pilot_data_service.create_pilot( { "service_url": "ssh://localhost/tmp/pilotdata-1/", "size": 100, "affinity_datacenter_label": "eu-de-south", "affinity_machine_label": "mymachine-1", } ) ps2 = pilot_data_service.create_pilot( { "service_url": "ssh://localhost/tmp/pilotdata-2/", "size": 100, "affinity_datacenter_label": "eu-de-south", "affinity_machine_label": "mymachine-2",
class DareManager(object): """DARE manager: - reads different configuration files - submits compute/data units as that in various steps""" """Constructor""" def __init__(self, conffile="/path/to/conf/file"): "" "" self.dare_conffile = conffile self.workflow = PrepareWorkFlow(self.dare_conffile) self.updater = Updater(self.workflow.update_site_db, self.workflow.dare_web_id) self.dare_id = "dare-" + str(uuid.uuid1()) self.data_pilot_service_repo = [] self.step_threads = {} try: self.start() except KeyboardInterrupt: self.quit(message='KeyboardInterrupt') def start(self): darelogger.info("Creating Compute Engine service ") self.pilot_compute_service = PilotComputeService( coordination_url=COORDINATION_URL) self.pilot_data_service = PilotDataService( coordination_url=COORDINATION_URL) for compute_pilot, desc in list( self.workflow.compute_pilot_repo.items()): self.pilot_compute_service.create_pilot( pilot_compute_description=desc) for data_pilot, desc in list(self.workflow.data_pilot_repo.items()): self.data_pilot_service_repo.append( self.pilot_data_service.create_pilot( pilot_data_description=desc)) self.compute_data_service = ComputeDataServiceDecentral() self.compute_data_service.add_pilot_compute_service( self.pilot_compute_service) self.compute_data_service.add_pilot_data_service( self.pilot_data_service) ### run the steps self.step_start_lock = threading.RLock() self.step_run_lock = threading.RLock() for step_id in list(self.workflow.step_units_repo.keys()): darelogger.info(" Sumitted step %s " % step_id) self.step_start_lock.acquire() self.start_thread_step_id = step_id self.step_start_lock.release() self.step_threads[step_id] = threading.Thread( target=self.start_step) self.step_threads[step_id].start() while (1): count_step = [ v.is_alive() for k, v in list(self.step_threads.items()) ] darelogger.info('count_step %s' % count_step) if not True in count_step and len(count_step) > 0: break time.sleep(10) darelogger.info(" All Steps Done processing") self.quit(message='quit gracefully') def check_to_start_step(self, step_id): flags = [] darelogger.info(self.workflow.step_units_repo[step_id]. UnitInfo['start_after_steps']) if self.workflow.step_units_repo[step_id].get_status( ) == StepUnitStates.New: for dep_step_id in self.workflow.step_units_repo[step_id].UnitInfo[ 'start_after_steps']: if self.workflow.step_units_repo[dep_step_id].get_status( ) != StepUnitStates.Done: flags.append(False) darelogger.info( self.workflow.step_units_repo[dep_step_id].get_status()) return False if False in flags else True def start_step(self): self.step_start_lock.acquire() step_id = self.start_thread_step_id self.step_start_lock.release() while (1): darelogger.info(" Checking to start step %s " % step_id) if self.check_to_start_step(step_id): self.run_step(step_id) break else: darelogger.info(" Cannot start this step %s sleeping..." % step_id) time.sleep(10) def run_step(self, step_id): #self.step_run_lock.acquire() #job started update status this_su = self.workflow.step_units_repo[step_id].UnitInfo self.updater.update_status( this_su['dare_web_id'], "%s in step %s" % ('Running', this_su['name'])) darelogger.info(" Started running %s " % step_id) jobs = [] job_start_times = {} job_states = {} NUMBER_JOBS = len( self.workflow.step_units_repo[step_id].UnitInfo['compute_units']) for cu_id in self.workflow.step_units_repo[step_id].UnitInfo[ 'compute_units']: compute_unit_desc = self.workflow.compute_units_repo[cu_id] input_dus = compute_unit_desc.pop('input_data_units') output_dus = compute_unit_desc.pop('output_data_units') input_data_units = [] for du_id in input_dus: input_data_units.append( self.compute_data_service.submit_data_unit( self.workflow.data_units_repo[du_id])) output_data_units = [] for du_id in output_dus: output_data_units.append( self.compute_data_service.submit_data_unit( self.workflow.data_units_repo[du_id])) compute_unit_desc["input_data"] = [ du.get_url() for du in input_data_units ] compute_unit_desc["output_data"] = [{ du.get_url(): ['std*'] } for du in output_data_units] compute_unit = self.compute_data_service.submit_compute_unit( compute_unit_desc) darelogger.info("Compute Unit: Description: \n%s" % (str(self.workflow.compute_units_repo[cu_id]))) jobs.append(compute_unit) job_start_times[compute_unit] = time.time() job_states[compute_unit] = compute_unit.get_state() darelogger.debug( "************************ All Jobs submitted ************************" ) while 1: finish_counter = 0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if state in result_map == False: result_map[state] = 0 result_map[state] = result_map.get(state, 0) + 1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: darelogger.debug("Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state) if old_state != state and self.has_finished(state) == True: darelogger.info("%s step Job: " % (self.workflow.step_units_repo[step_id]. UnitInfo['name']) + str(jobs[i]) + " Runtime: " + str(time.time() - job_start_times[jobs[i]]) + " s.") if self.has_finished(state) == True: finish_counter = finish_counter + 1 job_states[jobs[i]] = state darelogger.debug("Current states: " + str(result_map)) time.sleep(5) if finish_counter == NUMBER_JOBS: break self.workflow.step_units_repo[step_id].set_status(StepUnitStates.Done) #self.compute_data_service.wait() darelogger.debug(" Compute jobs for step %s complete" % step_id) #runtime = time.time()-starttime #all jobs done update status self.updater.update_status(this_su['dare_web_id'], "%s is Done" % this_su['name']) #self.step_run_lock.release() def has_finished(self, state): state = state.lower() if state == "done" or state == "failed" or state == "canceled": return True else: return False def quit(self, message=None): if message: darelogger.debug(message) darelogger.debug("Terminating steps") for step, thread in list(self.step_threads.items()): darelogger.debug("Stoppping step %s" % step) thread._Thread__stop() darelogger.debug("Terminating Pilot Compute/Data Service") try: self.compute_data_service.cancel() self.pilot_data_service.cancel() self.pilot_compute_service.cancel() except: pass
# create pilot job service and initiate a pilot job pilot_compute_description = { "service_url": 'fork://localhost', "number_of_processes": 1, "working_directory": os.getcwd() + "/work/", 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-1" } pilot_compute_service.create_pilot( pilot_compute_description=pilot_compute_description) # create pilot data service (factory for data pilots (physical, distributed storage)) # and pilot data pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) pilot_data_description = { "service_url": "ssh://localhost/tmp/pilot-data/", "size": 100, "affinity_datacenter_label": "eu-de-south", "affinity_machine_label": "mymachine-1", #"userkey":"/Users/luckow/.ssh/rsa_osg", } pilot_data_service.create_pilot( pilot_data_description=pilot_data_description) compute_data_service = ComputeDataService() compute_data_service.add_pilot_compute_service(pilot_compute_service) compute_data_service.add_pilot_data_service(pilot_data_service)
# What files? Create Pilot Data Description using remote SSH URLs # make remotete paths remote_url_list = [ "ssh://localhost" + os.path.join(base_dir, i) for i in url_list ] data_unit_description2 = { "file_urls": remote_url_list, 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-2" } logging.debug("Pilot Data Description 2: \n%s" % str(data_unit_description2)) # create pilot data service (factory for pilot stores (physical, distributed storage)) pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) ps1 = pilot_data_service.create_pilot({ 'service_url': "ssh://localhost/tmp/pilotdata-1/", 'size': 100, 'affinity_datacenter_label': "eu-de-south", 'affinity_machine_label': "mymachine-1" }) ps2 = pilot_data_service.create_pilot({ 'service_url': "ssh://localhost/tmp/pilotdata-2/", 'size':
import uuid #logging.basicConfig(level=logging.DEBUG) #sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) from pilot import PilotComputeService, PilotDataService, ComputeDataService, State from bigjob import logger #COORDINATION_URL = "redis://*****:*****@gw68.quarry.iu.teragrid.org:6379/pcs/pcs-4867ff08-e192-11e1-a694-00003e980000" if __name__ == "__main__": print COORDINATION_URL # create pilot data service (factory for data pilots (physical, distributed storage)) # and pilot data pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) ################################################################################################### # Pick one of the Pilot Data Descriptions below # pilot_data_description_aws={ # "service_url": "s3://pilot-data-" + str(uuid.uuid1()), # "size": 100, # "affinity_datacenter_label": "us-east-1", # "affinity_machine_label": "" # } pilot_data_description_india = { "service_url": "walrus://149.165.146.135/pilot-data-" + str(uuid.uuid1()), #"service_url": "ssh://localhost/tmp/pilot-data-" + str(uuid.uuid1()), "size": 100,
class DareManager(object): """DARE manager: - reads different configuration files - submits compute/data units as that in various steps""" """Constructor""" def __init__(self, conffile="/path/to/conf/file"): "" "" self.dare_conffile = conffile self.workflow = PrepareWorkFlow(self.dare_conffile) self.updater = Updater(self.workflow.update_site_db, self.workflow.dare_web_id) self.dare_id = "dare-" + str(uuid.uuid1()) self.data_pilot_service_repo = [] self.step_threads = {} try: self.start() except KeyboardInterrupt: self.quit(message='KeyboardInterrupt') def start(self): darelogger.info("Creating Compute Engine service ") self.pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) self.pilot_data_service = PilotDataService(coordination_url=COORDINATION_URL) for compute_pilot, desc in self.workflow.compute_pilot_repo.items(): self.pilot_compute_service.create_pilot(pilot_compute_description=desc) for data_pilot, desc in self.workflow.data_pilot_repo.items(): self.data_pilot_service_repo.append(self.pilot_data_service.create_pilot(pilot_data_description=desc)) self.compute_data_service = ComputeDataServiceDecentral() self.compute_data_service.add_pilot_compute_service(self.pilot_compute_service) self.compute_data_service.add_pilot_data_service(self.pilot_data_service) ### run the steps self.step_start_lock = threading.RLock() self.step_run_lock = threading.RLock() for step_id in self.workflow.step_units_repo.keys(): darelogger.info(" Sumitted step %s " % step_id) self.step_start_lock.acquire() self.start_thread_step_id = step_id self.step_start_lock.release() self.step_threads[step_id] = threading.Thread(target=self.start_step) self.step_threads[step_id].start() while(1): count_step = [v.is_alive() for k, v in self.step_threads.items()] darelogger.info('count_step %s' % count_step) if not True in count_step and len(count_step) > 0: break time.sleep(10) darelogger.info(" All Steps Done processing") self.quit(message='quit gracefully') def check_to_start_step(self, step_id): flags = [] darelogger.info(self.workflow.step_units_repo[step_id].UnitInfo['start_after_steps']) if self.workflow.step_units_repo[step_id].get_status() == StepUnitStates.New: for dep_step_id in self.workflow.step_units_repo[step_id].UnitInfo['start_after_steps']: if self.workflow.step_units_repo[dep_step_id].get_status() != StepUnitStates.Done: flags.append(False) darelogger.info(self.workflow.step_units_repo[dep_step_id].get_status()) return False if False in flags else True def start_step(self): self.step_start_lock.acquire() step_id = self.start_thread_step_id self.step_start_lock.release() while(1): darelogger.info(" Checking to start step %s " % step_id) if self.check_to_start_step(step_id): self.run_step(step_id) break else: darelogger.info(" Cannot start this step %s sleeping..." % step_id) time.sleep(10) def run_step(self, step_id): #self.step_run_lock.acquire() #job started update status this_su = self.workflow.step_units_repo[step_id].UnitInfo self.updater.update_status(this_su['dare_web_id'], "%s in step %s" % ('Running', this_su['name'])) darelogger.info(" Started running %s " % step_id) jobs = [] job_start_times = {} job_states = {} NUMBER_JOBS = len(self.workflow.step_units_repo[step_id].UnitInfo['compute_units']) for cu_id in self.workflow.step_units_repo[step_id].UnitInfo['compute_units']: compute_unit_desc = self.workflow.compute_units_repo[cu_id] input_dus = compute_unit_desc.pop('input_data_units') output_dus = compute_unit_desc.pop('output_data_units') input_data_units = [] for du_id in input_dus: input_data_units.append(self.compute_data_service.submit_data_unit(self.workflow.data_units_repo[du_id])) output_data_units = [] for du_id in output_dus: output_data_units.append(self.compute_data_service.submit_data_unit(self.workflow.data_units_repo[du_id])) compute_unit_desc["input_data"] = [du.get_url() for du in input_data_units] compute_unit_desc["output_data"] = [{du.get_url(): ['std*']} for du in output_data_units] compute_unit = self.compute_data_service.submit_compute_unit(compute_unit_desc) darelogger.info("Compute Unit: Description: \n%s" % (str(self.workflow.compute_units_repo[cu_id]))) jobs.append(compute_unit) job_start_times[compute_unit] = time.time() job_states[compute_unit] = compute_unit.get_state() darelogger.debug("************************ All Jobs submitted ************************") while 1: finish_counter = 0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if state in result_map == False: result_map[state] = 0 result_map[state] = result_map.get(state, 0) + 1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: darelogger.debug("Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state) if old_state != state and self.has_finished(state) == True: darelogger.info("%s step Job: " % (self.workflow.step_units_repo[step_id].UnitInfo['name']) + str(jobs[i]) + " Runtime: " + str(time.time() - job_start_times[jobs[i]]) + " s.") if self.has_finished(state) == True: finish_counter = finish_counter + 1 job_states[jobs[i]] = state darelogger.debug("Current states: " + str(result_map)) time.sleep(5) if finish_counter == NUMBER_JOBS: break self.workflow.step_units_repo[step_id].set_status(StepUnitStates.Done) #self.compute_data_service.wait() darelogger.debug(" Compute jobs for step %s complete" % step_id) #runtime = time.time()-starttime #all jobs done update status self.updater.update_status(this_su['dare_web_id'], "%s is Done" % this_su['name']) #self.step_run_lock.release() def has_finished(self, state): state = state.lower() if state == "done" or state == "failed" or state == "canceled": return True else: return False def quit(self, message=None): if message: darelogger.debug(message) darelogger.debug("Terminating steps") for step, thread in self.step_threads.items(): darelogger.debug("Stoppping step %s" % step) thread._Thread__stop() darelogger.debug("Terminating Pilot Compute/Data Service") try: self.compute_data_service.cancel() self.pilot_data_service.cancel() self.pilot_compute_service.cancel() except: pass