def run(self, pdb_on_exception=False): """ Run the rocket (check out a job from the database and execute it) Args: pdb_on_exception (bool): whether to invoke the debugger on a caught exception. Default False. """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) logdir = lp.get_logdir() if lp else None l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=ROCKET_STREAM_LOGLEVEL) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['started_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False final_state = None ping_stop = None btask_stops = [] try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except Exception: pass recovery = m_fw.spec.get('_recovery', None) if recovery: recovery_dir = recovery.get('_prev_dir') recovery_mode = recovery.get('_mode') starting_task = recovery.get('_task_n') all_stored_data.update(recovery.get('_all_stored_data')) all_update_spec.update(recovery.get('_all_update_spec')) all_mod_spec.extend(recovery.get('_all_mod_spec')) if lp: l_logger.log( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recovery_dir)) if recovery_mode == 'cp' and launch_dir != recovery_dir: if lp: l_logger.log( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recovery_dir, launch_dir)) distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1) else: starting_task = 0 files_in = m_fw.spec.get("_files_in", {}) prev_files = m_fw.spec.get("_files_prev", {}) for f in set(files_in.keys()).intersection(prev_files.keys()): # We use zopen for the file objects for transparent handling # of zipped files. shutil.copyfileobj does the actual copy # in chunks that avoid memory issues. with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout: shutil.copyfileobj(fin, fout) if lp: message = 'RUNNING fw_id: {} in directory: {}'. \ format(m_fw.fw_id, os.getcwd()) l_logger.log(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the Firetasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): checkpoint = {'_task_n': t_counter, '_all_stored_data': all_stored_data, '_all_update_spec': all_update_spec, '_all_mod_spec': all_mod_spec} Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint) if lp: l_logger.log(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.fw_id = m_fw.fw_id if FWData().MULTIPROCESSING: # hack because AutoProxy manager can't access attributes t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict()) else: t.launchpad = self.launchpad if my_spec.get("_add_fworker"): t.fworker = self.fworker try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details if pdb_on_exception: pdb.post_mortem() try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: l_logger.log(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except Exception: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'FIZZLED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True # read in a FWAction from a file, in case the task is not Python and cannot return # it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this # particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'COMPLETED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} reached final state {} but couldn't complete the update of " "the database. Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True except Exception: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except Exception: traceback.print_exc() if lp: try: lp.complete_launch(launch_id, m_action, 'FIZZLED') except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} fizzled but couldn't complete the update of the database." " Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True
def run(self, pdb_on_exception=False): """ Run the rocket (check out a job from the database and execute it) Args: pdb_on_exception (bool): whether to invoke the debugger on a caught exception. Default False. """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) logdir = lp.get_logdir() if lp else None l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=ROCKET_STREAM_LOGLEVEL) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['started_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False final_state = None ping_stop = None btask_stops = [] try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass recovery = m_fw.spec.get('_recovery', None) if recovery: recovery_dir = recovery.get('_prev_dir') recovery_mode = recovery.get('_mode') starting_task = recovery.get('_task_n') all_stored_data.update(recovery.get('_all_stored_data')) all_update_spec.update(recovery.get('_all_update_spec')) all_mod_spec.extend(recovery.get('_all_mod_spec')) if lp: l_logger.log( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recovery_dir)) if recovery_mode == 'cp' and launch_dir != recovery_dir: if lp: l_logger.log( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recovery_dir, launch_dir)) distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1) else: starting_task = 0 files_in = m_fw.spec.get("_files_in", {}) prev_files = m_fw.spec.get("_files_prev", {}) for f in set(files_in.keys()).intersection(prev_files.keys()): # We use zopen for the file objects for transparent handling # of zipped files. shutil.copyfileobj does the actual copy # in chunks that avoid memory issues. with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout: shutil.copyfileobj(fin, fout) if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) l_logger.log(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the Firetasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): checkpoint = {'_task_n': t_counter, '_all_stored_data': all_stored_data, '_all_update_spec': all_update_spec, '_all_mod_spec': all_mod_spec} Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint) if lp: l_logger.log(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.fw_id = m_fw.fw_id if FWData().MULTIPROCESSING: # hack because AutoProxy manager can't access attributes t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict()) else: t.launchpad = self.launchpad if my_spec.get("_add_fworker"): t.fworker = self.fworker try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details if pdb_on_exception: pdb.post_mortem() try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: l_logger.log(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'FIZZLED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True # read in a FWAction from a file, in case the task is not Python and cannot return # it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this # particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'COMPLETED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} reached final state {} but couldn't complete the update of " "the database. Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True except: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except: traceback.print_exc() if lp: try: lp.complete_launch(launch_id, m_action, 'FIZZLED') except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} fizzled but couldn't complete the update of the database." " Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) lp.log_message(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') try: if '_launch_dir' in m_fw.spec: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) launch_dir = os.path.abspath(os.getcwd()) if lp: lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass if m_fw.spec.get('_recover_launch', None): launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id']) starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0) recover_launch_dir = launch_to_recover.launch_dir if lp: lp.log_message( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir)) if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir: if lp: lp.log_message( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir)) distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1) else: starting_task = 0 my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): if lp: lp.log_message(logging.INFO, "Task started: %s." % t.fw_name) try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details, '_failed_task_n': t_counter}}, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass if m_fw.spec.get('_recover_launch', None): launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id']) starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0) recover_launch_dir = launch_to_recover.launch_dir if lp: lp.log_message( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir)) if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir: if lp: lp.log_message( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir)) distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1) else: starting_task = 0 if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) lp.log_message(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): if lp: lp.log_message(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.launchpad = self.launchpad t.fw_id = m_fw.fw_id try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details, '_failed_task_n': t_counter}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except: traceback.print_exc() if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = FireWork.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format( self.fworker.query)) return False if '_launch_dir' in m_fw.spec: prev_dir = launch_dir os.chdir(m_fw.spec['_launch_dir']) launch_dir = os.path.abspath(os.getcwd()) if lp: lp._change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') try: my_spec = dict( m_fw.spec) # make a copy of spec, don't override original # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for my_task in m_fw.tasks: m_action = my_task.run_task(my_spec) # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data[ 'multiprocess_name'] = multiprocessing.current_process( ).name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: stop_backgrounds(ping_stop, btask_stops) traceback.print_exc() try: m_action = FWAction(stored_data={ '_message': 'runtime error during task', '_task': my_task.to_dict(), '_exception': traceback.format_exc() }, exit=True) except: m_action = FWAction(stored_data={ '_message': 'runtime error during task', '_task': None, '_exception': traceback.format_exc() }, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False if '_launch_dir' in m_fw.spec: prev_dir = launch_dir os.chdir(m_fw.spec['_launch_dir']) launch_dir = os.path.abspath(os.getcwd()) if lp: lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) lp.log_message(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') try: my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for t in m_fw.tasks: lp.log_message(logging.INFO, "Task started: %s." % t.fw_name) m_action = t.run_task(my_spec) # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor traceback.print_exc() try: m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': t.to_dict(), '_exception': traceback.format_exc()}, exit=True) except: m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': traceback.format_exc()}, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True