def runprocess(proc, message_queue): """ Called by the master shell program, this function forks a shell process. The master process returns the PID of the child. The child process runs the shell process and then sends a message back to the master process indicating its status. """ import os import smq pid = os.fork() if pid: # Master return pid # Child L.debug("Forked %s as %d" % (proc.workunit_name, os.getpid())) # Updating master proc.state = States.RUNNING proc.message_queue = message_queue message_queue.send(smq.Message("state:%d" % proc.state, "str", proc.workunit_name, MASTER_SENDER_NAME)) proc.start() # If proc.start does not update its state, assume SUCCESS if proc.state == States.RUNNING: proc.state = States.SUCCESS message_queue.send(smq.Message("state:%d" % proc.state, "str", proc.workunit_name, MASTER_SENDER_NAME)) L.debug("%d finished start() with status %s" % (os.getpid(), strstate(proc.state))) exit(0)
def update_dag_job_state(result, is_valid): import dag.boinc from dag import States, strstate import dag root_dag = dag.boinc.result_to_dag(result.name) if not root_dag: print("Could not get DAG file for %s" % result.name) return wuname = dag.boinc.name_result2workunit(result.name) proc = root_dag.get_process(wuname) if not proc: print("In dag %s, could not find workunit %s" % (root_dag.filename, wuname)) return if is_valid: proc.state = States.SUCCESS else: proc.state = States.FAIL print("Marking %s as %s" % (result.name, strstate(proc.state))) try: root_dag.save() except dag.DagException as de: print("Could not save dag file '%s'" % root_dag.filename) raise de
def update_dag_job_state(result,is_valid): import dag.boinc from dag import States,strstate import dag root_dag = dag.boinc.result_to_dag(result.name) if not root_dag: print("Could not get DAG file for %s" % result.name) return wuname = dag.boinc.name_result2workunit(result.name) proc = root_dag.get_process(wuname) if not proc: print("In dag %s, could not find workunit %s" % (root_dag.filename, wuname)) return if is_valid: proc.state = States.SUCCESS else: proc.state = States.FAIL print("Marking %s as %s" % (result.name, strstate(proc.state))) try: root_dag.save() except dag.DagException as de: print("Could not save dag file '%s'" % root_dag.filename) raise de
def process_messages(root_dag, message_queue): """ Reads through messages in the queue for the master and acts on them. @see: perform_operation @param root_dag: Main DAG object @type root_dag: dag.DAG @param message_queue: Message queue being read @type message_queue: smq.Queue """ global kill_switch from smq import Message from dag import FINISHED_STATES def send(text, recipient): message_queue.send(Message(text, "str", MASTER_SENDER_NAME, recipient)) retval = None while message_queue.has_message(MASTER_SENDER_NAME): message = message_queue.next(MASTER_SENDER_NAME) L.debug("Processing Message from %s: %s..." % (message.sender, message.content[0:15])) if message.content == "shutdown": kill_switch = True retval = "Shutting down shell processes" elif message.content.startswith("state:"): proc = root_dag.get_process(message.sender) if not proc: retval = ( "Cannot change state. Unknown process %s" % message.sender) break newstate = message.content.replace("state:", "") proc.state = int(newstate) L.debug("Changed state of %s to %s" % (proc.workunit_name, strstate(proc.state))) root_dag.save() if proc.state in FINISHED_STATES: for ended in [ i for i in running_children if i[0] == proc.workunit_name ]: running_children.remove(ended) elif message.content == "dump": retval = dump_state(root_dag, message_queue) else: retval = perform_operation(root_dag, message) if retval is not None: send(retval, message.sender)
def modify_dag(root_dag, cmd, cmd_args, debug=False): """ This is the main operating function. This takes a command and performs an action on the dag @param root_dag: DAG to be modified @type root_dag: dag.DAG @param cmd: Command to execute @type cmd: str @param cmd_args: Optional arguments for commands. @type cmd_args: list @param debug: Optional debug flag @type debug: bool """ import os.path as OP import dag return_message = "" if cmd == "attach": from dag.shell import Waiter if len(cmd_args) != 2: raise Exception("Attach requires a workunit name" " and a process id number (PID).") new_process = Waiter(cmd_args[0], [cmd_args[1], ]) for process in root_dag.processes: if (process.state in [dag.States.CREATED, dag.States.STAGED] and not isinstance(process, Waiter)): new_process.children.append(process) root_dag.processes.append(new_process) root_dag.save() return "Attached %s" % cmd_args[0] elif cmd == "print": if len(cmd_args) == 0: return_message += "%s\n" % root_dag else: proc = root_dag.get_process(cmd_args[0]) if proc: return_message += "%s\n" % proc else: return_message += "No such process found: {0}\n".format(cmd_args[0]) return return_message elif cmd == "help": if not cmd_args: return get_help_string(None) else: return get_help_string(cmd_args[0]) elif cmd == "list": for proc in root_dag.processes: return_message += "%s: %s\n" % (proc.workunit_name, proc.cmd) return return_message elif cmd in ["remove", "run", "stage"]: if len(cmd_args) == 0: raise Exception("%s requires at least one workunit name" % cmd) for wuname in cmd_args: proc = root_dag.get_process(wuname) if cmd == "remove": if wuname == "all": from sys import stdin print("Are you sure you want to remove ALL workunits" " (yes or no)?") if (not stdin.readline().strip() in ["y", "Y", "yes", "Yes", "YES"]): # Cancel workunit print("Canceled.") exit(1) count = 0 progress_bar = None if not debug: from progressbar import ProgressBar, Percentage, Bar num_processes = len(root_dag.processes) if num_processes: progress_bar = ProgressBar(widgets = [Percentage(), Bar()], maxval = num_processes).start() for proc in root_dag.processes: if debug: print("Removing %s" % proc.workunit_name) clean_workunit(root_dag, proc) count += 1 if progress_bar: progress_bar.update(count) if progress_bar: print("") # reset line return root_dag.processes = [] # clear process list else: if debug: print("Removing %s" % wuname) clean_workunit(root_dag, proc) root_dag.processes.remove(proc) # remove process return_message += "Removed %s\n" % wuname if cmd in ["run", "stage"]: print("Staging %s" % wuname) stage_files(root_dag, proc) if proc.state == dag.States.CREATED: proc.state = dag.States.STAGED if cmd == "run": return_message += "Starting %s\n" % wuname if root_dag.incomplete_prereqs(proc): raise Exception("Cannot start %s." " Missing dependencies.") schedule_work(root_dag, proc, root_dag.filename) if isinstance(proc, dag.InternalProcess): proc.state = dag.States.SUCCESS return_message += "Finished %s" % wuname else: proc.state = dag.States.RUNNING #save dag root_dag.save() print("updated dagfile") elif cmd == "start": start_processes(root_dag, OP.abspath(root_dag.filename), True, root_dag.num_cores) return_message += "Started processes" elif cmd == "recreate": if not cmd_args: raise Exception("recreate requires a specific file type" " to recreate.") if cmd_args[0] == "result_template": if root_dag.engine != dag.Engine.BOINC: raise dag.DagException("Can only make result template" " with BOINC jobs.") import dag.boinc proc = root_dag.get_process(cmd_args[1]) dag.boinc.create_result_template(proc, proc.result_template.full_path()) print("Created result template") else: print("Do not know how to recreate: '%s'" % cmd_args[0]) return_message += "Recreated %s\n" % cmd_args[0] elif cmd == "reset": for wuname in cmd_args: proc = root_dag.get_process(wuname) if not proc: return_message += "No such workunit: %s\n" % wuname continue clean_workunit(root_dag, proc) proc.workunit_name = None proc.workunit_template = None proc.result_template = None proc.state = dag.States.CREATED root_dag.save() return_message += "Reset %s" % wuname elif cmd == "cancel": if root_dag.engine == dag.Engine.LSF: raise dag.DagException("Cannot yet cancel LSF jobs.") elif root_dag.engine == dag.Engine.SHELL: if not hasattr(root_dag, "message_queue"): raise dag.DagException("Cannot stop shell process " "without message queue") proc_list = [root_dag.get_process(wuname) for wuname in cmd_args] if root_dag.engine == dag.Engine.BOINC: dag.boinc.cancel_workunits(proc_list) elif root_dag.engine == dag.Engine.SHELL: dag.shell.cancel_workunits(root_dag, proc_list) root_dag.save() return_message += "Cancelled %s" % ", ".join(cmd_args) elif cmd == "update": update_state(cmd_args, root_dag) if root_dag.engine == dag.Engine.LSF: start_processes(root_dag, root_dag.filename, False) return_message += "Updated process" elif cmd == "state": count_only = False if "--count" in cmd_args: count_only = True if not cmd_args: raise dag.DagException("Missing state name.") states_to_view = cmd_args[0] if states_to_view == "all": states_to_view = ",".join([dag.strstate(i) for i in range(0, dag.States.NUM_STATES)]) for state_name in states_to_view.split(","): state = dag.intstate(state_name.upper()) if state is None: print("%s is not a valid state." % state_name) print("States are %s" % ", ".join([dag.strstate(i) for i in range(0, dag.States.NUM_STATES)])) raise dag.DagException("Invalid State") proc_list = root_dag.get_processes_by_state(state) if count_only: return_message += "%s: %d\n" % (dag.strstate(state), len(proc_list)) else: for i in proc_list: return_message += "%s" % i elif cmd == "uuid": proc = root_dag.get_process(cmd_args[0]) return_message += str(proc.uuid) else: if not debug: return_message += "Unknown command: %s" % cmd raise Exception("Unknown command: %s" % cmd) return return_message
" name is not given, all processes are listed."), "recreate": ("Regenerates specified temporary files." " Options are: 'result_template'"), "reset": ("Clears generated values, such as workunit name," " and moves process to CREATED state."), "remove": ("Removes a workunit. 'all' can be supplied instead" " of a workunit name to remove ALL of the workunits."), "run": ("Stars a specific process, by workunit name. This should be run" " after 'stage'"), "stage": ("Copies necessary files to their required locations" " on the server."), "start": "Starts ALL processes", "state": ("Prints processes in a given state. The optional \"--count\"" " flag may be used to show only a count of the number " "of processes in that state. States are: {0}" .format(", ".join([dag.strstate(i) for i in range(0, dag.States.NUM_STATES)]))), "update": "Update the state of a workunit.", "uuid": "Gets UUID for a work unit." } def get_help_string(command=None): """ Prints help for a command using the command_help dict. Help is printed to standard output. @param command: Command for which help is required @type command: str """ if not command in command_help:
def clean(result): import re import dag.util as dag_utils import boinctools import dag, dag.boinc import shutil import stat from os import path as OP import os if len(result.name) >= 2: if result.name[-2:] != "_0": print("Not cleaning %s" % result.name) return True print("Cleaning %s" % result.name) wuname = re.findall(r"^(.*)_\d*$", result.name) if len(wuname) == 0: print("Malformed result name") return None wuname = wuname[0] try: the_dag = dag.boinc.result_to_dag(result.name) except dag_utils.NoDagMarkerException as ndme: print("Warning: Missing dag") print("Skipping clean up") print("Message:") print(ndme.message) return False except dag.MissingDAGFile as mdf: print( "Missing dag file for result '%s'. Attempting to move output to invalid_results directory" % result.name) for output_file in result.output_files: boinctools.save_bad_res_output(output_file[0], wuname) return False if not the_dag: return False dagpath = dag.boinc.marker_to_dagpath( dag.boinc.dag_marker_filename(wuname)) dagdir = OP.split(dagpath)[0] print("Getting process %s" % wuname) proc = the_dag.get_process(wuname) if not proc: print( "%s was not found in the job batch file %s. Moving resultfile to invalid_results" % (wuname, dagpath)) for output_file in result.output_files: dag.boinc.save_bad_res_output(output_file[0], wuname) return False if not proc.output_files: return True source_file = "%s_0" % result.name output_file = proc.output_files[0] logical_name = output_file.logical_name print("Clean filename \"%s\"" % logical_name) # Get output dir if output_file.dir: output_dir = proc.directory # final destination (If it can be written there) else: output_dir = dagdir # If the result is valid, but the data in the bad_results # directory. if proc.state not in [dag.States.SUCCESS, dag.States.RUNNING]: output_dir = OP.join(output_dir, "bad_results") print( "Process has not been marked as successful. It is %s instead. Saving output in %s" % (dag.strstate(proc.state), output_dir)) if not OP.isdir(output_dir): os.mkdir(output_dir) dest_file = OP.join(output_dir, output_file.logical_name) upload_path = boinctools.dir_hier_path(source_file).replace( "/download", "/upload") if not OP.isfile(upload_path): print("Output file not found: '%s'" % upload_path) return False if not OP.isfile(dest_file): # Copy file. If it does not exist, move it to the invalid_results directory print("Copying {0} to {1}.".format(upload_path, dest_file)) try: shutil.copy(upload_path, dest_file) OP.os.chmod( dest_file, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) except Exception as e: dag.boinc.save_bad_res_output(upload_path, wuname) print( "ERROR - Could not copy result output file to data directory, %s. It was copied to invalid_results/%s" % (dagdir, wuname)) print("ERROR - Message:\n%s" % e.message) if isinstance(e, IOError): print(e.strerror) raise e else: #output already exists, append. try: with open(dest_file, "a") as old_file: old_file.write("\n") old_file.write(open(upload_path, "r").read()) except Exception as e: dag.boinc.save_bad_res_output(upload_path, wuname) print( "ERROR - Could not copy result output file to data directory, %s. It was copied to invalid_results/%s" % (dagdir, wuname)) print("ERROR - Message:\n%s" % e.message) if isinstance(e, IOError): print(e.strerror) raise e return True
def clean(result): import re import dag.util as dag_utils import boinctools import dag,dag.boinc import shutil import stat from os import path as OP import os if len(result.name) >= 2: if result.name[-2:] != "_0": print("Not cleaning %s" % result.name) return True print("Cleaning %s" % result.name) wuname = re.findall(r"^(.*)_\d*$",result.name) if len(wuname) == 0: print("Malformed result name") return None wuname = wuname[0] try: the_dag = dag.boinc.result_to_dag(result.name) except dag_utils.NoDagMarkerException as ndme: print("Warning: Missing dag") print("Skipping clean up" ) print("Message:") print(ndme.message) return False except dag.MissingDAGFile as mdf: print("Missing dag file for result '%s'. Attempting to move output to invalid_results directory" % result.name) for output_file in result.output_files: boinctools.save_bad_res_output(output_file[0],wuname) return False if not the_dag: return False dagpath = dag.boinc.marker_to_dagpath(dag.boinc.dag_marker_filename(wuname)) dagdir = OP.split(dagpath)[0] print("Getting process %s" % wuname) proc = the_dag.get_process(wuname) if not proc: print("%s was not found in the job batch file %s. Moving resultfile to invalid_results" % (wuname, dagpath)) for output_file in result.output_files: dag.boinc.save_bad_res_output(output_file[0],wuname) return False if not proc.output_files: return True source_file = "%s_0" % result.name output_file = proc.output_files[0] logical_name = output_file.logical_name print("Clean filename \"%s\"" % logical_name) # Get output dir if output_file.dir: output_dir = proc.directory # final destination (If it can be written there) else: output_dir = dagdir # If the result is valid, but the data in the bad_results # directory. if proc.state not in [dag.States.SUCCESS,dag.States.RUNNING]: output_dir = OP.join(output_dir,"bad_results") print("Process has not been marked as successful. It is %s instead. Saving output in %s" % (dag.strstate(proc.state), output_dir)) if not OP.isdir(output_dir): os.mkdir(output_dir) dest_file = OP.join(output_dir,output_file.logical_name) upload_path = boinctools.dir_hier_path(source_file).replace("/download","/upload") if not OP.isfile(upload_path): print("Output file not found: '%s'" % upload_path) return False if not OP.isfile(dest_file): # Copy file. If it does not exist, move it to the invalid_results directory print("Copying {0} to {1}.".format(upload_path,dest_file)) try: shutil.copy(upload_path,dest_file) OP.os.chmod(dest_file,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) except Exception as e: dag.boinc.save_bad_res_output(upload_path,wuname) print("ERROR - Could not copy result output file to data directory, %s. It was copied to invalid_results/%s" % (dagdir,wuname)) print("ERROR - Message:\n%s" % e.message) if isinstance(e,IOError): print(e.strerror) raise e else: #output already exists, append. try: with open(dest_file,"a") as old_file: old_file.write("\n") old_file.write(open(upload_path,"r").read()) except Exception as e: dag.boinc.save_bad_res_output(upload_path,wuname) print("ERROR - Could not copy result output file to data directory, %s. It was copied to invalid_results/%s" % (dagdir,wuname)) print("ERROR - Message:\n%s" % e.message) if isinstance(e,IOError): print(e.strerror) raise e return True