handler.workdir = workdir handler.master = master if master != None: SYS_STATS = True # get configuration from master sconf = client.config(master) dconf = json.loads(sconf) handler.config = dconf logging.debug("Config size: %d" % (len(sconf))) logging.debug(str(dconf)) # send done to master client.done(master, id) # NOTE: Time conflict. The master might already send 'step' request, # before server_forever() is started, so 'step' might be lost. # Delay done until the server is up and running. # Check out Asynchronous Mixins example for SocketServer # Comment: the constructor might already activate the server, # so there is no problem. # NOTE: Fixed using time.sleep(5) in master.py logging.debug("Supervisor sent /done to master") logging.info("Starting host server pid %d, id %s, port %d with master %s\n" % (pid, id, port, master)) if SYS_STATS: sys_t = threading.Thread(target=timed_sys_stats_reporter, args=(None, ))
def Execute(args): logging.info("Execute " + str(args.active) + "") tnow = time.time() overall_timer = perf.Timer(logging) task_name = "overall: " if len(args.active) > 0: # Get's the task name task_name = "overall: %s" % get_task_name(args.active[0]) overall_timer.start("superstep-%d-%s" % \ (args.server.superstep_count, task_name)) if len(args.active) > 0: execdir = os.path.join(args.workdir, "snapw.%d/exec" % (args.pid)) config.mkdir_p(execdir) def execute_single_task(task): # get the executables bunch = get_task_name(task) execlist = [] try: execlist = args.config["bunch"][bunch]["exec"].split(",") except: pass timer = perf.Timer(logging) timer.start('provision-execlist') # provision execlist on disk for item in execlist: execpath = os.path.join(execdir, item) # check if the program exists and its mtime mtime = None try: stat = os.stat(execpath) mtime = int(stat.st_mtime) except: pass if not mtime or mtime < tnow: # if the file does not exist or it is older than current time, # contact the head task content = client.getexec(args.master,item,mtime) swc = "None" if content: swc = str(len(content)) logging.debug("Host received %s" % (swc)) if content: if len(content) > 0: logging.debug("Host saving to %s" % (execpath)) f = open(execpath,"w") f.write(content) f.close() os.utime(execpath,(tnow, tnow)) timer.stop('provision-execlist') prog = execlist[0] logging.debug("Task %s, exec %s" % (prog, execlist)) progpath = os.path.join(execdir, prog) if not os.path.exists(progpath): logging.error("task %s not started, program %s not found" % (task, progpath)) return taskdir = "snapw.%d/tasks/%s" % (args.pid, task) config.mkdir_p(taskdir) qdir = os.path.join(args.workdir, args.qactname, task) tdir = os.path.join(args.workdir, taskdir) logging.info("starting task %s, prog %s, workdir %s, qdir %s\n" % (task, prog, tdir, qdir)) # get server information host = args.server.host port = args.server.port # construct a command line for worker cmd = python + " %s -t %s -h %s:%d -q %s" % ( progpath, task, host, port, qdir) logging.info("starting cmd %s" % (cmd)) # start the work process p = subprocess.Popen(cmd.split(), cwd=tdir, close_fds=True) return p, prog # Dynamically check what the number of processors we have on each host # In any error, default to 1. max_tasks = 1 var_par_tasks = int(args.config['par_tasks']) if var_par_tasks <= 0: try: max_tasks = os.sysconf('SC_NPROCESSORS_ONLN') except: max_tasks = 1 else: max_tasks = var_par_tasks # execute the tasks in a parallel fashion by running # at most max_tasks processes at any point. task_list = args.active[:] procs = [] logging.info("Running %d tasks with %d-way parallelism: %s" % \ (len(task_list), max_tasks, str(task_list))) timer = perf.Timer(logging) pcounter = 0 counter_map = {} while True: while task_list and len(procs) < max_tasks: task = task_list.pop() timer.start("prog-%d" % pcounter) p, prog = execute_single_task(task) timer.update_extra("prog-%d" % pcounter, "step: %d, pid: %d, prog: %s" \ % (args.server.superstep_count, p.pid, prog)) counter_map[p.pid] = pcounter pcounter += 1 procs.append(p) for p in procs: # wait for the process to complete pid = p.pid logging.debug("polling %d" % pid) status = p.poll() if status is not None: timer.stop("prog-%d" % counter_map[p.pid]) del counter_map[p.pid] logging.debug("finished %d with status %s" % (pid, str(status))) # error reporting if status <> 0: msg = "Pid %d terminated unexpectedly with status %d" % (pid, status) logging.error(msg) client.error(args.master, args.id, msg) procs.remove(p) if not procs and not task_list: break else: time.sleep(0.1) overall_timer.stop("superstep-%d-%s" % \ (args.server.superstep_count, task_name)) # send done to master client.done(args.master, args.id)
handler.workdir = workdir handler.master = master if master != None: # SYS_STATS = True # get configuration from master sconf = client.config(master) dconf = json.loads(sconf) handler.config = dconf logging.debug("Config size: %d" % (len(sconf))) logging.debug(str(dconf)) # send done to master client.done(master, id) # NOTE: Time conflict. The master might already send 'step' request, # before server_forever() is started, so 'step' might be lost. # Delay done until the server is up and running. # Check out Asynchronous Mixins example for SocketServer # Comment: the constructor might already activate the server, # so there is no problem. # NOTE: Fixed using time.sleep(5) in master.py logging.debug("Supervisor sent /done to master") logging.info("Starting host server pid %d, id %s, port %d with master %s\n" % (pid, id, port, master)) if SYS_STATS: sys_t = threading.Thread(target=timed_sys_stats_reporter, args=(None, ))
def Execute(args): args.flog.write("Execute " + str(args.active) + "\n") tnow = time.time() if len(args.active) > 0: execdir = os.path.join(args.workdir, "snapw.%d/exec" % (args.pid)) config.mkdir_p(execdir) # execute the tasks sequentially for task in args.active: # get the executables bunch = "%s" % (task.split("-",1)[0]) execlist = [] try: execlist = args.config["bunch"][bunch]["exec"].split(",") except: pass for item in execlist: execpath = os.path.join(execdir, item) # check if the program exists and its mtime mtime = None try: stat = os.stat(execpath) mtime = int(stat.st_mtime) except: pass if not mtime or mtime < tnow: # the file does not exist or it is older than current time, # contact the head task content = client.getexec(args.master,item,mtime) swc = "None" if content: swc = str(len(content)) print "Host received %s" % (swc) if content: if len(content) > 0: print "Host saving to %s" % (execpath) f = open(execpath,"w") f.write(content) f.close() os.utime(execpath,(tnow, tnow)) prog = execlist[0] print "Task %s, exec %s" % (prog, execlist) progpath = os.path.join(execdir, prog) if not os.path.exists(progpath): line = "*** Error: task %s not started, program %s not found\n" % ( task, progpath) args.flog.write(line) args.flog.flush() continue taskdir = "snapw.%d/tasks/%s" % (args.pid, task) config.mkdir_p(taskdir) qdir = os.path.join(args.workdir, args.qactname, task) tdir = os.path.join(args.workdir, taskdir) line = "starting task %s, prog %s, workdir %s, qdir %s\n" % ( task, prog, tdir, qdir) args.flog.write(line) args.flog.flush() # get server information host = args.server.host port = args.server.port # construct a command line cmd = python + " %s -t %s -h %s:%d -q %s" % ( progpath, task, host, port, qdir) args.flog.write("starting cmd %s\n" % (cmd)) args.flog.flush() # start the work process p = pexec.Exec(tdir,cmd) # wait for the process to complete while True: args.flog.write("polling\n") args.flog.flush() status = pexec.Poll(p) if status != None: break time.sleep(0.1) args.flog.write("finished\n") args.flog.flush() # send done to master client.done(args.master, args.id)
def Execute(args): args.flog.write("Execute " + str(args.active) + "\n") tnow = time.time() if len(args.active) > 0: execdir = os.path.join(args.workdir, "snapw.%d/exec" % (args.pid)) config.mkdir_p(execdir) def execute_single_task(task): # get the executables bunch = "%s" % (task.split("-",1)[0]) execlist = [] try: execlist = args.config["bunch"][bunch]["exec"].split(",") except: pass for item in execlist: execpath = os.path.join(execdir, item) # check if the program exists and its mtime mtime = None try: stat = os.stat(execpath) mtime = int(stat.st_mtime) except: pass if not mtime or mtime < tnow: # the file does not exist or it is older than current time, # contact the head task content = client.getexec(args.master,item,mtime) swc = "None" if content: swc = str(len(content)) print "Host received %s" % (swc) if content: if len(content) > 0: print "Host saving to %s" % (execpath) f = open(execpath,"w") f.write(content) f.close() os.utime(execpath,(tnow, tnow)) prog = execlist[0] print "Task %s, exec %s" % (prog, execlist) progpath = os.path.join(execdir, prog) if not os.path.exists(progpath): line = "*** Error: task %s not started, program %s not found\n" % ( task, progpath) args.flog.write(line) args.flog.flush() return taskdir = "snapw.%d/tasks/%s" % (args.pid, task) config.mkdir_p(taskdir) qdir = os.path.join(args.workdir, args.qactname, task) tdir = os.path.join(args.workdir, taskdir) line = "starting task %s, prog %s, workdir %s, qdir %s\n" % ( task, prog, tdir, qdir) args.flog.write(line) args.flog.flush() # get server information host = args.server.host port = args.server.port # construct a command line cmd = python + " %s -t %s -h %s:%d -q %s" % ( progpath, task, host, port, qdir) args.flog.write("starting cmd %s\n" % (cmd)) args.flog.flush() # start the work process p = pexec.Exec(tdir,cmd) return p # Dynamically check what the number of processors we have on each host # In any error, default to 1. try: max_tasks = os.sysconf('SC_NPROCESSORS_ONLN') except: max_tasks = 1 args.flog.write("Running tasks with " + str(max_tasks) + "-way parallelism\n") # execute the tasks in a parallel fashion by running # at most max_tasks processes at any point. task_list = args.active[:] procs = [] while True: while task_list and len(procs) < max_tasks: task = task_list.pop() procs.append(execute_single_task(task)) for p in procs: # wait for the process to complete pid = pexec.GetPid(p) args.flog.write("polling " + str(pid) + "\n") args.flog.flush() status = pexec.Poll(p) if status is not None: args.flog.write("finished " + str(pid) + "\n") args.flog.flush() procs.remove(p) if not procs and not task_list: break else: time.sleep(1.0) # send done to master client.done(args.master, args.id)
def Execute(args): args.flog.write("Execute " + str(args.active) + "\n") tnow = time.time() if len(args.active) > 0: execdir = os.path.join(args.workdir, "snapw.%d/exec" % (args.pid)) config.mkdir_p(execdir) def execute_single_task(task): # get the executables bunch = "%s" % (task.split("-", 1)[0]) execlist = [] try: execlist = args.config["bunch"][bunch]["exec"].split(",") except: pass for item in execlist: execpath = os.path.join(execdir, item) # check if the program exists and its mtime mtime = None try: stat = os.stat(execpath) mtime = int(stat.st_mtime) except: pass if not mtime or mtime < tnow: # the file does not exist or it is older than current time, # contact the head task content = client.getexec(args.master, item, mtime) swc = "None" if content: swc = str(len(content)) print "Host received %s" % (swc) if content: if len(content) > 0: print "Host saving to %s" % (execpath) f = open(execpath, "w") f.write(content) f.close() os.utime(execpath, (tnow, tnow)) prog = execlist[0] print "Task %s, exec %s" % (prog, execlist) progpath = os.path.join(execdir, prog) if not os.path.exists(progpath): line = "*** Error: task %s not started, program %s not found\n" % ( task, progpath) args.flog.write(line) args.flog.flush() return taskdir = "snapw.%d/tasks/%s" % (args.pid, task) config.mkdir_p(taskdir) qdir = os.path.join(args.workdir, args.qactname, task) tdir = os.path.join(args.workdir, taskdir) line = "starting task %s, prog %s, workdir %s, qdir %s\n" % ( task, prog, tdir, qdir) args.flog.write(line) args.flog.flush() # get server information host = args.server.host port = args.server.port # construct a command line cmd = python + " %s -t %s -h %s:%d -q %s" % (progpath, task, host, port, qdir) args.flog.write("starting cmd %s\n" % (cmd)) args.flog.flush() # start the work process p = pexec.Exec(tdir, cmd) return p # Dynamically check what the number of processors we have on each host # In any error, default to 1. try: max_tasks = os.sysconf('SC_NPROCESSORS_ONLN') except: max_tasks = 1 args.flog.write("Running tasks with " + str(max_tasks) + "-way parallelism\n") # execute the tasks in a parallel fashion by running # at most max_tasks processes at any point. task_list = args.active[:] procs = [] while True: while task_list and len(procs) < max_tasks: task = task_list.pop() procs.append(execute_single_task(task)) for p in procs: # wait for the process to complete pid = pexec.GetPid(p) args.flog.write("polling " + str(pid) + "\n") args.flog.flush() status = pexec.Poll(p) if status is not None: args.flog.write("finished " + str(pid) + "\n") args.flog.flush() procs.remove(p) if not procs and not task_list: break else: time.sleep(1.0) # send done to master client.done(args.master, args.id)