def do_GET(self): parsed_path = urlparse.urlparse(self.path) message_parts = [ 'CLIENT VALUES:', 'client_address=%s (%s)' % (self.client_address, self.address_string()), 'command=%s' % self.command, 'path=%s' % self.path, 'real path=%s' % parsed_path.path, 'query=%s' % parsed_path.query, 'request_version=%s' % self.request_version, '', 'SERVER VALUES:', 'server_type=%s' % "host server", 'server_version=%s' % self.server_version, 'sys_version=%s' % self.sys_version, 'protocol_version=%s' % self.protocol_version, '', 'HEADERS RECEIVED:', ] for name, value in sorted(self.headers.items()): message_parts.append('%s=%s' % (name, value.rstrip())) message_parts.append('') message = '\r\n'.join(message_parts) subpath = self.path.split("/") if self.path == "/prepare": prepare_timer = perf.Timer(logging) self.server.superstep_count += 1 prepare_timer.start("prepare-%d" % self.server.superstep_count) # move qin to qact qinname = "snapw.%d/qin" % (self.pid) qactname = "snapw.%d/qact" % (self.pid) # rename an existing qact if os.path.exists(qactname): removed = False if not self.config['debug']: try: shutil.rmtree(qactname) logging.debug("removed dir %s" % qactname) removed = True except: logging.error("error on removing dir %s" % qactname) if not removed: t = time.time() s = time.strftime("%Y%m%d-%H%M%S", time.localtime(t)) mus = "%06d" % (t*1000000 - int(t)*1000000) qactnewname = "%s-%s-%s" % (qactname, s, mus) os.rename(qactname, qactnewname) logging.debug("renamed %s to %s" % (qactname, qactnewname)) # get the number of active tasks, rename existing qin numtasks = 0 if os.path.exists(qinname): os.rename(qinname, qactname) active = os.listdir(qactname) numtasks = len(active) # create new qin config.mkdir_p(qinname) logging.info("preparing next step: %s, %s" % \ (qinname, qactname)) # send ready to master client.ready(self.master, self.id, numtasks) self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() prepare_timer.stop("prepare-%d" % self.server.superstep_count) return elif self.path == "/quit": self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() SYS_STATS = False # set the flag to terminate the server self.server.running = False self.server.self_dummy() return elif self.path == "/getkv": logging.debug("getting kv file") self.send_response(200) if self.server.superstep_count > 1: body = json.dumps(get_kv_file("supervisor")) self.send_header('Content-Length', len(body)) self.end_headers() self.wfile.write(body) else: self.send_header('Content-Length', len("None")) self.end_headers() self.wfile.write("None") return elif self.path == "/dummy": logging.debug("dummy request") self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() return elif self.path == "/step": logging.info("execute next step") self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() # TODO, implement null action, # skip execution if there are no tasks to execute, # qact does not exist # get the tasks to execute qactname = "snapw.%d/qact" % (self.pid) active = [] if os.path.exists(qactname): active = os.listdir(qactname) logging.debug("active tasks %s" % (str(active))) self.qactname = qactname self.active = active # the task list # start a thread to execute the work tasks t = threading.Thread(target=Execute, args=(self, )) t.start() return elif self.path == "/config": logging.debug("get configuration") body = json.dumps(self.config) self.send_response(200) self.send_header('Content-Length', len(body)) self.end_headers() self.wfile.write(body) return elif self.path == "/quit": logging.info("terminate execution") SYS_STATS = False self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() sys.exit(0) self.send_response(200) self.end_headers() self.wfile.write(message) return
def do_GET(self): parsed_path = urlparse.urlparse(self.path) message_parts = [ 'CLIENT VALUES:', 'client_address=%s (%s)' % (self.client_address, self.address_string()), 'command=%s' % self.command, 'path=%s' % self.path, 'real path=%s' % parsed_path.path, 'query=%s' % parsed_path.query, 'request_version=%s' % self.request_version, '', 'SERVER VALUES:', 'server_type=%s' % "head server", 'server_version=%s' % self.server_version, 'sys_version=%s' % self.sys_version, 'protocol_version=%s' % self.protocol_version, '', 'HEADERS RECEIVED:', ] for name, value in sorted(self.headers.items()): message_parts.append('%s=%s' % (name, value.rstrip())) message_parts.append('') message = '\r\n'.join(message_parts) subpath = self.path.split("/") command = parsed_path.path dargs = dict(urlparse.parse_qsl(parsed_path.query)) if self.path == "/start": logging.info("starting host servers") self.server.timer.start("master") self.server.superstep_count = 0 self.server.snapshot_counter = 0 master = self.config["master"] hosts = self.config["hosts"] for h in hosts: self.StartHostServer(h, master) elif self.path == "/quit": self._quit() return elif self.path == "/getkv": logging.debug("getting kv file") self.send_response(200) if self.server.superstep_count > 1: if not self.server.executing and not self.server.iterate: # We're done computing everything. So let LS know # that this is the final copy of the k-v file. body = json.dumps(get_kv_file("master")) self.send_header('Content-Length', len(body) + 5) self.end_headers() self.wfile.write("DONE " + body) return body = json.dumps(get_kv_file("master")) self.send_header('Content-Length', len(body)) self.end_headers() self.wfile.write(body) else: self.send_header('Content-Length', len("None")) self.end_headers() self.wfile.write("None") return elif self.path == "/dummy": logging.debug("dummy request") self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() return elif self.path == "/config": logging.debug("get configuration") body = json.dumps(self.config) self.send_response(200) self.send_header('Content-Length', len(body)) self.end_headers() self.wfile.write(body) return elif command == "/exec": pname = dargs.get("p") ptime = 0 try: ptime = int(dargs.get("t")) except: pass # logging.debug("get executable: " + str(pname) + " " + str(ptime)) stat = os.stat(pname) mtime = int(stat.st_mtime) swnew = False if mtime > ptime: swnew = True # logging.debug("stat " + str(pname) + " " + str(ptime) + " " + str(mtime) + " " + str("NEW" if swnew else "OLD")) if not swnew: # the file has not changed self.send_response(304) self.send_header('Content-Length', 0) self.end_headers() return f = open(pname) content = f.read() f.close() self.send_response(200) self.send_header('Content-Length', len(content)) self.end_headers() self.wfile.write(content) return elif subpath[1] == "done": self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() if len(subpath) > 2: host = subpath[2] self.server.global_lock.acquire() cur_superstep = self.server.superstep_count if cur_superstep > 0: self.server.timer.stop("superstep-%d-host-%d" % \ (self.server.superstep_count, int(host))) self.server.global_lock.release() self.server.done_lock.acquire() self.server.done.add(host) str_log = "host %s completed work" % (str(self.server.done)) done_size = len(self.server.done) self.server.done_lock.release() logging.info(str_log) if done_size == len(self.config["hosts"]): logging.info("all hosts completed") # Fix possible concurrency issue with supervisor.py if cur_superstep == 0: time.sleep(5) if self.server.snapshot_enabled: self.server.global_lock.acquire() self.server.snapshot_counter += 1 cmd = "./snapshot.sh %d" % (self.server.snapshot_counter - 1) self.server.global_lock.release() logging.info(cmd) os.system(cmd) # initialize a set of ready servers, # clear the continue indicator self.server.ready_lock.acquire() self.server.ready = set() self.server.ready_lock.release() self.server.iterate = False # send a start message at the beginning if not self.server.start: self.server.start = True self.server.executing = True (starthost, starttask) = self.GetStartInfo(self.config) s = "send __Start__ message for task %s to host %s" % ( starttask, starthost) logging.debug(s) client.message(starthost,"__Main__",starttask,"__Start__") # send a step start command to all the hosts hosts = self.config["hosts"] master = "%s:%s" % ( self.config["master"]["host"], self.config["master"]["port"]) logging.debug("hosts " + str(hosts)) for h in hosts: logging.debug("send prepare to " + str(h)) self.Prepare(h) logging.debug("done sending prepare to " + str(h)) return elif subpath[1] == "ready": self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() if len(subpath) > 2: host = subpath[2] # get the number of active tasks on the host numtasks = 0 try: numtasks = int(subpath[3]) except: pass # execute the next step, if there are active tasks if numtasks > 0: self.server.iterate = True self.server.ready_lock.acquire() self.server.ready.add(host) str_log = "host %s ready" % (str(self.server.ready)) ready_size = len(self.server.ready) self.server.ready_lock.release() logging.debug(str_log) if ready_size == len(self.config["hosts"]): # stop the execution, if there are no more tasks to execute if not self.server.iterate: logging.info("all tasks completed") self.server.executing = False self.server.iterate = False time.sleep(10) self._quit(force=True) return logging.info("all hosts ready") # initialize a set of done servers self.server.done_lock.acquire() self.server.done = set() self.server.done_lock.release() hosts = self.config["hosts"] master = "%s:%s" % ( self.config["master"]["host"], self.config["master"]["port"]) self.server.global_lock.acquire() self.server.superstep_count += 1 for h in hosts: h_id = int(h['id']) self.server.timer.start("superstep-%d-host-%d" % \ (self.server.superstep_count, h_id)) self.server.global_lock.release() # send a step start command to all the hosts # TODO: create a thread for this step for h in hosts: logging.info("send next step to " + str(h)) self.StartStep(h) return elif subpath[1] == "error": self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() if len(subpath) > 3: src_host = subpath[2] encoded_msg = subpath[3] msg_dict = urlparse.parse_qs(encoded_msg) logging.critical("Error msg from supervisor %s: %s" % \ (src_host, msg_dict['msg'])) logging.critical("Terminating master now") self._quit(force=True) return self.send_response(200) self.end_headers() self.wfile.write(message) return
def do_GET(self): parsed_path = urlparse.urlparse(self.path) message_parts = [ 'CLIENT VALUES:', 'client_address=%s (%s)' % (self.client_address, self.address_string()), 'command=%s' % self.command, 'path=%s' % self.path, 'real path=%s' % parsed_path.path, 'query=%s' % parsed_path.query, 'request_version=%s' % self.request_version, '', 'SERVER VALUES:', 'server_type=%s' % "head server", 'server_version=%s' % self.server_version, 'sys_version=%s' % self.sys_version, 'protocol_version=%s' % self.protocol_version, '', 'HEADERS RECEIVED:', ] for name, value in sorted(self.headers.items()): message_parts.append('%s=%s' % (name, value.rstrip())) message_parts.append('') message = '\r\n'.join(message_parts) subpath = self.path.split("/") command = parsed_path.path dargs = dict(urlparse.parse_qsl(parsed_path.query)) if self.path == "/start": logging.info("starting host servers") self.server.timer.start("master") self.server.superstep_count = 0 self.server.snapshot_counter = 0 master = self.config["master"] hosts = self.config["hosts"] for h in hosts: self.StartHostServer(h, master) elif self.path == "/quit": self._quit() return elif self.path == "/getkv": logging.debug("getting kv file") self.send_response(200) if self.server.superstep_count > 1: if not self.server.executing and not self.server.iterate: # We're done computing everything. So let LS know # that this is the final copy of the k-v file. body = json.dumps(get_kv_file("master")) self.send_header('Content-Length', len(body) + 5) self.end_headers() self.wfile.write("DONE " + body) return body = json.dumps(get_kv_file("master")) self.send_header('Content-Length', len(body)) self.end_headers() self.wfile.write(body) else: self.send_header('Content-Length', len("None")) self.end_headers() self.wfile.write("None") return elif self.path == "/dummy": logging.debug("dummy request") self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() return elif self.path == "/config": logging.debug("get configuration") body = json.dumps(self.config) self.send_response(200) self.send_header('Content-Length', len(body)) self.end_headers() self.wfile.write(body) return elif command == "/exec": pname = dargs.get("p") ptime = 0 try: ptime = int(dargs.get("t")) except: pass # logging.debug("get executable: " + str(pname) + " " + str(ptime)) stat = os.stat(pname) mtime = int(stat.st_mtime) swnew = False if mtime > ptime: swnew = True # logging.debug("stat " + str(pname) + " " + str(ptime) + " " + str(mtime) + " " + str("NEW" if swnew else "OLD")) if not swnew: # the file has not changed self.send_response(304) self.send_header('Content-Length', 0) self.end_headers() return f = open(pname) content = f.read() f.close() self.send_response(200) self.send_header('Content-Length', len(content)) self.end_headers() self.wfile.write(content) return elif subpath[1] == "done": self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() if len(subpath) > 2: host = subpath[2] self.server.global_lock.acquire() cur_superstep = self.server.superstep_count if cur_superstep > 0: self.server.timer.stop("superstep-%d-host-%d" % \ (self.server.superstep_count, int(host))) self.server.global_lock.release() self.server.done_lock.acquire() self.server.done.add(host) str_log = "host %s completed work" % (str(self.server.done)) done_size = len(self.server.done) self.server.done_lock.release() logging.info(str_log) if done_size == len(self.config["hosts"]): logging.info("all hosts completed") # Fix possible concurrency issue with supervisor.py if cur_superstep == 0: time.sleep(5) if self.server.snapshot_enabled: self.server.global_lock.acquire() self.server.snapshot_counter += 1 cmd = "./snapshot.sh %d" % ( self.server.snapshot_counter - 1) self.server.global_lock.release() logging.info(cmd) os.system(cmd) # initialize a set of ready servers, # clear the continue indicator self.server.ready_lock.acquire() self.server.ready = set() self.server.ready_lock.release() self.server.iterate = False # send a start message at the beginning if not self.server.start: self.server.start = True self.server.executing = True (starthost, starttask) = self.GetStartInfo(self.config) s = "send __Start__ message for task %s to host %s" % ( starttask, starthost) logging.debug(s) client.message(starthost, "__Main__", starttask, "__Start__") # send a step start command to all the hosts hosts = self.config["hosts"] master = "%s:%s" % (self.config["master"]["host"], self.config["master"]["port"]) logging.debug("hosts " + str(hosts)) for h in hosts: logging.debug("send prepare to " + str(h)) self.Prepare(h) logging.debug("done sending prepare to " + str(h)) return elif subpath[1] == "ready": self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() if len(subpath) > 2: host = subpath[2] # get the number of active tasks on the host numtasks = 0 try: numtasks = int(subpath[3]) except: pass # execute the next step, if there are active tasks if numtasks > 0: self.server.iterate = True self.server.ready_lock.acquire() self.server.ready.add(host) str_log = "host %s ready" % (str(self.server.ready)) ready_size = len(self.server.ready) self.server.ready_lock.release() logging.debug(str_log) if ready_size == len(self.config["hosts"]): # stop the execution, if there are no more tasks to execute if not self.server.iterate: logging.info("all tasks completed") self.server.executing = False self.server.iterate = False time.sleep(10) self._quit(force=True) return logging.info("all hosts ready") # initialize a set of done servers self.server.done_lock.acquire() self.server.done = set() self.server.done_lock.release() hosts = self.config["hosts"] master = "%s:%s" % (self.config["master"]["host"], self.config["master"]["port"]) self.server.global_lock.acquire() self.server.superstep_count += 1 for h in hosts: h_id = int(h['id']) self.server.timer.start("superstep-%d-host-%d" % \ (self.server.superstep_count, h_id)) self.server.global_lock.release() # send a step start command to all the hosts # TODO: create a thread for this step for h in hosts: logging.info("send next step to " + str(h)) self.StartStep(h) return elif subpath[1] == "error": self.send_response(200) self.send_header('Content-Length', 0) self.end_headers() if len(subpath) > 3: src_host = subpath[2] encoded_msg = subpath[3] msg_dict = urlparse.parse_qs(encoded_msg) logging.critical("Error msg from supervisor %s: %s" % \ (src_host, msg_dict['msg'])) logging.critical("Terminating master now") self._quit(force=True) return self.send_response(200) self.end_headers() self.wfile.write(message) return