def get_job(self): """ 获取job内容,发送到执行队列,并修改任务状态 """ while 1: if self._stop: log.warn("get_job stopping") return if self.locks["get_job"].acquire(): if self.queues["get_job"].empty(): self.locks["get_job"].release() time.sleep(0.5) continue dist_role, dist_node, jid = self.queues["get_job"].get( timeout=5) self.queues["get_job"].task_done() self.locks["get_job"].release() node_base_dir = self.zookeeper_conf.nodes jid_path = os.path.join(node_base_dir, dist_role, dist_node, "jobs", jid) try: job = self.zkconn.get(jid_path)[0] data = msgpack.loads(job) if data["env"] == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) data["payload"] = crypt.loads(data.get("payload")) if data["payload"]["status"] != "READY": continue data["payload"]["role"] = dist_role data["payload"]["node_name"] = dist_node #发送到执行队列中 if data["payload"].get("nthread"): self.queues["sigle_run"].put(msgpack.dumps(data), timeout=5) else: self.queues["mult_run"].put(msgpack.dumps(data), timeout=5) data["payload"]["status"] = "RUNNING" if data["env"] == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) data["payload"] = crypt.dumps(data.get("payload")) #修改任务状态为RUNNING self.zkconn.set(jid_path, msgpack.dumps(data)) except: log.error(traceback.format_exc()) self.queues["get_job"].put((dist_role, dist_node, jid))
def __init__(self, config): self.main_conf = Conf(config["swall"]) self.fs_conf = Conf(config["fs"]) self.node = self.main_conf.node_name self.node_ip = self.main_conf.node_ip self.node_funcs = self.load_module() self.mq = MQ(config) self._stop = 0 self.sys_envs = self.load_env() self.job_sub = JobSubject() self.job_sub.register(self) self.crypt = Crypt(self.main_conf.token)
def get_job_info(self, node_name, jid): """ 返回任务状态 @param node_name string:节点名称 @param jid string:任务id @return dict: """ payload = {} data = self.mq.get_res(node_name, jid) if data: if data["env"] == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) payload = crypt.loads(data.get("payload")) return payload
def crond_clear_job(self): """ 定时清理已经完成的job """ while 1: if self._stop: log.warn("crond_clear_job stopping") return try: for node_name in self.nodes.keys(): job_path = os.path.join(self.zookeeper_conf.nodes, self.nodes[node_name]["role"], node_name, "jobs") jids = self.zkconn.get_children(job_path) for jid in jids: jid_path = os.path.join(job_path, jid) znode = self.zkconn.get(jid_path) job = znode[0] mtime = znode[1]["mtime"] / 1000 data = msgpack.loads(job) if data["env"] == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) data["payload"] = crypt.loads(data.get("payload")) cur_t = int(time.strftime('%s', time.localtime())) delay_sec = cur_t - mtime keep_job_time = getattr(self.main_conf, "keep_job_time", 604800) if delay_sec >= keep_job_time: zkcli = self.zkconn if zkcli.delete(jid_path): log.info( "delete the timeout %s %s job [%s] ok" % (keep_job_time, data["payload"]["status"], jid)) else: log.error( "delete the timeout %s %s job [%s] fail" % (keep_job_time, data["payload"]["status"], jid)) except: log.error(traceback.format_exc()) time.sleep(5)
def get_job_info(self, role, node_name, jid): """ 返回任务状态 @param role string:角色 @param node_name string:节点名称 @param jid string:任务id @return dict: """ node_base_dir = self.zookeeper_conf.nodes jid_path = os.path.join(node_base_dir, role, node_name, "jobs", jid) payload = {} if self.zkconn.exists(jid_path): job = self.zkconn.get(jid_path)[0] data = msgpack.loads(job) if data["env"] == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) payload = crypt.loads(data.get("payload")) return payload
def _send_job(self, data, role, node_name): """ 发送job到对应的zk目录 @param data dict: @param role string: @param node_name string: @return int:1 for success else 0 """ ret = 0 try: job_path = os.path.join(self.zookeeper_conf.nodes, role, node_name, "jobs", data["payload"]["jid"]) if data.get("env") == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) data["payload"] = crypt.dumps(data.get("payload")) data = msgpack.dumps(data) self.zkconn.create(job_path, data) ret = 1 except ZKClientError, e: log.error("send_job error:%s" % e.message)
def send_ret(self): """ 发送结果 """ while 1: if self._stop: log.warn("send_ret stopping") return if self.locks["ret_job"].acquire(): if self.queues["ret_job"].empty(): self.locks["ret_job"].release() time.sleep(0.5) continue data = msgpack.loads(self.queues["ret_job"].get(timeout=5)) self.queues["ret_job"].task_done() self.locks["ret_job"].release() node_base_dir = self.zookeeper_conf.nodes jid_path = os.path.join(node_base_dir, data["payload"]["role"], data["payload"]["node_name"], "jobs", data["payload"]["jid"]) log.info( "[%s %s] send the result of job [%s]" % (data["payload"]["role"], data["payload"]["node_name"], data["payload"]["jid"])) try: if data["env"] == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) data["payload"] = crypt.dumps(data.get("payload")) #遇到过set返回成功但是却没有更新的情况,这里尝试set两次看看 self.zkconn.set(jid_path, msgpack.dumps(data)) time.sleep(0.0001) set_ret = self.zkconn.set(jid_path, msgpack.dumps(data)) if set_ret != 0: log.error("send result error,retcode is [%s]" % set_ret) except: log.error(traceback.format_exc())
def _send_job(self, node_data): """ 发送job @param data dict: @param node_name string: @return int:1 for success else 0 """ ret = 0 try: key_str = self.main_conf.token crypt = Crypt(key_str) jobs = [] for node in node_data: data = node[0] node_name = node[1] if data.get("env") == "aes": data["payload"] = crypt.dumps(data.get("payload")) jobs.append((node_name, data)) if jobs: self.keeper.mq.mset_job(jobs) ret = 1 except Exception, e: log.error("send_job error:%s" % traceback.format_exc())
def get_job(self, job_data): """ 获取任务 @param node_name string:节点名称 @param jid string:任务id @return dict:a job info """ ret = {} key_str = self.main_conf.token crypt = Crypt(key_str) try: rets = self.mq.mget_job(job_data) for node, data in rets.items(): if data: env = data.get("env") if env == "aes": data["payload"] = crypt.loads(data.get("payload")) payload = data["payload"] if payload["cmd"] == "sys.get" and payload[ "status"] == "FINISH" and payload["return"] != "": if payload["args"][0] != "help": fid = payload["return"] if "local_path" in payload[ "kwargs"] and "remote_path" in payload[ "kwargs"]: local_path = payload["kwargs"]["local_path"] remote_path = payload["kwargs"]["remote_path"] else: local_path = payload["args"][1] remote_path = payload["args"][0] stat = payload["kwargs"].get("stat") if local_path.endswith('/') or os.path.isdir( local_path): local_path = os.path.join( local_path, os.path.basename(remote_path)) if checksum(local_path) != fid: if not check_cache( app_abs_path(self.main_conf.cache), fid): FsClient = load_fclient( app_abs_path(self.main_conf.fs_plugin), ftype=self.fs_conf.fs_type) fscli = FsClient(self.fs_conf) fscli.download( fid, os.path.join( app_abs_path(self.main_conf.cache), fid)) if check_cache( app_abs_path(self.main_conf.cache), fid): if not make_dirs( os.path.dirname(local_path)): log.error("创建目标目录:%s失败" % local_path) if cp( os.path.join( app_abs_path( self.main_conf.cache), fid), local_path, stat): payload["return"] = local_path else: payload["return"] = "" else: payload["return"] = local_path ret[node] = data except Exception, e: log.error(traceback.format_exc())
def get_job(self, role, node_name, jid): """ 获取任务 @param role string:角色 @param node_name string:节点名称 @param jid string:任务id @return dict:a job info """ ret = {} try: node_path = os.path.join(self.zookeeper_conf.nodes, role, node_name, "jobs", jid) data = self.zkconn.get(node_path)[0] data = msgpack.loads(data) env = data.get("env") if env == "aes": key_str = self.main_conf.token crypt = Crypt(key_str) data["payload"] = crypt.loads(data.get("payload")) payload = data["payload"] if payload["cmd"] == "sys.get" and payload[ "status"] == "FINISH" and payload["return"] != "": if payload["args"][0] != "help": fid = payload["return"] if "local_path" in payload[ "kwargs"] and "remote_path" in payload["kwargs"]: local_path = payload["kwargs"]["local_path"] remote_path = payload["kwargs"]["remote_path"] else: local_path = payload["args"][1] remote_path = payload["args"][0] stat = payload["kwargs"].get("stat") if local_path.endswith('/') or os.path.isdir(local_path): local_path = os.path.join( local_path, os.path.basename(remote_path)) if checksum(local_path) != fid: if not check_cache(app_abs_path(self.main_conf.cache), fid): FsClient = load_fclient(app_abs_path( self.main_conf.fs_plugin), ftype=self.fs_conf.fs_type) fscli = FsClient(self.fs_conf) fscli.download( fid, os.path.join( app_abs_path(self.main_conf.cache), fid)) if check_cache(app_abs_path(self.main_conf.cache), fid): if not make_dirs(os.path.dirname(local_path)): log.error("创建目标目录:%s失败" % local_path) if cp( os.path.join( app_abs_path(self.main_conf.cache), fid), local_path, stat): payload["return"] = local_path else: payload["return"] = "" else: payload["return"] = local_path ret = data except (ZKClientError, KeyboardInterrupt), e: log.error(e.message)