def jobRunStatusWatch(data, stat, event=None): try: if data and event is not None: jobId = event.path.split("/")[3] zope.event.notify(JobStatusChangeEvent(jobId)) except Exception: Logger.exception(log)
def action(queue): while True: try: jobId = queue.get() doAction(jobId) except Exception: Logger.exception(log)
def runJob(q): while True: try: evnet = q.get() zope.event.notify(evnet) except Exception: Logger.exception(log)
def jobReadyWatch(children): try: # brokers={} for jobId in children: try: job=StoreHolder.getServerStore().getJob(jobId) if CacheHolder.getCache().hasKey(jobId, JOBS) is False: CacheHolder.getCache().put(jobId, job,JOBS) for taskName in job.tasks: TaskCacheHolder.getJobCache().put(taskName,job.jobId) except Exception: Logger.exception( log) #偷个懒,只要没有删除的全部放到ROUTER里面去 # if job.status != JOB_DELETE: # # for taskName in job.tasks: # que=StoreHolder.getServerStore().getQueue(job.brokerQueue) # TaskCacheHolder.getJobCache().put(taskName,job.jobId) # # if brokerServer in brokers: # brokers[brokerServer].update(routes) # else: # brokers[brokerServer] = routes # # #偷个懒,只要没有删除的全部放到ROUTER里面去 # for broker,routes in brokers.items(): # brokerServer = StoreHolder.getServerStore().getBrokerServer(broker) # CabbageHolder.getServerCabbages()[brokerServer.hostName].getApp().conf.update(CELERY_ROUTES = routes) # Logger.info(log,"更新队列服务器【%s】ROUTES【%s】"% (CabbageHolder.getServerCabbagesStr(),str(routes))) except Exception: Logger.exception( log)
def addBroberServerHandler(event): if event and event.brokerServer: brokerServer = event.brokerServer Logger.info(log,"添加队列服务器【%s】,URI:【%s】"%( brokerServer.hostName,brokerServer.connectUri)) cabbage = Cabbage(hostName=brokerServer.hostName,broker=brokerServer.connectUri) CabbageHolder.getServerCabbages()[brokerServer.hostName]= cabbage Logger.debug(log,"添加队列服务器【%s】"% CabbageHolder.getServerCabbagesStr())
def handleRequest(conn, addr): try: resultMessage = doRequestHandle(conn) #msgInstance.doAction() #result一般为message类型 if resultMessage: conn.sendall(MessageCodec().encode(resultMessage)) except Exception: Logger.exception(log)
def workBrokerQueueChangeHandler(event): # CacheHolder.getCache().put(QUEUES,event.brokerQueues,WORKS) with storeFactory.store() as store: work = store.getWork(HOST_NAME) CacheHolder.getCache().put(HOST_NAME,work,WORKS) if event.isEvent and (work.status == ON_LINE): Logger.info(log,"restart") CabbageControlHolder.getCabbageControl().restartCelery()
def workStatusWatch(data, stat, event=None): try: if data and event is not None: #path=u'/cabbage/works/huamac/status') hostname = event.path.split("/")[3] if hostname == HOST_NAME: zope.event.notify(ClientWorkStatusEvent(data)) except Exception: Logger.exception(log)
def workOnlineWatch(data, stat=None, event=None): if event is not None: #节点已死 if event.type=="DELETED": #"/cabbage/works/"+HOST_NAME+"/"+ON_LINE hostName = event.path.split("/")[3] work = StoreHolder.getServerStore().getWork(hostName) work.status = LOST Logger.info( log,"节点:【%s】IP:【%s】已经死亡!" % (hostName,work.ip) ) StoreHolder.getServerStore().updateWorkStatus(work)
def configWatch(children): try: for l in children : pa =CONFIG_PATH+"/"+l kazooClient.addDataListener(pa, configOptionDataChange) data= kazooClient.getData(pa) ConfigHolder.getConfig().setProperty(BASE,l ,data) except Exception: Logger.exception( log)
def workBrokerQueueWatch(children): try: data = kazooClient.getData("/" + CABBAGE + "/" + WORKS + "/" + HOST_NAME + "/" + QUEUES) # print data if data and data == DO_NOTHING: return zope.event.notify(WorkBrokerQueueChangeEvent(children, isEvent=True)) except Exception: Logger.exception(log)
def updateTaksSent(self,taskName): try: self.lock.acquire() if taskName in self.taskSent: self.taskSent[taskName]= self.taskSent.get(taskName)+1 else: self.taskSent[taskName]=1 except Exception as e: Logger.exception(log) finally: self.lock.release()
def jobAduitStatusWatch(data, stat, event=None): try: if data and data == JOB_AUTH_PASS and event is not None: # example /cabbage/jobs/job-47778319-7a86-4b2b-a43a-5e2e94504350/status jobId = event.path.split("/")[3] with storeFactory.store() as store: job = store.getJob(jobId) updateJobCache(jobId, job) zope.event.notify(JobAuditPassEvent(jobId)) except Exception: Logger.exception(log)
def taskSucceeded(state, event, app): taskId = event['uuid'] task = state.tasks.get(taskId) # taskName = task.name if task and hasattr(task,'name') else None jobId, taskName = _getJobIdAndTaskName(taskId) queueTime = 0 runtime = 0 # state. #FIXME 经常找不到TASK.name #@TODO log.debug("【%s】 TASK SUCCEEDED !" % (event['uuid'])) try: # jobId= None # if hasattr(task,'kwargs') and task.kwargs is not None and JOB_ID in task.kwargs: # jobId= eval(str(task.kwargs))[JOB_ID] # # if jobId is None and taskName: # jobId = TaskCacheHolder.getJobCache().get(taskName) # # if taskName is None or jobId is None: # jobId,taskName = _getJobIdAndTaskName(taskId) # print jobId,taskName job = CacheHolder.getCache().get(jobId, JOBS) result = AsyncResult(taskId, app=CabbageHolder.getServerCabbage( job.brokerServer).getApp()) if not isinstance(result.backend, DisabledBackend): log.debug("【%s】 TASK SUCCEEDED result【%s】 !" % (event['uuid'], result.result)) if result.result: jobResults.addResult(jobId, result.result) # # print task.started - task.received # print queueTime # print "task.started:%s"%task.started # print "task.received:%s"%task.received # print "task.runtime:%s" % event['runtime'] # print event if task and task.started and task.received: queueTime = task.started - task.received runtime = event['runtime'] # with storeFactory.store() as store: # store.deleteTaskId( taskId) CabbageCounterHolder.getCabbageCounter().updateTaskSucceeded( taskName, _getHostName(event), runtime, queueTime) # raise Exception("test") except Exception as e: Logger.exception(log)
def taskReceived(state, event, app): # task = state.tasks.get(event['uuid']) log.debug("%s" % event) # print task.name # print "monitor" # print event # print dir(task) try: CabbageCounterHolder.getCabbageCounter().updateTaskReceived( event['name'], _getHostName(event)) except: Logger.exception(log)
def jobAuditPassHandler(event): try: jobId = event.jobId syncJob(jobId) #通知子进程进行加载模块 # zope.event.notify(JobNeedLoadEvent(jobId)) from cabbage.process.cabbage_control_holder import CabbageControlHolder # CabbageControlHolder.getCabbageControl().addJobId(event.jobId) CabbageControlHolder.getCabbageControl().restartCelery() except Exception: Logger.exception(log)
def saveJob(self, job): parent = "/" + CABBAGE + "/" + JOBS + "/" + job.jobId self.client.create(parent, makepath=True) Logger.debug(log, parent) self.client.create(parent + "/" + JOB_NAME, value=job.jobName) self.client.create(parent + "/" + FILE_PATH, value=job.filePath) self.client.create(parent + "/" + FILE_NAME, value=job.fileName) self.client.create(parent + "/" + FILE_TYPE, value=job.fileType) self.client.create(parent + "/" + STATUS, value=job.status) self.client.create(parent + "/" + AUDIT_STATUS, value=job.auditStatus) self.client.create(parent + "/" + RUN_STRATEGY, value=job.runStrategy) self.client.create(parent + "/" + STRATEGY_VALUE, value=job.strategyValue) self.client.create(parent + "/" + ATTACH_FILES) self.client.create(parent + "/" + REULST_BACKEND, value=job.resultBackend) for f in job.attachFiles: self.client.create(parent + "/" + ATTACH_FILES + "/" + f.fileName, value=f.filePath, makepath=True) self.client.create(parent + "/" + ATTACH_FILES + "/" + f.fileName + "/" + FILE_TYPE, value=f.fileType, makepath=True) for w in job.works: self.client.create(parent + "/" + WORKS + "/" + LIST + "/" + w.hostName, value=w.port, makepath=True) if not self.client.isExistPath(parent + "/" + WORKS + "/" + READIES): self.client.create(parent + "/" + WORKS + "/" + READIES) if job.tasks: for task in job.tasks: self.client.create(parent + "/" + TASKS + "/" + task, makepath=True) self.client.create("/" + CABBAGE + "/" + JOBS + "/" + RESULTS + "/" + job.jobId + "/" + task, makepath=True) self.client.create(parent + "/" + BROKER_SERVER, value=job.brokerServer) self.client.create(parent + "/" + QUEUE, value=job.brokerQueue) # for q in job.queue: # self.client.create(parent+"/"+BROKER_SERVER+"/"+QUEUES+"/"+q) #使用该目录是因为,如果前面的目录没有创建完,集群的节点已经开始监控,导致数据不完整,所以,单独才用该目录来进行节点的监控 self.client.create("/" + CABBAGE + "/" + JOBS + "/" + READIES + "/" + job.jobId)
def addScriptJobId(self,jobId): store =None try: store = StoreHolder.getStore() log.info("节点【%s】当前任务【%s】的脚本开始加载。。。。" % (HOST_NAME,jobId)) self.loadJobScript(jobId,store) work=store.getWork(HOST_NAME) self.sendBeReady(jobId,work,store) store.close() except Exception: Logger.exception(log) finally: if store: store.close()
def updateTaskReceived(self,taskName,hostName): try: self.lock.acquire() if taskName in self.taskReceived: if hostName in self.taskReceived[taskName]: self.taskReceived[taskName][hostName]=self.taskReceived[taskName][hostName]+1 else: self.taskReceived[taskName].update({hostName:1}) else: self.taskReceived[taskName]={hostName:1} except Exception as e: Logger.exception(log) finally: self.lock.release()
def doAction(jobId): if not CacheHolder.getCache().hasKey(jobId, JOBS): with storeFactory.store() as store: job = store.getJob(jobId) CacheHolder.getCache().put(jobId, job, JOBS) job = CacheHolder.getCache().get(jobId, JOBS) Logger.debug(log, "upload files. job【%s】" % str(job.asDict())) if job.resultBackend == None: return elif job.resultBackend == NFS: CabbageNfsBackend(jobId).save() elif job.resultBackend == HDFS: CabbageHdfsBackend(jobId).save()
def updateTaskSucceeded(self,taskName,hostName,runTime,queueTime): try: self.lock.acquire() if taskName in self.taskSucceeded: if hostName in self.taskSucceeded[taskName]: m= self.taskSucceeded[taskName][hostName] m[TASK_COUNT]=m[TASK_COUNT]+1 m[TASK_RUNTIME]=m[TASK_RUNTIME]+runTime m[TASK_QUEUE_TIME]=m[TASK_QUEUE_TIME]+queueTime else: self.taskSucceeded[taskName].update({hostName:{TASK_COUNT:1,TASK_RUNTIME:runTime,TASK_QUEUE_TIME:queueTime}}) else: self.taskSucceeded[taskName]={hostName:{TASK_COUNT:1,TASK_RUNTIME:runTime,TASK_QUEUE_TIME:queueTime}} except Exception as e: Logger.exception(log) finally: self.lock.release()
def jobChildWatch(children): try: for jobId in children: jobId = str(jobId) with storeFactory.store() as store: job = store.getJob(jobId) work = store.getWork(HOST_NAME) if CacheHolder.getCache().hasKey( jobId, JOBS) is False and job.brokerQueue in work.queues: '''添加job的状态监控''' parent = "/" + CABBAGE + "/" + JOBS + "/" + jobId kazooClient.addDataListener(parent + "/" + STATUS, jobRunStatusWatch) kazooClient.addDataListener(parent + "/" + AUDIT_STATUS, jobAduitStatusWatch) updateJobCache(jobId, job) except Exception: Logger.exception(log)
def taskSent(state, event, app): # task = state.tasks.get(event['uuid']) # log.info("%s"% task) # print task.name # print "monitor" # print event # print dir(task) try: taskName = event['name'] taskId = event['uuid'] # jobId = eval(str(event['kwargs']))[JOB_ID] jobId = TaskCacheHolder.getJobCache().get(taskName) # StoreHolder.getRedisStaticStore().saveTaskId(jobId, taskName, taskId) CabbageCounterHolder.getCabbageCounter().updateTaksSent(taskName) except: Logger.exception(log)
def taskFailed(state, event, app): eventOutDic = event.copy() taskName = None try: taskId = event['uuid'] task = state.tasks.get(taskId) # # taskName = task.name if task and hasattr(task,'name') else None # # if hasattr(task,'kwargs') and task.kwargs is not None and JOB_ID in task.kwargs: # eventOutDic[JOB_ID] = eval(str(task.kwargs))[JOB_ID] # # if eventOutDic[JOB_ID] is None: # eventOutDic[JOB_ID] = TaskCacheHolder.getJobCache().get(taskName) # if taskName is None or eventOutDic[JOB_ID] is None : jobId, taskName = _getJobIdAndTaskName(taskId) eventOutDic[JOB_ID] = jobId job = CacheHolder.getCache().get(eventOutDic[JOB_ID], JOBS) brokerServer = job.brokerServer taskPath = ConfigHolder.getConfig().getProperty( BASE, TASK_FAILLOG_PATH) if not os.path.isdir(taskPath): os.makedirs(taskPath) dateStr = getNowDateStr() with open(taskPath + "/" + brokerServer + "_" + dateStr + ".log", "a+") as writer: writer.write(str(eventOutDic) + "\n") # with storeFactory.store() as store: # store.deleteTaskId(task.id) # StoreHolder.getRedisStaticStore().deleteTaskId(taskId) CabbageCounterHolder.getCabbageCounter().updateTaksFail( taskName, _getHostName(event)) except Exception as e: Logger.exception(log)
def doAction(self,actionFun): sendDict=None receivedDict=None failDict=None succeedDict=None try: self.lock.acquire() sendDict = self.taskSent.copy() receivedDict = self.taskReceived.copy() failDict = self.taskFail.copy() succeedDict = self.taskSucceeded.copy() actionFun(sendDict,receivedDict,failDict,succeedDict) self.taskFail.clear() self.taskSent.clear() self.taskSucceeded.clear() self.taskReceived.clear() except Exception as e: Logger.exception(log) finally: self.lock.release()
def save(self): hdfsPath=ConfigHolder.getConfig().getProperty(BASE,HDFS_ROOT_PATH) dateStr = getNowDateStr() if self.jobId: localPath = getLocalFilesPath() dateStr = getNowDateStr() hour = getNowHour() if hour == 0:# 提交前一天的数据 dateStr = formatDate(subDay(getNow(),1),f="%Y%m%d") p = localPath+"/"+self.jobId+"/result/"+dateStr Logger.debug( log, "upload file to hdfs. jobId【%s】 date【%s】" % (self.jobId,dateStr)) if not os.path.isdir(p): return fileNames = os.listdir(p) if len(fileNames) == 0: return client =HdfsClientHolder.getHdfsClient() remoteDire=hdfsPath+"/"+self.jobId if not client.isDirectory(remoteDire): client.mkdir(remoteDire) remoteDire= remoteDire+"/"+dateStr if not client.isDirectory(remoteDire): client.mkdir(remoteDire) Logger.info(log,"hour:%s files:%s"%(hour,",".join(fileNames))) for fileName in fileNames: if hour != 0: if int(fileName) >= hour: continue # if os.path.isfile(p+"/"+fileName): self.uploadToHdfs(client,localPath,self.jobId,hdfsPath,fileName,dateStr) os.remove(p+"/"+fileName)
def jobRemoveHandler(event): try: jobId = event.jobId if JobCacheHolder.getJobCache().has_key(jobId): jobRun = JobCacheHolder.getJobCache().get(jobId) if jobRun : #停止运行TASK jobRun.stop() else: job =CacheHolder.getCache().get(jobId, JOBS) for taskName in job.tasks: CabbageHolder.getServerCabbage(job.brokerServer).revokeByTaskName(taskName) with storeFactory.store() as store: store.updateJobStatus(jobId, JOB_DELETE) #删除缓存让下一个task可以同名 tasks=CacheHolder.getCache().get(jobId, JOBS).tasks for taskName in tasks: if TaskCacheHolder.getJobCache().has_key(taskName): TaskCacheHolder.getJobCache().remove(taskName) CacheHolder.getCache().remove(jobId, JOBS) except: Logger.exception(log)
def save(self): try: nfsPath=ConfigHolder.getConfig().getProperty(BASE,NFS_DIRECTORY) dateStr = getNowDateStr() if self.jobId: localPath = getLocalFilesPath() dateStr = getNowDateStr() hour = getNowHour() if hour == 0:# 提交前一天的数据 dateStr = formatDate(subDay(getNow(),1),f="%Y%m%d") localPath = localPath+"/"+self.jobId+"/result/"+dateStr Logger.info( log, "upload file to nfs. jobId【%s】 date【%s】" % (self.jobId,dateStr)) if not os.path.isdir(localPath): return fileNames = os.listdir(localPath) if len(fileNames) == 0: return remoteDire=nfsPath+"/"+self.jobId+"/"+dateStr if not os.path.isdir(remoteDire): os.makedirs(remoteDire) # os.chmod(remoteDire,777) Logger.info(log,"hour:%s files:%s"%(hour,",".join(fileNames))) for fileName in fileNames: if hour != 0: if int(fileName) >= hour: continue newFileName = None if os.environ[CABBAGE] ==MASTER: newFileName = HOST_NAME+"_"+LOCAL_IP+"_"+MASTER+"_"+fileName else: newFileName = HOST_NAME+"_"+LOCAL_IP+"_"+NODE+"_"+fileName if os.path.isfile(localPath+"/"+fileName): shutil.move(localPath+"/"+fileName,remoteDire+"/"+newFileName) except Exception as e: Logger.exception(log)
def jobWebWatch(children): store = storeFactory.getStore() try: brokers={} for jobId in children: try: job=store.getJob(jobId)#toreHolder.getStore().getJob(jobId) if CacheHolder.getCache().hasKey(jobId, JOBS) is False: CacheHolder.getCache().put(jobId, job,JOBS) # kazooClient.addDataListener(parent+"/"+STATUS, jobRunStatusWatch) #偷个懒,只要没有删除的全部放到ROUTER里面去 if job.status != JOB_DELETE: brokerServer=job.brokerServer routes={} for taskName in job.tasks: que=store.getQueue(job.brokerQueue) routes[taskName]={'queue': que.queueName, 'routing_key': que.routingKey} TaskCacheHolder.getJobCache().put(taskName,job.jobId) if brokerServer in brokers: brokers[brokerServer].update(routes) else: brokers[brokerServer] = routes except Exception: Logger.exception( log) #偷个懒,只要没有删除的全部放到ROUTER里面去 for broker,routes in brokers.items(): brokerServer = store.getBrokerServer(broker) #修复BUG,导致任务提交的celery队列里面去了 cabbage = Cabbage(hostName=brokerServer.hostName,broker=brokerServer.connectUri) cabbage.app.conf.update(CELERY_ROUTES = routes) CabbageHolder.getServerCabbages()[brokerServer.hostName] = cabbage # CabbageHolder.getServerCabbages()[brokerServer.hostName].getApp().conf.update(CELERY_ROUTES = routes) Logger.info(log,"更新队列服务器【%s】ROUTES【%s】"% (brokerServer.hostName,str(routes))) except Exception: Logger.exception( log) finally: storeFactory.returnStroe(store)
# -*- encoding: utf-8 -*- ''' Created on 2016年8月17日 @author: hua ''' from cabbage.common.log.logger import Logger from cabbage.utils.host_name import getHostName def error(): raise Exception("dadsfasdf") log = Logger.getLogger(__name__) try: error() except Exception as e: log.exception(getHostName())