def run(self): self.log("job process %s started" % (self.JID, )) if self.LogFilePath is not None: self.LogFile = open(self.LogFilePath, "w") try: with self.T["JobProcess/run"]: setproctitle("striped_job %s" % (self.JID, )) self.log("started: dataset: %s, fraction: %s, %d workers" % (self.JobDesc.DatasetName, self.JobDesc.Fraction, len(self.Workers))) callback_delegate = self with self.T["JobProcess/run/create_contract"]: self.Contract = Contract( self.JID, self.DataServerURL, self.BulkTransportPort, self.DataClient.dataset(self.JobDesc.DatasetName), self.JobDesc, self.Workers, callback_delegate, self.log, self.T) self.DataExchange.send( DXMessage("job_started", nworkers=len(self.Workers), jid=self.JID, total_events=self.Contract.TotalEvents, selected_events=self.Contract.SelectedEvents, selected_frames=json.dumps( self.Contract.SelectedFrames))) self.log("job_started sent") with self.T["JobProcess/run/start_contract"]: self.Contract.start() self.ContractStartedT = self.FirstWorkerExitT = self.LastWorkerExitT = time.time( ) self.log("contract started. waiting...") with self.T["JobProcess/run/wait_contract"]: self.Contract.wait() self.DataExchange.send( DXMessage("job_done", total_events=self.TotalEvents)) self.log( "Job finished. Worker exit timestamps: first: %.5f, last:%.5f" % (self.FirstWorkerExitT - self.ContractStartedT, self.LastWorkerExitT - self.ContractStartedT)) self.DataExchange.close() self.log("---- exit ----") except: tb = traceback.format_exc() self.DataExchange.send(DXMessage("job_failed").append(reason=tb)) self.log("Exception: ------------\n%s" % (tb, )) finally: self.log("----- job stats: -----\n" + self.T.formatStats()) if self.LogFile is not None: self.LogFile.close()
def sendHistograms(self): msg = DXMessage("hist") nhist = 0 for hid, hacc in self.HAccumulators.items(): if hacc.NFills: #print ("sendHistograms: counts=", hacc.H.Counts) msg.append("h:"+hid, hacc.dump()) nhist += 1 if nhist: self.DXSock.send(msg)
def run(self): self.log("started: %s" % (self.JID, )) job_description = self.JobDescription failed = False try: self.log("validating...") validated = self.Server.validate_job(job_description) if not validated: self.log("job request validation failed %s" % (repr(job_description.AuthToken), )) self.DataExchange.send( DXMessage("job_failed").append( reason="Token validation failed")) self.Failed = True self.Server.jobFailed(self, "Token validation failed") else: self.log("validated: token=%s identity=[%s]" % (job_description.AuthToken, job_description.Identity)) workers = self.Server.workers(tags=job_description.WorkerTags) self.log("workers: %s" % ([wi.Addr for wi in workers], )) if not workers: self.log("no workers found for tags=%s" % (job_description.WorkerTags, )) self.DataExchange.send( DXMessage("job_failed").append( reason="No available workers found for tags=%s" % (job_description.WorkerTags, ))) self.Failed = True self.Server.jobFailed( self, "No available forkers found for tags=%s" % (job_description.WorkerTags, )) else: process = JobProcess(self.JID, self.DataServerURL, self.BulkTransportPort, self.DataExchange, self.DataClient, workers, job_description, self.LogFilePath) process.start() self.Started = time.time() self.Server.jobStarted(self) process.join() self.log("job process exited with status %s" % (process.exitcode, )) except: exception = traceback.format_exc() self.log("failed: %s" % (exception, )) self.Server.jobFailed(self, exception) self.Failed = True finally: self.Ended = time.time() self.DataExchange.close() if not self.Failed: self.log("ended") self.Server.jobEnded(self) self.Server = None
def endOfFrame(self, nevents): #self.log("end of frame") self.DXSock.send(DXMessage("events", events_delta=nevents)) t = time.time() if self.NFills: self.LastFlush = t msg = DXMessage("hist") for hid, hb in self.HCollectors.items(): msg.append("h:"+hid, hb.dump()) #print "counts:", counts self.DXSock.send(msg) self.NFills = 0
def run(self): self.Sock = socket(AF_INET, SOCK_STREAM) self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) self.Sock.bind(('', self.Port)) self.Sock.listen(5) data_exchange_listener = DataExchangeSocket(self.Sock) while not self.Stop: data_exchange = None try: data_exchange = data_exchange_listener.accept() msg = data_exchange.recv() #print "msg:", msg.Type if msg and msg.Type == 'job_request': job_description = JobDescription.fromDXMsg(msg) exists = self.DataClient.dataset( job_description.DatasetName).exists #print "exists:", exists if not exists: self.log("Dataset not found: %s" % (job_description.DatasetName, )) data_exchange.send( DXMessage("job_failed").append( reason="Dataset '%s' not found" % (job_description.DatasetName, ))) else: jid = self.jid() self.log( "Job description received. Job id %s assigned" % (jid, )) job_log_file_path = None if self.LogFileDir is None else "%s/job_%s.log" % ( self.LogFileDir, jid) jt = JobTask(self, jid, job_description, self.DataServerURL, self.BulkTransportPort, self.DataClient, data_exchange, job_log_file_path) self.JobQueue << jt data_exchange = None # the job task owns it now ! if self.SourceArchive is not None: open("%s/ws_%s.txt" % (self.SourceArchive, jid), "w").write(job_description.WorkerText) self.purgeJobHistory() except: dump = traceback.format_exc() self.log("Uncaught exception: %s" % (dump, )) if data_exchange is not None: data_exchange.send( DXMessage("job_failed").append(reason="Exception: %s" % (dump, ))) finally: if data_exchange is not None: data_exchange.close() data_exchange = None
def toDXMsg(self): msg = DXMessage("worker_task", jid=self.JID, worker_module_name = self.WorkerModuleName, bulk_data_name = self.BulkDataName, dataset_name = self.DatasetName, use_data_cache = "yes" if self.UseDataCache else "no", data_server_url = self.DataServerURL, data_mod_url = self.DataModURL, data_mod_token = self.DataModToken) msg.append( histograms = json.dumps(self.HDescriptors), user_params = self.UserParams # this is encoded by the job server, do not decode it yet ) return msg
def flushAll(self, nevents): #self.log("flush all, nevents=%d" % (nevents,)) for sn, sb in self.SBuffers.items(): values = sb.flush() if values is not None and len(values) > 0: # values is a list of (nevents, data) #print "flushAll: len(values)=%d, len(values.data)=%d" % (len(values), len(values.data)) msg = DXMessage("stream", name=sn, format="encode")(data=encodeData(values)) self.DXSock.send(msg) if self.NFills > 0: self.DXSock.send(DXMessage("flush", nevents=nevents)) self.NFills = 0
def run(self): self.log("Started") signal.signal(signal.SIGINT, self.sigint) self.log("Listening...") self.Sock.listen(10) while not self.Stop: self.log("accepting new connection...") sock, addr = self.Sock.accept() # synchronously run only 1 job at a time, for now dxsock = DataExchangeSocket(sock) msg = dxsock.recv() if msg and msg.Type == "worker_task": worker_params = WorkerParams.fromDXMsg(msg) frames = json.loads(msg["frames"]) wid = msg["wid"] try: self.runWorker(worker_params, dxsock, frames, wid) except: formatted = traceback.format_exc() self.log("jid/wid=%s/%s: SocketWorkerServer.runWorker() exception:\n%s" % (worker_params.JID, wid, formatted,)) try: dxsock.send(DXMessage("exception").append(info=formatted)) except: self.log("Error sending 'exception' message:\n%s" % (traceback.format_exc(),)) self.log("closing socket") dxsock.close()
def run(self): signal.signal(signal.SIGINT, self.sigint) self.Sock = socket(AF_INET, SOCK_STREAM) self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) self.Sock.bind(("", 0)) self.Sock.listen(10) port = self.Sock.getsockname()[1] pinger = WorkerRegistryPinger(self.RegistryAddress, port, self.Tag) pinger.start() # # Start workers # self.Workers = [Worker(i, self.NWorkers, self.StripedServerURL, self.WorkerLogFileTemplate, self.CacheLimit, self.ModuleStorage) for i in range(self.NWorkers)] for w in self.Workers: w.start() self.log("startied worker %d with pid %d" % (w.ID, w.pid)) nrunning = self.NWorkers while not self.Stop: sock, addr = self.Sock.accept() dxsock = DataExchangeSocket(sock) close_sock = True #print "Client connected: %s" % (addr,) self.log("Client connected: %s" % (addr,)) # read job description JSON #print "reading params..." try: msg = dxsock.recv() except: self.log("Can not read initial message. Closing the connection. Error:\n%s" % (traceback.format_exc(),)) msg = None if msg and msg.Type == 'request': try: request = WorkerRequest.fromDXMsg(msg) self.log("Request received:\n jid/wid: %s/%s\n dataset: %s\n data_url: %s\n frames: %s\n" % ( request.JID, request.WID, request.DatasetName, request.DataServerURL, request.RGIDs) ) signature, t, salt, alg = msg["worker_authenticator"].split(":") #print "worker_authenticator:", (signature, t, salt, alg) key = pinger.Key verified, reason = request.verifySignature(key, signature, t, salt, alg) if not verified: self.log("Signature verification failed: %s" % (reason,)) dxsock.send(DXMessage("exception").append(info="Authentication failed: %s" % (reason,))) else: self.Accumulators << AccumulatorDriver(dxsock, request, self.Workers, self.ModuleStorage, self.BulkDataTransport, self.LogFile) close_sock = False except: self.log("Error processing the request. Closing the connection\n%s" % (traceback.format_exc(),)) if close_sock: dxsock.close()
def workerExited(self, wid, status, t, nevents, nrunning): if self.FirstWorkerExitT is None: self.FirstWorkerExitT = time.time() self.LastWorkerExitT = time.time() with self.T["callback/worker_exit"]: self.DataExchange.send( DXMessage("worker_exit", nrunning=nrunning, wid=wid, status=status, t=t, nevents=nevents))
def updateReceived(self, wid, hists, streams, nevents_delta): self.TotalEvents += nevents_delta client_disconnected = False if hists: msg = DXMessage("histograms", total_events=self.TotalEvents, wid=wid) for k, v in hists.items(): msg[k] = v try: self.DataExchange.send(msg) except: self.log("Error sending message to the client:\n%s" % (traceback.format_exc(), )) client_disconnected = True if streams: for k, data in streams.items(): msg = DXMessage("stream", name=k, format="pickle", total_events=self.TotalEvents, wid=wid) msg.append( data=data ) # this is still pickled data because the WorkerInterface does not unpickle try: self.DataExchange.send(msg) except: self.log("Error sending message to the client:\n%s" % (traceback.format_exc(), )) client_disconnected = True if not streams and not hists: #print "sending empty(%d)" % (self.TotalEvents,) msg = DXMessage("empty", total_events=self.TotalEvents, wid=wid) try: self.DataExchange.send(msg) except: self.log("Error sending message to the client:\n%s" % (traceback.format_exc(), )) client_disconnected = True if client_disconnected: self.log( "Client disconnected (because of the communication error). Aborting" ) self.Contract.abort()
def messageFromWorker(self, worker_interface, msg): # Can be message, hist, stream, flush, exception if msg.Type == "data": storage = BulkStorage.open(msg["storage"]) #print "Accumulator.messageFromWorker(data): keys:", storage.keys() events_delta = msg["events_delta"] #self.log("data message: events_delta=%s" % (events_delta,)) data = storage.asDict() if self.Accumulator is None: msg = DXMessage("data", events_delta = self.eventsDelta(events_delta), format="encode")(data=encodeData(data)) self.DXSock.send(msg) else: through = None try: with self.T["accumulate"]: through = self.Accumulator.add(data) except: self.DXSock.send(DXMessage("exception").append(info=traceback.format_exc())) if through is not None: with self.T["send through data"]: msg = DXMessage("data", events_delta = self.eventsDelta(events_delta), format="encode")(data=encodeData(through)) self.DXSock.send(msg) else: self.EventsSeen += events_delta storage.unlink() elif msg.Type == "hist": for k, v in msg.items(): if k.startswith("h:"): hid = k[2:] self.HAccumulators[hid].add(v) #print("AccumulatorDriver: h(%s).Counts->%s" % (hid, self.HAccumulators[hid].H.Counts)) now = time.time() if now > self.HistSentTime + self.HistSendInterval: self.sendHistograms() self.HistSentTime = now else: self.DXSock.send(msg)
def messageReceived(self, wid, nevents, message): with self.T["callback/message"]: self.DataExchange.send( DXMessage("message", wid=wid, nevents=nevents).append(message=message))
def message(self, message): self.DXSock.send(DXMessage("message", nevents=0).append(message=message))
def run(self): try: storage = None bulk_data = None worker_module_name = "m_%s_%s" % (os.getpid(), self.Request.JID) module_file = "%s/%s.py" % (self.ModuleStorage, worker_module_name) open(module_file, "w").write(self.Request.WorkerText) frames = self.Request.RGIDs frames_by_worker = distribute_items(frames, len(self.Workers)) params = WorkerParams.fromRequest(self.Request, worker_module_name) # # Store bulk data in shared memory # if self.Request.BulkDataName: with self.T["wait_for_bulk_data"]: t0 = time.time() bulk_data = self.BulkDataTransport.pop(self.Request.BulkDataName, timeout=30) t1 = time.time() self.log("bulk data %s received, %d bytes encoded, %.2f wait time" % (self.Request.BulkDataName, len(bulk_data), t1-t0)) bulk_data = decodeData(bulk_data) with self.T["store_bulk_data"]: assert isinstance(bulk_data, dict) n = sum([len(v.data)+1000 for v in bulk_data.values()]) n = int(n*1.1)+1000000 # for safety storage = BulkStorage.create(params.BulkDataName, bulk_data) storage.save() self.log("bulk data stored. %f MB allocated" % (float(n)/1024/1024,)) # # Create Accumulator if specified # worker_module = __import__(worker_module_name, {}, {}, ["Accumulator"]) if hasattr(worker_module, "Accumulator"): job_interface = self.JobInterface(self) db_interface = self.DBInterface(self) self.Accumulator = worker_module.Accumulator( params.UserParams, bulk_data, job_interface, db_interface ) worker_interfaces = [] for iw, (w, frames) in enumerate(zip(self.Workers, frames_by_worker)): if frames: wid = "%s/%d" % (self.Request.WID, iw) wi = WorkerInterface(self, w.Address, params, wid, frames) wi.start() worker_interfaces.append(wi) for wi in worker_interfaces: wi.join() self.log("all worker interfaces closed") if self.Accumulator is not None: data = self.Accumulator.values() if data is not None: with self.T["send accumulated data"]: events_delta = self.eventsDelta() self.log("sending accumulated data with events_delta=%d" % (events_delta,)) self.DXSock.send(DXMessage("data", events_delta = events_delta, format="encode")(data=encodeData(data))) self.sendHistograms() #self.DXSock.send(DXMessage("flush", nevents=self.EventsAccumulated)) except: self.DXSock.send(DXMessage("exception").append(info=traceback.format_exc())) finally: self.DXSock.close() self.log("socket closed") if storage: storage.unlink() self.log("bulk storage unlinked") os.unlink(module_file) if module_file.endswith(".py"): try: os.unlink(module_file+"c") except OSError: pass self.log("---- Accumulator stats ----\n" + self.T.formatStats())
def bumpEvents(self, events_delta): if events_delta > 0: self.DXSock.send(DXMessage("events", delta=events_delta))
def run(self): signal.signal(signal.SIGINT, self.sigint) self.Pinger.start() while not self.Stop: self.log("accepting...") sock, addr = self.Sock.accept() dxsock = DataExchangeSocket(sock) #print "Client connected: %s" % (addr,) self.log("Client connected: %s" % (addr, )) # read job description JSON #print "reading params..." try: msg = dxsock.recv() except: self.log( "Can not read initial message. Closing the connection. Error:\n%s" % (traceback.format_exc(), )) msg = None jid = None wid = None if msg and msg.Type == 'request': try: params = WorkerRequest.fromDXMsg(msg) signature, t, salt, alg = msg[ "worker_authenticator"].split(":") #print "worker_authenticator:", (signature, t, salt, alg) key = self.Pinger.Key verified, reason = params.verifySignature( key, signature, t, salt, alg) #self.log("worker request verification: %s %s" % (verified, reason)) if not verified: self.log("Signature verification failed: %s" % (reason, )) dxsock.send( DXMessage("exception").append( info="Authentication failed: %s" % (reason, ))) else: jid, wid = params.JID, params.WID try: self.runWorker(params, dxsock, addr) except StripedNotFoundException as e: dxsock.send( DXMessage("exception").append(info=str(e))) except: formatted = traceback.format_exc() self.log( "jid/wid=%s/%s: SocketWorkerServer.runWorker() exception:\n%s" % ( jid, wid, formatted, )) dxsock.send( DXMessage("exception").append(info=formatted)) except: self.log( "jid/wid=%s/%s: Error processing the request. Closing the connection\n%s" % ( jid, wid, traceback.format_exc(), )) dxsock.close() self.log("jid/wid=%s/%s: socket closed" % (jid, wid))
def dataLoadFailure(self, rgid, info): self.DXSock.send(DXMessage("data_load_failure", rgid=rgid, info=info))
def updateReceived(self, wid, data, nevents_delta): msg = DXMessage("update", events_delta=nevents_delta, wid=wid) for k, v in data: msg["data:" + k] = v msg.toSocket(self.Sock)
def sendData(self, events_delta, data): storage_name = "%s_%s" % (self.ID, self.dataSequence()) storage = BulkStorage.create(storage_name, data) msg = DXMessage("data", events_delta=events_delta, storage=storage_name) self.DXSock.send(msg)
def exceptionReceived(self, wid, info): with self.T["callback/exception"]: self.DataExchange.send( DXMessage("exception", wid=wid).append(info=info))
def dataReceived(self, wid, events_delta, data): with self.T["callback/data"]: self.DataExchange.send( DXMessage("data", wid=wid, events_delta=events_delta).append(data=data))
def eventsDelta(self, wid, events_delta): with self.T["callback/eventsDelta"]: self.DataExchange.send( DXMessage("events", wid=wid, events_delta=events_delta))
def dataLoadFailureReceived(self, wid, rgid): with self.T["callback/data_load_failure"]: self.DataExchange.send( DXMessage("data_load_failure", wid=wid, rgid=rgid))
def sendData___(self, events_delta, data): msg = DXMessage("data", events_delta=events_delta, format="encode")(data=encodeData(data)) self.DXSock.send(msg)