예제 #1
0
    def run(self):

        self.log("job process %s started" % (self.JID, ))

        if self.LogFilePath is not None:
            self.LogFile = open(self.LogFilePath, "w")

        try:
            with self.T["JobProcess/run"]:
                setproctitle("striped_job %s" % (self.JID, ))
                self.log("started: dataset: %s, fraction: %s, %d workers" %
                         (self.JobDesc.DatasetName, self.JobDesc.Fraction,
                          len(self.Workers)))
                callback_delegate = self
                with self.T["JobProcess/run/create_contract"]:
                    self.Contract = Contract(
                        self.JID, self.DataServerURL, self.BulkTransportPort,
                        self.DataClient.dataset(self.JobDesc.DatasetName),
                        self.JobDesc, self.Workers, callback_delegate,
                        self.log, self.T)

                self.DataExchange.send(
                    DXMessage("job_started",
                              nworkers=len(self.Workers),
                              jid=self.JID,
                              total_events=self.Contract.TotalEvents,
                              selected_events=self.Contract.SelectedEvents,
                              selected_frames=json.dumps(
                                  self.Contract.SelectedFrames)))

                self.log("job_started sent")

                with self.T["JobProcess/run/start_contract"]:
                    self.Contract.start()

                self.ContractStartedT = self.FirstWorkerExitT = self.LastWorkerExitT = time.time(
                )

                self.log("contract started. waiting...")

                with self.T["JobProcess/run/wait_contract"]:
                    self.Contract.wait()

                self.DataExchange.send(
                    DXMessage("job_done", total_events=self.TotalEvents))

                self.log(
                    "Job finished. Worker exit timestamps: first: %.5f, last:%.5f"
                    % (self.FirstWorkerExitT - self.ContractStartedT,
                       self.LastWorkerExitT - self.ContractStartedT))
                self.DataExchange.close()
                self.log("---- exit ----")
        except:
            tb = traceback.format_exc()
            self.DataExchange.send(DXMessage("job_failed").append(reason=tb))
            self.log("Exception: ------------\n%s" % (tb, ))
        finally:
            self.log("----- job stats: -----\n" + self.T.formatStats())
            if self.LogFile is not None: self.LogFile.close()
예제 #2
0
 def sendHistograms(self):
             msg = DXMessage("hist")
             nhist = 0
             for hid, hacc in self.HAccumulators.items():
                 if hacc.NFills:
                     #print ("sendHistograms: counts=", hacc.H.Counts)
                     msg.append("h:"+hid, hacc.dump())
                     nhist += 1
             if nhist:
                 self.DXSock.send(msg)
예제 #3
0
 def run(self):
     self.log("started: %s" % (self.JID, ))
     job_description = self.JobDescription
     failed = False
     try:
         self.log("validating...")
         validated = self.Server.validate_job(job_description)
         if not validated:
             self.log("job request validation failed %s" %
                      (repr(job_description.AuthToken), ))
             self.DataExchange.send(
                 DXMessage("job_failed").append(
                     reason="Token validation failed"))
             self.Failed = True
             self.Server.jobFailed(self, "Token validation failed")
         else:
             self.log("validated: token=%s identity=[%s]" %
                      (job_description.AuthToken, job_description.Identity))
             workers = self.Server.workers(tags=job_description.WorkerTags)
             self.log("workers: %s" % ([wi.Addr for wi in workers], ))
             if not workers:
                 self.log("no workers found for tags=%s" %
                          (job_description.WorkerTags, ))
                 self.DataExchange.send(
                     DXMessage("job_failed").append(
                         reason="No available workers found for tags=%s" %
                         (job_description.WorkerTags, )))
                 self.Failed = True
                 self.Server.jobFailed(
                     self, "No available forkers found for tags=%s" %
                     (job_description.WorkerTags, ))
             else:
                 process = JobProcess(self.JID, self.DataServerURL,
                                      self.BulkTransportPort,
                                      self.DataExchange, self.DataClient,
                                      workers, job_description,
                                      self.LogFilePath)
                 process.start()
                 self.Started = time.time()
                 self.Server.jobStarted(self)
                 process.join()
                 self.log("job process exited with status %s" %
                          (process.exitcode, ))
     except:
         exception = traceback.format_exc()
         self.log("failed: %s" % (exception, ))
         self.Server.jobFailed(self, exception)
         self.Failed = True
     finally:
         self.Ended = time.time()
         self.DataExchange.close()
         if not self.Failed:
             self.log("ended")
             self.Server.jobEnded(self)
         self.Server = None
예제 #4
0
 def endOfFrame(self, nevents):
     #self.log("end of frame")
     self.DXSock.send(DXMessage("events", events_delta=nevents))
     t = time.time()
     if self.NFills:
         self.LastFlush = t
         msg = DXMessage("hist")
         for hid, hb in self.HCollectors.items():
             msg.append("h:"+hid, hb.dump())
             #print "counts:", counts
         self.DXSock.send(msg)
         self.NFills = 0
예제 #5
0
    def run(self):
        self.Sock = socket(AF_INET, SOCK_STREAM)
        self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
        self.Sock.bind(('', self.Port))
        self.Sock.listen(5)
        data_exchange_listener = DataExchangeSocket(self.Sock)

        while not self.Stop:
            data_exchange = None
            try:
                data_exchange = data_exchange_listener.accept()
                msg = data_exchange.recv()
                #print "msg:", msg.Type
                if msg and msg.Type == 'job_request':
                    job_description = JobDescription.fromDXMsg(msg)
                    exists = self.DataClient.dataset(
                        job_description.DatasetName).exists
                    #print "exists:", exists
                    if not exists:
                        self.log("Dataset not found: %s" %
                                 (job_description.DatasetName, ))
                        data_exchange.send(
                            DXMessage("job_failed").append(
                                reason="Dataset '%s' not found" %
                                (job_description.DatasetName, )))
                    else:
                        jid = self.jid()
                        self.log(
                            "Job description received. Job id %s assigned" %
                            (jid, ))
                        job_log_file_path = None if self.LogFileDir is None else "%s/job_%s.log" % (
                            self.LogFileDir, jid)
                        jt = JobTask(self, jid, job_description,
                                     self.DataServerURL,
                                     self.BulkTransportPort, self.DataClient,
                                     data_exchange, job_log_file_path)
                        self.JobQueue << jt
                        data_exchange = None  # the job task owns it now !
                        if self.SourceArchive is not None:
                            open("%s/ws_%s.txt" % (self.SourceArchive, jid),
                                 "w").write(job_description.WorkerText)
                self.purgeJobHistory()
            except:
                dump = traceback.format_exc()
                self.log("Uncaught exception: %s" % (dump, ))
                if data_exchange is not None:
                    data_exchange.send(
                        DXMessage("job_failed").append(reason="Exception: %s" %
                                                       (dump, )))
            finally:
                if data_exchange is not None:
                    data_exchange.close()
                    data_exchange = None
예제 #6
0
 def toDXMsg(self):
     msg = DXMessage("worker_task", jid=self.JID,
         worker_module_name = self.WorkerModuleName, 
         bulk_data_name = self.BulkDataName,
         dataset_name = self.DatasetName,
         use_data_cache = "yes" if self.UseDataCache else "no",
         data_server_url = self.DataServerURL,
         data_mod_url = self.DataModURL,
         data_mod_token = self.DataModToken)
     msg.append(
         histograms = json.dumps(self.HDescriptors),
         user_params = self.UserParams           # this is encoded by the job server, do not decode it yet
     )
     return msg
예제 #7
0
 def flushAll(self, nevents):
     #self.log("flush all, nevents=%d" % (nevents,))
     
         
     for sn, sb in self.SBuffers.items():
         values = sb.flush()
         if values is not None and len(values) > 0:
             # values is a list of (nevents, data)
             #print "flushAll: len(values)=%d, len(values.data)=%d" % (len(values), len(values.data))
             msg = DXMessage("stream", name=sn, format="encode")(data=encodeData(values))
             self.DXSock.send(msg)
             
     if self.NFills > 0:
         self.DXSock.send(DXMessage("flush", nevents=nevents))
     self.NFills = 0
예제 #8
0
 def run(self):
     self.log("Started")
     signal.signal(signal.SIGINT, self.sigint)
     self.log("Listening...")
     self.Sock.listen(10)
     while not self.Stop:
         self.log("accepting new connection...")
         sock, addr = self.Sock.accept()             # synchronously run only 1 job at a time, for now
         dxsock = DataExchangeSocket(sock)
         msg = dxsock.recv()
         if msg and msg.Type == "worker_task":
             worker_params = WorkerParams.fromDXMsg(msg)
             frames = json.loads(msg["frames"])
             wid = msg["wid"]
             try:
                 self.runWorker(worker_params, dxsock, frames, wid)
             except:
                 formatted = traceback.format_exc()
                 self.log("jid/wid=%s/%s: SocketWorkerServer.runWorker() exception:\n%s" % (worker_params.JID, wid, formatted,))
                 try:    
                     dxsock.send(DXMessage("exception").append(info=formatted))
                 except:
                     self.log("Error sending 'exception' message:\n%s" % (traceback.format_exc(),))
         self.log("closing socket")
         dxsock.close()
예제 #9
0
    def run(self):
        signal.signal(signal.SIGINT, self.sigint)

        self.Sock = socket(AF_INET, SOCK_STREAM)
        self.Sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
        self.Sock.bind(("", 0))
        self.Sock.listen(10)

        port = self.Sock.getsockname()[1]
        pinger = WorkerRegistryPinger(self.RegistryAddress, port, self.Tag)
        pinger.start()

        #
        # Start workers
        #
        
        self.Workers = [Worker(i, self.NWorkers, self.StripedServerURL, self.WorkerLogFileTemplate, self.CacheLimit, self.ModuleStorage) 
                for i in range(self.NWorkers)]
        for w in self.Workers:
            w.start()
            self.log("startied worker %d with pid %d" % (w.ID, w.pid))
        nrunning = self.NWorkers
        
        while not self.Stop:
            sock, addr = self.Sock.accept()
            dxsock = DataExchangeSocket(sock)
            close_sock = True
            #print "Client connected: %s" % (addr,)
            self.log("Client connected: %s" % (addr,))
            
            # read job description JSON
            #print "reading params..."
            
            try:    msg = dxsock.recv()
            except:
                self.log("Can not read initial message. Closing the connection. Error:\n%s" % 
                            (traceback.format_exc(),))
                msg = None
            if msg and msg.Type == 'request':
                try:
                    request = WorkerRequest.fromDXMsg(msg)
                    self.log("Request received:\n  jid/wid: %s/%s\n  dataset: %s\n  data_url: %s\n  frames: %s\n" % (
                                request.JID, request.WID, request.DatasetName, request.DataServerURL,
                                request.RGIDs)
                    )
                    signature, t, salt, alg = msg["worker_authenticator"].split(":")
                    #print "worker_authenticator:", (signature, t, salt, alg)
                    key = pinger.Key
                    verified, reason = request.verifySignature(key, signature, t, salt, alg)
                    if not verified:
                        self.log("Signature verification failed: %s" % (reason,))
                        dxsock.send(DXMessage("exception").append(info="Authentication failed: %s" % (reason,)))
                    else:
                        self.Accumulators << AccumulatorDriver(dxsock, request, self.Workers, self.ModuleStorage, self.BulkDataTransport, self.LogFile)
                        close_sock = False
                except:
                    self.log("Error processing the request. Closing the connection\n%s" % (traceback.format_exc(),))

            if close_sock:
                dxsock.close()
예제 #10
0
 def workerExited(self, wid, status, t, nevents, nrunning):
     if self.FirstWorkerExitT is None:
         self.FirstWorkerExitT = time.time()
     self.LastWorkerExitT = time.time()
     with self.T["callback/worker_exit"]:
         self.DataExchange.send(
             DXMessage("worker_exit",
                       nrunning=nrunning,
                       wid=wid,
                       status=status,
                       t=t,
                       nevents=nevents))
예제 #11
0
    def updateReceived(self, wid, hists, streams, nevents_delta):

        self.TotalEvents += nevents_delta
        client_disconnected = False

        if hists:
            msg = DXMessage("histograms",
                            total_events=self.TotalEvents,
                            wid=wid)
            for k, v in hists.items():
                msg[k] = v
            try:
                self.DataExchange.send(msg)
            except:
                self.log("Error sending message to the client:\n%s" %
                         (traceback.format_exc(), ))
                client_disconnected = True

        if streams:
            for k, data in streams.items():
                msg = DXMessage("stream",
                                name=k,
                                format="pickle",
                                total_events=self.TotalEvents,
                                wid=wid)
                msg.append(
                    data=data
                )  # this is still pickled data because the WorkerInterface does not unpickle
                try:
                    self.DataExchange.send(msg)
                except:
                    self.log("Error sending message to the client:\n%s" %
                             (traceback.format_exc(), ))
                    client_disconnected = True

        if not streams and not hists:
            #print "sending empty(%d)" % (self.TotalEvents,)
            msg = DXMessage("empty", total_events=self.TotalEvents, wid=wid)
            try:
                self.DataExchange.send(msg)
            except:
                self.log("Error sending message to the client:\n%s" %
                         (traceback.format_exc(), ))
                client_disconnected = True
        if client_disconnected:
            self.log(
                "Client disconnected (because of the communication error). Aborting"
            )
            self.Contract.abort()
예제 #12
0
 def messageFromWorker(self, worker_interface, msg):
     # Can be message, hist, stream, flush, exception
     if msg.Type == "data":
         storage = BulkStorage.open(msg["storage"])
         #print "Accumulator.messageFromWorker(data): keys:", storage.keys()
         events_delta = msg["events_delta"]
         #self.log("data message: events_delta=%s" % (events_delta,))
         data = storage.asDict()
         if self.Accumulator is None:
             msg = DXMessage("data", events_delta = self.eventsDelta(events_delta), format="encode")(data=encodeData(data))
             self.DXSock.send(msg)
         else:
                 through = None
                 try:
                     with self.T["accumulate"]:
                         through = self.Accumulator.add(data)
                 except:
                     self.DXSock.send(DXMessage("exception").append(info=traceback.format_exc()))
                 if through is not None:
                     with self.T["send through data"]:
                         msg = DXMessage("data", events_delta = self.eventsDelta(events_delta), format="encode")(data=encodeData(through))
                         self.DXSock.send(msg)       
                 else:
                     self.EventsSeen += events_delta
         storage.unlink()
     elif msg.Type == "hist":
         for k, v in msg.items():
             if k.startswith("h:"):
                 hid = k[2:]
                 self.HAccumulators[hid].add(v)
                 #print("AccumulatorDriver: h(%s).Counts->%s" % (hid, self.HAccumulators[hid].H.Counts))
         now = time.time()
         if now > self.HistSentTime + self.HistSendInterval:
             self.sendHistograms()
             self.HistSentTime = now
     else:
         self.DXSock.send(msg)       
예제 #13
0
 def messageReceived(self, wid, nevents, message):
     with self.T["callback/message"]:
         self.DataExchange.send(
             DXMessage("message", wid=wid,
                       nevents=nevents).append(message=message))
예제 #14
0
 def message(self, message):
     self.DXSock.send(DXMessage("message", nevents=0).append(message=message))
예제 #15
0
    def run(self):
        try:
            storage = None
            bulk_data = None
            
            worker_module_name = "m_%s_%s" % (os.getpid(), self.Request.JID)     
            module_file = "%s/%s.py" % (self.ModuleStorage, worker_module_name)
            open(module_file, "w").write(self.Request.WorkerText)

            frames = self.Request.RGIDs
            frames_by_worker = distribute_items(frames, len(self.Workers))
            params = WorkerParams.fromRequest(self.Request, worker_module_name)

            #
            # Store bulk data in shared memory
            #
            if self.Request.BulkDataName:
                with self.T["wait_for_bulk_data"]:
                    t0 = time.time()
                    bulk_data = self.BulkDataTransport.pop(self.Request.BulkDataName, timeout=30)
                    t1 = time.time()
                    self.log("bulk data %s received, %d bytes encoded, %.2f wait time" % (self.Request.BulkDataName, len(bulk_data), t1-t0))
                    bulk_data = decodeData(bulk_data)
                with self.T["store_bulk_data"]:
                    assert isinstance(bulk_data, dict)
                    n = sum([len(v.data)+1000 for v in bulk_data.values()])
                    n = int(n*1.1)+1000000      # for safety
                    storage = BulkStorage.create(params.BulkDataName, bulk_data)
                    storage.save()
                self.log("bulk data stored. %f MB allocated" % (float(n)/1024/1024,))

            #
            # Create Accumulator if specified
            #
            worker_module = __import__(worker_module_name, {}, {}, ["Accumulator"])
            if hasattr(worker_module, "Accumulator"):
                job_interface = self.JobInterface(self)
                db_interface = self.DBInterface(self)
                self.Accumulator = worker_module.Accumulator(
                    params.UserParams, bulk_data,
                    job_interface, db_interface
                )        

            worker_interfaces = []
            for iw, (w, frames) in enumerate(zip(self.Workers, frames_by_worker)):
                if frames:
                    wid = "%s/%d" % (self.Request.WID, iw)
                    wi = WorkerInterface(self, w.Address, params, wid, frames)
                    wi.start()
                    worker_interfaces.append(wi)

            for wi in worker_interfaces:
                wi.join()
            self.log("all worker interfaces closed")

            if self.Accumulator is not None:
                data = self.Accumulator.values()
                if data is not None:
                    with self.T["send accumulated data"]:
                        events_delta = self.eventsDelta()
                        self.log("sending accumulated data with events_delta=%d" % (events_delta,))
                        self.DXSock.send(DXMessage("data", events_delta = events_delta,
                                format="encode")(data=encodeData(data)))

            self.sendHistograms()

            #self.DXSock.send(DXMessage("flush", nevents=self.EventsAccumulated))
                        
        except:
            self.DXSock.send(DXMessage("exception").append(info=traceback.format_exc()))

        finally:
            self.DXSock.close()
            self.log("socket closed")

            if storage:
                    storage.unlink()
                    self.log("bulk storage unlinked")

            os.unlink(module_file)
            if module_file.endswith(".py"):
                try:    os.unlink(module_file+"c")
                except OSError:
                    pass
                
            self.log("---- Accumulator stats ----\n" + self.T.formatStats())
예제 #16
0
 def bumpEvents(self, events_delta):
     if events_delta > 0:
          self.DXSock.send(DXMessage("events", delta=events_delta))
예제 #17
0
    def run(self):
        signal.signal(signal.SIGINT, self.sigint)
        self.Pinger.start()
        while not self.Stop:
            self.log("accepting...")
            sock, addr = self.Sock.accept()
            dxsock = DataExchangeSocket(sock)
            #print "Client connected: %s" % (addr,)
            self.log("Client connected: %s" % (addr, ))

            # read job description JSON
            #print "reading params..."

            try:
                msg = dxsock.recv()
            except:
                self.log(
                    "Can not read initial message. Closing the connection. Error:\n%s"
                    % (traceback.format_exc(), ))
                msg = None
            jid = None
            wid = None
            if msg and msg.Type == 'request':
                try:
                    params = WorkerRequest.fromDXMsg(msg)
                    signature, t, salt, alg = msg[
                        "worker_authenticator"].split(":")
                    #print "worker_authenticator:", (signature, t, salt, alg)
                    key = self.Pinger.Key
                    verified, reason = params.verifySignature(
                        key, signature, t, salt, alg)
                    #self.log("worker request verification: %s %s" % (verified, reason))
                    if not verified:
                        self.log("Signature verification failed: %s" %
                                 (reason, ))
                        dxsock.send(
                            DXMessage("exception").append(
                                info="Authentication failed: %s" % (reason, )))
                    else:
                        jid, wid = params.JID, params.WID

                        try:
                            self.runWorker(params, dxsock, addr)
                        except StripedNotFoundException as e:
                            dxsock.send(
                                DXMessage("exception").append(info=str(e)))
                        except:
                            formatted = traceback.format_exc()
                            self.log(
                                "jid/wid=%s/%s: SocketWorkerServer.runWorker() exception:\n%s"
                                % (
                                    jid,
                                    wid,
                                    formatted,
                                ))
                            dxsock.send(
                                DXMessage("exception").append(info=formatted))
                except:
                    self.log(
                        "jid/wid=%s/%s: Error processing the request. Closing the connection\n%s"
                        % (
                            jid,
                            wid,
                            traceback.format_exc(),
                        ))
            dxsock.close()
            self.log("jid/wid=%s/%s: socket closed" % (jid, wid))
예제 #18
0
 def dataLoadFailure(self, rgid, info):
     self.DXSock.send(DXMessage("data_load_failure", rgid=rgid, info=info))
예제 #19
0
 def updateReceived(self, wid, data, nevents_delta):
     msg = DXMessage("update", events_delta=nevents_delta, wid=wid)
     for k, v in data:
         msg["data:" + k] = v
     msg.toSocket(self.Sock)
예제 #20
0
 def sendData(self, events_delta, data):
     storage_name = "%s_%s" % (self.ID, self.dataSequence())
     storage = BulkStorage.create(storage_name, data)
     msg = DXMessage("data", events_delta=events_delta, storage=storage_name)
     self.DXSock.send(msg)
예제 #21
0
 def exceptionReceived(self, wid, info):
     with self.T["callback/exception"]:
         self.DataExchange.send(
             DXMessage("exception", wid=wid).append(info=info))
예제 #22
0
 def dataReceived(self, wid, events_delta, data):
     with self.T["callback/data"]:
         self.DataExchange.send(
             DXMessage("data", wid=wid,
                       events_delta=events_delta).append(data=data))
예제 #23
0
 def eventsDelta(self, wid, events_delta):
     with self.T["callback/eventsDelta"]:
         self.DataExchange.send(
             DXMessage("events", wid=wid, events_delta=events_delta))
예제 #24
0
 def dataLoadFailureReceived(self, wid, rgid):
     with self.T["callback/data_load_failure"]:
         self.DataExchange.send(
             DXMessage("data_load_failure", wid=wid, rgid=rgid))
예제 #25
0
 def sendData___(self, events_delta, data):
     msg = DXMessage("data", events_delta=events_delta, format="encode")(data=encodeData(data))
     self.DXSock.send(msg)