Exemplo n.º 1
0
 def finalize(self, cookie):
     try:
         logger.debug("InprocessApplicationProxy.finalize()")
         cookie = streamer.loads(cookie)
         self.app.finalize(cookie)
     except Exception, x:
         handleApplicationFailure(x)
Exemplo n.º 2
0
 def do_work(self, task_data):
     try:
         logger.debug("InprocessApplicationProxy.do_work()")
         task_data = streamer.loads(task_data)
         task_result = self.app.do_work(task_data)
         return streamer.dumps(task_result)
     except Exception, x:
         handleApplicationFailure(x)
Exemplo n.º 3
0
    def run(self):

        import MSGWrap
        
        from diane.config import log_configuration
        log_configuration(title='initial configuration')        

        msg_data = { '_worker_uuid' : self.uuid }

        try:
            self.registerToMaster()

            master = StandingCall(self.master, config.HEARTBEAT_DELAY, should_stop = self.should_stop)
            
            
            app_boot,app_init = master.get_init_data(self.uuid) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'get_init_data',self.uuid)
            _boot = streamer.loads(app_boot)
            msg_data['_master_uuid'] = _boot.master_uuid
            msg_data['_runid'] = _boot.runid
            import os
            msg_data['ganga_job_uuid'] = self.ganga_job_uuid

            # FIXME: if worker restart enabled, save diane.config.__all_configs and restore it after run has finished
            MSGWrap.sendStatus('_worker_create_application_proxy_start', msg_data)
            self.application = create_application_proxy(app_boot,app_init,agent=self)
            MSGWrap.sendStatus('_worker_create_application_proxy_finish', msg_data)
            
            self.program.registerAtExitHandler(self.finalize_application)
            
            MSGWrap.sendStatus('_worker_initialize_start', msg_data)
            app_init_output = self.application.initialize(app_init)
            MSGWrap.sendStatus('_worker_initialize_finish', msg_data)

            # config may have been updated and the value of config.HEARTBEAT_DELAY may have changed -> need to create the object again
            # FIXME: use a REFERENCE to config.HEARTBEAT_DELAY
            master = StandingCall(self.master, config.HEARTBEAT_DELAY, should_stop = self.should_stop)

            master.put_init_result(self.uuid,app_init_output,0) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'put_init_result',self.uuid,app_init_output,0)

            while not self.should_stop():
                time.sleep(config.PULL_REQUEST_DELAY) # PENDING: this parameter should be dynamically controlled by the master
                tid,task_data = master.get_task_data(self.uuid) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'get_task_data',self.uuid)
                try:
                    msg_data['tid'] = tid
                    MSGWrap.sendStatus('_worker_do_work_start', msg_data)
                    task_result = self.application.do_work(task_data)
                    MSGWrap.sendStatus('_worker_do_work_finish', msg_data)
                    error = 0
                except diane.application.ApplicationFailure,x: # recoverable problem
                    task_result = streamer.dumps(x)
                    error = 1
                    #FIXME: reporting failure is not yet well-defined
                
                master.put_task_result(self.uuid,tid,task_result,error) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'put_task_result',self.uuid,tid,task_result,error)

        except diane.application.ApplicationFailure,x: # recoverable problem but raised by the application init
            pass
Exemplo n.º 4
0
 def initialize(self, app_init):
     try:
         logger.debug("InprocessApplicationProxy.initialize()")
         app_init = streamer.loads(app_init)
         app_init_output = self.app.initialize(app_init)
         app_init_output = streamer.dumps(app_init_output)
         return app_init_output
     except Exception, x:
         handleApplicationFailure(x)
Exemplo n.º 5
0
def create_application_proxy(boot_msg, app_init, agent, **kwds):
    boot = streamer.loads(boot_msg)

    import os

    if boot.darname:
        agent.ftc.download(boot.darname)
        dar = tarfile.open(boot.darname, "r:gz")
        try:
            dar.extractall("_python")
        except AttributeError:  # python < 2.5
            os.system("mkdir -p _python")
            os.system("cd _python; tar xfzv ../%s" % boot.darname)

    import sys

    app_python_path = os.path.abspath("_python")
    sys.path.insert(0, app_python_path)

    diane.config.restore_config(boot.config)
    logger.info("application boot and run data received")
    boot.log()
    diane.config.log_configuration(title="updated configuration")
    boot.agent = agent
    c = diane.config.getConfig("WorkerAgent")

    boot.application_shell_command = c.APPLICATION_SHELL
    boot.application_shell_pre_process = ""
    boot.application_shell_post_process = ""

    # perform a setup action of the application
    setup_application = importName(boot.name, "setup_application")
    if setup_application:
        try:
            r = setup_application(streamer.loads(app_init), agent)
            if not r is None:
                boot.application_shell_pre_process, boot.application_shell_post_process = r
        except Exception, x:
            handleApplicationFailure(x)
Exemplo n.º 6
0
    def put_init_result(self,worker_uuid,init_result,error):
        wid = self._resolve_wid(worker_uuid)
        # during the execution of this method the tasks may not be scheduled
        # to this worker because it is not in the cache waiting list        
        init_result = streamer.loads(init_result)
        logger.debug('put_init_result %d %s',wid,repr(init_result))
        self.update_contact(wid)

        w = self.worker_registry.get(wid)
        require_worker_initialized(w,False)
        w.init_output = init_result

        self.journal.addEntry('put_init_result',wid=wid)
        
        try:
            logger.debug('task_scheduler.worker_initialized(w) w.wid=%d w.worker_uuid=%s'%(w.wid,w.worker_uuid))            
            self.task_scheduler.worker_initialized(w)
        except Exception,x:
            logger.exception('Error in TaskScheduler.worker_initialized() callback')
Exemplo n.º 7
0
    def put_task_result(self,worker_uuid,tid,task_result,error):
        task_result = streamer.loads(task_result)
        wid = self._resolve_wid(worker_uuid)
        logger.debug('put_task_data %d %d %s',wid,tid,repr(task_result))
        self.journal.addEntry('put_task_result_request',wid=wid,tid=tid,error=error)
        
        try:
            worker_entry = self.worker_registry.get(wid)
            worker_entry.alive_lock.acquire()

            self.update_contact(wid)
            require_worker_initialized(worker_entry)

            # protect against multiple calls from the same worker with the same task
            # this may happen not only because of the login error in the Worker Agent
            # running lattice qcd application I observed TRANSIENT exception on the worker
            # but the call apparently made it to the master
            try:
                task_info=worker_entry.processing_tasks[tid]
            except KeyError:
                logger.debug('ignored multiple call to put_task_data() %d %d',wid,tid)
                return
            
            task_info.details.time_finish = time.time()
            del worker_entry.processing_tasks[tid]

            self.journal.addEntry('put_task_result',wid=wid,tid=tid,error=error)
            
            if error:
                task_info.update(TaskStatus.FAILED,task_result)
                try:
                    logger.debug('task_scheduler.tasks_failed(%s)'%str([task_info.tid]))
                    logger.warning('task %s (%s) failed: %s',task_info.tid,repr(task_info.application_label),task_result)
                    self.task_scheduler.tasks_failed([task_info])
                except Exception,x:
                    logger.exception('Error in TaskScheduler.tasks_failed() callback')
            else: