Exemplo n.º 1
0
 def __init__(self, config, config_section_name):
     logger.debug('Initializing AppExecutor')
     
     self.config_main_section_name = config_section_name
     self.config_files_section_name = 'files'
     self.queues_section_name = 'queues'
     self.config = config
     
     self.remote_config_fname = AppFNames.CONFIG
     self.remote_system_output_fname = AppFNames.SYSTEM_OUTPUT
     self.remote_duration_output_fname = AppFNames.DURATION_OUTPUT
     #TODO: retrieve app_output from the remote resource if necessary and send it to the job creator
     self.remote_app_output_fname = AppFNames.APP_OUTPUT
     self.remote_s3cfg_fname = AppFNames.S3CFG
     self.remote_pid_fname = AppFNames.PID
     
     try:
         self.res_manager = ResourceManager(self.config.get(self.config_main_section_name, 'resources_fpath'))
         self.job_manager = JobManager(self.config.get(self.config_main_section_name, 'jobs_fpath'))
         self.consumer = Consumer(self.config.get(self.queues_section_name, 'app_executor_queue'))
         self.producer_app_monitor = Producer(self.config.get(self.queues_section_name, 'app_monitor_queue'))
     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
         raise CLAUDEConfigError
     
     self.state = AppExecutorSM()
     self.selector = Selector(self.res_manager, self.job_manager, self.state, self._submit_job)
     self.ssh_manager = SSHManager()
     
     running_resources = self.res_manager.get_resources_in_state(resource_states.RUNNING)
     for resource in running_resources:
         self.ssh_manager.add_resource(resource.host, TransportCredentials(resource.credentials.username,
                                                                          resource.credentials.password,
                                                                          proxy_host=resource.credentials.proxy_host,
                                                                          proxy_username=resource.credentials.proxy_username))
Exemplo n.º 2
0
class AppExecutor(object):
    '''
    classdocs
    '''

    def __init__(self, config, config_section_name):
        logger.debug('Initializing AppExecutor')
        
        self.config_main_section_name = config_section_name
        self.config_files_section_name = 'files'
        self.queues_section_name = 'queues'
        self.config = config
        
        self.remote_config_fname = AppFNames.CONFIG
        self.remote_system_output_fname = AppFNames.SYSTEM_OUTPUT
        self.remote_duration_output_fname = AppFNames.DURATION_OUTPUT
        #TODO: retrieve app_output from the remote resource if necessary and send it to the job creator
        self.remote_app_output_fname = AppFNames.APP_OUTPUT
        self.remote_s3cfg_fname = AppFNames.S3CFG
        self.remote_pid_fname = AppFNames.PID
        
        try:
            self.res_manager = ResourceManager(self.config.get(self.config_main_section_name, 'resources_fpath'))
            self.job_manager = JobManager(self.config.get(self.config_main_section_name, 'jobs_fpath'))
            self.consumer = Consumer(self.config.get(self.queues_section_name, 'app_executor_queue'))
            self.producer_app_monitor = Producer(self.config.get(self.queues_section_name, 'app_monitor_queue'))
        except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
            raise CLAUDEConfigError
        
        self.state = AppExecutorSM()
        self.selector = Selector(self.res_manager, self.job_manager, self.state, self._submit_job)
        self.ssh_manager = SSHManager()
        
        running_resources = self.res_manager.get_resources_in_state(resource_states.RUNNING)
        for resource in running_resources:
            self.ssh_manager.add_resource(resource.host, TransportCredentials(resource.credentials.username,
                                                                             resource.credentials.password,
                                                                             proxy_host=resource.credentials.proxy_host,
                                                                             proxy_username=resource.credentials.proxy_username))
        
    def start(self):
        if len(self.job_manager.get_jobs_in_state(job_states.RUNNING)) > 0:
            self._msg_create_ask_me()
        if len(self.job_manager.get_jobs_in_state(job_states.NEW)) > 0:
            self.state.set(AppExecutorSM.ASSIGNING)
        
        try:
            while True:
                while self._check_queue():
                    pass
                
                if self.state.get() == AppExecutorSM.ASSIGNING:
                    self.selector.update_hardware_resources()
                    self.selector.assign_jobs()
                    
                if self.state.get() == AppExecutorSM.ASSIGNED:
                    self._msg_create_ask_me()
                    self.state.set(AppExecutorSM.WAITING)
                
                self._process_finished_jobs()
        except KeyboardInterrupt:
            logger.info('Caught control-C')
        
    def _process_finished_jobs(self):
        terminated_jobs = self.job_manager.get_jobs_in_state(job_states.TERMINATED)
        for job in terminated_jobs:
            self._msg_create_job_finished(job)
            
            add_timestamp(job, timestamps.SENT)
            job.state = job_states.SENT
            self.job_manager.save()
            
        failed_jobs = self.job_manager.get_jobs_in_state(job_states.FAILED)
        for job in failed_jobs:
            self._msg_create_job_finished(job)
            
            add_timestamp(job, timestamps.SENT)
            job.state = job_states.SENT
            self.job_manager.save()
            
        to_remove = {}
        sent_jobs = self.job_manager.get_jobs_in_state(job_states.SENT)
        for job in sent_jobs:
            to_remove[job.jid] = job
            
        for job in to_remove.values():
            self.job_manager.delete(job)
        
    def _msg_create_job_finished(self, job):
        a_msg = claude_msgs_pb2.Msg()
        a_msg.type = 10
        (msg, msg_type_name) = form_message(a_msg)
        
        msg.jid = job.jid
        if job.state == job_states.FAILED:
            msg.errorcode = 1
            
        if job.systemoutput:
            msg.systemoutput = job.systemoutput
            
        if job.durationoutput:
            msg.durationoutput = job.durationoutput
        
        msg_bytes = a_msg.SerializeToString()
        producer = Producer(job.returnqueue, host=job.returnip)
        producer.put(msg_bytes)
        
    def _msg_create_job_exists(self, jid, returnqueue, returnip):
        a_msg = claude_msgs_pb2.Msg()
        a_msg.type = 11
        (msg, msg_type_name) = form_message(a_msg)
        
        msg.jid = jid
        
        msg_bytes = a_msg.SerializeToString()
        producer = Producer(returnqueue, host=returnip)
        producer.put(msg_bytes)
        
    def _check_queue(self):
        ret = False
        
        msg_bytes = self.consumer.get()
        if msg_bytes:
            ret = True
            a_msg = claude_msgs_pb2.Msg()
            a_msg.ParseFromString(msg_bytes)
            (msg, msg_type_name) = form_message(a_msg)
            
            if a_msg.type == 3:
                self._msg_handler_add_resource(msg)
            elif a_msg.type == 4:
                self._msg_handler_add_job(msg)
            elif a_msg.type == 6:
                self._msg_handler_req_apps_to_monitor(msg)
            elif a_msg.type == 8:
                self._msg_handler_finished_apps(msg)
            elif a_msg.type == 9:
                self._msg_handler_kill_job(msg)
        
        return ret
    
    def _msg_handler_kill_job(self, msg):
        jid = msg.jid
        
        try:
            job = self.job_manager.get_job_by_jid(jid)
            
            if (job.state == job_states.NEW) or (job.state == job_states.RUNNING):
                if job.state == job_states.RUNNING:
                    resource = self.res_manager.get_resource_by_rid(job.rid)
                    self._kill_job(job, resource)
                    
                add_timestamp(job, timestamps.KILLED)
                self.job_manager.save()
                logger.info('Job "%s" was killed' % jid)
            else:
                logger.warning('Cannot kill job "%s" in state %s' % (jid, job.state))
        except CLAUDEJobDoesNotExist:
            pass
        except CLAUDEResourceDoesNotExist:
            logger.error('Something is terribly wrong with the system...', exc_info=True)
            
    def _msg_handler_finished_apps(self, msg):
        for jid in msg.jids:
            try:
                job = self.job_manager.get_job_by_jid(jid)
                resource = self.res_manager.get_resource_by_rid(job.rid)
                
                if job.state == job_states.RUNNING:
                    try:
                        self._retrieve_output(job, resource)
                        self._free_job(job, resource)
                        self.state.set(AppExecutorSM.ASSIGNING)
                    except CLAUDEResourceError:
                        logger.warning('Problem with resource %s' % resource.rid)
                        
                    add_timestamp(job, timestamps.TERMINATED)
                    job.state = job_states.TERMINATED
                    self.job_manager.save()
                    
                    logger.info('Job "%s" has terminated on resource "%s"' % (jid, job.rid))
            except (CLAUDEResourceDoesNotExist, CLAUDEJobDoesNotExist):
                logger.error('Something is terribly wrong with the system...', exc_info=True)
        
    def _msg_handler_req_apps_to_monitor(self, msg):
        running_jobs = self.job_manager.get_jobs_in_state(job_states.RUNNING)
        
        if len(running_jobs) > 0:
            a_msg = claude_msgs_pb2.Msg()
            a_msg.type = 7
            (msg, msg_type_name) = form_message(a_msg)
            
            for running_job in running_jobs:
                try:
                    resource = self.res_manager.get_resource_by_rid(running_job.rid)
                    
                    job = msg.jobs.add()
                    job.jid = running_job.jid
                    job.host = resource.host
                    job.credentials.username = resource.credentials.username
                    job.credentials.password = resource.credentials.password
                    job.credentials.proxy_host = resource.credentials.proxy_host
                    job.credentials.proxy_username = resource.credentials.proxy_username
                    job.pid = running_job.pid
                except CLAUDEResourceDoesNotExist:
                    logger.error('Something is terribly wrong with the system...', exc_info=True)
            msg_bytes = a_msg.SerializeToString()
            self.producer_app_monitor.put(msg_bytes)
    
    def _msg_handler_add_resource(self, msg):
        #TODO: state should be set somewhere else
        state = resource_states.RUNNING
        tresources = HardwareResources(msg.tresources.ram, msg.tresources.cpu, msg.tresources.disk)
        credentials = TransportCredentials(msg.credentials.username, msg.credentials.password, msg.credentials.proxy_host, msg.credentials.proxy_username)
        params = {}
        for param in msg.params:
            params[param.key] = param.value
        
        resource = self.res_manager.add_resource(msg.rid, msg.rtype, msg.host, credentials, state, tresources, msg.rootdir, params)
        
        if resource:
            self.selector.update_hardware_resources()
            
            self.ssh_manager.add_resource(msg.host, TransportCredentials(resource.credentials.username,
                                                                         resource.credentials.password,
                                                                         proxy_host=resource.credentials.proxy_host,
                                                                         proxy_username=resource.credentials.proxy_username))
            
            self.state.set(AppExecutorSM.ASSIGNING)
        
    def _msg_handler_add_job(self, msg):
        working_dir = str(uuid.uuid4())
        state = job_states.NEW
        rresources = HardwareResources(msg.rresources.ram, msg.rresources.cpu, msg.rresources.disk)
        
        try:
            s3cfg = msg.s3cfg
        except AttributeError:
            s3cfg = None
            
        try:
            keepworkingdir = msg.keepworkingdir
        except AttributeError:
            keepworkingdir = None
            
        try:
            chainedjob = msg.chainedjob
        except AttributeError:
            chainedjob = None
            
        try:
            appoutput = msg.appoutput
        except AttributeError:
            appoutput = None
        
        params = {}
        for param in msg.params:
            params[param.key] = param.value
            
        chainedjobparams = {}
        for param in msg.chainedjobparams:
            chainedjobparams[param.key] = param.value
        
        if self.job_manager.add_job(msg.jid, 
                                    working_dir, 
                                    state, 
                                    rresources, 
                                    msg.script, 
                                    msg.returnip, 
                                    msg.returnqueue, 
                                    s3cfg, 
                                    keepworkingdir, 
                                    chainedjob, 
                                    appoutput, 
                                    params, 
                                    chainedjobparams):
            self.state.set(AppExecutorSM.ASSIGNING)
        else:
            self._msg_create_job_exists(msg.jid, msg.returnqueue, msg.returnip)

    def _msg_create_ask_me(self):
        a_msg = claude_msgs_pb2.Msg()
        a_msg.type = 2
        (msg, msg_type_name) = form_message(a_msg)
        
        msg_bytes = a_msg.SerializeToString()
        self.producer_app_monitor.put(msg_bytes)
    
    def _kill_job(self, job, resource):
        host = resource.host
        
        try:
            connection = self.ssh_manager.get_connection(host)
            
            connection.kill_by_pid(job.pid)
            logger.debug('Process "%s" was killed on host "%s"' % (job.pid, host))
        except CLAUDEConnectingError:
            logger.error('Error connecting to host %s' % host, exc_info=True)
            raise CLAUDEResourceError
        except CLAUDENotConnectedError:
            logger.error('No connection to host %s' % host, exc_info=True)
            raise CLAUDEResourceError
    
    def _retrieve_output(self, job, resource):
        try:
            host = resource.host
            connection = self.ssh_manager.get_connection(host)
            
            with TemporaryDirectory() as tmp_dir:
                local_output_fpath = os.path.join(tmp_dir, 'local_system.tmp')
                remote_output_fpath = os.path.join(os.path.join(resource.rootdir, job.workingdir), self.remote_system_output_fname)
                connection.get(remote_output_fpath, local_output_fpath)
                with open(local_output_fpath) as f:
                    job.systemoutput = f.read()
                    
                local_output_fpath = os.path.join(tmp_dir, 'local_duration.tmp')
                remote_output_fpath = os.path.join(os.path.join(resource.rootdir, job.workingdir), self.remote_duration_output_fname)
                connection.get(remote_output_fpath, local_output_fpath)
                with open(local_output_fpath) as f:
                    job.durationoutput = f.read()
                    
                self.job_manager.save()
        except CLAUDEConnectingError:
            logger.error('Error connecting to host %s' % host, exc_info=True)
            raise CLAUDEResourceError
        except CLAUDENotConnectedError:
            logger.error('No connection to host %s' % host, exc_info=True)
            raise CLAUDEResourceError
        except CLAUDEFileContentRetrievingTimeout:
            logger.error('Error retrieving output of job %s from host %s' % (job.jid, host), exc_info=True)
    
    def _free_job(self, job, resource):
        try:
            host = resource.host
            connection = self.ssh_manager.get_connection(host)
            
            if not job.keepworkingdir:
                working_dir = os.path.join(resource.rootdir, job.workingdir)
                connection.rmdir(working_dir)
                
                logger.debug('Working dir "%s" was removed from host "%s"' % (working_dir, host))
        except CLAUDEConnectingError:
            logger.error('Error connecting to host %s' % host, exc_info=True)
            raise CLAUDEResourceError
        except CLAUDENotConnectedError:
            logger.error('No connection to host %s' % host, exc_info=True)
            raise CLAUDEResourceError
        
    def _submit_job(self, job, resource):
        host = resource.host
        
        try:
            connection = self.ssh_manager.get_connection(host)
            
            working_dir = os.path.join(resource.rootdir, job.workingdir)
            if connection.exists(working_dir):
                logger.warning('Working dir "%s" exists on host "%s", removing it first' % (working_dir, host))
                connection.rmdir(working_dir)
                
            connection.mkdir(working_dir)
            if job.s3cfg:
                local_s3cfg_fpath = self.config.get(self.config_files_section_name, 's3cfg')
                remote_s3cfg_fpath = os.path.join(working_dir, self.remote_s3cfg_fname)
                connection.put(local_s3cfg_fpath, remote_s3cfg_fpath)
            else:
                logger.warning('s3cfg is not in use')
                
            job_script_name = job.script
            local_script_fpath = self.config.get(self.config_files_section_name, job_script_name)
            script_fname = os.path.basename(local_script_fpath)
            remote_script_fpath = os.path.join(working_dir, script_fname)
            connection.put(local_script_fpath, remote_script_fpath)
            
            local_common_script_fname = self.config.get(self.config_files_section_name, 'common')
            common_script_fname = os.path.basename(local_common_script_fname)
            remote_common_script_fname = os.path.join(working_dir, common_script_fname)
            connection.put(local_common_script_fname, remote_common_script_fname)
            
            if job.params or job.chainedjob:
                job_config = self._create_job_config(job)
                remote_job_config_fpath = os.path.join(working_dir, self.remote_config_fname)
                
                with TemporaryDirectory() as tmp_dir:
                    local_job_config_fpath = os.path.join(tmp_dir, self.remote_config_fname)
                    
                    with open(local_job_config_fpath, 'w') as job_config_file:
                        job_config.write(job_config_file)
                        
                    connection.put(local_job_config_fpath, remote_job_config_fpath)
            
            connection.launch_app('python %s %s' % (remote_script_fpath, working_dir))
            pid = connection.retrieve_pid(os.path.join(working_dir, self.remote_pid_fname))
            
            add_timestamp(job, timestamps.SUBMITTED)
            job.state = job_states.RUNNING
            job.rid = resource.rid
            job.pid = pid
            self.job_manager.save()
            
            logger.info('Job "%s" was submitted to resource "%s"' % (job.jid, resource.rid))
        except (CLAUDENotConnectedError, CLAUDEConnectingError):
            raise CLAUDEResourceError
        except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, IOError, CLAUDEFileContentRetrievingTimeout, ValueError, CLAUDEIOError, OSError):
            self._free_job(job, resource)
            raise CLAUDEJobFailed
        
    def _create_job_config(self, job):
        job_config = ConfigParser.RawConfigParser()
        
        if job.params:
            job_section = 'job'
            job_config.add_section(job_section)
            
            for param in job.params:
                job_config.set(job_section, param.key, param.value)
                
        if job.chainedjob:
            chained_job_section = 'chainedjob'
            job_config.add_section(chained_job_section)
            
            job_config.set(chained_job_section, 'claude_service_ip', get_interface_ip())
            job_config.set(chained_job_section, 'claude_service_queue', self.config.get(self.queues_section_name, 'app_executor_queue'))
            
            if job.chainedjobparams:
                for param in job.chainedjobparams:
                    job_config.set(chained_job_section, param.key, param.value)
        
        return job_config