def poll_controllers(self):
     for worker in Worker.objects.all():
         controller = rpc.getThriftControllerClient(worker.lan_dns)
         if controller:
             stats = controller.stats()
             print controller,stats
             # TODO update worker stats.
         else:
             print "could not connect to controller on %s" % worker
예제 #2
0
 def poll_controllers(self):
     for worker in Worker.objects.all():
         controller = rpc.getThriftControllerClient(worker.lan_dns)
         if controller:
             stats = controller.stats()
             print controller, stats
             # TODO update worker stats.
         else:
             print "could not connect to controller on %s" % worker
예제 #3
0
    def update_status(self, instance_name):
        worker = Worker.objects.filter(instance_name=instance_name)[0]
        if worker.status == Worker.States.created:
            conn = self.ec2_connection()
            reservations = conn.get_all_instances([instance_name])
            instance = reservations[0].instances[0]
            if instance.state == 'running':
                logger.info('Worker %s is now initializing: %s',
                            worker.instance_name, instance.public_dns_name)
                worker.status = Worker.States.initializing
                worker.lan_dns = instance.private_dns_name
                worker.wan_dns = instance.public_dns_name
                worker.save()
                mail.report_new_worker(worker)
            else:
                logger.debug('Worker %s is still reporting as %s',
                             worker.instance_name, instance.state)
                return WorkerManager.WORKER_NOT_READY

        if worker.status == Worker.States.initializing:
            logger.debug('Trying to update controller on %s',
                         worker.instance_name)
            if self.update_worker(worker.lan_dns):
                worker.status = Worker.States.updating
                worker.save()
                logger.info('Worker %s is now updating', worker.instance_name)
                return WorkerManager.WORKER_UPDATING
            else:
                return WorkerManager.WORKER_INITIALIZING

        if worker.status == Worker.States.updating:
            logger.debug('Checking if controller is up on %s',
                         worker.instance_name)
            try:
                controller = rpc.getThriftControllerClient(worker.lan_dns)
                controller.get_worker_load_stats()
                worker.status = Worker.States.controllable
                worker.save()
                logger.info('Worker %s is now controllable',
                            worker.instance_name)
                return WorkerManager.WORKER_CONTROLLABLE
            except Exception, e:
                if isinstance(
                        e, TTransport.TTransportException
                ) and e.type == TTransport.TTransportException.NOT_OPEN:
                    logger.info('Controller on worker %s not responding yet.',
                                worker.lan_dns)
                else:
                    logger.exception(
                        'Unexpected exception while checking worker %s',
                        worker.lan_dns)
                return WorkerManager.WORKER_UPDATING
 def _delete_deploy(self, deploy):
     logger.debug('Deleting deploy: %r', deploy)
     if deploy.base_port:
         try:
             controller = rpc.getThriftControllerClient(deploy.worker.lan_dns)
             controller.kill_engine(deploy.index.code,deploy.base_port)
         except:
             logger.exception('Failed when attempting to kill the IndexEngine for the deploy %s', deploy) 
     
     index = deploy.index
     deploy.delete()
     
     if index.deleted and index.deploys.count() == 0:
         index.delete()
    def _handle_created(self, deploy):
        if not deploy.worker.is_ready():
            logger.info('Waiting to initialize index "%s" (%s) on %s:%d. The worker is not ready yet', deploy.index.name, deploy.index.code, deploy.worker.instance_name, deploy.base_port)
            return DeployManager.WORKER_NOT_READY_YET

        # else
        controller = rpc.getThriftControllerClient(deploy.worker.lan_dns)
        json_config = {}
        json_config['functions']  = deploy.index.get_functions_dict()

        # there should be exactly one recovery service 
        recovery_service = Service.objects.get(name='recovery')
        # log based storage
        json_config['log_based_storage'] = True
        json_config['log_server_host'] = recovery_service.host
        json_config['log_server_port'] = recovery_service.port
        
        json_config.update(deploy.index.configuration.get_data())
        
        proposed_port = self._get_free_port(deploy)
        json_config['base_port'] = proposed_port
        json_config['index_code'] = deploy.index.code

        analyzer_config = deploy.index.get_json_for_analyzer()
        if analyzer_config:
            json_config['analyzer_config'] = analyzer_config 


        logger.info('Initializing index "%s" (%s) on %s:%d', deploy.index.name, deploy.index.code, deploy.worker.instance_name, proposed_port)
        
        # override xmx with the one defined for this deploy
        json_config['xmx'] = deploy.effective_xmx
        
        logger.debug("deploy: %r\n----\nindex: %r\n----\nstart args: %r", deploy, deploy.index, json_config)
        started_ok = controller.start_engine(json.dumps(json_config))
        if started_ok:
            qs = Deploy.objects.filter(id=deploy.id)
            qs.update(base_port=proposed_port)
            qs = Deploy.objects.filter(id=deploy.id,index__deleted=False)
            qs.update(status=Deploy.States.initializing)
            return DeployManager.INDEX_INITIALIZING
        else:
            logger.warn('Deploy failed starting. Will try again in next round.');
            return
 def update_worker(self, dns):
     try:
         controller = rpc.getThriftControllerClient(dns)
         host = socket.gethostbyname_ex(socket.gethostname())[0]
         retcode = controller.update_worker(host)
         if retcode == 0:
             try:
                 logger.debug('Worker %s updated. Restarting...', dns)
                 controller.restart_controller()
                 logger.warn("Restart controller didn't throw an exception. Did it restart?")
             except TTransport.TTransportException:
                 # restart will always fail
                 pass 
     except Exception, e:
         if isinstance(e, TTransport.TTransportException) and e.type == TTransport.TTransportException.NOT_OPEN:
             logger.info('Controller on worker %s not responding yet.', dns)
         else:
             logger.exception('Unexpected exception while updating worker %s', dns)
         return False
    def update_status(self, instance_name):
        worker = Worker.objects.filter(instance_name=instance_name)[0]
        if worker.status == Worker.States.created:
            conn = self.ec2_connection()
            reservations = conn.get_all_instances([instance_name])
            instance = reservations[0].instances[0]
            if instance.state == 'running':
                logger.info('Worker %s is now initializing: %s', worker.instance_name, instance.public_dns_name)
                worker.status = Worker.States.initializing
                worker.lan_dns = instance.private_dns_name
                worker.wan_dns = instance.public_dns_name
                worker.save()
                mail.report_new_worker(worker)
            else:
                logger.debug('Worker %s is still reporting as %s', worker.instance_name, instance.state)
                return WorkerManager.WORKER_NOT_READY
    
        if worker.status == Worker.States.initializing:
            logger.debug('Trying to update controller on %s', worker.instance_name)
            if self.update_worker(worker.lan_dns):
                worker.status = Worker.States.updating
                worker.save()
                logger.info('Worker %s is now updating', worker.instance_name)
                return WorkerManager.WORKER_UPDATING
            else:
                return WorkerManager.WORKER_INITIALIZING

        if worker.status == Worker.States.updating:
            logger.debug('Checking if controller is up on %s', worker.instance_name)
            try:
                controller = rpc.getThriftControllerClient(worker.lan_dns)
                controller.get_worker_load_stats()
                worker.status = Worker.States.controllable
                worker.save()
                logger.info('Worker %s is now controllable', worker.instance_name)
                return WorkerManager.WORKER_CONTROLLABLE
            except Exception, e:
                if isinstance(e, TTransport.TTransportException) and e.type == TTransport.TTransportException.NOT_OPEN:
                    logger.info('Controller on worker %s not responding yet.', worker.lan_dns)
                else:
                    logger.exception('Unexpected exception while checking worker %s', worker.lan_dns)
                return WorkerManager.WORKER_UPDATING
예제 #8
0
 def update_worker(self, dns):
     try:
         controller = rpc.getThriftControllerClient(dns)
         host = socket.gethostbyname_ex(socket.gethostname())[0]
         retcode = controller.update_worker(host)
         if retcode == 0:
             try:
                 logger.debug('Worker %s updated. Restarting...', dns)
                 controller.restart_controller()
                 logger.warn(
                     "Restart controller didn't throw an exception. Did it restart?"
                 )
             except TTransport.TTransportException:
                 # restart will always fail
                 pass
     except Exception, e:
         if isinstance(
                 e, TTransport.TTransportException
         ) and e.type == TTransport.TTransportException.NOT_OPEN:
             logger.info('Controller on worker %s not responding yet.', dns)
         else:
             logger.exception(
                 'Unexpected exception while updating worker %s', dns)
         return False
예제 #9
0
#
# This script is used from the upgrade_frontend.sh
# to issue commands to every worker so they can update
# their nebu installations from this frontend and
# restart their controllers.
#
# author: santip
#

from nebu.models import Worker
import rpc
import socket
from thrift.transport import TTransport

for w in Worker.objects.all():
    print 'Upgrading worker %d at %s' % (w.id, w.wan_dns)
    dns = w.lan_dns
    controller = rpc.getThriftControllerClient(dns)
    host = socket.gethostbyname_ex(socket.gethostname())[0]
    retcode = controller.update_worker(host)
    if retcode == 0:
        try:
            print 'Worker %s updated. Restarting...' % dns
            controller.restart_controller()
            print "Restart controller didn't throw an exception. Did it restart?"
        except TTransport.TTransportException:
            # restart will always fail
            pass
print 'Done'