示例#1
0
def listen_on(data):
    info = get_user_info(request, session)

    if info.get('uname', None) is None:
        return
    
    LOGGER.info("[%s@%s] SocketIO:Listen - Event received => %s" % (info.get('uname', None), info['ip'], data))
    
    try:
        u = NamedQueue(data['wq_id'], private=True)
        if data['from_start']:
            msg = u.pop(timeout=15)

            if msg is None:
                emit('error', {'err_msg': 'Never got any response from the dispatcher. Try reloading the page...',
                               'status_code': 404, 'msg': None})
                LOGGER.info("[%s@%s] SocketIO:Listen - Timeout reached. Event terminated." % (info.get('uname', None),
                                                                                              info['ip']))
                return
            elif msg['status'] == 'START':
                emit('start', {'err_msg': None, 'status_code': 200, 'msg': "Start listening..."})
            elif msg['status'] == 'STOP':
                emit('stop', {'err_msg': None, 'status_code': 200, 'msg': "All messages received, closing queue..."})
                LOGGER.info("[%s@%s] SocketIO:Listen - Event terminated gracefully." % (info.get('uname', None),
                                                                                        info['ip']))
                return
            else:
                emit('error', {'err_msg': 'Unexpected status code for the first message',
                               'status_code': 500, 'msg': msg})
                LOGGER.info("[%s@%s] SocketIO:Listen - Unexpected message received. "
                            "Event terminated." % (info.get('uname', None), info['ip']))
                return
                
        while True:
            msg = u.pop(timeout=300)

            if msg is None:
                emit('error', {'err_msg': 'Never got any response from the dispatcher. Try reloading the page...',
                               'status_code': 404, 'msg': None})
                LOGGER.info("[%s@%s] SocketIO:Listen - Timeout reached. Event terminated." % (info.get('uname', None),
                                                                                              info['ip']))
                break
            if msg['status'] == 'STOP':
                emit('stop', {'err_msg': None, 'status_code': 200, 'msg': "All messages received, closing queue..."})
                LOGGER.info("[%s@%s] SocketIO:Listen - Event terminated gracefully." % (info.get('uname', None),
                                                                                        info['ip']))
                break
            elif msg['status'] == 'OK':
                emit('cachekey', {'err_msg': None, 'status_code': 200, 'msg': msg['cache_key']})
            elif msg['status'] == 'FAIL':
                emit('cachekeyerr', {'err_msg': None, 'status_code': 200, 'msg': msg['cache_key']})
                
    except Exception:
        LOGGER.exception("[%s@%s] SocketIO:Listen" % (info.get('uname', None), info['ip']))
    finally:
        LOGGER.info("[%s@%s] SocketIO:Listen - Connection to client was terminated" % (info.get('uname', None),
                                                                                       info['ip']))
def wait_for_networking(timeout):
    uid = uuid.uuid4().get_hex()
    for _each_second in xrange(timeout):
        try:
            q = NamedQueue('hostagent-redischeck-%s' % uid)
            q.push('can i reach you')
            q.pop(timeout=1, blocking=False)
            return True
        except Exception as e:
            print('waiting for redis reachability. %s ' % str(e))
    return False
示例#3
0
def get_all_messages(notification_queue, **kwargs):
    """
    Get all messages on the specified notification queue

    Variables:
    complete_queue       => Queue to get the message from

    Arguments:
    None

    Data Block:
    None

    Result example:
    []            # List of messages
    """
    resp_list = []
    u = NamedQueue("nq-%s" % notification_queue,
                   host=config.core.redis.persistent.host,
                   port=config.core.redis.persistent.port,
                   db=config.core.redis.persistent.db)

    while True:
        msg = u.pop(blocking=False)

        if msg is None:
            break

        resp_list.append(msg)

    return make_api_response(resp_list)
示例#4
0
 def _send_control_queue_call(cls, shard, state, **kw):
     name = reply_queue_name(state)
     kw.update({
         'state': state,
         'watch_queue': name,
     })
     t = Task({}, **kw)
     forge.get_control_queue('control-queue-' + str(shard)).push(t.raw)
     nq = NamedQueue(name)
     return nq.pop(timeout=5)
示例#5
0
    def _check_time_drift(self):
        dispatcher = '0'
        name = reply_queue_name('cli_get_time')
        t = Task({}, **{
            'state': 'get_system_time',
            'watch_queue': name,
        })
        forge.get_control_queue('control-queue-' + dispatcher).push(t.raw)
        nq = NamedQueue(name)
        r = nq.pop(timeout=5)
        if r is None or 'time' not in r:
            self.log.warn('timed out trying to determine dispatchers clock.')
            return

        clock_difference = abs(r['time'] - time.time())
        if clock_difference > 600:
            self.log.info(
                'Dispatchers clock %s away from ours. Clocks are not set correctly',
                clock_difference)
        else:
            self.log.debug('Clock drift from dispatcher: %s.',
                           clock_difference)
示例#6
0
def get_message(notification_queue, **kwargs):
    """
    Get one message on the specified notification queue

    Variables:
    complete_queue       => Queue to get the message from

    Arguments:
    None

    Data Block:
    None

    Result example:
    {}          # A message
    """
    u = NamedQueue("nq-%s" % notification_queue,
                   host=config.core.redis.persistent.host,
                   port=config.core.redis.persistent.port,
                   db=config.core.redis.persistent.db)

    msg = u.pop(blocking=False)

    return make_api_response(msg)
示例#7
0
def get_message(wq_id, **kwargs):
    """
    Get a message from a live watch queue. 
    Note: This method is not optimal because it requires the
          UI to pull the information. The prefered method is the
          socket server.
    
    Variables:
    wq_id       => Queue to get the message from
    
    Arguments: 
    None
    
    Data Block:
    None
    
    Result example:
    {
     "type": "",         # Type of message
     "err_msg": "",      # Error message
     "status_code": 400, # Status code of the error
     "msg": ""           # Message
    } 
    """
    u = NamedQueue(wq_id)

    msg = u.pop(blocking=False)

    if msg is None:
        response = {
            'type': 'timeout',
            'err_msg': 'Timeout waiting for a message.',
            'status_code': 408,
            'msg': None
        }
    elif msg['status'] == 'STOP':
        response = {
            'type': 'stop',
            'err_msg': None,
            'status_code': 200,
            'msg': "All messages received, closing queue..."
        }
    elif msg['status'] == 'START':
        response = {
            'type': 'start',
            'err_msg': None,
            'status_code': 200,
            'msg': "Start listening..."
        }
    elif msg['status'] == 'OK':
        response = {
            'type': 'cachekey',
            'err_msg': None,
            'status_code': 200,
            'msg': msg['cache_key']
        }
    elif msg['status'] == 'FAIL':
        response = {
            'type': 'cachekeyerr',
            'err_msg': None,
            'status_code': 200,
            'msg': msg['cache_key']
        }
    else:
        response = {
            'type': 'error',
            'err_msg': "Unknown message",
            'status_code': 500,
            'msg': msg
        }

    return make_api_response(response)
示例#8
0
def get_messages(wq_id, **kwargs):
    """
    Get all messages currently on a watch queue. 
    Note: This method is not optimal because it requires the
          UI to pull the information. The prefered method is the
          socket server when possible.
    
    Variables:
    wq_id       => Queue to get the message from
    
    Arguments: 
    None
    
    Data Block:
    None
    
    Result example:
    []            # List of messages
    """
    resp_list = []
    u = NamedQueue(wq_id)

    while True:
        msg = u.pop(blocking=False)
        if msg is None:
            break
        elif msg['status'] == 'STOP':
            response = {
                'type': 'stop',
                'err_msg': None,
                'status_code': 200,
                'msg': "All messages received, closing queue..."
            }
        elif msg['status'] == 'START':
            response = {
                'type': 'start',
                'err_msg': None,
                'status_code': 200,
                'msg': "Start listening..."
            }
        elif msg['status'] == 'OK':
            response = {
                'type': 'cachekey',
                'err_msg': None,
                'status_code': 200,
                'msg': msg['cache_key']
            }
        elif msg['status'] == 'FAIL':
            response = {
                'type': 'cachekeyerr',
                'err_msg': None,
                'status_code': 200,
                'msg': msg['cache_key']
            }
        else:
            response = {
                'type': 'error',
                'err_msg': "Unknown message",
                'status_code': 500,
                'msg': msg
            }

        resp_list.append(response)

    return make_api_response(resp_list)
示例#9
0
    def _init_registration(self):
        if self.is_a_vm():
            nq = NamedQueue('vm-%s' % self.mac, db=DATABASE_NUM)
            reg = nq.pop()
            nq.push(reg)

            self.log.info('Updating our registration.')
            reg['hostname'] = net.get_hostname()
            reg['ip'] = self.ip
            reg['machine_info'] = sysinfo.get_machine_info()
            reg['last_checkin'] = isotime.now_as_iso()
            reg['platform'] = sysinfo.get_platform()
            reg['updated'] = time.asctime()
            reg['system_name'] = config.system.name
            if 'roles' not in reg:
                reg['roles'] = []
            if "hostagent" not in reg["roles"]:
                reg['roles'].append("hostagent")

        else:
            reg = self.store.get_node(self.mac)

            if not reg:
                self.log.info(
                    'This appears to be our first run on this host. Registering ourselves.'
                )
                reg = DEFAULT_REGISTRATION.copy()
                reg['hostname'] = net.get_hostname()
                reg['ip'] = self.ip
                reg['mac_address'] = self.mac
                reg['machine_info'] = sysinfo.get_machine_info()
                reg['last_checkin'] = isotime.now_as_iso()
                reg['platform'] = sysinfo.get_platform()
                reg['profile'] = 'idle'
                reg['created'] = time.asctime()
                if 'roles' not in reg:
                    reg['roles'] = []
                if "controller" not in reg["roles"]:
                    reg['roles'].append("controller")
                if "hostagent" not in reg["roles"]:
                    reg['roles'].append("hostagent")
                self.store.save_node(self.mac, reg)
            else:
                # Just do an update of the extra info in registration.
                self.log.info('Updating our registration.')
                reg['hostname'] = net.get_hostname()
                reg['ip'] = self.ip
                if not reg.get('profile', None):
                    reg['profile'] = config.workers.default_profile
                reg['machine_info'] = sysinfo.get_machine_info()
                reg['last_checkin'] = isotime.now_as_iso()
                reg['platform'] = sysinfo.get_platform()
                reg['updated'] = time.asctime()
                reg['system_name'] = config.system.name
                if 'roles' not in reg:
                    reg['roles'] = []
                if "controller" not in reg["roles"] and not reg.get(
                        'is_vm', False):
                    reg['roles'].append("controller")
                if "hostagent" not in reg["roles"]:
                    reg['roles'].append("hostagent")
                self.store.save_node(self.mac, reg)

        self.registration = reg

        msgs = forge.apply_overrides(reg.get('config_overrides', None))
        if msgs:
            self.log.info("Using %s.", " and ".join(msgs))

        self.log.info('Our registration: %s',
                      pprint.pformat(self.registration))
示例#10
0
class HostAgent(object):
    def __init__(self):
        self.ip = net.get_hostip()
        self.mac = net.get_mac_for_ip(self.ip)
        self.store = forge.get_datastore()
        self.log = logging.getLogger('assemblyline.agent')
        self.log.info('Starting HostAgent: MAC[%s] STORE[%s]' %
                      (self.mac, self.store))

        # This hosts registration from riak (Hosts tab in UI).
        self.registration = None
        self.service_manager = None
        self.vm_manager = None
        self.flex_manager = None
        self.lock = None
        self.consumer_thread = None
        self._should_run = False
        self.host_profile = {}
        self.executor_thread = None

        # Chores are actions that we run periodically and which we coallesce
        # when the same chore is requested multiple times in the same tick.
        # Jobs are executed as they are received.
        self.jobs = LocalQueue()
        self.last_heartbeat = 0
        self.rpc_handlers = {
            AgentRequest.PING: self.ping,
            AgentRequest.DRAIN: self.drain,
            AgentRequest.UNDRAIN: self.undrain,
            AgentRequest.SHUTDOWN: self.shutdown,
            AgentRequest.VM_LIST: self.list_vms,
            AgentRequest.VM_START: self.start_vm,
            AgentRequest.VM_STOP: self.stop_vm,
            AgentRequest.VM_STOP_ALL: self.stop_all_vms,
            AgentRequest.VM_RESTART: self.restart_vm,
            AgentRequest.VM_REFRESH_ALL: self.refresh_vm_all,
            AgentRequest.VM_REFRESH_FLEET: self.refresh_vm_fleet,
            AgentRequest.VM_GET_REVERT_TIMES: self.vm_get_revert_times,
            AgentRequest.START_SERVICES: self.start_services,
            AgentRequest.STOP_SERVICES: self.stop_services,
        }

        self._should_run = True

        # Fetch and update or host registration information in riak.
        # self._init_registration() defer registration until later

    # noinspection PyUnresolvedReferences
    def register_host(self):
        if self.is_a_vm():
            return "This is a VM, no need to register."

        existing_reg = self.store.get_node(self.mac)
        if existing_reg:
            return "already registered: %s" % pprint.pformat(existing_reg)
        reg = DEFAULT_REGISTRATION.copy()
        reg['hostname'] = net.get_hostname()
        reg['ip'] = self.ip
        reg['mac_address'] = self.mac
        reg['machine_info'] = sysinfo.get_machine_info()
        reg['last_checkin'] = isotime.now_as_iso()
        reg['platform'] = sysinfo.get_platform()
        reg['profile'] = config.workers.default_profile
        reg['created'] = time.asctime()
        if 'roles' not in reg:
            reg['roles'] = []
        if "controller" not in reg["roles"]:
            reg['roles'].append("controller")
        if "hostagent" not in reg["roles"]:
            reg['roles'].append("hostagent")
        self.store.save_node(self.mac, reg)
        return 'Registered %s with %s' % (self.mac, pprint.pformat(reg))

    def _init_queues(self):
        self.rpcqueue = NamedQueue(self.mac)

    def is_a_vm(self):
        if self.mac.startswith(VM_MAC_PREFIX) and config.workers.install_kvm:
            return True
        return False

    # noinspection PyUnresolvedReferences
    def _init_registration(self):
        if self.is_a_vm():
            nq = NamedQueue('vm-%s' % self.mac, db=DATABASE_NUM)
            reg = nq.pop()
            nq.push(reg)

            self.log.info('Updating our registration.')
            reg['hostname'] = net.get_hostname()
            reg['ip'] = self.ip
            reg['machine_info'] = sysinfo.get_machine_info()
            reg['last_checkin'] = isotime.now_as_iso()
            reg['platform'] = sysinfo.get_platform()
            reg['updated'] = time.asctime()
            reg['system_name'] = config.system.name
            if 'roles' not in reg:
                reg['roles'] = []
            if "hostagent" not in reg["roles"]:
                reg['roles'].append("hostagent")

        else:
            reg = self.store.get_node(self.mac)

            if not reg:
                self.log.info(
                    'This appears to be our first run on this host. Registering ourselves.'
                )
                reg = DEFAULT_REGISTRATION.copy()
                reg['hostname'] = net.get_hostname()
                reg['ip'] = self.ip
                reg['mac_address'] = self.mac
                reg['machine_info'] = sysinfo.get_machine_info()
                reg['last_checkin'] = isotime.now_as_iso()
                reg['platform'] = sysinfo.get_platform()
                reg['profile'] = 'idle'
                reg['created'] = time.asctime()
                if 'roles' not in reg:
                    reg['roles'] = []
                if "controller" not in reg["roles"]:
                    reg['roles'].append("controller")
                if "hostagent" not in reg["roles"]:
                    reg['roles'].append("hostagent")
                self.store.save_node(self.mac, reg)
            else:
                # Just do an update of the extra info in registration.
                self.log.info('Updating our registration.')
                reg['hostname'] = net.get_hostname()
                reg['ip'] = self.ip
                if not reg.get('profile', None):
                    reg['profile'] = config.workers.default_profile
                reg['machine_info'] = sysinfo.get_machine_info()
                reg['last_checkin'] = isotime.now_as_iso()
                reg['platform'] = sysinfo.get_platform()
                reg['updated'] = time.asctime()
                reg['system_name'] = config.system.name
                if 'roles' not in reg:
                    reg['roles'] = []
                if "controller" not in reg["roles"] and not reg.get(
                        'is_vm', False):
                    reg['roles'].append("controller")
                if "hostagent" not in reg["roles"]:
                    reg['roles'].append("hostagent")
                self.store.save_node(self.mac, reg)

        self.registration = reg

        msgs = forge.apply_overrides(reg.get('config_overrides', None))
        if msgs:
            self.log.info("Using %s.", " and ".join(msgs))

        self.log.info('Our registration: %s',
                      pprint.pformat(self.registration))

    def _wait_for_networking(self, timeout):
        uid = uuid.uuid4().get_hex()
        for each_second in xrange(timeout):
            try:
                q = NamedQueue('hostagent-redischeck-%s' % uid)
                q.push('can i reach you')
                q.pop(timeout=1, blocking=False)
                return True
            except Exception as e:
                self.log.info('waiting for redis reachability. %s ', str(e))
        return False

    def _check_time_drift(self):
        dispatcher = '0'
        name = reply_queue_name('cli_get_time')
        t = Task({}, **{
            'state': 'get_system_time',
            'watch_queue': name,
        })
        forge.get_control_queue('control-queue-' + dispatcher).push(t.raw)
        nq = NamedQueue(name)
        r = nq.pop(timeout=5)
        if r is None or 'time' not in r:
            self.log.warn('timed out trying to determine dispatchers clock.')
            return

        clock_difference = abs(r['time'] - time.time())
        if clock_difference > 600:
            self.log.info(
                'Dispatchers clock %s away from ours. Clocks are not set correctly',
                clock_difference)
        else:
            self.log.debug('Clock drift from dispatcher: %s.',
                           clock_difference)

    # noinspection PyBroadException
    def _clear_tempdir(self):
        # Clear our temporary folder of any files left from previous executions.
        try:
            altemp_dir = os.path.join(tempfile.gettempdir(), 'al')
            shutil.rmtree(altemp_dir, ignore_errors=True)
        except:
            self.log.exception(
                'while clearing temporary directory during sysprep')

    def sysprep(self):
        """Basic prep and return."""
        self._init_registration()
        self._init_queues()
        self.log.info('performing sysprep')
        self._clear_tempdir()
        self._wait_for_networking(20)
        self._check_time_drift()

        if not self.registration:
            raise ProvisioningError('Host registration not found.')

        if not self.registration.get('enabled', None):
            raise ProvisioningError('Host explicitly disabled.')

        profile_name = self.registration.get('profile', None)
        if not profile_name:
            raise ProvisioningError('Host has no assigned profile.')

        if 'profile_definition' not in self.registration:

            self.host_profile = self.store.get_profile(profile_name)
            if not self.host_profile:
                raise ProvisioningError(
                    'Host profile does not appear to exist in datastore: %s.',
                    profile_name)

            self.log.info('Our profile: %s', pprint.pformat(self.host_profile))

        else:
            self.host_profile = self.registration.get('profile_definition', {})

        self.log.info('Our profile: %s', pprint.pformat(self.host_profile))
        vm_config = self.host_profile.get('virtual_machines', {})
        if vm_config and not profile_name.startswith('flex'):
            from assemblyline.al.common.vm import VmManager
            self.vm_manager = VmManager(vm_config)
            self.vm_manager.sysprep()

        # if we are are running within a VM. patch hosts files.
        if self.is_a_vm():
            nq = NamedQueue('vm-%s' % self.mac, db=DATABASE_NUM)
            nq.push(self.registration)

    # noinspection PyUnusedLocal
    def undrain(self, msg):
        self.store = forge.get_datastore()
        if self.service_manager:
            self.service_manager.undrain()
        if self.vm_manager:
            self.vm_manager.undrain()
        return True

    # noinspection PyUnusedLocal
    def drain(self, msg):
        if self.service_manager:
            self.service_manager.drain()
        if self.vm_manager:
            self.vm_manager.drain()
        if self.store:
            self.store.close()
        return True

    # noinspection PyUnusedLocal
    def list_vms(self, _msg):
        return self.vm_manager.list_vms()

    # noinspection PyUnusedLocal
    def stop_all_vms(self, _msg):
        return self.vm_manager.stop_all()

    def start_vm(self, msg):
        instance_name = msg.body.get('name', None)
        return self.vm_manager.start_vm(instance_name)

    def stop_vm(self, msg):
        instance_name = msg.body.get('name', None)
        return self.vm_manager.stop_vm(instance_name)

    def restart_vm(self, msg):
        instance_name = msg.body.get('name', None)
        return self.vm_manager.restart_vm(instance_name)

    def refresh_vm_fleet(self, msg):
        fleet_name = msg.body.get('name', None)
        return self.vm_manager.refresh_fleet(fleet_name)

    # noinspection PyUnusedLocal
    def vm_get_revert_times(self, _msg):
        return self.vm_manager.get_revert_times()

    # noinspection PyUnusedLocal
    def refresh_vm_all(self, _msg):
        return self.vm_manager.refresh_all()

    @staticmethod
    def _handle_unknown_request(msg):
        raise Exception('Unknown message type: %s', msg.mtype)

    def start_services(self, _):
        self._start_services()
        return 'started'

    # noinspection PyUnusedLocal
    def stop_services(self, msg):
        self._stop_services()
        return 'stopped'

    @staticmethod
    def _handle_exception(msg, e):
        return 'Exception while processing msg %s: %s' % (msg.mtype, str(e))

    def _handle_request(self, msg):
        self.log.info('Processing RPC: %s', msg.mtype)
        handler = self.rpc_handlers.get(msg.mtype,
                                        self._handle_unknown_request)
        return handler(msg)

    # noinspection PyBroadException
    def _rpc_executor_thread_main(self):
        self.send_heartbeat()
        while self._should_run:
            try:
                self.log.debug('Checking for RPCs on %s. Waiting: %s',
                               self.rpcqueue.name, self.jobs.qsize())
                raw = self.rpcqueue.pop(timeout=1, blocking=True)
                if not raw:
                    continue

                # RPCs are in assemblyline.al.common.Message format.
                msg = None
                error = None
                try:
                    msg = AgentRequest.parse(raw)
                except Exception as e:
                    self.log.exception('While processing rpc: %s', raw)
                    error = str(e)

                # TODO should we just block instead of using job queue ?
                if msg:
                    self.jobs.push(msg)
                else:
                    reply_to_rpc(raw, response_body=error, succeeded=False)
            except KeyboardInterrupt:
                self._should_run = False
                self.log.error('Thread got CTL-C in consumer thread.')
                return
            except Exception:
                self.log.exception('Unhandled Exception in consumer thread.')
                time.sleep(2)
                continue

    def _complete_chores_if_due(self):
        now = time.time()
        since_last_heartbeat = now - self.last_heartbeat
        if abs(since_last_heartbeat) >= config.system.update_interval:
            self.send_heartbeat()
            self.last_heartbeat = now

    # noinspection PyUnusedLocal
    def ping(self, _msg):
        self.log.info('PING')
        return 'PONG'

    def heartbeat(self):
        heartbeat = {
            'mac': self.mac,
            'time': isotime.now_as_iso(),
            'registration': self.registration,
            'resources': {
                'cpu_usage.percent': psutil.cpu_percent(),
                'mem_usage.percent': psutil.phymem_usage().percent,
                'disk_usage.percent': psutil.disk_usage('/').percent,
                'disk_usage.free': psutil.disk_usage('/').free
            }
        }

        profile = self.registration.get('profile', None)
        if profile:
            heartbeat['profile'] = profile

        heartbeat['profile_definition'] = self.host_profile

        vm_host_mac = self.registration.get('vm_host_mac', None)
        if vm_host_mac:
            heartbeat['vm_host_mac'] = vm_host_mac

        if self.vm_manager:
            heartbeat['vmm'] = self.vm_manager.get_stats()
        else:
            heartbeat['vmm'] = None

        if self.service_manager:
            heartbeat['services'] = {
                'status': 'up',
                'details': self.service_manager.get_stats()
            }
        else:
            heartbeat['services'] = None

        if self.flex_manager:
            heartbeat.update(self.flex_manager.heartbeat())

        return heartbeat

    def send_heartbeat(self):
        self.log.debug(r'heartbeat.')
        heartbeat = self.heartbeat()
        msg = Message(to='*',
                      mtype=MT_SVCHEARTBEAT,
                      sender=self.mac,
                      body=heartbeat)
        CommsQueue('status').publish(msg.as_dict())

    @staticmethod
    def shutdown(msg):
        raise RemoteShutdownInterrupt(str(msg))

    # noinspection PyBroadException
    def start_components(self):
        if not self.registration:
            raise ProvisioningError('Host registration not found.')

        if not self.registration.get('enabled', None):
            raise ProvisioningError('Host explicitly disabled.')

        profile_name = self.registration.get('profile', None)
        if not profile_name:
            raise ProvisioningError('Host has no assigned profile.')

        if 'profile_definition' not in self.registration:
            self.host_profile = self.store.get_profile(profile_name)
            if not self.host_profile:
                raise ProvisioningError(
                    'Host profile definition not found for %s.', profile_name)
            self.log.info('Our profile: %s', pprint.pformat(self.host_profile))

            # Prior to startup, remove any safe-start queues associated with our mac address.
            NamedQueue('safe-start-%s' % self.mac).delete()

            if profile_name.startswith('flex'):
                self.log.info(
                    'We have been provisioned as a flex node. Starting FlexManager.'
                )
                self.flex_manager = FlexManager()
                self.flex_manager.start()
                return
        else:
            self.host_profile = self.registration.get('profile_definition', {})

        services_config = self.host_profile.get('services', None)
        if services_config:
            config_overrides = self.registration.get('config_overrides', {})
            self.service_manager = ServiceManager(services_config,
                                                  config_overrides)
            self.service_manager.start()
        else:
            self.log.info('No services provisioned for this host.')

        vm_config = self.host_profile.get('virtual_machines', {})
        if vm_config:
            from assemblyline.al.common.vm import VmManager
            self.vm_manager = VmManager(vm_config)
            self.vm_manager.start()
        else:
            self.log.info('No virtual machines provisioned for this host.')
            try:
                from assemblyline.al.common.vm import VmManager
                # Attempt to launch VmManager to cleanup any old VMs that may be left over
                # from a previously configured profile on this node.
                self.vm_manager = VmManager()
                self.vm_manager.start()
                self.vm_manager.shutdown()
            except:
                pass

    def _stop_services(self):
        if self.service_manager:
            self.log.info('Stopping ServiceManager')
            self.service_manager.shutdown()
            self.service_manager = None

    def _start_services(self):
        if self.service_manager:
            # already running
            return

        services_config = self.host_profile.get('services', None)
        if services_config:
            config_overrides = self.registration.get('config_overrides', {})
            self.service_manager = ServiceManager(services_config,
                                                  config_overrides)
            self.service_manager.start()
        else:
            self.log.info('No services provisioned for this host.')

    def stop_components(self):
        if self.service_manager:
            self._stop_services()

        if self.vm_manager:
            self.log.info('Stopping VmManager.')
            self.vm_manager.shutdown()
            self.vm_manager = None

        if self.flex_manager:
            self.log.info('Stopping Flex Manager.')
            self.flex_manager.shutdown()
            self.flex_manager = None

    def run(self):
        # Clean up any leftover resources from workers
        worker_cleanup(self.mac, self.log)
        # Start up the core components (service and vmm managers)
        # and then kick of the rpc receiver.
        self._init_registration()
        self._init_queues()
        self.start_components()
        self.executor_thread = threading.Thread(
            target=self._rpc_executor_thread_main, name='agent_rpc_consumer')
        self.executor_thread.start()

        while self._should_run:
            self._complete_chores_if_due()
            job = self.jobs.pop(timeout=0.5)
            if not job:
                continue

            succeeded = True
            try:
                result = self._handle_request(job)
            except RemoteShutdownInterrupt:
                reply_to_rpc(job,
                             response_body='Host Agent Shutting down.',
                             succeeded=True)
                raise
            except Exception as e:  # pylint:disable=W0703
                succeeded = False
                result = 'Error while completing job: %s' % str(e)
                self.log.exception('while completing job')

            reply_to_rpc(job, response_body=result, succeeded=succeeded)

        self.log.info('_should_run is false. exiting.')
        return

    def stop(self):
        self.log.info('Stopping: MAC[%s] STORE[%s]' % (self.mac, self.store))
        self._should_run = False
        self.stop_components()
        if self.consumer_thread:
            self.consumer_thread.join(5)

        if self.store:
            self.store.close()

        worker_cleanup(self.mac, self.log)

    # noinspection PyUnusedLocal
    def _stop_signal_handler(self, signal_num, interrupted_frame):
        self.log.info("Shutting down due to signal.")
        self.stop()

    def serve_forever(self):
        try:
            # Listen for our shutdown signal
            signal.signal(signal.SIGINT, self._stop_signal_handler)
            # Inject a message onto the agents queue.
            self.run()
        except KeyboardInterrupt:
            self.log.info('Shutting down due to KeyboardInterrupt.')
            self.stop()
        except RemoteShutdownInterrupt as ri:
            msg = 'Shutting down due to remote command: %s' % ri
            self.log.info(msg)
            self.stop()
        except Exception as ex:
            msg = 'Shutting down due to unhandled exception: %s' % get_stacktrace_info(
                ex)
            self.log.error(msg)
            self.stop()
示例#11
0
class FlexManager(object):

    # Regardless of CPU/RAM requirements we limit
    # the number of worker instances to this.
    MAX_WORKERS = 50

    # Maximum number of times to requery the datastore on error
    DATASTORE_RETRY_LIMIT = 8

    # Maximum and minimum ticks spent on the same main service
    MAX_TICKS = 144
    MIN_TICKS = 12

    # Ticks lenght in seconds
    SECONDS_PER_TICKS = 5

    # Start/Stop Flexing threshold
    START_FLEX = max(100, config.core.dispatcher.max.inflight * 5 / 100)
    STOP_FLEX = 25

    # noinspection PyGlobalUndefined,PyUnresolvedReferences
    def __init__(self):
        # Delay these imports so most nodes don't import them.
        global Scheduler
        from apscheduler.scheduler import Scheduler

        self.bottleneck_queue_sizes = {}
        self.cores = None
        self.datastore = forge.get_datastore()
        self.flex_profile = None
        self.flex_scheduler = None
        self.log = logging.getLogger('assemblyline.flex')
        self.mac = net.get_mac_for_ip(net.get_hostip())
        self.main_bottleneck = ''
        self.needs_cleanup = True
        self.previous_queue_sizes = {}
        self.safe_start_dict = {}
        self.safeq = NamedQueue('safe-start-%s' % self.mac)
        self.service_manager = None
        self.ram_mb = None
        self.tick_count = 0
        self.vm_manager = None

    # noinspection PyUnresolvedReferences
    def start(self):

        self.cores = psutil.NUM_CPUS
        self.flex_scheduler = Scheduler()
        self.ram_mb = int(psutil.TOTAL_PHYMEM / (1024 * 1024))

        assert (self.cores > 0)
        assert (self.ram_mb > 256)
        assert ((self.MAX_TICKS > 6) and (self.MAX_TICKS < 10000))

        self.log.info('Blacklisted for Flex: %s',
                      " ".join(config.services.flex_blacklist))

        self.flex_scheduler.add_interval_job(self._check_flexing_status,
                                             seconds=self.SECONDS_PER_TICKS,
                                             kwargs={})
        self._find_new_bottlenecks()

        self.flex_scheduler.start()

    def heartbeat(self):
        heartbeat = {}

        if self.vm_manager:
            vm_hb = self.vm_manager.get_stats()
            if vm_hb:
                heartbeat['vmm'] = vm_hb

        if self.service_manager:
            details = self.service_manager.get_stats()
            if details:
                heartbeat['services'] = {'status': 'up', 'details': details}
            else:
                heartbeat['services'] = None
        else:
            heartbeat['services'] = None

        return heartbeat

    def shutdown(self):
        self.log.info('shutting down flex manager.')
        self.flex_scheduler.shutdown()
        if self.service_manager:
            self.service_manager.shutdown()
            self.service_manager = None
        if self.vm_manager:
            self.vm_manager.shutdown()
            self.vm_manager = None
        self.datastore.close()

    def _check_flexing_status(self):
        self.tick_count += 1

        if self.tick_count > self.MAX_TICKS:
            self.log.info(
                "flexnode has been running for max period. respawning.")
            self._find_new_bottlenecks()
            return

        if self.main_bottleneck:
            service_queue_lengths = {
                x: get_service_queue_length(x)
                for x in self.bottleneck_queue_sizes.keys()
            }
            if service_queue_lengths[self.main_bottleneck] < self.STOP_FLEX:
                self.log.info(
                    "flexnode main bottleneck (%s) has shrunk under minimum threshold, "
                    "looking for new bottlenecks..." % self.main_bottleneck)
                for srv, queue_size in self.bottleneck_queue_sizes.iteritems():
                    self.log.info(
                        "    %s: %d --> %d" %
                        (srv, queue_size, service_queue_lengths[srv]))
                self._find_new_bottlenecks()
                return

            self.log.info("flexnode bottleneck progress:")
            for srv, queue_size in self.bottleneck_queue_sizes.iteritems():
                self.log.info("    %s: %d --> %d" %
                              (srv, queue_size, service_queue_lengths[srv]))

        else:
            self.log.info(
                "flexnode has no services up, checking for new job...")
            self._find_new_bottlenecks()

    def _wait_for_safe_start(self):

        # If safe_start is enabled, we wait until the service has initialized (or timed out) before
        # respawning a new blitz.
        if not self.bottleneck_queue_sizes or not self.flex_profile or not self.safe_start_dict:
            # Nothing to wait on..
            self.log.info("Safe-start has nothing to wait for.")
            return

        for srv in [k for k, v in self.safe_start_dict.iteritems() if v]:
            self.log.info("Waiting on safe-start for service: %s", srv)

            # Before shutting down a service, make sure all of our spawned instances have come up
            worker_count = int(
                self.flex_profile.get("services", {}).get(srv, {}).get(
                    'workers', None) or 0)
            # Maximum 2 minute wait
            max_wait = 120
            while worker_count > 0 and max_wait > 0:
                start = time.time()
                if self.safeq.pop(timeout=max_wait) is not None:
                    worker_count -= 1
                end = time.time()
                max_wait -= int(end - start)

            if worker_count > 0:
                self.log.warning(
                    "Service %s is being stopped, but safe_start timed out!")

            self.log.info("Safe-start completed successfully for service: %s",
                          srv)

    # precondition: holding scheduler_thread_lock
    def _find_new_bottlenecks(self):
        self.log.info("Finding new bottlenecks...")
        if self.needs_cleanup:
            self.tick_count = 0
            self._wait_for_safe_start()
            if self.service_manager:
                self.service_manager.shutdown()
                self.service_manager = None

            if self.vm_manager:
                self.vm_manager.shutdown()
                self.vm_manager = None

            # Cleanup queue should also be handled here
            worker_cleanup(self.mac, self.log)

            # Delete any lingering safe-start queue entries (timeout/restart)
            self.safeq.delete()
            self.needs_cleanup = False

        flexable_services, self.flex_profile, self.safe_start_dict = self._create_profile_for_cur_bottleneck(
        )

        if flexable_services:
            self.bottleneck_queue_sizes = {k: v for k, v in flexable_services}
            self.main_bottleneck = flexable_services[0][0]
            self.log.info(
                "Starting Transient ServiceManager with profile: %s. TTL:%d secs.",
                str(self.flex_profile),
                self.MAX_TICKS * self.SECONDS_PER_TICKS)

            self.service_manager = ServiceManager(
                self.flex_profile.get('services'))
            self.service_manager.start()

            if config.workers.install_kvm:
                from assemblyline.al.common.vm import VmManager
                self.vm_manager = VmManager(
                    self.flex_profile.get('virtual_machines'))
                self.vm_manager.sysprep()
                self.vm_manager.start()
            self.needs_cleanup = True
        else:
            self.main_bottleneck = ''
            self.bottleneck_queue_sizes = {}

    def _determine_busiest_services(self):
        queue_lengths = get_service_queue_lengths()

        for blacklisted_service in config.services.flex_blacklist:
            if blacklisted_service in queue_lengths:
                try:
                    queue_lengths.pop(blacklisted_service)
                except KeyError:
                    pass

        flexable_services = sorted(
            [(k, v)
             for k, v in queue_lengths.iteritems() if v >= self.START_FLEX],
            key=lambda x: x[1],
            reverse=True)

        return flexable_services

    # noinspection PyBroadException
    def _load_profile_data(self):
        # Occasionally we encounter missing profile data, which usually works when a retry occurs.
        retries = self.DATASTORE_RETRY_LIMIT
        while retries > 0:
            try:
                profile_map = self.datastore.get_all_profiles()
                service_map = {
                    x['name']: x
                    for x in self.datastore.list_services()
                }
                vm_list = self.datastore.list_virtualmachines()
                return profile_map, service_map, vm_list
            except:
                self.log.exception("Error retrieving profile data:")
                time.sleep(.5)
                retries -= 1
        raise ProvisioningError("Unable to retrieve profile data.")

    def _load_allocation_for_service(self, service_to_load):
        vm_to_alloc = None
        profile_map, service_map, vm_list = self._load_profile_data()
        for vm in vm_list:
            vm['srv_list'] = {vm['name']: vm['num_workers']}

            if service_to_load in vm['srv_list'] and vm[
                    'name'] == service_to_load:
                vm_to_alloc = copy.copy(vm)
                break
            else:
                self.log.info(
                    'This is not the vm we are looking for: %s with %s' %
                    (vm['name'], vm['srv_list']))

        if vm_to_alloc:
            cpu_usage = 0
            for service in vm_to_alloc['srv_list']:
                cpu_usage += service_map.get(service, {}).get(
                    'cpu_cores', 1) * vm_to_alloc['srv_list'][service]

            return VmAllocation(service_to_load, cpu_usage, vm_to_alloc['ram'],
                                vm_to_alloc['srv_list'])
        else:
            try:
                return ServiceAllocation(
                    service_to_load,
                    service_map[service_to_load].get('cpu_cores', 1),
                    service_map[service_to_load].get('ram_mb',
                                                     1024), service_to_load)
            except KeyError:
                return None

    def _create_profile_for_cur_bottleneck(self):
        profile = {
            'services': {},
            'system_overrides': {},
            'virtual_machines': {}
        }
        safe_start_dict = {}
        resources = {}
        ratio = {}
        allocation = {}

        temp_flexable_services = self._determine_busiest_services()

        if not temp_flexable_services:
            self.log.info(
                "There are no service that meet the minimum requirement for flexing..."
            )
            return temp_flexable_services, profile, safe_start_dict

        flexable_services = [(k, v) for k, v in temp_flexable_services
                             if k in self.previous_queue_sizes]

        if not flexable_services:
            self.previous_queue_sizes = {
                k: v
                for k, v in temp_flexable_services
            }
            return flexable_services, profile, safe_start_dict
        else:
            self.previous_queue_sizes = {}

        self.log.info(
            "The following services are candidates for current flex node:")
        for srv, queue_size in flexable_services:
            self.log.info("    %s: %d" % (srv, queue_size))
            res = self._load_allocation_for_service(srv)
            if not res:
                raise ProvisioningError(
                    "No automatic provisioning profile for %s", srv)

            resources[srv] = res
            ratio[srv] = queue_size * 1.0 / flexable_services[0][1]
            allocation[srv] = 0

        # Allocate services
        available_ram = self.ram_mb - 256
        available_cpu = self.cores - 0.1
        failed_list = []
        main_service, main_queue_size = flexable_services[0]
        main_resources = resources[main_service]
        something_changed = True

        while something_changed:
            something_changed = False
            # Can I allocated the main service ?
            if main_resources.cores <= available_cpu and main_resources.ram_mb <= available_ram and \
                    allocation[main_service] < self.MAX_WORKERS:
                available_cpu -= main_resources.cores
                available_ram -= main_resources.ram_mb
                allocation[main_service] += 1
                something_changed = True
            else:
                if main_service not in failed_list:
                    failed_list.append(main_service)
                    something_changed = True

            for srv, queue_size in flexable_services[1:]:
                # is it time to allocate this service
                if (int(ratio[srv] * allocation[main_service]) != allocation[srv] or main_service in failed_list) and \
                        srv not in failed_list:
                    res = resources[srv]
                    if res.cores <= available_cpu and res.ram_mb <= available_ram and \
                            allocation[srv] < self.MAX_WORKERS:
                        available_cpu -= res.cores
                        available_ram -= res.ram_mb
                        allocation[srv] += 1
                        something_changed = True
                    else:
                        failed_list.append(srv)
                        something_changed = True

        self.log.info(
            "Machine has %s cores and %sMB ram. The flex manager will provision %s cores and %sMB ram."
            % (self.cores, self.ram_mb, self.cores - available_cpu,
               self.ram_mb - available_ram))

        for srv, _ in flexable_services:
            for x in range(0, allocation[srv]):
                resources[srv].update_profile_for_allocation(profile)

                # Get the name of the optional startup guard queue (used to prevent restart during initialization)
                safe_start_dict[srv] = service_by_name(srv).SERVICE_SAFE_START

        return flexable_services, profile, safe_start_dict