Пример #1
0
class ProvisionerQueryService(ServiceProcess):
    """Provisioner querying service
    """

    declare = ServiceProcess.service_declare(name='provisioner_query',
                                             version='0.1.0',
                                             dependencies=[])

    def slc_init(self):
        interval = float(
            self.spawn_args.get("interval_seconds", DEFAULT_QUERY_INTERVAL))

        self.client = ProvisionerClient(self)

        log.debug('Starting provisioner query loop - %s second interval',
                  interval)
        self.loop = LoopingCall(self.query)
        self.loop.start(interval)

    def slc_terminate(self):
        if self.loop:
            self.loop.stop()

    @defer.inlineCallbacks
    def query(self):
        try:
            yield self._do_query()
        except Exception, e:
            log.error("Error sending provisioner query request: %s",
                      e,
                      exc_info=True)
Пример #2
0
Файл: dtrs.py Проект: timf/epu
class DeployableTypeRegistryService(ServiceProcess):
    """Deployable Type Registry service interface
    """
    declare = ServiceProcess.service_declare(name='dtrs', version='0.1.0', dependencies=[])

    def slc_init(self):
        registry = self.spawn_args.get('registry')
        registry_dir = self.spawn_args.get('registry_dir')

        if registry is None and registry_dir is None:
            raise ValueError("DTRS needs either 'registry' or 'registry_dir' in spawnargs")

        if registry is not None:
            self.registry = registry

        else:
            log.info('DTRS configured to use directory %s' % registry_dir)
            self.registry = DeployableTypeRegistry(registry_dir)
            self.registry.load()

    def op_lookup(self, content, headers, msg):
        """Resolve a deployable type
        """
        # hide the password so it doesn't get logged
        hide_password = deepcopy(content)
        if hide_password.get('vars') and 'cassandra_password' in hide_password['vars']:
            hide_password['vars']['cassandra_password'] = '******' 
        if hide_password.get('vars') and 'broker_password' in hide_password['vars']:
            hide_password['vars']['broker_password'] = '******'

        log.debug('Received DTRS lookup. content: %s', hide_password)
        # just using a file for this right now, to keep it simple
        dt_id = content['deployable_type']
        nodes = content.get('nodes')
        vars = content.get('vars')

        dt = self.registry.get(dt_id)
        if not dt:
            return self._dtrs_error(msg, 'Unknown deployable type name: '+ dt_id)

        doc_tpl = dt['document']
        defaults = dt.get('vars')
        all_vars = {}
        if defaults:
            all_vars.update(defaults)
        if vars:
            try:
                process_vars(vars, dt_id)
            except DeployableTypeValidationError, e:
                return self._dtrs_error(msg, str(e))
            
            all_vars.update(vars)

        template = string.Template(doc_tpl)
        try:
            document = template.substitute(all_vars)
        except KeyError,e:
            return self._dtrs_error(msg,
                    'DT doc has variable not present in request or defaults: %s'
                    % str(e))
Пример #3
0
class ProvisionerService(ServiceProcess):
    """Provisioner service interface
    """

    # Declaration of service
    declare = ServiceProcess.service_declare(name='provisioner',
                                             version='0.1.0',
                                             dependencies=[])

    @defer.inlineCallbacks
    def slc_init(self):
        cei_events.event("provisioner", "init_begin")

        try:
            store = self.spawn_args['store']
            site_drivers = self.spawn_args['site_drivers']
            context_client = self.spawn_args['context_client']
        except KeyError, e:
            raise KeyError("Missing provisioner spawn_arg: " + str(e))

        self.store = store

        notifier = self.spawn_args.get('notifier')
        self.notifier = notifier or ProvisionerNotifier(self)
        self.dtrs = DeployableTypeRegistryClient(self)

        self.core = ProvisionerCore(self.store, self.notifier, self.dtrs,
                                    site_drivers, context_client)
        yield self.core.recover()
        cei_events.event("provisioner", "init_end")

        # operator can disable new launches
        self.enabled = True
        self.terminate_all_deferred = None
Пример #4
0
class EPUControllerListService(ServiceProcess):
    """Provides list of EPU Controller service names
    """

    declare = ServiceProcess.service_declare(name='epu_controller_list',
                                             version='0.1.0',
                                             dependencies=[])

    def slc_init(self):
        # Allow direct list for tests, etc.
        self.controller_list = self.spawn_args.get('controller_list_direct',
                                                   None)

        # It's valid to have a zero length list, only check 'is None'
        if self.controller_list is None:
            controller_list_path = self.spawn_args.get('controller_list_path',
                                                       None)
            if not controller_list_path:
                raise Exception(
                    "There is no 'controller_list_path' configuration")
            self.controller_list = self._intake_file(controller_list_path)

        if self.controller_list:
            log.debug("Initialized with controller list:\n%s\n" %
                      self.controller_list)
        else:
            log.debug("Initialized with empty controller list")

    def _intake_file(self, controller_list_path):
        if not os.path.exists(controller_list_path):
            raise Exception(
                "The 'controller_list_path' file does not exist: %s" %
                controller_list_path)
        controller_list = []
        f = open(controller_list_path)
        for line in f.readlines():
            name = line.strip()
            if not name:
                continue
            if name.startswith("#"):
                continue
            controller_list.append(name)
        return controller_list

    def plc_terminate(self):
        log.debug('EPU Controller List service: shutdown triggered')

    @defer.inlineCallbacks
    def op_list(self, content, headers, msg):
        """Return a list of zero to N controller names
        """
        yield self.reply_ok(msg, self.controller_list)
Пример #5
0
class EPUWorkProducer(ServiceProcess):
    """EPU Work Producer.
    """
    declare = ServiceProcess.service_declare(name='epu_work_producer',
                                             version='0.1.0',
                                             dependencies=[])

    def slc_init(self):
        self.queue_name_work = self.get_scoped_name(
            "system", self.spawn_args["queue_name_work"])
        self.web_resource = Sidechannel()
        listen_port = self.spawn_args["listen_port"]
        reactor.listenTCP(int(listen_port), server.Site(self.web_resource))
        self.work_produce_loop = LoopingCall(self.work_seek)
        self.work_produce_loop.start(1, now=False)

    @defer.inlineCallbacks
    def work_seek(self):
        try:
            while True:
                job = self.web_resource.queue.get(block=False)
                if job is None:
                    raise Queue.Empty()

                yield self.send(
                    self.queue_name_work, 'work', {
                        "work_amount": job.length,
                        "batchid": job.batchid,
                        "jobid": job.jobid
                    })

                extradict = {
                    "batchid": job.batchid,
                    "jobid": job.jobid,
                    "work_amount": job.length
                }
                cei_events.event("workproducer", "job_sent", extra=extradict)

                # This is an unfortunate hack to work around a memory leak in ion.
                # Some caches are only cleared after a received message is handled.
                # Since this process sends messages "spontaneously" -- triggered by a
                # LoopingCall -- we must manually clear the cache.
                self.message_client.workbench.manage_workbench_cache(
                    'Default Context')

        except Queue.Empty:
            return
        except Exception, e:
            # unhandled exceptions will terminate the LoopingCall
            log.error("Error adding work: %s", e, exc_info=True)
Пример #6
0
class EPUWorkerService(ServiceProcess):
    """EPU Worker service.
    """
    declare = ServiceProcess.service_declare(name='epu_worker',
                                             version='0.1.0',
                                             dependencies=[])

    def slc_init(self):
        queue_name = self.spawn_args["queue_name_work"]
        self.workReceiver = WorkerReceiver(name=queue_name,
                                           label=__name__,
                                           scope=WorkerReceiver.SCOPE_SYSTEM,
                                           handler=self.receive)
        self.queue_name_work = self.workReceiver.xname
        extradict = {"queue_name_work": self.queue_name_work}
        cei_events.event("worker", "init_begin", extra=extradict)
        self.laterinitialized = False
        reactor.callLater(0, self.later_init)

    @defer.inlineCallbacks
    def later_init(self):
        spawnId = yield self.workReceiver.attach()
        log.debug("spawnId: %s" % spawnId)
        self.laterinitialized = True
        extradict = {"queue_name_work": self.queue_name_work}
        cei_events.event("worker", "init_end", extra=extradict)

    @defer.inlineCallbacks
    def op_work(self, content, headers, msg):
        if not self.laterinitialized:
            log.error("message got here without the later-init")
        sleepsecs = int(content['work_amount'])
        extradict = {
            "batchid": content['batchid'],
            "jobid": content['jobid'],
            "work_amount": sleepsecs
        }
        cei_events.event("worker", "job_begin", extra=extradict)
        log.info("WORK: sleeping for %d seconds ---" % sleepsecs)
        yield pu.asleep(sleepsecs)
        yield self.reply(msg, 'result', {'result': 'work_complete'}, {})
        cei_events.event("worker", "job_end", extra=extradict)
Пример #7
0
class EPUControllerClientSample(ServiceProcess):

    declare = ServiceProcess.service_declare(name='epu_reconfigure_sample',
                                             version='0.1.0',
                                             dependencies=[])

    def slc_init(self, proc=None, **kwargs):
        self.client = EPUControllerClient()
        reactor.callLater(5, self.send_reconfigure)

    @defer.inlineCallbacks
    def send_reconfigure(self):
        newconf = {}
        newconf["preserve_n"] = "%s" % self.spawn_args["preserve_n"]
        newconf["unique_instances"] = {
            'b2db408e': {
                'some_unique_name': 'some_unique_value123'
            },
            '3633541e': {
                'some_unique_name': 'some_other_unique_value456'
            }
        }
        self.client.reconfigure(newconf)
Пример #8
0
class QueueStatService(ServiceProcess):
    """Queue stat subscription service

    Only works on a RabbitMQ server running *on localhost*
    """

    declare = ServiceProcess.service_declare(name='queuestat',
                                             version='0.1.0',
                                             dependencies=[])

    def slc_init(self):
        erlang_cookie = self.spawn_args.get('erlang_cookie', None)
        if not erlang_cookie:
            cookie_path = self.spawn_args.get('erlang_cookie_path', None)
            erlang_cookie = read_cookie(cookie_path)

        rabbitmq_node = self.spawn_args.get('rabbitmq_node',
                                            'rabbit@localhost')
        node_name = twotp.node.buildNodeName(rabbitmq_node)
        twotp_process = twotp.node.Process(node_name, erlang_cookie)
        self.rabbitmq = RabbitMQControlService(twotp_process, node_name)

        self.interval = float(
            self.spawn_args.get('interval_seconds', DEFAULT_INTERVAL_SECONDS))
        self.sensor_id = self.spawn_args.get('sensor_id', DEFAULT_SENSOR_ID)
        self.loop = LoopingCall(self._wrapped_do_poll)

        # a dict of sets of (subscriber,op) tuples
        self.watched_queues = {}

    def plc_terminate(self):
        log.debug('Shutdown triggered')
        self.rabbitmq.shutdown()

    def op_watch_queue(self, content, headers, msg):
        """Start watching a queue for updates. If queue is already being
        watched by this subscriber, this operation does nothing.
        """
        queue_name = content.get('queue_name')
        subscriber_name = content.get('subscriber_name')
        subscriber_op = content.get('subscriber_op')

        if not (queue_name and subscriber_name and subscriber_op):
            log.warn("Got invalid watch request: %s" % content)
            return

        sub_tuple = (subscriber_name, subscriber_op)

        queue_subs = self.watched_queues.get(queue_name, None)
        if queue_subs is None:
            queue_subs = set()
            self.watched_queues[queue_name] = queue_subs
        queue_subs.add(sub_tuple)

        if not self.loop.running:
            log.debug('starting LoopingCall, to poll queues')
            self.loop.start(self.interval)

    def op_unwatch_queue(self, content, headers, msg):
        """Stop watching a queue. If queue is not being watched by subscriber,
        this operation does nothing.
        """
        queue_name = content.get('queue_name')
        subscriber_name = content.get('subscriber_name')
        subscriber_op = content.get('subscriber_op')

        if not (queue_name and subscriber_name and subscriber_op):
            log.warn("Got invalid unwatch request: %s" % content)
            return

        sub_tuple = (subscriber_name, subscriber_op)

        queue_subs = self.watched_queues.get(queue_name, None)
        if queue_subs:
            queue_subs.discard(sub_tuple)
            if not queue_subs:
                del self.watched_queues[queue_name]

        if not self.watched_queues and self.loop.running:
            log.debug('No queues are being watched, disabling LoopingCall')
            self.loop.stop()

    @defer.inlineCallbacks
    def _wrapped_do_poll(self):
        try:
            yield self._do_poll()
        except Exception, e:
            log.error("Error in RabbitMQ poll: %s", str(e), exc_info=True)
class AppControllerService(ServiceProcess):
    """
    Defines an application controller service to perform load balancing.
    """

    declare = ServiceProcess.service_declare(name="app_controller",
                                             version="0.1.0",
                                             dependencies=["attributestore"])

    def __init__(self, *args, **kwargs):
        ServiceProcess.__init__(self, *args, **kwargs)

        self.routing = {
        }  # mapping of queues to a list of bindings (station ids/sensor ids)
        self.workers = {
        }  # mapping of known worker vms to info about those vms (cores / running instances)

        # get configs for current exchange setup from exchange space, queues as per what TopicWorkerReceiver (below) uses
        exchcnfg = self.container.exchange_manager.exchange_space.exchange
        msgcnfg = messaging.worker('temp')

        # for timing
        self._timer = time.time()

        # for reconfigure events
        self._reconfigure_timeout = None

        # provisioner vars are common vars for all worker instances
        self.prov_vars = {
            'sqlt_vars': {
                'inp_exchange': INP_EXCHANGE_NAME,
                'inp_exchange_type': exchcnfg.exchange_type,
                'inp_exchange_durable': str(exchcnfg.durable).lower(),
                'inp_exchange_autodelete': str(exchcnfg.auto_delete).lower(),
                'inp_queue_durable': msgcnfg['durable'],
                'inp_queue_autodelete': msgcnfg['auto_delete'],
                'det_topic': DETECTION_TOPIC,
                'det_exchange': OUT_EXCHANGE_NAME,
                'det_exchange_type': exchcnfg.exchange_type,
                'det_exchange_durable': str(exchcnfg.durable).lower(),
                'det_exchange_autodelete': str(exchcnfg.auto_delete).lower()
            }
        }

    @defer.inlineCallbacks
    def slc_init(self):
        # Service life cycle state.

        # consume the announcement queue
        self.announce_recv = TopicWorkerReceiver(name=ANNOUNCE_QUEUE,
                                                 scope='global',
                                                 process=self,
                                                 handler=self._recv_announce)

        # declares queue and starts listening on it
        yield self.announce_recv.attach()

        # get topic based routing to all sensor data (for anything missed on the announcement queue)
        #self.all_data_recv = TopicWorkerReceiver(name="ta_alldata",
        #                                         scope='global',
        #                                         binding_key = "ta.*.BHZ",
        #                                         process=self,
        #                                         handler=self._recv_data)

        #yield self.all_data_recv.attach()
        #yield self.all_data_recv.initialize()
        #self.counter = 0

        self.epu_controller_client = EPUControllerClient()

        self.attribute_store_client = AttributeStoreClient()
        yield self._load_sql_def()

    @defer.inlineCallbacks
    def _recv_announce(self, data, msg):
        """
        Received an instrument announcement. Set up a binding for it.
        """
        jsdata = json.loads(data)
        station_name = jsdata['content']

        log.info("Instrument Station Announce: " + station_name)

        found = self.has_station_binding(station_name)

        if found:
            log.error("Duplicate announcement")
        else:
            yield self.bind_station(station_name)

        yield msg.ack()

    #def _recv_data(self, data, msg):
    #    #log.info("<-- data packet" + msg.headers.__str__())
    #    log.info("data " + self.counter.__str__())
    #    self.counter += 1
    #    msg.ack()

    @defer.inlineCallbacks
    def bind_station(self, station_name, queue_name=None):
        """
        Binds a station to a queue. Typically you do not specify the queue name, this method
        will find a queue with room. If a queue name is given, no checking will be done - it 
        will simply be added.
        """
        if queue_name == None:
            queue_name = "W%s" % (len(self.routing.keys()) + 1)

            # find a queue with enough room
            added = False
            for queues in self.routing.keys():
                qlen = len(self.routing[queues])
                if qlen < STATIONS_PER_QUEUE:
                    queue_name = queues
                    break

        binding_key = '%s' % station_name

        yield self._create_queue(queue_name, binding_key)

        if not self.routing.has_key(queue_name):
            self.routing[queue_name] = []
            self.request_sqlstream(queue_name)

        self.routing[queue_name].append(station_name)

        log.info("Created binding %s to queue %s" % (binding_key, queue_name))

    @defer.inlineCallbacks
    def _create_queue(self, queue_name, binding_key):
        """
        Creates a queue and/or binding to a queue (just the binding if the queue exists).
        TODO: replace this with proper method of doing so.
        """
        recv = TopicWorkerReceiver(name=queue_name,
                                   scope='global',
                                   binding_key=binding_key,
                                   process=self)
        yield recv.initialize()  # creates queue but does not listen

    def request_sqlstream(self, queue_name, op_unit_id=None):
        """
        Requests a SQLStream operational unit to be created, or an additional SQLStream on an exiting operational unit.
        @param queue_name   The queue the SQL Stream unit should consume from.
        @param op_unit_id   The operational unit id that should be used to create a SQL Stream instance. If specified, will always create on that op unit. Otherwise, it will find available space on an existing VM or create a new VM.
        """

        # if this var is true, at the end of this method, instead of reconfiguring via
        # the decision engine, we will directly ask the agent on op_unit_id to spawn the
        # sqlstream engine. This will hopefully be taken out when we can reconfigure
        # workers on the fly.
        direct_request = False

        if op_unit_id != None and not self.workers.has_key(op_unit_id):
            log.error("request_sqlstream: op_unit (%s) requested but unknown" %
                      op_unit_id)

        if op_unit_id == None:
            # find an available op unit
            for (worker, info) in self.workers.items():
                availcores = info['metrics']['cores'] - (
                    len(info['sqlstreams']) * CORES_PER_SQLSTREAM)
                if availcores >= CORES_PER_SQLSTREAM:
                    log.info(
                        "request_sqlstream - asking existing operational unit (%s) to spawn new SQLStream"
                        % worker)
                    # Request spawn new sqlstream instance on this worker
                    # wait for rpc message to app controller that says sqlstream is up
                    op_unit_id = worker

                    direct_request = True

                    # record the fact we are using this worker now
                    # TODO : needs to be an integer to indicate number of starting up, or a
                    # unique key per each starter
                    #info['sqlstreams']['spawning'] = True
                    break

        if op_unit_id == None:
            op_unit_id = str(uuid.uuid4())[:8]
            log.info("request_sqlstream - requesting new operational unit %s" %
                     op_unit_id)

        # now we have an op_unit_id, update the config
        if not self.workers.has_key(op_unit_id):
            self.workers[op_unit_id] = {
                'metrics': {
                    'cores': 2
                },  # all workers should have at least two, will be updated when status is updated
                'state': '',
                'sqlstreams': {}
            }
            streamcount = 0
        else:
            streamcount = len(self.workers[op_unit_id]['sqlstreams'])

        ssid = str(streamcount + 1)

        stream_conf = {'sqlt_vars': {'inp_queue': queue_name}, 'ssid': ssid}

        self.workers[op_unit_id]['sqlstreams'][ssid] = {
            'conf': stream_conf,
            'state': ''
        }

        if direct_request == True:
            self._start_sqlstream(op_unit_id, stream_conf)
        else:
            self.request_reconfigure()  # schedule a reconfigure event!

    def request_reconfigure(self):
        """
        Rate limiter for actual request reconfigure call.
        Waits 4 seconds for any more reconfigure attempts, each of which delays the call by another 4 seconds.
        When the timeout finally calls, the real reconfigure is sent.
        """
        if self._reconfigure_timeout != None and self._reconfigure_timeout.active(
        ):
            log.info(
                "request_reconfigure: delay already active, resetting to 4 seconds"
            )
            self._reconfigure_timeout.reset(4)
        else:

            def callReconfigure():
                log.info(
                    "request_reconfigure: delay complete, actually performing reconfigure"
                )
                self._reconfigure_timeout = None
                self._request_reconfigure()

            log.info(
                "request_reconfigure: starting delay to 4 seconds to prevent flooding EPU controller"
            )
            self._reconfigure_timeout = reactor.callLater(4, callReconfigure)

    def _request_reconfigure(self):
        """
        Requests a reconfiguration from the Decision Engine. This takes care of provisioning
        workers.

        This method builds the JSON required to reconfigure/configure the decision engine.
        """

        # TODO: likely does not need to send prov vars every time as this is reconfigure

        provvars = self.prov_vars.copy()
        #provvars['sqldefs'] = provvars['sqldefs'].replace("$", "$$")    # escape template vars once so it doesn't get clobbered in provisioner replacement

        conf = {
            'preserve_n': len(self.workers),
            #PROVISIONER_VARS_KEY : self.prov_vars,
            'unique_instances': {}
        }

        for (wid, winfo) in self.workers.items():
            conf['unique_instances'][wid] = {'agent_args': {'sqlstreams': []}}
            conf['unique_instances'][wid]['agent_args'].update(self.prov_vars)
            ssdefs = conf['unique_instances'][wid]['agent_args']['sqlstreams']
            for (ssid, ssinfo) in winfo['sqlstreams'].items():
                ssdefs.append({
                    'ssid': ssinfo['conf']['ssid'],
                    'sqlt_vars': ssinfo['conf']['sqlt_vars']
                })

        if DEBUG_WRITE_PROV_JSON:
            f = open('/tmp/prov.json', 'w')
            json.dump(conf, f, indent=1)
            f.close()
            log.debug(
                "Wrote /tmp/prov.json due to DEBUG_WRITE_PROV_JSON being on in the config."
            )

            for (wid, winfo) in conf['unique_instances'].items():
                wdict = winfo.copy()
                wdict['agent_args']['opunit_id'] = wid

                f = open('/tmp/sa-' + wid + '.json', 'w')
                json.dump(wdict, f, indent=1)
                f.close()

                log.debug("Wrote /tmp/sa-%s.json." % wid)

            # merge and write individual worker configs while we're at it
            #for (wid, winfo) in self.workers.items():
            #    wdict = { 'agent_args': { 'opunit_id' : wid,
            #                              'sqlstreams': str(conf['unique_instances'][wid]['sqlstreams']),   # TODO: unstringify this
            #                              'sqlt_vars' : self.prov_vars['sqlt_vars'] } }

            #    f = open('/tmp/sa-' + wid + '.json', 'w')
            #    json.dump(wdict, f, indent=1)
            #    f.close()

        self.epu_controller_client.reconfigure(conf)

        # record the time we sent this
        self._timer = time.time()

    def has_station_binding(self, station_name):
        """
        Returns true if we know about this station.
        """
        for queues in self.routing.keys():
            found = station_name in self.routing[queues]
            if found:
                return True

        return False

    def op_opunit_status(self, content, headers, msg):
        """
        Handles an application agent reporting an operational unit's status.
        Details include its current state, metrics about the system, status of
        SQLstream instances.
        """
        self._update_opunit_status(content)
        self.reply_ok(msg, {'value': 'ok'}, {})

    def request_opunit_status(self, opunit_id):
        """
        Asks an AppAgent to report in its status.
        """
        proc_id = self.workers[opunit_id]['proc_id']
        d = self.rpc_send(proc_id, 'get_opunit_status', {})
        d.addCallback(lambda res: self._update_opunit_status(res[0]))

    def _update_opunit_status(self, status):
        """
        Internal method to handle updating an op unit's status.
        Status updates can either come from heartbeats initiated by the AppAgent, or
        on request from the AppController. This method handles both of those.
        """
        opunit_id = status['id']
        proc_id = status['proc_id']
        state = status['state']
        metrics = status['metrics']
        sqlstreams = status['sqlstreams']

        sstext = ""
        for ssid, sinfo in sqlstreams.items():
            sstext += "(id: %s status: %s queue: %s)" % (ssid, sinfo['state'],
                                                         sinfo['inp_queue'])

        # get amount of time since we requested opunits
        timediff = time.time() - self._timer

        log.info(
            "Op Unit (%s) status update (+%s sec) : state (%s), sqlstreams (%d): %s"
            % (opunit_id, str(timediff), state, len(sqlstreams), sstext))

        if not self.workers.has_key(status['id']):
            self.workers[status['id']] = {}

        self.workers[opunit_id].update({
            'metrics': metrics,
            'state': state,
            'proc_id': proc_id,
            'sqlstreams': sqlstreams
        })

        # display a message if all known opunits are running
        allstate = [
            ssinfo.get('state', None) for ssinfo in [
                winfo['sqlstreams'] for winfo in self.workers.values()
                if len(winfo['sqlstreams']) > 0
            ]
        ]
        if set(allstate) == set(["SUCCESS"]):
            log.info("All known workers are running (+%s sec)" % timediff)

    def _start_sqlstream(self, op_unit_id, conf):
        """
        Tells an op unit to start a SQLStream instance.
        """
        proc_id = self.workers[op_unit_id]['proc_id']
        self.rpc_send(proc_id, 'start_sqlstream', conf)

    def _load_sql_def(self):
        """
        Loads SQL Templates from disk and puts them in a store.
        Called at startup.

        XXX fix:
        Gets SQLStream detection application SQL definitions, either from
        disk or in memory. SQL files stored on disk are loaded once and stored
        in memory after they have been translated through string.Template.

        You may override the SQL defs by sending an RPC message ("set_sql_defs") to
        the Application Controller. These defs will take the place of the current
        in memory defs. They are expected to be templates, in which certain vars will be
        updated. See op_set_sql_defs for more information.
        """
        fulltemplatelist = []
        for filename in ["catalog.sqlt", "funcs.sqlt", "detections.sqlt"]:
            f = resource_stream(__name__, "data/%s" % filename)
            #f = open(os.path.join(os.path.dirname(__file__), "app_controller_service", filename), "r")
            fulltemplatelist.extend(f.readlines())
            f.close()

        fulltemplate = "".join(fulltemplatelist)

        self.attribute_store_client.put(SQLTDEFS_KEY, fulltemplate)

    def op_set_sql_defs(self, content, headers, msg):
        """
        Updates the current cached SQL defs for the SQLStream detection application.
        This overrides what is found on the disk.

        Note it does not update the SQL files on disk, so if the AppControllerService is
        restarted, it will need to be updated with the current defs again.

        This method expects that the only key in content, also named content, is a full 
        SQL definition (the concatenation of "catalog.sqlt" and "detections.sqlt") with
        Python string.Template vars as substitution points for the following variables:

        * inp_queue                 - The input queue name to read messages from.
        * inp_queue_autodelete      - The input queue's auto_delete setting.
        * inp_queue_durable         - The input queue's durable setting.
        * inp_exchange              - The exchange where the input queue resides.
        * inp_exchange_type         - The exchange's type (topic/fanout/direct).
        * inp_exchange_durable      - The exchange's durable setting.
        * inp_exchange_autodelete   - The exchange's auto_delete setting.
        * det_topic                 - The topic string that should be used for detections.
        * det_exchange              - The exchange where detections should be published.
        * det_exchange_type         - The detection exchange's type (topic/fanout/direct).
        * det_exchange_durable      - The detection exchange's durable setting.
        * det_exchange_autodelete   - The detection exchange's auto_delete setting.

        If these variables are not present, no error is thrown - it will use whatever you
        gave it. So your updated SQL definitions may hardcode the variables above.
        """
        defs = content['content']
        self.attribute_store_client.put(SQLTDEFS_KEY, defs)
        self.reply_ok(msg, {'value': 'ok'}, {})
Пример #10
0
class EPUManagementService(ServiceProcess):
    """EPU Management service interface

    The system is bootstrapped with a launch plan using the CEI bootstrap
    tool ("cloudinit.d").  When the system has reached an operational state
    without error, the EPU system is in effect, compensating for load and
    failures.

    This service manages the EPU system as a whole through remote messages.

    It is not implemented; when it is implemented it will be backed almost
    entirely by the functionality currently in the "epumgmt" tool via API.
    More basic things like launching new EPUs entirely into a running system
    will be a combination of cloudinit.d API and epumgmt API functionality.
    Currently the cloudinit.d and epumgmt tools are run from the commandline.
    
    This class represents the service interface in order to architecturally
    represent "EPU management from the outside" as a service itself which
    can be integrated via AMQP instead of commandline/scripts.
    """

    declare = ServiceProcess.service_declare(name='epu_management',
                                             version='0.1.0',
                                             dependencies=[])

    def slc_init(self):
        """Initialize the service.

        Reads in all credentials and IaaS coordinates available for
        bootstrapping new EPUs.  Also there would be a datastore dependency
        configuration.
        """
        pass

    def op_create_system(self, content, headers, msg):
        """Create an entirely new system based on the input launch plan.

        This is a pass-through to the cloudinit.d "boot" subcommand via API.

        The return message mere indicates success or failure.  To query an
        in-progress, failed or launched run, the client would interact with
        the datastore to get the latest information.

        Input: one serialized launch plan.
        """
        pass

    def op_destroy_system(self, content, headers, msg):
        """Entirely destroy a booted system.

        This will cause the provisioner node to intiate terminations on all
        known workers via IaaS.  Then it will move to destroying anything
        launched by the bootstrap tool.

        Input: flag to indicate whether or not log files should be retrieved
        or not before destruction (for post-mortem analysis).
        """
        pass

    def op_add_epu(self, content, headers, msg):
        """Create a new EPU entirely.

        Brings one or many services into being initially, using the EPU
        infrastructure.

        Everything in the input service spec will have an EPU controller
        running on the same instance that this will launch.

        If the services have dependencies that must be running ahead of time,
        it is assumed the operator understands that these dependencies will
        need to be resolved and operational already.  Launch-plans are
        "vetted" as a whole, but adding new EPUs on top of an already
        launched system is a manual decision that requires knowledge of what
        the system is capable of sustaining change-wise.

        Input: list of one to many service specs (just a "piece" of a normal
        launch plan).  The IaaS coordinates and credentials must have been
        given to the service at initialization time.
        """
        pass

    def op_remove_epu(self, content, headers, msg):
        """Remove an EPU entirely.

        The same caveats apply as in the add_epu operation: the system needs
        to be able to handle such a thing.  If a service is deprecated, it
        is implied that the messages drain out before the entire capability
        is removed.  This will need to be taken care of by a higher level
        metric/tool to know when it is safe to delete an EPU entirely.

        Note that this is different than reducing the node count of a certain
        worker set to zero (see the reconfigure operation).  That can create
        the same temporary effect of course (no workers for this service in
        the system) but that, for example, leaves room for workers coming
        online when the demand increases past a certain threshold.
        """
        pass

    def op_reconfigure_epu(self, content, headers, msg):
        """Reconfigure a running EPU with a new policy.

        Given the name of one or many EPU controller(s) in the system, this
        is a convenience operation that ensures the new policy is configured.

        An EPU controller's policy is dictated by its "decision engine".
        The default decision engine supports policy reconfiguration but not
        all of them are required to.  An attempt to try to configure an EPU
        controller with such a decision engine will result in a harmless error.

        See the NPreservingEngine class notes for reconfiguration details.
        """
        pass

    def op_find_workers(self, content, headers, msg):
        """Interact with the provisioner to discover any new worker nodes
        that were launched in the system since the last query.

        Input: optionally filter by a particular HA service.

        Returns the newest workers, before return it has updated the
        datastore with any new information.
        """
        pass

    def op_service_status(self, content, headers, msg):
        """Return the status of one or more services.  Each service status
        will list the:

        1) Known workers, past and present.  Instance information, time of
        launch (and failure/termination if applicable), hostnames.

        2) Worker status: IaaS status as well as the more "semantic" knowledge
        of health that is acquired via heartbeats (or the lack thereof).
        """
        pass
Пример #11
0
class SiamCiReceiverService(ServiceProcess):
    """
    Simple service to receive asynchronous reponses from the SIAM-CI adapter
    in java
    """
    # Declaration of service
    declare = ServiceProcess.service_declare(name='siamci_receiver',
                                             version='0.1.0',
                                             dependencies=[])

    def __init__(self, *args, **kwargs):
        ServiceProcess.__init__(self, *args, **kwargs)
        log.debug('SiamCiReceiverService.__init__()')
        
        self.rc = ResourceClient(proc=self)
        self.mc = MessageClient(proc=self)
        
        self.checkTimeout = None
        
        # the set of id's given via op_expect:
        self.expect = set()
        
        # the (id, content) pairs accepted via op_acceptResponse:
        self.accepted = {}


    def slc_init(self):
        log.debug('SiamCiReceiverService.slc_init()')

    def slc_terminate(self):
        """
        Just logs the expect and accepted sets
        """
        if log.getEffectiveLevel() <= logging.DEBUG:
            log.debug('SiamCiReceiverService.slc_terminate() ======= ')
            for e in self.expect:
                log.debug('---- expect: ' +str(e))
            for a in self.accepted:
                log.debug('---accepted: ' +str(a))


    def _get_publish_id(self, content, headers, msg):
        """
        Gets the publish_id from the headers or the content.
        Note: the java client puts the publish_id in the headers; we check there first.        
        If not found in the headers, we check in the content (if it is a dict); this is 
        basically to support python-side clients for testing purposes.
        
        @return: the publish ID; None if not found
        """
        publish_id = None
        if 'publish_id' in headers.keys():
            publish_id = headers['publish_id']
            log.debug('_get_publish_id: publish_id = "'+publish_id+ '" (from headers)')
        elif isinstance(content, dict) and 'publish_id' in content.keys():
            publish_id = content['publish_id']
            log.debug('_get_publish_id: publish_id = "'+publish_id+ '" (from content)')
            
        return publish_id
       
    
    @defer.inlineCallbacks
    def op_expect(self, content, headers, msg):
        log.debug('op_expect: ' +str(content))
        
        publish_id = self._get_publish_id(content, headers, msg)
        if publish_id:
            self.expect.add(publish_id)
        else:
            log.warn('op_expect: publish_id not given')
        
        yield self.reply_ok(msg, {'value' : "TODO-some-result"})


    @defer.inlineCallbacks
    def op_acceptResponse(self, content, headers, msg):
        publish_id = self._get_publish_id(content, headers, msg)
        if publish_id:
            self.accepted[publish_id] = content
            yield self.reply_ok(msg, {'op_acceptResponse' : "OK: response for publish_id='" +str(publish_id)+ "' accepted"})
        else:
            log.warn('op_acceptResponse: publish_id not given')
            yield self.reply_err(msg, "op_acceptResponse : WARNING: publish_id not given")


    @defer.inlineCallbacks
    def op_setExpectedTimeout(self, content, headers, msg):
        """
        Sets the timeout for the op_getExpected operation. There in no
        timeout by default.
        """
        
        if 'timeout' in content.keys() and content['timeout']:
            self.checkTimeout = content['timeout']
            yield self.reply_ok(msg, {'checkTimeout' : self.checkTimeout})
        else:
            yield self.reply_err(msg, "Missing 'timeout' for op_setExpectedTimeout operation")
            
    @defer.inlineCallbacks
    def op_getExpected(self, content, headers, msg):
        """
        Returns a list with expected id's that have not been received.
        
        If the content includes a 'timeout' parameter, this is used to allow time for expected 
        responses to be received. If not, then the timeout indicated in the last call to 
        op_setExpectedTimeout, if any, will be used. Otherwise, no timeout at all is used.
        
        @return: a list with expected id's that have not been received
        """
        
        log.debug('op_getExpected: ' +str(headers))
        
        timeout = None
        if 'timeout' in content.keys() and content['timeout']:
            timeout = content['timeout']   # content in this operation takes precedence
        else:
            timeout = self.checkTimeout    # use the overall timeout, if any
            
        # the total time in seconds we will wait while there is still expected id's
        remaining = timeout if timeout else 0.0
        
        expected = self._get_still_expected()
        while len(expected) > 0 and remaining > 0.0:
            yield pu.asleep(0.2);   # sleep for a moment
            remaining -= 0.2
            expected = self._get_still_expected()

        yield self.reply_ok(msg, expected)


    def _get_still_expected(self):
        expected = []
        for e in self.expect:
            if not e in self.accepted.keys():
                expected.append(e)
        return expected


    @defer.inlineCallbacks
    def op_getAccepted(self, content, headers, msg):
        """
        Returns the content received for a given publish_id; None if not received yet.
        """
        publish_id = self._get_publish_id(content, headers, msg)
        if publish_id:
            if publish_id in self.accepted:
                yield self.reply_ok(msg, self.accepted[publish_id])
            else:
                yield self.reply_ok(msg, None)
        else:
            log.warn('op_getAccepted: publish_id not given')
            yield self.reply_err(msg, 'op_getAccepted: publish_id not given')
Пример #12
0
class EPUControllerService(ServiceProcess):
    """EPU Controller service interface
    """

    declare = ServiceProcess.service_declare(name=DEFAULT_NAME,
                                             version='0.1.0',
                                             dependencies=[])

    @defer.inlineCallbacks
    def slc_init(self):

        scoped_name = self.get_scoped_name("system", self.svc_name)
        self.scoped_name = scoped_name

        queue_name_work = self.spawn_args.get("queue_name_work")
        if queue_name_work:
            self.queue_name_work = self.get_scoped_name(
                "system", queue_name_work)

            extradict = {"queue_name_work": self.queue_name_work}
            cei_events.event(self.svc_name, "init_begin", extra=extradict)
            yield self._make_queue(queue_name_work)

            queuestat_client = QueueStatClient(self)
            yield queuestat_client.watch_queue(self.queue_name_work,
                                               self.scoped_name, 'sensor_info')
            cei_events.event(self.svc_name, "queue_watched")

        else:
            self.worker_queue_receiver = None
            self.queue_name_work = None
            extradict = None
            cei_events.event(self.svc_name, "init_begin", extra=extradict)

        engineclass = "epu.decisionengine.impls.NpreservingEngine"
        if self.spawn_args.has_key("engine_class"):
            engineclass = self.spawn_args["engine_class"]
            log.info("Using configured decision engine: %s" % engineclass)
        else:
            log.info("Using default decision engine: %s" % engineclass)

        if self.spawn_args.has_key("engine_conf"):
            engine_conf = self.spawn_args["engine_conf"]
            if isinstance(engine_conf, str):
                engine_conf = json.loads(engine_conf)
        else:
            engine_conf = None

        if self.spawn_args.has_key("cassandra"):
            cass = self.spawn_args["cassandra"]
            host = cass['hostname']
            username = cass['username']
            password = cass['password']
            port = cass['port']
            keyspace = cass['keyspace']

            store = CassandraControllerStore(self.svc_name, host, port,
                                             username, password, keyspace,
                                             CoreInstance, SensorItem)
            store.initialize()
            store.activate()
        elif self.spawn_args.has_key('store'):
            store = self.spawn_args['store']
        else:
            store = ControllerStore()

        self.core = ControllerCore(ProvisionerClient(self),
                                   engineclass,
                                   scoped_name,
                                   conf=engine_conf,
                                   store=store)

        # run state recovery and engine initialization

        # this one needs to run before any messages start arriving. It pulls
        # information from persistence and refreshes local caches.
        yield self.core.run_recovery()

        # temporarily doing this later due to a potential bug in ioncore where
        # queues may not be bound before slc_init runs. This means  if the
        # provisioner is quck to reply to dump_state some messages may be
        # missed.
        reactor.callLater(1, self._delayed_init)

    @defer.inlineCallbacks
    def _delayed_init(self):
        yield self.core.run_initialize()

        self.core.begin_controlling()
        cei_events.event(self.svc_name, "init_end")

    @defer.inlineCallbacks
    def _make_queue(self, name):
        self.worker_queue_receiver = ServiceWorkerReceiver(label=name,
                                                           name=name,
                                                           scope='system')
        yield self.worker_queue_receiver.initialize()

    def op_heartbeat(self, content, headers, msg):
        log.debug("Got node heartbeat: %s", content)
        return self.core.new_heartbeat(content)

    def op_instance_state(self, content, headers, msg):
        return self.core.new_instance_state(content)

    def op_sensor_info(self, content, headers, msg):
        return self.core.new_sensor_info(content)

    def op_reconfigure(self, content, headers, msg):
        log.info("EPU Controller: reconfigure: '%s'" % content)
        return self.core.run_reconfigure(content)

    @defer.inlineCallbacks
    def op_reconfigure_rpc(self, content, headers, msg):
        log.info("EPU Controller: reconfigure_rpc: '%s'" % content)
        yield self.core.run_reconfigure(content)
        yield self.reply_ok(msg, "")

    @defer.inlineCallbacks
    def op_de_state(self, content, headers, msg):
        state = self.core.de_state()
        extradict = {"state": state}
        cei_events.event(self.svc_name, "de_state", extra=extradict)
        yield self.reply_ok(msg, state)

    @defer.inlineCallbacks
    def op_whole_state(self, content, headers, msg):
        state = yield self.core.whole_state()
        yield self.reply_ok(msg, state)

    @defer.inlineCallbacks
    def op_node_error(self, content, headers, msg):
        node_id = content
        state = yield self.core.node_error(node_id)
        yield self.reply_ok(msg, state)