def __init__(self, *args, **kwargs): ServiceProcess.__init__(self, *args, **kwargs) self.routing = {} # mapping of queues to a list of bindings (station ids/sensor ids) self.workers = {} # mapping of known worker vms to info about those vms (cores / running instances) # get configs for current exchange setup from exchange space, queues as per what TopicWorkerReceiver (below) uses exchcnfg = self.container.exchange_manager.exchange_space.exchange msgcnfg = messaging.worker('temp') # for timing self._timer = time.time() # for reconfigure events self._reconfigure_timeout = None # provisioner vars are common vars for all worker instances self.prov_vars = { 'sqlt_vars' : { 'inp_exchange' : INP_EXCHANGE_NAME, 'inp_exchange_type' : exchcnfg.exchange_type, 'inp_exchange_durable' : str(exchcnfg.durable).lower(), 'inp_exchange_autodelete': str(exchcnfg.auto_delete).lower(), 'inp_queue_durable' : msgcnfg['durable'], 'inp_queue_autodelete' : msgcnfg['auto_delete'], 'det_topic' : DETECTION_TOPIC, 'det_exchange' : OUT_EXCHANGE_NAME, 'det_exchange_type' : exchcnfg.exchange_type, 'det_exchange_durable' : str(exchcnfg.durable).lower(), 'det_exchange_autodelete': str(exchcnfg.auto_delete).lower() } }
def __init__(self, *args, **kwargs): ServiceProcess.__init__(self, *args, **kwargs) self.routing = { } # mapping of queues to a list of bindings (station ids/sensor ids) self.workers = { } # mapping of known worker vms to info about those vms (cores / running instances) # get configs for current exchange setup from exchange space, queues as per what TopicWorkerReceiver (below) uses exchcnfg = self.container.exchange_manager.exchange_space.exchange msgcnfg = messaging.worker('temp') # for timing self._timer = time.time() # for reconfigure events self._reconfigure_timeout = None # provisioner vars are common vars for all worker instances self.prov_vars = { 'sqlt_vars': { 'inp_exchange': INP_EXCHANGE_NAME, 'inp_exchange_type': exchcnfg.exchange_type, 'inp_exchange_durable': str(exchcnfg.durable).lower(), 'inp_exchange_autodelete': str(exchcnfg.auto_delete).lower(), 'inp_queue_durable': msgcnfg['durable'], 'inp_queue_autodelete': msgcnfg['auto_delete'], 'det_topic': DETECTION_TOPIC, 'det_exchange': OUT_EXCHANGE_NAME, 'det_exchange_type': exchcnfg.exchange_type, 'det_exchange_durable': str(exchcnfg.durable).lower(), 'det_exchange_autodelete': str(exchcnfg.auto_delete).lower() } }
def __init__(self, *args, **kwargs): ServiceProcess.__init__(self, *args, **kwargs) log.debug('SiamCiReceiverService.__init__()') self.rc = ResourceClient(proc=self) self.mc = MessageClient(proc=self) self.checkTimeout = None # the set of id's given via op_expect: self.expect = set() # the (id, content) pairs accepted via op_acceptResponse: self.accepted = {}
class ProvisionerQueryService(ServiceProcess): """Provisioner querying service """ declare = ServiceProcess.service_declare(name='provisioner_query', version='0.1.0', dependencies=[]) def slc_init(self): interval = float( self.spawn_args.get("interval_seconds", DEFAULT_QUERY_INTERVAL)) self.client = ProvisionerClient(self) log.debug('Starting provisioner query loop - %s second interval', interval) self.loop = LoopingCall(self.query) self.loop.start(interval) def slc_terminate(self): if self.loop: self.loop.stop() @defer.inlineCallbacks def query(self): try: yield self._do_query() except Exception, e: log.error("Error sending provisioner query request: %s", e, exc_info=True)
class ProvisionerService(ServiceProcess): """Provisioner service interface """ # Declaration of service declare = ServiceProcess.service_declare(name='provisioner', version='0.1.0', dependencies=[]) @defer.inlineCallbacks def slc_init(self): cei_events.event("provisioner", "init_begin") try: store = self.spawn_args['store'] site_drivers = self.spawn_args['site_drivers'] context_client = self.spawn_args['context_client'] except KeyError, e: raise KeyError("Missing provisioner spawn_arg: " + str(e)) self.store = store notifier = self.spawn_args.get('notifier') self.notifier = notifier or ProvisionerNotifier(self) self.dtrs = DeployableTypeRegistryClient(self) self.core = ProvisionerCore(self.store, self.notifier, self.dtrs, site_drivers, context_client) yield self.core.recover() cei_events.event("provisioner", "init_end") # operator can disable new launches self.enabled = True self.terminate_all_deferred = None
class DeployableTypeRegistryService(ServiceProcess): """Deployable Type Registry service interface """ declare = ServiceProcess.service_declare(name='dtrs', version='0.1.0', dependencies=[]) def slc_init(self): registry = self.spawn_args.get('registry') registry_dir = self.spawn_args.get('registry_dir') if registry is None and registry_dir is None: raise ValueError("DTRS needs either 'registry' or 'registry_dir' in spawnargs") if registry is not None: self.registry = registry else: log.info('DTRS configured to use directory %s' % registry_dir) self.registry = DeployableTypeRegistry(registry_dir) self.registry.load() def op_lookup(self, content, headers, msg): """Resolve a deployable type """ # hide the password so it doesn't get logged hide_password = deepcopy(content) if hide_password.get('vars') and 'cassandra_password' in hide_password['vars']: hide_password['vars']['cassandra_password'] = '******' if hide_password.get('vars') and 'broker_password' in hide_password['vars']: hide_password['vars']['broker_password'] = '******' log.debug('Received DTRS lookup. content: %s', hide_password) # just using a file for this right now, to keep it simple dt_id = content['deployable_type'] nodes = content.get('nodes') vars = content.get('vars') dt = self.registry.get(dt_id) if not dt: return self._dtrs_error(msg, 'Unknown deployable type name: '+ dt_id) doc_tpl = dt['document'] defaults = dt.get('vars') all_vars = {} if defaults: all_vars.update(defaults) if vars: try: process_vars(vars, dt_id) except DeployableTypeValidationError, e: return self._dtrs_error(msg, str(e)) all_vars.update(vars) template = string.Template(doc_tpl) try: document = template.substitute(all_vars) except KeyError,e: return self._dtrs_error(msg, 'DT doc has variable not present in request or defaults: %s' % str(e))
class EPUControllerListService(ServiceProcess): """Provides list of EPU Controller service names """ declare = ServiceProcess.service_declare(name='epu_controller_list', version='0.1.0', dependencies=[]) def slc_init(self): # Allow direct list for tests, etc. self.controller_list = self.spawn_args.get('controller_list_direct', None) # It's valid to have a zero length list, only check 'is None' if self.controller_list is None: controller_list_path = self.spawn_args.get('controller_list_path', None) if not controller_list_path: raise Exception( "There is no 'controller_list_path' configuration") self.controller_list = self._intake_file(controller_list_path) if self.controller_list: log.debug("Initialized with controller list:\n%s\n" % self.controller_list) else: log.debug("Initialized with empty controller list") def _intake_file(self, controller_list_path): if not os.path.exists(controller_list_path): raise Exception( "The 'controller_list_path' file does not exist: %s" % controller_list_path) controller_list = [] f = open(controller_list_path) for line in f.readlines(): name = line.strip() if not name: continue if name.startswith("#"): continue controller_list.append(name) return controller_list def plc_terminate(self): log.debug('EPU Controller List service: shutdown triggered') @defer.inlineCallbacks def op_list(self, content, headers, msg): """Return a list of zero to N controller names """ yield self.reply_ok(msg, self.controller_list)
class EPUWorkProducer(ServiceProcess): """EPU Work Producer. """ declare = ServiceProcess.service_declare(name='epu_work_producer', version='0.1.0', dependencies=[]) def slc_init(self): self.queue_name_work = self.get_scoped_name( "system", self.spawn_args["queue_name_work"]) self.web_resource = Sidechannel() listen_port = self.spawn_args["listen_port"] reactor.listenTCP(int(listen_port), server.Site(self.web_resource)) self.work_produce_loop = LoopingCall(self.work_seek) self.work_produce_loop.start(1, now=False) @defer.inlineCallbacks def work_seek(self): try: while True: job = self.web_resource.queue.get(block=False) if job is None: raise Queue.Empty() yield self.send( self.queue_name_work, 'work', { "work_amount": job.length, "batchid": job.batchid, "jobid": job.jobid }) extradict = { "batchid": job.batchid, "jobid": job.jobid, "work_amount": job.length } cei_events.event("workproducer", "job_sent", extra=extradict) # This is an unfortunate hack to work around a memory leak in ion. # Some caches are only cleared after a received message is handled. # Since this process sends messages "spontaneously" -- triggered by a # LoopingCall -- we must manually clear the cache. self.message_client.workbench.manage_workbench_cache( 'Default Context') except Queue.Empty: return except Exception, e: # unhandled exceptions will terminate the LoopingCall log.error("Error adding work: %s", e, exc_info=True)
class EPUWorkerService(ServiceProcess): """EPU Worker service. """ declare = ServiceProcess.service_declare(name='epu_worker', version='0.1.0', dependencies=[]) def slc_init(self): queue_name = self.spawn_args["queue_name_work"] self.workReceiver = WorkerReceiver(name=queue_name, label=__name__, scope=WorkerReceiver.SCOPE_SYSTEM, handler=self.receive) self.queue_name_work = self.workReceiver.xname extradict = {"queue_name_work": self.queue_name_work} cei_events.event("worker", "init_begin", extra=extradict) self.laterinitialized = False reactor.callLater(0, self.later_init) @defer.inlineCallbacks def later_init(self): spawnId = yield self.workReceiver.attach() log.debug("spawnId: %s" % spawnId) self.laterinitialized = True extradict = {"queue_name_work": self.queue_name_work} cei_events.event("worker", "init_end", extra=extradict) @defer.inlineCallbacks def op_work(self, content, headers, msg): if not self.laterinitialized: log.error("message got here without the later-init") sleepsecs = int(content['work_amount']) extradict = { "batchid": content['batchid'], "jobid": content['jobid'], "work_amount": sleepsecs } cei_events.event("worker", "job_begin", extra=extradict) log.info("WORK: sleeping for %d seconds ---" % sleepsecs) yield pu.asleep(sleepsecs) yield self.reply(msg, 'result', {'result': 'work_complete'}, {}) cei_events.event("worker", "job_end", extra=extradict)
class EPUControllerClientSample(ServiceProcess): declare = ServiceProcess.service_declare(name='epu_reconfigure_sample', version='0.1.0', dependencies=[]) def slc_init(self, proc=None, **kwargs): self.client = EPUControllerClient() reactor.callLater(5, self.send_reconfigure) @defer.inlineCallbacks def send_reconfigure(self): newconf = {} newconf["preserve_n"] = "%s" % self.spawn_args["preserve_n"] newconf["unique_instances"] = { 'b2db408e': { 'some_unique_name': 'some_unique_value123' }, '3633541e': { 'some_unique_name': 'some_other_unique_value456' } } self.client.reconfigure(newconf)
class AppControllerService(ServiceProcess): """ Defines an application controller service to perform load balancing. """ declare = ServiceProcess.service_declare(name="app_controller", version="0.1.0", dependencies=["attributestore"]) def __init__(self, *args, **kwargs): ServiceProcess.__init__(self, *args, **kwargs) self.routing = { } # mapping of queues to a list of bindings (station ids/sensor ids) self.workers = { } # mapping of known worker vms to info about those vms (cores / running instances) # get configs for current exchange setup from exchange space, queues as per what TopicWorkerReceiver (below) uses exchcnfg = self.container.exchange_manager.exchange_space.exchange msgcnfg = messaging.worker('temp') # for timing self._timer = time.time() # for reconfigure events self._reconfigure_timeout = None # provisioner vars are common vars for all worker instances self.prov_vars = { 'sqlt_vars': { 'inp_exchange': INP_EXCHANGE_NAME, 'inp_exchange_type': exchcnfg.exchange_type, 'inp_exchange_durable': str(exchcnfg.durable).lower(), 'inp_exchange_autodelete': str(exchcnfg.auto_delete).lower(), 'inp_queue_durable': msgcnfg['durable'], 'inp_queue_autodelete': msgcnfg['auto_delete'], 'det_topic': DETECTION_TOPIC, 'det_exchange': OUT_EXCHANGE_NAME, 'det_exchange_type': exchcnfg.exchange_type, 'det_exchange_durable': str(exchcnfg.durable).lower(), 'det_exchange_autodelete': str(exchcnfg.auto_delete).lower() } } @defer.inlineCallbacks def slc_init(self): # Service life cycle state. # consume the announcement queue self.announce_recv = TopicWorkerReceiver(name=ANNOUNCE_QUEUE, scope='global', process=self, handler=self._recv_announce) # declares queue and starts listening on it yield self.announce_recv.attach() # get topic based routing to all sensor data (for anything missed on the announcement queue) #self.all_data_recv = TopicWorkerReceiver(name="ta_alldata", # scope='global', # binding_key = "ta.*.BHZ", # process=self, # handler=self._recv_data) #yield self.all_data_recv.attach() #yield self.all_data_recv.initialize() #self.counter = 0 self.epu_controller_client = EPUControllerClient() self.attribute_store_client = AttributeStoreClient() yield self._load_sql_def() @defer.inlineCallbacks def _recv_announce(self, data, msg): """ Received an instrument announcement. Set up a binding for it. """ jsdata = json.loads(data) station_name = jsdata['content'] log.info("Instrument Station Announce: " + station_name) found = self.has_station_binding(station_name) if found: log.error("Duplicate announcement") else: yield self.bind_station(station_name) yield msg.ack() #def _recv_data(self, data, msg): # #log.info("<-- data packet" + msg.headers.__str__()) # log.info("data " + self.counter.__str__()) # self.counter += 1 # msg.ack() @defer.inlineCallbacks def bind_station(self, station_name, queue_name=None): """ Binds a station to a queue. Typically you do not specify the queue name, this method will find a queue with room. If a queue name is given, no checking will be done - it will simply be added. """ if queue_name == None: queue_name = "W%s" % (len(self.routing.keys()) + 1) # find a queue with enough room added = False for queues in self.routing.keys(): qlen = len(self.routing[queues]) if qlen < STATIONS_PER_QUEUE: queue_name = queues break binding_key = '%s' % station_name yield self._create_queue(queue_name, binding_key) if not self.routing.has_key(queue_name): self.routing[queue_name] = [] self.request_sqlstream(queue_name) self.routing[queue_name].append(station_name) log.info("Created binding %s to queue %s" % (binding_key, queue_name)) @defer.inlineCallbacks def _create_queue(self, queue_name, binding_key): """ Creates a queue and/or binding to a queue (just the binding if the queue exists). TODO: replace this with proper method of doing so. """ recv = TopicWorkerReceiver(name=queue_name, scope='global', binding_key=binding_key, process=self) yield recv.initialize() # creates queue but does not listen def request_sqlstream(self, queue_name, op_unit_id=None): """ Requests a SQLStream operational unit to be created, or an additional SQLStream on an exiting operational unit. @param queue_name The queue the SQL Stream unit should consume from. @param op_unit_id The operational unit id that should be used to create a SQL Stream instance. If specified, will always create on that op unit. Otherwise, it will find available space on an existing VM or create a new VM. """ # if this var is true, at the end of this method, instead of reconfiguring via # the decision engine, we will directly ask the agent on op_unit_id to spawn the # sqlstream engine. This will hopefully be taken out when we can reconfigure # workers on the fly. direct_request = False if op_unit_id != None and not self.workers.has_key(op_unit_id): log.error("request_sqlstream: op_unit (%s) requested but unknown" % op_unit_id) if op_unit_id == None: # find an available op unit for (worker, info) in self.workers.items(): availcores = info['metrics']['cores'] - ( len(info['sqlstreams']) * CORES_PER_SQLSTREAM) if availcores >= CORES_PER_SQLSTREAM: log.info( "request_sqlstream - asking existing operational unit (%s) to spawn new SQLStream" % worker) # Request spawn new sqlstream instance on this worker # wait for rpc message to app controller that says sqlstream is up op_unit_id = worker direct_request = True # record the fact we are using this worker now # TODO : needs to be an integer to indicate number of starting up, or a # unique key per each starter #info['sqlstreams']['spawning'] = True break if op_unit_id == None: op_unit_id = str(uuid.uuid4())[:8] log.info("request_sqlstream - requesting new operational unit %s" % op_unit_id) # now we have an op_unit_id, update the config if not self.workers.has_key(op_unit_id): self.workers[op_unit_id] = { 'metrics': { 'cores': 2 }, # all workers should have at least two, will be updated when status is updated 'state': '', 'sqlstreams': {} } streamcount = 0 else: streamcount = len(self.workers[op_unit_id]['sqlstreams']) ssid = str(streamcount + 1) stream_conf = {'sqlt_vars': {'inp_queue': queue_name}, 'ssid': ssid} self.workers[op_unit_id]['sqlstreams'][ssid] = { 'conf': stream_conf, 'state': '' } if direct_request == True: self._start_sqlstream(op_unit_id, stream_conf) else: self.request_reconfigure() # schedule a reconfigure event! def request_reconfigure(self): """ Rate limiter for actual request reconfigure call. Waits 4 seconds for any more reconfigure attempts, each of which delays the call by another 4 seconds. When the timeout finally calls, the real reconfigure is sent. """ if self._reconfigure_timeout != None and self._reconfigure_timeout.active( ): log.info( "request_reconfigure: delay already active, resetting to 4 seconds" ) self._reconfigure_timeout.reset(4) else: def callReconfigure(): log.info( "request_reconfigure: delay complete, actually performing reconfigure" ) self._reconfigure_timeout = None self._request_reconfigure() log.info( "request_reconfigure: starting delay to 4 seconds to prevent flooding EPU controller" ) self._reconfigure_timeout = reactor.callLater(4, callReconfigure) def _request_reconfigure(self): """ Requests a reconfiguration from the Decision Engine. This takes care of provisioning workers. This method builds the JSON required to reconfigure/configure the decision engine. """ # TODO: likely does not need to send prov vars every time as this is reconfigure provvars = self.prov_vars.copy() #provvars['sqldefs'] = provvars['sqldefs'].replace("$", "$$") # escape template vars once so it doesn't get clobbered in provisioner replacement conf = { 'preserve_n': len(self.workers), #PROVISIONER_VARS_KEY : self.prov_vars, 'unique_instances': {} } for (wid, winfo) in self.workers.items(): conf['unique_instances'][wid] = {'agent_args': {'sqlstreams': []}} conf['unique_instances'][wid]['agent_args'].update(self.prov_vars) ssdefs = conf['unique_instances'][wid]['agent_args']['sqlstreams'] for (ssid, ssinfo) in winfo['sqlstreams'].items(): ssdefs.append({ 'ssid': ssinfo['conf']['ssid'], 'sqlt_vars': ssinfo['conf']['sqlt_vars'] }) if DEBUG_WRITE_PROV_JSON: f = open('/tmp/prov.json', 'w') json.dump(conf, f, indent=1) f.close() log.debug( "Wrote /tmp/prov.json due to DEBUG_WRITE_PROV_JSON being on in the config." ) for (wid, winfo) in conf['unique_instances'].items(): wdict = winfo.copy() wdict['agent_args']['opunit_id'] = wid f = open('/tmp/sa-' + wid + '.json', 'w') json.dump(wdict, f, indent=1) f.close() log.debug("Wrote /tmp/sa-%s.json." % wid) # merge and write individual worker configs while we're at it #for (wid, winfo) in self.workers.items(): # wdict = { 'agent_args': { 'opunit_id' : wid, # 'sqlstreams': str(conf['unique_instances'][wid]['sqlstreams']), # TODO: unstringify this # 'sqlt_vars' : self.prov_vars['sqlt_vars'] } } # f = open('/tmp/sa-' + wid + '.json', 'w') # json.dump(wdict, f, indent=1) # f.close() self.epu_controller_client.reconfigure(conf) # record the time we sent this self._timer = time.time() def has_station_binding(self, station_name): """ Returns true if we know about this station. """ for queues in self.routing.keys(): found = station_name in self.routing[queues] if found: return True return False def op_opunit_status(self, content, headers, msg): """ Handles an application agent reporting an operational unit's status. Details include its current state, metrics about the system, status of SQLstream instances. """ self._update_opunit_status(content) self.reply_ok(msg, {'value': 'ok'}, {}) def request_opunit_status(self, opunit_id): """ Asks an AppAgent to report in its status. """ proc_id = self.workers[opunit_id]['proc_id'] d = self.rpc_send(proc_id, 'get_opunit_status', {}) d.addCallback(lambda res: self._update_opunit_status(res[0])) def _update_opunit_status(self, status): """ Internal method to handle updating an op unit's status. Status updates can either come from heartbeats initiated by the AppAgent, or on request from the AppController. This method handles both of those. """ opunit_id = status['id'] proc_id = status['proc_id'] state = status['state'] metrics = status['metrics'] sqlstreams = status['sqlstreams'] sstext = "" for ssid, sinfo in sqlstreams.items(): sstext += "(id: %s status: %s queue: %s)" % (ssid, sinfo['state'], sinfo['inp_queue']) # get amount of time since we requested opunits timediff = time.time() - self._timer log.info( "Op Unit (%s) status update (+%s sec) : state (%s), sqlstreams (%d): %s" % (opunit_id, str(timediff), state, len(sqlstreams), sstext)) if not self.workers.has_key(status['id']): self.workers[status['id']] = {} self.workers[opunit_id].update({ 'metrics': metrics, 'state': state, 'proc_id': proc_id, 'sqlstreams': sqlstreams }) # display a message if all known opunits are running allstate = [ ssinfo.get('state', None) for ssinfo in [ winfo['sqlstreams'] for winfo in self.workers.values() if len(winfo['sqlstreams']) > 0 ] ] if set(allstate) == set(["SUCCESS"]): log.info("All known workers are running (+%s sec)" % timediff) def _start_sqlstream(self, op_unit_id, conf): """ Tells an op unit to start a SQLStream instance. """ proc_id = self.workers[op_unit_id]['proc_id'] self.rpc_send(proc_id, 'start_sqlstream', conf) def _load_sql_def(self): """ Loads SQL Templates from disk and puts them in a store. Called at startup. XXX fix: Gets SQLStream detection application SQL definitions, either from disk or in memory. SQL files stored on disk are loaded once and stored in memory after they have been translated through string.Template. You may override the SQL defs by sending an RPC message ("set_sql_defs") to the Application Controller. These defs will take the place of the current in memory defs. They are expected to be templates, in which certain vars will be updated. See op_set_sql_defs for more information. """ fulltemplatelist = [] for filename in ["catalog.sqlt", "funcs.sqlt", "detections.sqlt"]: f = resource_stream(__name__, "data/%s" % filename) #f = open(os.path.join(os.path.dirname(__file__), "app_controller_service", filename), "r") fulltemplatelist.extend(f.readlines()) f.close() fulltemplate = "".join(fulltemplatelist) self.attribute_store_client.put(SQLTDEFS_KEY, fulltemplate) def op_set_sql_defs(self, content, headers, msg): """ Updates the current cached SQL defs for the SQLStream detection application. This overrides what is found on the disk. Note it does not update the SQL files on disk, so if the AppControllerService is restarted, it will need to be updated with the current defs again. This method expects that the only key in content, also named content, is a full SQL definition (the concatenation of "catalog.sqlt" and "detections.sqlt") with Python string.Template vars as substitution points for the following variables: * inp_queue - The input queue name to read messages from. * inp_queue_autodelete - The input queue's auto_delete setting. * inp_queue_durable - The input queue's durable setting. * inp_exchange - The exchange where the input queue resides. * inp_exchange_type - The exchange's type (topic/fanout/direct). * inp_exchange_durable - The exchange's durable setting. * inp_exchange_autodelete - The exchange's auto_delete setting. * det_topic - The topic string that should be used for detections. * det_exchange - The exchange where detections should be published. * det_exchange_type - The detection exchange's type (topic/fanout/direct). * det_exchange_durable - The detection exchange's durable setting. * det_exchange_autodelete - The detection exchange's auto_delete setting. If these variables are not present, no error is thrown - it will use whatever you gave it. So your updated SQL definitions may hardcode the variables above. """ defs = content['content'] self.attribute_store_client.put(SQLTDEFS_KEY, defs) self.reply_ok(msg, {'value': 'ok'}, {})
class EPUManagementService(ServiceProcess): """EPU Management service interface The system is bootstrapped with a launch plan using the CEI bootstrap tool ("cloudinit.d"). When the system has reached an operational state without error, the EPU system is in effect, compensating for load and failures. This service manages the EPU system as a whole through remote messages. It is not implemented; when it is implemented it will be backed almost entirely by the functionality currently in the "epumgmt" tool via API. More basic things like launching new EPUs entirely into a running system will be a combination of cloudinit.d API and epumgmt API functionality. Currently the cloudinit.d and epumgmt tools are run from the commandline. This class represents the service interface in order to architecturally represent "EPU management from the outside" as a service itself which can be integrated via AMQP instead of commandline/scripts. """ declare = ServiceProcess.service_declare(name='epu_management', version='0.1.0', dependencies=[]) def slc_init(self): """Initialize the service. Reads in all credentials and IaaS coordinates available for bootstrapping new EPUs. Also there would be a datastore dependency configuration. """ pass def op_create_system(self, content, headers, msg): """Create an entirely new system based on the input launch plan. This is a pass-through to the cloudinit.d "boot" subcommand via API. The return message mere indicates success or failure. To query an in-progress, failed or launched run, the client would interact with the datastore to get the latest information. Input: one serialized launch plan. """ pass def op_destroy_system(self, content, headers, msg): """Entirely destroy a booted system. This will cause the provisioner node to intiate terminations on all known workers via IaaS. Then it will move to destroying anything launched by the bootstrap tool. Input: flag to indicate whether or not log files should be retrieved or not before destruction (for post-mortem analysis). """ pass def op_add_epu(self, content, headers, msg): """Create a new EPU entirely. Brings one or many services into being initially, using the EPU infrastructure. Everything in the input service spec will have an EPU controller running on the same instance that this will launch. If the services have dependencies that must be running ahead of time, it is assumed the operator understands that these dependencies will need to be resolved and operational already. Launch-plans are "vetted" as a whole, but adding new EPUs on top of an already launched system is a manual decision that requires knowledge of what the system is capable of sustaining change-wise. Input: list of one to many service specs (just a "piece" of a normal launch plan). The IaaS coordinates and credentials must have been given to the service at initialization time. """ pass def op_remove_epu(self, content, headers, msg): """Remove an EPU entirely. The same caveats apply as in the add_epu operation: the system needs to be able to handle such a thing. If a service is deprecated, it is implied that the messages drain out before the entire capability is removed. This will need to be taken care of by a higher level metric/tool to know when it is safe to delete an EPU entirely. Note that this is different than reducing the node count of a certain worker set to zero (see the reconfigure operation). That can create the same temporary effect of course (no workers for this service in the system) but that, for example, leaves room for workers coming online when the demand increases past a certain threshold. """ pass def op_reconfigure_epu(self, content, headers, msg): """Reconfigure a running EPU with a new policy. Given the name of one or many EPU controller(s) in the system, this is a convenience operation that ensures the new policy is configured. An EPU controller's policy is dictated by its "decision engine". The default decision engine supports policy reconfiguration but not all of them are required to. An attempt to try to configure an EPU controller with such a decision engine will result in a harmless error. See the NPreservingEngine class notes for reconfiguration details. """ pass def op_find_workers(self, content, headers, msg): """Interact with the provisioner to discover any new worker nodes that were launched in the system since the last query. Input: optionally filter by a particular HA service. Returns the newest workers, before return it has updated the datastore with any new information. """ pass def op_service_status(self, content, headers, msg): """Return the status of one or more services. Each service status will list the: 1) Known workers, past and present. Instance information, time of launch (and failure/termination if applicable), hostnames. 2) Worker status: IaaS status as well as the more "semantic" knowledge of health that is acquired via heartbeats (or the lack thereof). """ pass
def __init__(self, *args, **kwargs): self.fake_state = None ServiceProcess.__init__(self, *args, **kwargs)
class SiamCiReceiverService(ServiceProcess): """ Simple service to receive asynchronous reponses from the SIAM-CI adapter in java """ # Declaration of service declare = ServiceProcess.service_declare(name='siamci_receiver', version='0.1.0', dependencies=[]) def __init__(self, *args, **kwargs): ServiceProcess.__init__(self, *args, **kwargs) log.debug('SiamCiReceiverService.__init__()') self.rc = ResourceClient(proc=self) self.mc = MessageClient(proc=self) self.checkTimeout = None # the set of id's given via op_expect: self.expect = set() # the (id, content) pairs accepted via op_acceptResponse: self.accepted = {} def slc_init(self): log.debug('SiamCiReceiverService.slc_init()') def slc_terminate(self): """ Just logs the expect and accepted sets """ if log.getEffectiveLevel() <= logging.DEBUG: log.debug('SiamCiReceiverService.slc_terminate() ======= ') for e in self.expect: log.debug('---- expect: ' +str(e)) for a in self.accepted: log.debug('---accepted: ' +str(a)) def _get_publish_id(self, content, headers, msg): """ Gets the publish_id from the headers or the content. Note: the java client puts the publish_id in the headers; we check there first. If not found in the headers, we check in the content (if it is a dict); this is basically to support python-side clients for testing purposes. @return: the publish ID; None if not found """ publish_id = None if 'publish_id' in headers.keys(): publish_id = headers['publish_id'] log.debug('_get_publish_id: publish_id = "'+publish_id+ '" (from headers)') elif isinstance(content, dict) and 'publish_id' in content.keys(): publish_id = content['publish_id'] log.debug('_get_publish_id: publish_id = "'+publish_id+ '" (from content)') return publish_id @defer.inlineCallbacks def op_expect(self, content, headers, msg): log.debug('op_expect: ' +str(content)) publish_id = self._get_publish_id(content, headers, msg) if publish_id: self.expect.add(publish_id) else: log.warn('op_expect: publish_id not given') yield self.reply_ok(msg, {'value' : "TODO-some-result"}) @defer.inlineCallbacks def op_acceptResponse(self, content, headers, msg): publish_id = self._get_publish_id(content, headers, msg) if publish_id: self.accepted[publish_id] = content yield self.reply_ok(msg, {'op_acceptResponse' : "OK: response for publish_id='" +str(publish_id)+ "' accepted"}) else: log.warn('op_acceptResponse: publish_id not given') yield self.reply_err(msg, "op_acceptResponse : WARNING: publish_id not given") @defer.inlineCallbacks def op_setExpectedTimeout(self, content, headers, msg): """ Sets the timeout for the op_getExpected operation. There in no timeout by default. """ if 'timeout' in content.keys() and content['timeout']: self.checkTimeout = content['timeout'] yield self.reply_ok(msg, {'checkTimeout' : self.checkTimeout}) else: yield self.reply_err(msg, "Missing 'timeout' for op_setExpectedTimeout operation") @defer.inlineCallbacks def op_getExpected(self, content, headers, msg): """ Returns a list with expected id's that have not been received. If the content includes a 'timeout' parameter, this is used to allow time for expected responses to be received. If not, then the timeout indicated in the last call to op_setExpectedTimeout, if any, will be used. Otherwise, no timeout at all is used. @return: a list with expected id's that have not been received """ log.debug('op_getExpected: ' +str(headers)) timeout = None if 'timeout' in content.keys() and content['timeout']: timeout = content['timeout'] # content in this operation takes precedence else: timeout = self.checkTimeout # use the overall timeout, if any # the total time in seconds we will wait while there is still expected id's remaining = timeout if timeout else 0.0 expected = self._get_still_expected() while len(expected) > 0 and remaining > 0.0: yield pu.asleep(0.2); # sleep for a moment remaining -= 0.2 expected = self._get_still_expected() yield self.reply_ok(msg, expected) def _get_still_expected(self): expected = [] for e in self.expect: if not e in self.accepted.keys(): expected.append(e) return expected @defer.inlineCallbacks def op_getAccepted(self, content, headers, msg): """ Returns the content received for a given publish_id; None if not received yet. """ publish_id = self._get_publish_id(content, headers, msg) if publish_id: if publish_id in self.accepted: yield self.reply_ok(msg, self.accepted[publish_id]) else: yield self.reply_ok(msg, None) else: log.warn('op_getAccepted: publish_id not given') yield self.reply_err(msg, 'op_getAccepted: publish_id not given')
def __init__(self, *args, **kwargs): self.requests = defaultdict(list) self.poll_count = {} self.errors = {} ServiceProcess.__init__(self, *args, **kwargs)
class QueueStatService(ServiceProcess): """Queue stat subscription service Only works on a RabbitMQ server running *on localhost* """ declare = ServiceProcess.service_declare(name='queuestat', version='0.1.0', dependencies=[]) def slc_init(self): erlang_cookie = self.spawn_args.get('erlang_cookie', None) if not erlang_cookie: cookie_path = self.spawn_args.get('erlang_cookie_path', None) erlang_cookie = read_cookie(cookie_path) rabbitmq_node = self.spawn_args.get('rabbitmq_node', 'rabbit@localhost') node_name = twotp.node.buildNodeName(rabbitmq_node) twotp_process = twotp.node.Process(node_name, erlang_cookie) self.rabbitmq = RabbitMQControlService(twotp_process, node_name) self.interval = float( self.spawn_args.get('interval_seconds', DEFAULT_INTERVAL_SECONDS)) self.sensor_id = self.spawn_args.get('sensor_id', DEFAULT_SENSOR_ID) self.loop = LoopingCall(self._wrapped_do_poll) # a dict of sets of (subscriber,op) tuples self.watched_queues = {} def plc_terminate(self): log.debug('Shutdown triggered') self.rabbitmq.shutdown() def op_watch_queue(self, content, headers, msg): """Start watching a queue for updates. If queue is already being watched by this subscriber, this operation does nothing. """ queue_name = content.get('queue_name') subscriber_name = content.get('subscriber_name') subscriber_op = content.get('subscriber_op') if not (queue_name and subscriber_name and subscriber_op): log.warn("Got invalid watch request: %s" % content) return sub_tuple = (subscriber_name, subscriber_op) queue_subs = self.watched_queues.get(queue_name, None) if queue_subs is None: queue_subs = set() self.watched_queues[queue_name] = queue_subs queue_subs.add(sub_tuple) if not self.loop.running: log.debug('starting LoopingCall, to poll queues') self.loop.start(self.interval) def op_unwatch_queue(self, content, headers, msg): """Stop watching a queue. If queue is not being watched by subscriber, this operation does nothing. """ queue_name = content.get('queue_name') subscriber_name = content.get('subscriber_name') subscriber_op = content.get('subscriber_op') if not (queue_name and subscriber_name and subscriber_op): log.warn("Got invalid unwatch request: %s" % content) return sub_tuple = (subscriber_name, subscriber_op) queue_subs = self.watched_queues.get(queue_name, None) if queue_subs: queue_subs.discard(sub_tuple) if not queue_subs: del self.watched_queues[queue_name] if not self.watched_queues and self.loop.running: log.debug('No queues are being watched, disabling LoopingCall') self.loop.stop() @defer.inlineCallbacks def _wrapped_do_poll(self): try: yield self._do_poll() except Exception, e: log.error("Error in RabbitMQ poll: %s", str(e), exc_info=True)
class EPUControllerService(ServiceProcess): """EPU Controller service interface """ declare = ServiceProcess.service_declare(name=DEFAULT_NAME, version='0.1.0', dependencies=[]) @defer.inlineCallbacks def slc_init(self): scoped_name = self.get_scoped_name("system", self.svc_name) self.scoped_name = scoped_name queue_name_work = self.spawn_args.get("queue_name_work") if queue_name_work: self.queue_name_work = self.get_scoped_name( "system", queue_name_work) extradict = {"queue_name_work": self.queue_name_work} cei_events.event(self.svc_name, "init_begin", extra=extradict) yield self._make_queue(queue_name_work) queuestat_client = QueueStatClient(self) yield queuestat_client.watch_queue(self.queue_name_work, self.scoped_name, 'sensor_info') cei_events.event(self.svc_name, "queue_watched") else: self.worker_queue_receiver = None self.queue_name_work = None extradict = None cei_events.event(self.svc_name, "init_begin", extra=extradict) engineclass = "epu.decisionengine.impls.NpreservingEngine" if self.spawn_args.has_key("engine_class"): engineclass = self.spawn_args["engine_class"] log.info("Using configured decision engine: %s" % engineclass) else: log.info("Using default decision engine: %s" % engineclass) if self.spawn_args.has_key("engine_conf"): engine_conf = self.spawn_args["engine_conf"] if isinstance(engine_conf, str): engine_conf = json.loads(engine_conf) else: engine_conf = None if self.spawn_args.has_key("cassandra"): cass = self.spawn_args["cassandra"] host = cass['hostname'] username = cass['username'] password = cass['password'] port = cass['port'] keyspace = cass['keyspace'] store = CassandraControllerStore(self.svc_name, host, port, username, password, keyspace, CoreInstance, SensorItem) store.initialize() store.activate() elif self.spawn_args.has_key('store'): store = self.spawn_args['store'] else: store = ControllerStore() self.core = ControllerCore(ProvisionerClient(self), engineclass, scoped_name, conf=engine_conf, store=store) # run state recovery and engine initialization # this one needs to run before any messages start arriving. It pulls # information from persistence and refreshes local caches. yield self.core.run_recovery() # temporarily doing this later due to a potential bug in ioncore where # queues may not be bound before slc_init runs. This means if the # provisioner is quck to reply to dump_state some messages may be # missed. reactor.callLater(1, self._delayed_init) @defer.inlineCallbacks def _delayed_init(self): yield self.core.run_initialize() self.core.begin_controlling() cei_events.event(self.svc_name, "init_end") @defer.inlineCallbacks def _make_queue(self, name): self.worker_queue_receiver = ServiceWorkerReceiver(label=name, name=name, scope='system') yield self.worker_queue_receiver.initialize() def op_heartbeat(self, content, headers, msg): log.debug("Got node heartbeat: %s", content) return self.core.new_heartbeat(content) def op_instance_state(self, content, headers, msg): return self.core.new_instance_state(content) def op_sensor_info(self, content, headers, msg): return self.core.new_sensor_info(content) def op_reconfigure(self, content, headers, msg): log.info("EPU Controller: reconfigure: '%s'" % content) return self.core.run_reconfigure(content) @defer.inlineCallbacks def op_reconfigure_rpc(self, content, headers, msg): log.info("EPU Controller: reconfigure_rpc: '%s'" % content) yield self.core.run_reconfigure(content) yield self.reply_ok(msg, "") @defer.inlineCallbacks def op_de_state(self, content, headers, msg): state = self.core.de_state() extradict = {"state": state} cei_events.event(self.svc_name, "de_state", extra=extradict) yield self.reply_ok(msg, state) @defer.inlineCallbacks def op_whole_state(self, content, headers, msg): state = yield self.core.whole_state() yield self.reply_ok(msg, state) @defer.inlineCallbacks def op_node_error(self, content, headers, msg): node_id = content state = yield self.core.node_error(node_id) yield self.reply_ok(msg, state)