def getNodes(api, num, **constraints): # Now do the backtracking search for a suitable solution # First with existing slice nodes reqs = [] nodes = [] import node as Node for i in xrange(num): node = Node.Node(api) node.min_num_external_interface = 1 nodes.append(node) node = nodes[0] candidates = filterBlacklist(node.find_candidates()) reqs = [candidates] * num def pickbest(fullset, nreq, node=nodes[0]): if len(fullset) > nreq: fullset = zip(node.rate_nodes(fullset), fullset) fullset.sort(reverse=True) del fullset[nreq:] return set(map(operator.itemgetter(1), fullset)) else: return fullset solution = resourcealloc.alloc(reqs, sample=pickbest) # Do assign nodes runner = ParallelRun(maxthreads=4) for node, node_id in zip(nodes, solution): runner.put(node.assign_node_id, node_id) runner.join() return nodes
def getNodes(api, num, **constraints): # Now do the backtracking search for a suitable solution # First with existing slice nodes reqs = [] nodes = [] import node as Node for i in xrange(num): node = Node.Node(api) node.min_num_external_interface = 1 nodes.append(node) node = nodes[0] candidates = filterBlacklist(node.find_candidates()) reqs = [candidates] * num def pickbest(fullset, nreq, node=nodes[0]): if len(fullset) > nreq: fullset = zip(node.rate_nodes(fullset),fullset) fullset.sort(reverse=True) del fullset[nreq:] return set(map(operator.itemgetter(1),fullset)) else: return fullset solution = resourcealloc.alloc(reqs, sample=pickbest) # Do assign nodes runner = ParallelRun(maxthreads=4) for node, node_id in zip(nodes, solution): runner.put(node.assign_node_id, node_id) runner.join() return nodes
def do_wait_nodes(self): for guid, node in self._elements.iteritems(): if isinstance(node, self._node.Node): # Just inject configuration stuff node.home_path = "nepi-node-%s" % (guid,) node.ident_path = self.sliceSSHKey node.slicename = self.slicename # Show the magic self._logger.info("PlanetLab Node %s configured at %s", guid, node.hostname) try: runner = ParallelRun(maxthreads=64, maxqueue=1) abort = [] def waitforit(guid, node): try: node.wait_provisioning( (20*60 if node._node_id in self._just_provisioned else 60) ) self._logger.info("READY Node %s at %s", guid, node.hostname) # Prepare dependency installer now node.prepare_dependencies() except: abort.append(None) raise for guid, node in self._elements.iteritems(): if abort: break if isinstance(node, self._node.Node): self._logger.info("Waiting for Node %s configured at %s", guid, node.hostname) runner.put(waitforit, guid, node) runner.join() except self._node.UnresponsiveNodeError: # Uh... self._logger.warn("UNRESPONSIVE Nodes") # Mark all dead nodes (which are unresponsive) on the blacklist # and re-raise for guid, node in self._elements.iteritems(): if isinstance(node, self._node.Node): if not node.is_alive(): self._logger.warn("Blacklisting %s for unresponsiveness", node.hostname) self._blacklist.add(node.hostname) node.unassign_node() try: self._save_blacklist() except: # not important... import traceback traceback.print_exc() raise
def test_run_simple(self): runner = ParallelRun(maxthreads=4) runner.start() count = [0] def inc(count): count[0] += 1 for x in xrange(10): runner.put(inc, count) runner.destroy() self.assertEquals(count[0], 10)
def test_run_simple(self): runner = ParallelRun(maxthreads = 4) runner.start() count = [0] def inc(count): count[0] += 1 for x in xrange(10): runner.put(inc, count) runner.destroy() self.assertEquals(count[0], 10)
def test_run_interrupt(self): def sleep(): import time time.sleep(5) startt = datetime.datetime.now() runner = ParallelRun(maxthreads=4) runner.start() for x in xrange(100): runner.put(sleep) runner.empty() runner.destroy() endt = datetime.datetime.now() time_elapsed = (endt - startt).seconds self.assertTrue(time_elapsed < 500)
def test_run_interrupt(self): def sleep(): import time time.sleep(5) startt = datetime.datetime.now() runner = ParallelRun(maxthreads = 4) runner.start() for x in xrange(100): runner.put(sleep) runner.empty() runner.destroy() endt = datetime.datetime.now() time_elapsed = (endt - startt).seconds self.assertTrue( time_elapsed < 500)
def test_run_error(self): count = [0] def inc(count): count[0] += 1 def error(): raise RuntimeError() runner = ParallelRun(maxthreads=4) runner.start() for x in xrange(4): runner.put(inc, count) runner.put(error) runner.destroy() self.assertEquals(count[0], 4) self.assertRaises(RuntimeError, runner.sync)
def test_run_error(self): count = [0] def inc(count): count[0] += 1 def error(): raise RuntimeError() runner = ParallelRun(maxthreads = 4) runner.start() for x in xrange(4): runner.put(inc, count) runner.put(error) runner.destroy() self.assertEquals(count[0], 4) self.assertRaises(RuntimeError, runner.sync)
class ExperimentController(object): """ .. note:: An experiment, or scenario, is defined by a concrete set of resources, and the behavior, configuration and interconnection of those resources. The Experiment Description (ED) is a detailed representation of a single experiment. It contains all the necessary information to allow repeating the experiment. NEPI allows to describe experiments by registering components (resources), configuring them and interconnecting them. A same experiment (scenario) can be executed many times, generating different results. We call an experiment execution (instance) a 'run'. The ExperimentController (EC), is the entity responsible of managing an experiment run. The same scenario can be recreated (and re-run) by instantiating an EC and recreating the same experiment description. An experiment is represented as a graph of interconnected resources. A resource is a generic concept in the sense that any component taking part of an experiment, whether physical of virtual, is considered a resource. A resources could be a host, a virtual machine, an application, a simulator, a IP address. A ResourceManager (RM), is the entity responsible for managing a single resource. ResourceManagers are specific to a resource type (i.e. An RM to control a Linux application will not be the same as the RM used to control a ns-3 simulation). To support a new type of resource, a new RM must be implemented. NEPI already provides a variety of RMs to control basic resources, and new can be extended from the existing ones. Through the EC interface the user can create ResourceManagers (RMs), configure them and interconnect them, to describe an experiment. Describing an experiment through the EC does not run the experiment. Only when the 'deploy()' method is invoked on the EC, the EC will take actions to transform the 'described' experiment into a 'running' experiment. While the experiment is running, it is possible to continue to create/configure/connect RMs, and to deploy them to involve new resources in the experiment (this is known as 'interactive' deployment). An experiments in NEPI is identified by a string id, which is either given by the user, or automatically generated by NEPI. The purpose of this identifier is to separate files and results that belong to different experiment scenarios. However, since a same 'experiment' can be run many times, the experiment id is not enough to identify an experiment instance (run). For this reason, the ExperimentController has two identifier, the exp_id, which can be re-used in different ExperimentController, and the run_id, which is unique to one ExperimentController instance, and is automatically generated by NEPI. """ @classmethod def load(cls, filepath, format=SFormats.XML): serializer = ECSerializer() ec = serializer.load(filepath) return ec def __init__(self, exp_id=None, local_dir=None, persist=False, fm=None, add_node_callback=None, add_edge_callback=None, **kwargs): """ ExperimentController entity to model an execute a network experiment. :param exp_id: Human readable name to identify the experiment :type exp_id: str :param local_dir: Path to local directory where to store experiment related files :type local_dir: str :param persist: Save an XML description of the experiment after completion at local_dir :type persist: bool :param fm: FailureManager object. If None is given, the default FailureManager class will be used :type fm: FailureManager :param add_node_callback: Callback to invoke for node instantiation when automatic topology creation mode is used :type add_node_callback: function :param add_edge_callback: Callback to invoke for edge instantiation when automatic topology creation mode is used :type add_edge_callback: function """ super(ExperimentController, self).__init__() # Logging self._logger = logging.getLogger("ExperimentController") # Run identifier. It identifies a concrete execution instance (run) # of an experiment. # Since a same experiment (same configuration) can be executed many # times, this run_id permits to separate result files generated on # different experiment executions self._run_id = tsformat() # Experiment identifier. Usually assigned by the user # Identifies the experiment scenario (i.e. configuration, # resources used, etc) self._exp_id = exp_id or "exp-%s" % os.urandom(8).encode('hex') # Local path where to store experiment related files (results, etc) if not local_dir: local_dir = tempfile.gettempdir() # /tmp self._local_dir = local_dir self._exp_dir = os.path.join(local_dir, self.exp_id) self._run_dir = os.path.join(self.exp_dir, self.run_id) # If True persist the experiment controller in XML format, after completion self._persist = persist # generator of globally unique ids self._guid_generator = guid.GuidGenerator() # Resource managers self._resources = dict() # Scheduler. It a queue that holds tasks scheduled for # execution, and yields the next task to be executed # ordered by execution and arrival time self._scheduler = HeapScheduler() # Tasks self._tasks = dict() # RM groups (for deployment) self._groups = dict() # generator of globally unique id for groups self._group_id_generator = guid.GuidGenerator() # Flag to stop processing thread self._stop = False # Entity in charge of managing system failures if not fm: self._fm = FailureManager() self._fm.set_ec(self) # EC state self._state = ECState.RUNNING # Automatically construct experiment description self._netgraph = None if add_node_callback or add_edge_callback or kwargs.get("topology"): self._build_from_netgraph(add_node_callback, add_edge_callback, **kwargs) # The runner is a pool of threads used to parallelize # execution of tasks self._nthreads = 20 self._runner = None # Event processing thread self._cond = threading.Condition() self._thread = threading.Thread(target=self._process) self._thread.setDaemon(True) self._thread.start() @property def logger(self): """ Returns the logger instance of the Experiment Controller """ return self._logger @property def fm(self): """ Returns the failure manager """ return self._fm @property def failure_level(self): """ Returns the level of FAILURE of th experiment """ return self._fm._failure_level @property def ecstate(self): """ Returns the state of the Experiment Controller """ return self._state @property def exp_id(self): """ Returns the experiment id assigned by the user """ return self._exp_id @property def run_id(self): """ Returns the experiment instance (run) identifier (automatically generated) """ return self._run_id @property def nthreads(self): """ Returns the number of processing nthreads used """ return self._nthreads @property def local_dir(self): """ Root local directory for experiment files """ return self._local_dir @property def exp_dir(self): """ Local directory to store results and other files related to the experiment. """ return self._exp_dir @property def run_dir(self): """ Local directory to store results and other files related to the experiment run. """ return self._run_dir @property def persist(self): """ If True, persists the ExperimentController to XML format upon experiment completion """ return self._persist @property def netgraph(self): """ Return NetGraph instance if experiment description was automatically generated """ return self._netgraph @property def abort(self): """ Returns True if the experiment has failed and should be interrupted, False otherwise. """ return self._fm.abort def inform_failure(self, guid): """ Reports a failure in a RM to the EC for evaluation :param guid: Resource id :type guid: int """ return self._fm.eval_failure(guid) def wait_finished(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= STOPPED (i.e. STOPPED, FAILED or RELEASED ), or until a failure in the experiment occurs (i.e. abort == True) :param guids: List of guids :type guids: list """ def quit(): return self.abort return self.wait(guids, state=ResourceState.STOPPED, quit=quit) def wait_started(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= STARTED, or until a failure in the experiment occurs (i.e. abort == True) :param guids: List of guids :type guids: list """ def quit(): return self.abort return self.wait(guids, state=ResourceState.STARTED, quit=quit) def wait_released(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state == RELEASED, or until the EC fails :param guids: List of guids :type guids: list """ def quit(): return self._state == ECState.FAILED return self.wait(guids, state=ResourceState.RELEASED, quit=quit) def wait_deployed(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= READY, or until a failure in the experiment occurs (i.e. abort == True) :param guids: List of guids :type guids: list """ def quit(): return self.abort return self.wait(guids, state=ResourceState.READY, quit=quit) def wait(self, guids, state, quit): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= 'state', or until the 'quit' callback yields True :param guids: List of guids :type guids: list """ if isinstance(guids, int): guids = [guids] # Make a copy to avoid modifying the original guids list guids = list(guids) while True: # If there are no more guids to wait for # or the quit function returns True, exit the loop if len(guids) == 0 or quit(): break # If a guid reached one of the target states, remove it from list guid = guids.pop() rm = self.get_resource(guid) rstate = rm.state if rstate >= state: self.logger.debug( " %s guid %d DONE - state is %s, required is >= %s " % (rm.get_rtype(), guid, rstate, state)) else: # Debug... self.logger.debug( " WAITING FOR guid %d - state is %s, required is >= %s " % (guid, rstate, state)) guids.append(guid) time.sleep(0.5) def plot(self, dirpath=None, format=PFormats.FIGURE, show=False): plotter = ECPlotter() fpath = plotter.plot(self, dirpath=dirpath, format=format, show=show) return fpath def serialize(self, format=SFormats.XML): serializer = ECSerializer() sec = serializer.load(self, format=format) return sec def save(self, dirpath=None, format=SFormats.XML): if dirpath == None: dirpath = self.run_dir try: os.makedirs(dirpath) except OSError: pass serializer = ECSerializer() path = serializer.save(self, dirpath, format=format) return path def get_task(self, tid): """ Returns a task by its id :param tid: Id of the task :type tid: int :rtype: Task """ return self._tasks.get(tid) def get_resource(self, guid): """ Returns a registered ResourceManager by its guid :param guid: Id of the resource :type guid: int :rtype: ResourceManager """ rm = self._resources.get(guid) return rm def get_resources_by_type(self, rtype): """ Returns the ResourceManager objects of type rtype :param rtype: Resource type :type rtype: string :rtype: list of ResourceManagers """ rms = [] for guid, rm in self._resources.iteritems(): if rm.get_rtype() == rtype: rms.append(rm) return rms def remove_resource(self, guid): del self._resources[guid] @property def resources(self): """ Returns the guids of all ResourceManagers :return: Set of all RM guids :rtype: list """ keys = self._resources.keys() return keys def filter_resources(self, rtype): """ Returns the guids of all ResourceManagers of type rtype :param rtype: Resource type :type rtype: string :rtype: list of guids """ rms = [] for guid, rm in self._resources.iteritems(): if rm.get_rtype() == rtype: rms.append(rm.guid) return rms def register_resource(self, rtype, guid=None): """ Registers a new ResourceManager of type 'rtype' in the experiment This method will assign a new 'guid' for the RM, if no guid is specified. :param rtype: Type of the RM :type rtype: str :return: Guid of the RM :rtype: int """ # Get next available guid guid = self._guid_generator.next(guid) # Instantiate RM rm = ResourceFactory.create(rtype, self, guid) # Store RM self._resources[guid] = rm return guid def get_attributes(self, guid): """ Returns all the attributes of the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :return: List of attributes :rtype: list """ rm = self.get_resource(guid) return rm.get_attributes() def get_attribute(self, guid, name): """ Returns the attribute 'name' of the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :return: The attribute with name 'name' :rtype: Attribute """ rm = self.get_resource(guid) return rm.get_attribute(name) def register_connection(self, guid1, guid2): """ Registers a connection between a RM with guid 'guid1' and another RM with guid 'guid2'. The order of the in which the two guids are provided is not important, since the connection relationship is symmetric. :param guid1: First guid to connect :type guid1: ResourceManager :param guid2: Second guid to connect :type guid: ResourceManager """ rm1 = self.get_resource(guid1) rm2 = self.get_resource(guid2) rm1.register_connection(guid2) rm2.register_connection(guid1) def register_condition(self, guids1, action, guids2, state, time=None): """ Registers an action START, STOP or DEPLOY for all RM on list guids1 to occur at time 'time' after all elements in list guids2 have reached state 'state'. :param guids1: List of guids of RMs subjected to action :type guids1: list :param action: Action to perform (either START, STOP or DEPLOY) :type action: ResourceAction :param guids2: List of guids of RMs to we waited for :type guids2: list :param state: State to wait for on RMs of list guids2 (STARTED, STOPPED, etc) :type state: ResourceState :param time: Time to wait after guids2 has reached status :type time: string """ if isinstance(guids1, int): guids1 = [guids1] if isinstance(guids2, int): guids2 = [guids2] for guid1 in guids1: rm = self.get_resource(guid1) rm.register_condition(action, guids2, state, time) def enable_trace(self, guid, name): """ Enables a trace to be collected during the experiment run :param name: Name of the trace :type name: str """ rm = self.get_resource(guid) rm.enable_trace(name) def trace_enabled(self, guid, name): """ Returns True if the trace of name 'name' is enabled :param name: Name of the trace :type name: str """ rm = self.get_resource(guid) return rm.trace_enabled(name) def trace(self, guid, name, attr=TraceAttr.ALL, block=512, offset=0): """ Returns information on a collected trace, the trace stream or blocks (chunks) of the trace stream :param name: Name of the trace :type name: str :param attr: Can be one of: - TraceAttr.ALL (complete trace content), - TraceAttr.STREAM (block in bytes to read starting at offset), - TraceAttr.PATH (full path to the trace file), - TraceAttr.SIZE (size of trace file). :type attr: str :param block: Number of bytes to retrieve from trace, when attr is TraceAttr.STREAM :type name: int :param offset: Number of 'blocks' to skip, when attr is TraceAttr.STREAM :type name: int :rtype: str """ rm = self.get_resource(guid) return rm.trace(name, attr, block, offset) def get_traces(self, guid): """ Returns the list of the trace names of the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :return: List of trace names :rtype: list """ rm = self.get_resource(guid) return rm.get_traces() def discover(self, guid): """ Discovers an available resource matching the criteria defined by the RM with guid 'guid', and associates that resource to the RM Not all RM types require (or are capable of) performing resource discovery. For the RM types which are not capable of doing so, invoking this method does not have any consequences. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.discover() def provision(self, guid): """ Provisions the resource associated to the RM with guid 'guid'. Provisioning means making a resource 'accessible' to the user. Not all RM types require (or are capable of) performing resource provisioning. For the RM types which are not capable of doing so, invoking this method does not have any consequences. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.provision() def get(self, guid, name): """ Returns the value of the attribute with name 'name' on the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :return: The value of the attribute with name 'name' """ rm = self.get_resource(guid) return rm.get(name) def set(self, guid, name, value): """ Modifies the value of the attribute with name 'name' on the RM with guid 'guid'. :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :param value: Value of the attribute """ rm = self.get_resource(guid) rm.set(name, value) def get_global(self, rtype, name): """ Returns the value of the global attribute with name 'name' on the RMs of rtype 'rtype'. :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :return: The value of the attribute with name 'name' """ rclass = ResourceFactory.get_resource_type(rtype) return rclass.get_global(name) def set_global(self, rtype, name, value): """ Modifies the value of the global attribute with name 'name' on the RMs of with rtype 'rtype'. :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :param value: Value of the attribute """ rclass = ResourceFactory.get_resource_type(rtype) return rclass.set_global(name, value) def state(self, guid, hr=False): """ Returns the state of a resource :param guid: Resource guid :type guid: integer :param hr: Human readable. Forces return of a status string instead of a number :type hr: boolean """ rm = self.get_resource(guid) state = rm.state if hr: return ResourceState2str.get(state) return state def stop(self, guid): """ Stops the RM with guid 'guid' Stopping a RM means that the resource it controls will no longer take part of the experiment. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.stop() def start(self, guid): """ Starts the RM with guid 'guid' Starting a RM means that the resource it controls will begin taking part of the experiment. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.start() def get_start_time(self, guid): """ Returns the start time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.start_time def get_stop_time(self, guid): """ Returns the stop time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.stop_time def get_discover_time(self, guid): """ Returns the discover time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.discover_time def get_provision_time(self, guid): """ Returns the provision time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.provision_time def get_ready_time(self, guid): """ Returns the deployment time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.ready_time def get_release_time(self, guid): """ Returns the release time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.release_time def get_failed_time(self, guid): """ Returns the time failure occured for the RM as a timestamp """ rm = self.get_resource(guid) return rm.failed_time def set_with_conditions(self, name, value, guids1, guids2, state, time=None): """ Modifies the value of attribute with name 'name' on all RMs on the guids1 list when time 'time' has elapsed since all elements in guids2 list have reached state 'state'. :param name: Name of attribute to set in RM :type name: string :param value: Value of attribute to set in RM :type name: string :param guids1: List of guids of RMs subjected to action :type guids1: list :param action: Action to register (either START or STOP) :type action: ResourceAction :param guids2: List of guids of RMs to we waited for :type guids2: list :param state: State to wait for on RMs (STARTED, STOPPED, etc) :type state: ResourceState :param time: Time to wait after guids2 has reached status :type time: string """ if isinstance(guids1, int): guids1 = [guids1] if isinstance(guids2, int): guids2 = [guids2] for guid1 in guids1: rm = self.get_resource(guid) rm.set_with_conditions(name, value, guids2, state, time) def deploy(self, guids=None, wait_all_ready=True, group=None): """ Deploys all ResourceManagers in the guids list. If the argument 'guids' is not given, all RMs with state NEW are deployed. :param guids: List of guids of RMs to deploy :type guids: list :param wait_all_ready: Wait until all RMs are ready in order to start the RMs :type guid: int :param group: Id of deployment group in which to deploy RMs :type group: int """ self.logger.debug(" ------- DEPLOY START ------ ") if not guids: # If no guids list was passed, all 'NEW' RMs will be deployed guids = [] for guid, rm in self._resources.iteritems(): if rm.state == ResourceState.NEW: guids.append(guid) if isinstance(guids, int): guids = [guids] # Create deployment group # New guids can be added to a same deployment group later on new_group = False if not group: new_group = True group = self._group_id_generator.next() if group not in self._groups: self._groups[group] = [] self._groups[group].extend(guids) def wait_all_and_start(group): # Function that checks if all resources are READY # before scheduling a start_with_conditions for each RM reschedule = False # Get all guids in group guids = self._groups[group] for guid in guids: if self.state(guid) < ResourceState.READY: reschedule = True break if reschedule: callback = functools.partial(wait_all_and_start, group) self.schedule("1s", callback) else: # If all resources are ready, we schedule the start for guid in guids: rm = self.get_resource(guid) self.schedule("0s", rm.start_with_conditions) if rm.conditions.get(ResourceAction.STOP): # Only if the RM has STOP conditions we # schedule a stop. Otherwise the RM will stop immediately self.schedule("0s", rm.stop_with_conditions) if wait_all_ready and new_group: # Schedule a function to check that all resources are # READY, and only then schedule the start. # This aims at reducing the number of tasks looping in the # scheduler. # Instead of having many start tasks, we will have only one for # the whole group. callback = functools.partial(wait_all_and_start, group) self.schedule("0s", callback) for guid in guids: rm = self.get_resource(guid) rm.deployment_group = group self.schedule("0s", rm.deploy_with_conditions) if not wait_all_ready: self.schedule("0s", rm.start_with_conditions) if rm.conditions.get(ResourceAction.STOP): # Only if the RM has STOP conditions we # schedule a stop. Otherwise the RM will stop immediately self.schedule("0s", rm.stop_with_conditions) def release(self, guids=None): """ Releases all ResourceManagers in the guids list. If the argument 'guids' is not given, all RMs registered in the experiment are released. :param guids: List of RM guids :type guids: list """ if self._state == ECState.RELEASED: return if isinstance(guids, int): guids = [guids] if not guids: guids = self.resources for guid in guids: rm = self.get_resource(guid) self.schedule("0s", rm.release) self.wait_released(guids) if self.persist: self.save() for guid in guids: if self.get(guid, "hardRelease"): self.remove_resource(guid)\ # Mark the EC state as RELEASED self._state = ECState.RELEASED def shutdown(self): """ Releases all resources and stops the ExperimentController """ # If there was a major failure we can't exit gracefully if self._state == ECState.FAILED: raise RuntimeError("EC failure. Can not exit gracefully") # Remove all pending tasks from the scheduler queue for tid in list(self._scheduler.pending): self._scheduler.remove(tid) # Remove pending tasks from the workers queue self._runner.empty() self.release() # Mark the EC state as TERMINATED self._state = ECState.TERMINATED # Stop processing thread self._stop = True # Notify condition to wake up the processing thread self._notify() if self._thread.is_alive(): self._thread.join() def schedule(self, date, callback, track=False): """ Schedules a callback to be executed at time 'date'. :param date: string containing execution time for the task. It can be expressed as an absolute time, using timestamp format, or as a relative time matching ^\d+.\d+(h|m|s|ms|us)$ :param callback: code to be executed for the task. Must be a Python function, and receives args and kwargs as arguments. :param track: if set to True, the task will be retrievable with the get_task() method :return : The Id of the task :rtype: int """ timestamp = stabsformat(date) task = Task(timestamp, callback) task = self._scheduler.schedule(task) if track: self._tasks[task.id] = task # Notify condition to wake up the processing thread self._notify() return task.id def _process(self): """ Process scheduled tasks. .. note:: Tasks are scheduled by invoking the schedule method with a target callback and an execution time. The schedule method creates a new Task object with that callback and execution time, and pushes it into the '_scheduler' queue. The execution time and the order of arrival of tasks are used to order the tasks in the queue. The _process method is executed in an independent thread held by the ExperimentController for as long as the experiment is running. This method takes tasks from the '_scheduler' queue in a loop and processes them in parallel using multithreading. The environmental variable NEPI_NTHREADS can be used to control the number of threads used to process tasks. The default value is 50. To execute tasks in parallel, a ParallelRunner (PR) object is used. This object keeps a pool of threads (workers), and a queue of tasks scheduled for 'immediate' execution. On each iteration, the '_process' loop will take the next task that is scheduled for 'future' execution from the '_scheduler' queue, and if the execution time of that task is >= to the current time, it will push that task into the PR for 'immediate execution'. As soon as a worker is free, the PR will assign the next task to that worker. Upon receiving a task to execute, each PR worker (thread) will invoke the _execute method of the EC, passing the task as argument. The _execute method will then invoke task.callback inside a try/except block. If an exception is raised by the tasks.callback, it will be trapped by the try block, logged to standard error (usually the console), and the task will be marked as failed. """ self._nthreads = int( os.environ.get("NEPI_NTHREADS", str(self._nthreads))) self._runner = ParallelRun(maxthreads=self.nthreads) self._runner.start() while not self._stop: try: self._cond.acquire() task = self._scheduler.next() if not task: # No task to execute. Wait for a new task to be scheduled. self._cond.wait() else: # The task timestamp is in the future. Wait for timeout # or until another task is scheduled. now = tnow() if now < task.timestamp: # Calculate timeout in seconds timeout = tdiffsec(task.timestamp, now) # Re-schedule task with the same timestamp self._scheduler.schedule(task) task = None # Wait timeout or until a new task awakes the condition self._cond.wait(timeout) self._cond.release() if task: # Process tasks in parallel self._runner.put(self._execute, task) except: import traceback err = traceback.format_exc() self.logger.error( "Error while processing tasks in the EC: %s" % err) # Set the EC to FAILED state self._state = ECState.FAILED # Set the FailureManager failure level to EC failure self._fm.set_ec_failure() self.logger.debug("Exiting the task processing loop ... ") self._runner.sync() self._runner.destroy() def _execute(self, task): """ Executes a single task. :param task: Object containing the callback to execute :type task: Task """ try: # Invoke callback task.result = task.callback() task.status = TaskStatus.DONE except: import traceback err = traceback.format_exc() task.result = err task.status = TaskStatus.ERROR self.logger.error("Error occurred while executing task: %s" % err) def _notify(self): """ Awakes the processing thread if it is blocked waiting for new tasks to arrive """ self._cond.acquire() self._cond.notify() self._cond.release() def _build_from_netgraph(self, add_node_callback, add_edge_callback, **kwargs): """ Automates experiment description using a NetGraph instance. """ self._netgraph = NetGraph(**kwargs) if add_node_callback: ### Add resources to the EC for nid in self.netgraph.nodes(): add_node_callback(self, nid) if add_edge_callback: #### Add connections between resources for nid1, nid2 in self.netgraph.edges(): add_edge_callback(self, nid1, nid2)
def _do_in_factory_order(self, action, order, postaction = None, poststep = None): logger = self._logger guids = collections.defaultdict(list) # order guids (elements) according to factory_id for guid, factory_id in self._create.iteritems(): guids[factory_id].append(guid) # configure elements following the factory_id order for factory_id in order: # Create a parallel runner if we're given a Parallel() wrapper runner = None if isinstance(factory_id, Parallel): runner = ParallelRun(factory_id.maxthreads) factory_id = factory_id.factory # omit the factories that have no element to create if factory_id not in guids: continue # configure action factory = self._factories[factory_id] if isinstance(action, basestring) and not getattr(factory, action): continue def perform_action(guid): if isinstance(action, basestring): getattr(factory, action)(self, guid) else: action(self, guid) if postaction: postaction(self, guid) # perform the action on all elements, in parallel if so requested if runner: logger.debug("TestbedController: Starting parallel %s", action) runner.start() for guid in guids[factory_id]: if runner: logger.debug("TestbedController: Scheduling %s on %s", action, guid) runner.put(perform_action, guid) else: logger.debug("TestbedController: Performing %s on %s", action, guid) perform_action(guid) # sync if runner: runner.sync() # post hook if poststep: for guid in guids[factory_id]: if runner: logger.debug("TestbedController: Scheduling post-%s on %s", action, guid) runner.put(poststep, self, guid) else: logger.debug("TestbedController: Performing post-%s on %s", action, guid) poststep(self, guid) # sync if runner: runner.join() logger.debug("TestbedController: Finished parallel %s", action)
def _do_in_factory_order(self, action, order, postaction=None, poststep=None): logger = self._logger guids = collections.defaultdict(list) # order guids (elements) according to factory_id for guid, factory_id in self._create.iteritems(): guids[factory_id].append(guid) # configure elements following the factory_id order for factory_id in order: # Create a parallel runner if we're given a Parallel() wrapper runner = None if isinstance(factory_id, Parallel): runner = ParallelRun(factory_id.maxthreads) factory_id = factory_id.factory # omit the factories that have no element to create if factory_id not in guids: continue # configure action factory = self._factories[factory_id] if isinstance(action, basestring) and not getattr(factory, action): continue def perform_action(guid): if isinstance(action, basestring): getattr(factory, action)(self, guid) else: action(self, guid) if postaction: postaction(self, guid) # perform the action on all elements, in parallel if so requested if runner: logger.debug("TestbedController: Starting parallel %s", action) runner.start() for guid in guids[factory_id]: if runner: logger.debug("TestbedController: Scheduling %s on %s", action, guid) runner.put(perform_action, guid) else: logger.debug("TestbedController: Performing %s on %s", action, guid) perform_action(guid) # sync if runner: runner.sync() # post hook if poststep: for guid in guids[factory_id]: if runner: logger.debug( "TestbedController: Scheduling post-%s on %s", action, guid) runner.put(poststep, self, guid) else: logger.debug( "TestbedController: Performing post-%s on %s", action, guid) poststep(self, guid) # sync if runner: runner.join() logger.debug("TestbedController: Finished parallel %s", action)
class ExperimentController(object): """ .. note:: An experiment, or scenario, is defined by a concrete set of resources, and the behavior, configuration and interconnection of those resources. The Experiment Description (ED) is a detailed representation of a single experiment. It contains all the necessary information to allow repeating the experiment. NEPI allows to describe experiments by registering components (resources), configuring them and interconnecting them. A same experiment (scenario) can be executed many times, generating different results. We call an experiment execution (instance) a 'run'. The ExperimentController (EC), is the entity responsible of managing an experiment run. The same scenario can be recreated (and re-run) by instantiating an EC and recreating the same experiment description. An experiment is represented as a graph of interconnected resources. A resource is a generic concept in the sense that any component taking part of an experiment, whether physical of virtual, is considered a resource. A resources could be a host, a virtual machine, an application, a simulator, a IP address. A ResourceManager (RM), is the entity responsible for managing a single resource. ResourceManagers are specific to a resource type (i.e. An RM to control a Linux application will not be the same as the RM used to control a ns-3 simulation). To support a new type of resource, a new RM must be implemented. NEPI already provides a variety of RMs to control basic resources, and new can be extended from the existing ones. Through the EC interface the user can create ResourceManagers (RMs), configure them and interconnect them, to describe an experiment. Describing an experiment through the EC does not run the experiment. Only when the 'deploy()' method is invoked on the EC, the EC will take actions to transform the 'described' experiment into a 'running' experiment. While the experiment is running, it is possible to continue to create/configure/connect RMs, and to deploy them to involve new resources in the experiment (this is known as 'interactive' deployment). An experiments in NEPI is identified by a string id, which is either given by the user, or automatically generated by NEPI. The purpose of this identifier is to separate files and results that belong to different experiment scenarios. However, since a same 'experiment' can be run many times, the experiment id is not enough to identify an experiment instance (run). For this reason, the ExperimentController has two identifier, the exp_id, which can be re-used in different ExperimentController, and the run_id, which is unique to one ExperimentController instance, and is automatically generated by NEPI. """ @classmethod def load(cls, filepath, format = SFormats.XML): serializer = ECSerializer() ec = serializer.load(filepath) return ec def __init__(self, exp_id = None, local_dir = None, persist = False, fm = None, add_node_callback = None, add_edge_callback = None, **kwargs): """ ExperimentController entity to model an execute a network experiment. :param exp_id: Human readable name to identify the experiment :type exp_id: str :param local_dir: Path to local directory where to store experiment related files :type local_dir: str :param persist: Save an XML description of the experiment after completion at local_dir :type persist: bool :param fm: FailureManager object. If None is given, the default FailureManager class will be used :type fm: FailureManager :param add_node_callback: Callback to invoke for node instantiation when automatic topology creation mode is used :type add_node_callback: function :param add_edge_callback: Callback to invoke for edge instantiation when automatic topology creation mode is used :type add_edge_callback: function """ super(ExperimentController, self).__init__() # Logging self._logger = logging.getLogger("ExperimentController") # Run identifier. It identifies a concrete execution instance (run) # of an experiment. # Since a same experiment (same configuration) can be executed many # times, this run_id permits to separate result files generated on # different experiment executions self._run_id = tsformat() # Experiment identifier. Usually assigned by the user # Identifies the experiment scenario (i.e. configuration, # resources used, etc) self._exp_id = exp_id or "exp-%s" % os.urandom(8).encode('hex') # Local path where to store experiment related files (results, etc) if not local_dir: local_dir = tempfile.gettempdir() # /tmp self._local_dir = local_dir self._exp_dir = os.path.join(local_dir, self.exp_id) self._run_dir = os.path.join(self.exp_dir, self.run_id) # If True persist the experiment controller in XML format, after completion self._persist = persist # generator of globally unique ids self._guid_generator = guid.GuidGenerator() # Resource managers self._resources = dict() # Scheduler. It a queue that holds tasks scheduled for # execution, and yields the next task to be executed # ordered by execution and arrival time self._scheduler = HeapScheduler() # Tasks self._tasks = dict() # RM groups (for deployment) self._groups = dict() # generator of globally unique id for groups self._group_id_generator = guid.GuidGenerator() # Flag to stop processing thread self._stop = False # Entity in charge of managing system failures if not fm: self._fm = FailureManager() self._fm.set_ec(self) # EC state self._state = ECState.RUNNING # Automatically construct experiment description self._netgraph = None if add_node_callback or add_edge_callback or kwargs.get("topology"): self._build_from_netgraph(add_node_callback, add_edge_callback, **kwargs) # The runner is a pool of threads used to parallelize # execution of tasks self._nthreads = 20 self._runner = None # Event processing thread self._cond = threading.Condition() self._thread = threading.Thread(target = self._process) self._thread.setDaemon(True) self._thread.start() @property def logger(self): """ Returns the logger instance of the Experiment Controller """ return self._logger @property def fm(self): """ Returns the failure manager """ return self._fm @property def failure_level(self): """ Returns the level of FAILURE of th experiment """ return self._fm._failure_level @property def ecstate(self): """ Returns the state of the Experiment Controller """ return self._state @property def exp_id(self): """ Returns the experiment id assigned by the user """ return self._exp_id @property def run_id(self): """ Returns the experiment instance (run) identifier (automatically generated) """ return self._run_id @property def nthreads(self): """ Returns the number of processing nthreads used """ return self._nthreads @property def local_dir(self): """ Root local directory for experiment files """ return self._local_dir @property def exp_dir(self): """ Local directory to store results and other files related to the experiment. """ return self._exp_dir @property def run_dir(self): """ Local directory to store results and other files related to the experiment run. """ return self._run_dir @property def persist(self): """ If True, persists the ExperimentController to XML format upon experiment completion """ return self._persist @property def netgraph(self): """ Return NetGraph instance if experiment description was automatically generated """ return self._netgraph @property def abort(self): """ Returns True if the experiment has failed and should be interrupted, False otherwise. """ return self._fm.abort def inform_failure(self, guid): """ Reports a failure in a RM to the EC for evaluation :param guid: Resource id :type guid: int """ return self._fm.eval_failure(guid) def wait_finished(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= STOPPED (i.e. STOPPED, FAILED or RELEASED ), or until a failure in the experiment occurs (i.e. abort == True) :param guids: List of guids :type guids: list """ def quit(): return self.abort return self.wait(guids, state = ResourceState.STOPPED, quit = quit) def wait_started(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= STARTED, or until a failure in the experiment occurs (i.e. abort == True) :param guids: List of guids :type guids: list """ def quit(): return self.abort return self.wait(guids, state = ResourceState.STARTED, quit = quit) def wait_released(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state == RELEASED, or until the EC fails :param guids: List of guids :type guids: list """ def quit(): return self._state == ECState.FAILED return self.wait(guids, state = ResourceState.RELEASED, quit = quit) def wait_deployed(self, guids): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= READY, or until a failure in the experiment occurs (i.e. abort == True) :param guids: List of guids :type guids: list """ def quit(): return self.abort return self.wait(guids, state = ResourceState.READY, quit = quit) def wait(self, guids, state, quit): """ Blocking method that waits until all RMs in the 'guids' list have reached a state >= 'state', or until the 'quit' callback yields True :param guids: List of guids :type guids: list """ if isinstance(guids, int): guids = [guids] # Make a copy to avoid modifying the original guids list guids = list(guids) while True: # If there are no more guids to wait for # or the quit function returns True, exit the loop if len(guids) == 0 or quit(): break # If a guid reached one of the target states, remove it from list guid = guids.pop() rm = self.get_resource(guid) rstate = rm.state if rstate >= state: self.logger.debug(" %s guid %d DONE - state is %s, required is >= %s " % ( rm.get_rtype(), guid, rstate, state)) else: # Debug... self.logger.debug(" WAITING FOR guid %d - state is %s, required is >= %s " % ( guid, rstate, state)) guids.append(guid) time.sleep(0.5) def plot(self, dirpath = None, format= PFormats.FIGURE, show = False): plotter = ECPlotter() fpath = plotter.plot(self, dirpath = dirpath, format= format, show = show) return fpath def serialize(self, format = SFormats.XML): serializer = ECSerializer() sec = serializer.load(self, format = format) return sec def save(self, dirpath = None, format = SFormats.XML): if dirpath == None: dirpath = self.run_dir try: os.makedirs(dirpath) except OSError: pass serializer = ECSerializer() path = serializer.save(self, dirpath, format = format) return path def get_task(self, tid): """ Returns a task by its id :param tid: Id of the task :type tid: int :rtype: Task """ return self._tasks.get(tid) def get_resource(self, guid): """ Returns a registered ResourceManager by its guid :param guid: Id of the resource :type guid: int :rtype: ResourceManager """ rm = self._resources.get(guid) return rm def get_resources_by_type(self, rtype): """ Returns the ResourceManager objects of type rtype :param rtype: Resource type :type rtype: string :rtype: list of ResourceManagers """ rms = [] for guid, rm in self._resources.iteritems(): if rm.get_rtype() == rtype: rms.append(rm) return rms def remove_resource(self, guid): del self._resources[guid] @property def resources(self): """ Returns the guids of all ResourceManagers :return: Set of all RM guids :rtype: list """ keys = self._resources.keys() return keys def filter_resources(self, rtype): """ Returns the guids of all ResourceManagers of type rtype :param rtype: Resource type :type rtype: string :rtype: list of guids """ rms = [] for guid, rm in self._resources.iteritems(): if rm.get_rtype() == rtype: rms.append(rm.guid) return rms def register_resource(self, rtype, guid = None): """ Registers a new ResourceManager of type 'rtype' in the experiment This method will assign a new 'guid' for the RM, if no guid is specified. :param rtype: Type of the RM :type rtype: str :return: Guid of the RM :rtype: int """ # Get next available guid guid = self._guid_generator.next(guid) # Instantiate RM rm = ResourceFactory.create(rtype, self, guid) # Store RM self._resources[guid] = rm return guid def get_attributes(self, guid): """ Returns all the attributes of the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :return: List of attributes :rtype: list """ rm = self.get_resource(guid) return rm.get_attributes() def get_attribute(self, guid, name): """ Returns the attribute 'name' of the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :return: The attribute with name 'name' :rtype: Attribute """ rm = self.get_resource(guid) return rm.get_attribute(name) def register_connection(self, guid1, guid2): """ Registers a connection between a RM with guid 'guid1' and another RM with guid 'guid2'. The order of the in which the two guids are provided is not important, since the connection relationship is symmetric. :param guid1: First guid to connect :type guid1: ResourceManager :param guid2: Second guid to connect :type guid: ResourceManager """ rm1 = self.get_resource(guid1) rm2 = self.get_resource(guid2) rm1.register_connection(guid2) rm2.register_connection(guid1) def register_condition(self, guids1, action, guids2, state, time = None): """ Registers an action START, STOP or DEPLOY for all RM on list guids1 to occur at time 'time' after all elements in list guids2 have reached state 'state'. :param guids1: List of guids of RMs subjected to action :type guids1: list :param action: Action to perform (either START, STOP or DEPLOY) :type action: ResourceAction :param guids2: List of guids of RMs to we waited for :type guids2: list :param state: State to wait for on RMs of list guids2 (STARTED, STOPPED, etc) :type state: ResourceState :param time: Time to wait after guids2 has reached status :type time: string """ if isinstance(guids1, int): guids1 = [guids1] if isinstance(guids2, int): guids2 = [guids2] for guid1 in guids1: rm = self.get_resource(guid1) rm.register_condition(action, guids2, state, time) def enable_trace(self, guid, name): """ Enables a trace to be collected during the experiment run :param name: Name of the trace :type name: str """ rm = self.get_resource(guid) rm.enable_trace(name) def trace_enabled(self, guid, name): """ Returns True if the trace of name 'name' is enabled :param name: Name of the trace :type name: str """ rm = self.get_resource(guid) return rm.trace_enabled(name) def trace(self, guid, name, attr = TraceAttr.ALL, block = 512, offset = 0): """ Returns information on a collected trace, the trace stream or blocks (chunks) of the trace stream :param name: Name of the trace :type name: str :param attr: Can be one of: - TraceAttr.ALL (complete trace content), - TraceAttr.STREAM (block in bytes to read starting at offset), - TraceAttr.PATH (full path to the trace file), - TraceAttr.SIZE (size of trace file). :type attr: str :param block: Number of bytes to retrieve from trace, when attr is TraceAttr.STREAM :type name: int :param offset: Number of 'blocks' to skip, when attr is TraceAttr.STREAM :type name: int :rtype: str """ rm = self.get_resource(guid) return rm.trace(name, attr, block, offset) def get_traces(self, guid): """ Returns the list of the trace names of the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :return: List of trace names :rtype: list """ rm = self.get_resource(guid) return rm.get_traces() def discover(self, guid): """ Discovers an available resource matching the criteria defined by the RM with guid 'guid', and associates that resource to the RM Not all RM types require (or are capable of) performing resource discovery. For the RM types which are not capable of doing so, invoking this method does not have any consequences. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.discover() def provision(self, guid): """ Provisions the resource associated to the RM with guid 'guid'. Provisioning means making a resource 'accessible' to the user. Not all RM types require (or are capable of) performing resource provisioning. For the RM types which are not capable of doing so, invoking this method does not have any consequences. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.provision() def get(self, guid, name): """ Returns the value of the attribute with name 'name' on the RM with guid 'guid' :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :return: The value of the attribute with name 'name' """ rm = self.get_resource(guid) return rm.get(name) def set(self, guid, name, value): """ Modifies the value of the attribute with name 'name' on the RM with guid 'guid'. :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :param value: Value of the attribute """ rm = self.get_resource(guid) rm.set(name, value) def get_global(self, rtype, name): """ Returns the value of the global attribute with name 'name' on the RMs of rtype 'rtype'. :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :return: The value of the attribute with name 'name' """ rclass = ResourceFactory.get_resource_type(rtype) return rclass.get_global(name) def set_global(self, rtype, name, value): """ Modifies the value of the global attribute with name 'name' on the RMs of with rtype 'rtype'. :param guid: Guid of the RM :type guid: int :param name: Name of the attribute :type name: str :param value: Value of the attribute """ rclass = ResourceFactory.get_resource_type(rtype) return rclass.set_global(name, value) def state(self, guid, hr = False): """ Returns the state of a resource :param guid: Resource guid :type guid: integer :param hr: Human readable. Forces return of a status string instead of a number :type hr: boolean """ rm = self.get_resource(guid) state = rm.state if hr: return ResourceState2str.get(state) return state def stop(self, guid): """ Stops the RM with guid 'guid' Stopping a RM means that the resource it controls will no longer take part of the experiment. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.stop() def start(self, guid): """ Starts the RM with guid 'guid' Starting a RM means that the resource it controls will begin taking part of the experiment. :param guid: Guid of the RM :type guid: int """ rm = self.get_resource(guid) return rm.start() def get_start_time(self, guid): """ Returns the start time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.start_time def get_stop_time(self, guid): """ Returns the stop time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.stop_time def get_discover_time(self, guid): """ Returns the discover time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.discover_time def get_provision_time(self, guid): """ Returns the provision time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.provision_time def get_ready_time(self, guid): """ Returns the deployment time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.ready_time def get_release_time(self, guid): """ Returns the release time of the RM as a timestamp """ rm = self.get_resource(guid) return rm.release_time def get_failed_time(self, guid): """ Returns the time failure occured for the RM as a timestamp """ rm = self.get_resource(guid) return rm.failed_time def set_with_conditions(self, name, value, guids1, guids2, state, time = None): """ Modifies the value of attribute with name 'name' on all RMs on the guids1 list when time 'time' has elapsed since all elements in guids2 list have reached state 'state'. :param name: Name of attribute to set in RM :type name: string :param value: Value of attribute to set in RM :type name: string :param guids1: List of guids of RMs subjected to action :type guids1: list :param action: Action to register (either START or STOP) :type action: ResourceAction :param guids2: List of guids of RMs to we waited for :type guids2: list :param state: State to wait for on RMs (STARTED, STOPPED, etc) :type state: ResourceState :param time: Time to wait after guids2 has reached status :type time: string """ if isinstance(guids1, int): guids1 = [guids1] if isinstance(guids2, int): guids2 = [guids2] for guid1 in guids1: rm = self.get_resource(guid) rm.set_with_conditions(name, value, guids2, state, time) def deploy(self, guids = None, wait_all_ready = True, group = None): """ Deploys all ResourceManagers in the guids list. If the argument 'guids' is not given, all RMs with state NEW are deployed. :param guids: List of guids of RMs to deploy :type guids: list :param wait_all_ready: Wait until all RMs are ready in order to start the RMs :type guid: int :param group: Id of deployment group in which to deploy RMs :type group: int """ self.logger.debug(" ------- DEPLOY START ------ ") if not guids: # If no guids list was passed, all 'NEW' RMs will be deployed guids = [] for guid, rm in self._resources.iteritems(): if rm.state == ResourceState.NEW: guids.append(guid) if isinstance(guids, int): guids = [guids] # Create deployment group # New guids can be added to a same deployment group later on new_group = False if not group: new_group = True group = self._group_id_generator.next() if group not in self._groups: self._groups[group] = [] self._groups[group].extend(guids) def wait_all_and_start(group): # Function that checks if all resources are READY # before scheduling a start_with_conditions for each RM reschedule = False # Get all guids in group guids = self._groups[group] for guid in guids: if self.state(guid) < ResourceState.READY: reschedule = True break if reschedule: callback = functools.partial(wait_all_and_start, group) self.schedule("1s", callback) else: # If all resources are ready, we schedule the start for guid in guids: rm = self.get_resource(guid) self.schedule("0s", rm.start_with_conditions) if rm.conditions.get(ResourceAction.STOP): # Only if the RM has STOP conditions we # schedule a stop. Otherwise the RM will stop immediately self.schedule("0s", rm.stop_with_conditions) if wait_all_ready and new_group: # Schedule a function to check that all resources are # READY, and only then schedule the start. # This aims at reducing the number of tasks looping in the # scheduler. # Instead of having many start tasks, we will have only one for # the whole group. callback = functools.partial(wait_all_and_start, group) self.schedule("0s", callback) for guid in guids: rm = self.get_resource(guid) rm.deployment_group = group self.schedule("0s", rm.deploy_with_conditions) if not wait_all_ready: self.schedule("0s", rm.start_with_conditions) if rm.conditions.get(ResourceAction.STOP): # Only if the RM has STOP conditions we # schedule a stop. Otherwise the RM will stop immediately self.schedule("0s", rm.stop_with_conditions) def release(self, guids = None): """ Releases all ResourceManagers in the guids list. If the argument 'guids' is not given, all RMs registered in the experiment are released. :param guids: List of RM guids :type guids: list """ if self._state == ECState.RELEASED: return if isinstance(guids, int): guids = [guids] if not guids: guids = self.resources for guid in guids: rm = self.get_resource(guid) self.schedule("0s", rm.release) self.wait_released(guids) if self.persist: self.save() for guid in guids: if self.get(guid, "hardRelease"): self.remove_resource(guid)\ # Mark the EC state as RELEASED self._state = ECState.RELEASED def shutdown(self): """ Releases all resources and stops the ExperimentController """ # If there was a major failure we can't exit gracefully if self._state == ECState.FAILED: raise RuntimeError("EC failure. Can not exit gracefully") # Remove all pending tasks from the scheduler queue for tid in list(self._scheduler.pending): self._scheduler.remove(tid) # Remove pending tasks from the workers queue self._runner.empty() self.release() # Mark the EC state as TERMINATED self._state = ECState.TERMINATED # Stop processing thread self._stop = True # Notify condition to wake up the processing thread self._notify() if self._thread.is_alive(): self._thread.join() def schedule(self, date, callback, track = False): """ Schedules a callback to be executed at time 'date'. :param date: string containing execution time for the task. It can be expressed as an absolute time, using timestamp format, or as a relative time matching ^\d+.\d+(h|m|s|ms|us)$ :param callback: code to be executed for the task. Must be a Python function, and receives args and kwargs as arguments. :param track: if set to True, the task will be retrievable with the get_task() method :return : The Id of the task :rtype: int """ timestamp = stabsformat(date) task = Task(timestamp, callback) task = self._scheduler.schedule(task) if track: self._tasks[task.id] = task # Notify condition to wake up the processing thread self._notify() return task.id def _process(self): """ Process scheduled tasks. .. note:: Tasks are scheduled by invoking the schedule method with a target callback and an execution time. The schedule method creates a new Task object with that callback and execution time, and pushes it into the '_scheduler' queue. The execution time and the order of arrival of tasks are used to order the tasks in the queue. The _process method is executed in an independent thread held by the ExperimentController for as long as the experiment is running. This method takes tasks from the '_scheduler' queue in a loop and processes them in parallel using multithreading. The environmental variable NEPI_NTHREADS can be used to control the number of threads used to process tasks. The default value is 50. To execute tasks in parallel, a ParallelRunner (PR) object is used. This object keeps a pool of threads (workers), and a queue of tasks scheduled for 'immediate' execution. On each iteration, the '_process' loop will take the next task that is scheduled for 'future' execution from the '_scheduler' queue, and if the execution time of that task is >= to the current time, it will push that task into the PR for 'immediate execution'. As soon as a worker is free, the PR will assign the next task to that worker. Upon receiving a task to execute, each PR worker (thread) will invoke the _execute method of the EC, passing the task as argument. The _execute method will then invoke task.callback inside a try/except block. If an exception is raised by the tasks.callback, it will be trapped by the try block, logged to standard error (usually the console), and the task will be marked as failed. """ self._nthreads = int(os.environ.get("NEPI_NTHREADS", str(self._nthreads))) self._runner = ParallelRun(maxthreads = self.nthreads) self._runner.start() while not self._stop: try: self._cond.acquire() task = self._scheduler.next() if not task: # No task to execute. Wait for a new task to be scheduled. self._cond.wait() else: # The task timestamp is in the future. Wait for timeout # or until another task is scheduled. now = tnow() if now < task.timestamp: # Calculate timeout in seconds timeout = tdiffsec(task.timestamp, now) # Re-schedule task with the same timestamp self._scheduler.schedule(task) task = None # Wait timeout or until a new task awakes the condition self._cond.wait(timeout) self._cond.release() if task: # Process tasks in parallel self._runner.put(self._execute, task) except: import traceback err = traceback.format_exc() self.logger.error("Error while processing tasks in the EC: %s" % err) # Set the EC to FAILED state self._state = ECState.FAILED # Set the FailureManager failure level to EC failure self._fm.set_ec_failure() self.logger.debug("Exiting the task processing loop ... ") self._runner.sync() self._runner.destroy() def _execute(self, task): """ Executes a single task. :param task: Object containing the callback to execute :type task: Task """ try: # Invoke callback task.result = task.callback() task.status = TaskStatus.DONE except: import traceback err = traceback.format_exc() task.result = err task.status = TaskStatus.ERROR self.logger.error("Error occurred while executing task: %s" % err) def _notify(self): """ Awakes the processing thread if it is blocked waiting for new tasks to arrive """ self._cond.acquire() self._cond.notify() self._cond.release() def _build_from_netgraph(self, add_node_callback, add_edge_callback, **kwargs): """ Automates experiment description using a NetGraph instance. """ self._netgraph = NetGraph(**kwargs) if add_node_callback: ### Add resources to the EC for nid in self.netgraph.nodes(): add_node_callback(self, nid) if add_edge_callback: #### Add connections between resources for nid1, nid2 in self.netgraph.edges(): add_edge_callback(self, nid1, nid2)
def do_resource_discovery(self, recover = False): to_provision = self._to_provision = set() reserved = set(self._blacklist) for guid, node in self._elements.iteritems(): if isinstance(node, self._node.Node) and node._node_id is not None: reserved.add(node.hostname) # Initial algo: # look for perfectly defined nodes # (ie: those with only one candidate) reserve_lock = threading.RLock() def assignifunique(guid, node): # Try existing nodes first # If we have only one candidate, simply use it candidates = node.find_candidates( filter_slice_id = self.slice_id) node_id = None candidate_hosts = set(candidates.keys() if candidates else []) reserve_lock.acquire() try: candidate_hosts -= reserved if len(candidate_hosts) == 1: hostname = iter(candidate_hosts).next() node_id = candidates[hostname] reserved.add(hostname) elif not candidate_hosts: # Try again including unassigned nodes reserve_lock.release() try: candidates = node.find_candidates() finally: reserve_lock.acquire() candidate_hosts = set(candidates.keys() if candidates else []) candidate_hosts -= reserved if len(candidate_hosts) > 1: return if len(candidate_hosts) == 1: hostname = iter(candidate_hosts).next() node_id = candidates[hostname] to_provision.add(node_id) reserved.add(hostname) elif not candidates: raise RuntimeError, "Cannot assign resources for node %s, no candidates with %s" % (guid, node.make_filter_description()) finally: reserve_lock.release() if node_id is not None: node.assign_node_id(node_id) runner = ParallelRun(maxthreads=4) # don't overload the PLC API, just 4 threads to hide latencies and that's it runner.start() for guid, node in self._elements.iteritems(): if isinstance(node, self._node.Node) and node._node_id is None: runner.put(assignifunique, guid, node) runner.sync() # Now do the backtracking search for a suitable solution # First with existing slice nodes reqs = [] nodes = [] def genreqs(node, filter_slice_id=None): # Try existing nodes first # If we have only one candidate, simply use it candidates = node.find_candidates( filter_slice_id = filter_slice_id) for r in reserved: if candidates.has_key(r): del candidates[r] reqs.append(candidates.values()) nodes.append(node) for guid, node in self._elements.iteritems(): if isinstance(node, self._node.Node) and node._node_id is None: runner.put(genreqs, node, self.slice_id) runner.sync() if nodes and reqs: if recover: raise RuntimeError, "Impossible to recover: unassigned host for Nodes %r" % (nodes,) def pickbest(fullset, nreq, node=nodes[0]): if len(fullset) > nreq: fullset = zip(node.rate_nodes(fullset),fullset) fullset.sort(reverse=True) del fullset[nreq:] return set(map(operator.itemgetter(1),fullset)) else: return fullset try: solution = resourcealloc.alloc(reqs, sample=pickbest) except resourcealloc.ResourceAllocationError: # Failed, try again with all nodes reqs = [] for node in nodes: runner.put(genreqs, node) runner.sync() solution = resourcealloc.alloc(reqs, sample=pickbest) to_provision.update(solution) # Do assign nodes for node, node_id in zip(nodes, solution): runner.put(node.assign_node_id, node_id) runner.join()