def _launch_job(self, n_boards, spalloc_kw_args): job = Job(n_boards, **spalloc_kw_args) try: job.wait_until_ready() # get param from jobs before starting, so that hanging doesn't # occur return job, job.hostname except Exception: job.destroy() raise
def run_job(job_args, job_kwargs, ip_file_filename): # Reason for destroying the job reason = None # Create the job try: job = Job(*job_args, **job_kwargs) except (OSError, IOError, ProtocolError, ProtocolTimeoutError) as e: info(t.red("Could not connect to server: {}".format(e))) return 6 try: # Wait for it to become ready, keeping the user informed along the # way code, reason = wait_for_job_ready(job) if code != 0: return code # Machine is now ready write_ips_to_csv(job.connections, ip_file_filename) # Boot the machine if required if MachineController is not None and args.boot: update("Job {}: Booting...", t.yellow, job.id) mc = MachineController(job.hostname) mc.boot(job.width, job.height) update("Job {}: Ready!", t.green, job.id) # Either run the user's application or just print the details. if not args.command: print_info(job.machine_name, job.connections, job.width, job.height, ip_file_filename) return 0 return run_command(args.command, job.id, job.machine_name, job.connections, job.width, job.height, ip_file_filename) finally: # Destroy job and disconnect client if args.no_destroy: job.close() else: job.destroy(reason)
def _launch_job(self, n_boards, spalloc_kw_args): job = Job(n_boards, **spalloc_kw_args) try: job.wait_until_ready() # get param from jobs before starting, so that hanging doesn't # occur return job, job.hostname except: job.destroy() raise
def _launch_job(self, n_boards, spalloc_kw_args): """ :param int n_boards: :param dict(str, str or int) spalloc_kw_args: :rtype: tuple(~.Job, str) """ job = Job(n_boards, **spalloc_kw_args) try: job.wait_until_ready() # get param from jobs before starting, so that hanging doesn't # occur return job, job.hostname except Exception: job.destroy() raise
def __init__(self, network, dt=0.001, period=10.0, timescale=1.0, hostname=None, use_spalloc=None, allocation_fudge_factor=0.6): """Create a new Simulator with the given network. Parameters ---------- period : float or None Duration of one period of the simulator. This determines how much memory will be allocated to store precomputed and probed data. timescale : float Scaling factor to apply to the simulation, e.g., a value of `0.5` will cause the simulation to run at half real-time. hostname : string or None Hostname of the SpiNNaker machine to use; if None then the machine specified in the config file will be used. use_spalloc : bool or None Allocate a SpiNNaker machine for the simulator using ``spalloc``. If None then the setting specified in the config file will be used. Other Parameters ---------------- allocation_fudge_factor: Fudge factor to allocate more cores than really necessary when using `spalloc` to ensure that (a) there are sufficient "live" cores in the allocated machine, (b) there is sufficient room for a good place and route solution. This should generally be more than 0.1 (10% more cores than necessary) to account for the usual rate of missing chips. """ # Add this simulator to the set of open simulators Simulator._add_simulator(self) # Create the IO controller io_cls = getconfig(network.config, Simulator, "node_io", Ethernet) io_kwargs = getconfig(network.config, Simulator, "node_io_kwargs", dict()) self.io_controller = io_cls(**io_kwargs) # Calculate the machine timestep, this is measured in microseconds # (hence the 1e6 scaling factor). self.timescale = timescale machine_timestep = int((dt / timescale) * 1e6) # Determine the maximum run-time self.max_steps = None if period is None else int(period / dt) self.steps = 0 # Steps simulated # If the simulator is in "run indefinite" mode (i.e., max_steps=None) # then we modify the builders to ignore function of time Nodes and # probes. builder_kwargs = self.io_controller.builder_kwargs if self.max_steps is None: raise NotImplementedError # Create a model from the network, using the IO controller logger.debug("Building model") start_build = time.time() self.model = Model(dt=dt, machine_timestep=machine_timestep, decoder_cache=get_default_decoder_cache()) self.model.build(network, **builder_kwargs) logger.info("Build took {:.3f} seconds".format(time.time() - start_build)) self.model.decoder_cache.shrink() self.dt = self.model.dt self._closed = False # Whether the simulator has been closed or not self.host_sim = self._create_host_sim() # Holder for probe data self.data = {} # Holder for profiling data self.profiler_data = {} # Convert the model into a netlist logger.info("Building netlist") start = time.time() self.netlist = self.model.make_netlist(self.max_steps or 0) # Determine whether to use a spalloc machine or not if use_spalloc is None: # Default is to not use spalloc; this is indicated by either the # absence of the option in the config file OR the option being set # to false. use_spalloc = (rc.has_option("spinnaker_machine", "use_spalloc") and rc.getboolean("spinnaker_machine", "use_spalloc")) # Create a controller for the machine and boot if necessary self.job = None if not use_spalloc or hostname is not None: # Use the specified machine rather than trying to get one # allocated. if hostname is None: hostname = rc.get("spinnaker_machine", "hostname") else: # Attempt to get a machine allocated to us from spalloc import Job # Determine how many boards to ask for (assuming 16 usable cores # per chip and 48 chips per board). n_cores = self.netlist.n_cores * (1.0 + allocation_fudge_factor) n_boards = int(np.ceil((n_cores / 16.) / 48.)) # Request the job self.job = Job(n_boards) logger.info("Allocated job ID %d...", self.job.id) # Wait until we're given the machine logger.info("Waiting for machine allocation...") self.job.wait_until_ready() # spalloc recommends a slight delay before attempting to boot the # machine, later versions of spalloc server may relax this # requirement. time.sleep(5.0) # Store the hostname hostname = self.job.hostname logger.info("Using %d board(s) of \"%s\" (%s)", len(self.job.boards), self.job.machine_name, hostname) self.controller = MachineController(hostname) self.controller.boot() # Get a system-info object to place & route against logger.info("Getting SpiNNaker machine specification") system_info = self.controller.get_system_info() # Place & Route logger.info("Placing and routing") self.netlist.place_and_route( system_info, place=getconfig(network.config, Simulator, 'placer', rig.place_and_route.place), place_kwargs=getconfig(network.config, Simulator, 'placer_kwargs', {}), ) logger.info("{} cores in use".format(len(self.netlist.placements))) chips = set(six.itervalues(self.netlist.placements)) logger.info("Using {}".format(chips)) # Prepare the simulator against the placed, allocated and routed # netlist. self.io_controller.prepare(self.model, self.controller, self.netlist) # Load the application logger.info("Loading application") self.netlist.load_application(self.controller, system_info) # Check if any cores are in bad states if self.controller.count_cores_in_state( ["exit", "dead", "watchdog", "runtime_exception"]): for vertex, (x, y) in six.iteritems(self.netlist.placements): p = self.netlist.allocations[vertex][Cores].start status = self.controller.get_processor_status(p, x, y) if status.cpu_state is not AppState.sync0: print("Core ({}, {}, {}) in state {!s}".format( x, y, p, status)) print(self.controller.get_iobuf(p, x, y)) raise Exception("Unexpected core failures.") logger.info("Preparing and loading machine took {:3f} seconds".format( time.time() - start)) logger.info("Setting router timeout to 16 cycles") for x, y in system_info.chips(): with self.controller(x=x, y=y): data = self.controller.read(0xf1000000, 4) self.controller.write(0xf1000000, data[:-1] + b'\x10')
class Simulator(object): """SpiNNaker simulator for Nengo models. The simulator period determines how much data will be stored on SpiNNaker and is the maximum length of simulation allowed before data is transferred between the machine and the host PC. If the period is set to `None` function of time Nodes will not be optimised and probes will be disabled. For any other value simulation lengths of less than or equal to the period will be in real-time, longer simulations will be possible but will include short gaps when data is transferred between SpiNNaker and the host. :py:meth:`~.close` should be called when the simulator will no longer be used. This will close all sockets used to communicate with the SpiNNaker machine and will leave the machine in a clean state. Failure to call `close` may result in later failures. Alternatively `with` may be used:: sim = nengo_spinnaker.Simulator(network) with sim: sim.run(10.0) """ _open_simulators = set() @classmethod def _add_simulator(cls, simulator): cls._open_simulators.add(simulator) @classmethod def _remove_simulator(cls, simulator): cls._open_simulators.remove(simulator) def __init__(self, network, dt=0.001, period=10.0, timescale=1.0, hostname=None, use_spalloc=None, allocation_fudge_factor=0.6): """Create a new Simulator with the given network. Parameters ---------- period : float or None Duration of one period of the simulator. This determines how much memory will be allocated to store precomputed and probed data. timescale : float Scaling factor to apply to the simulation, e.g., a value of `0.5` will cause the simulation to run at half real-time. hostname : string or None Hostname of the SpiNNaker machine to use; if None then the machine specified in the config file will be used. use_spalloc : bool or None Allocate a SpiNNaker machine for the simulator using ``spalloc``. If None then the setting specified in the config file will be used. Other Parameters ---------------- allocation_fudge_factor: Fudge factor to allocate more cores than really necessary when using `spalloc` to ensure that (a) there are sufficient "live" cores in the allocated machine, (b) there is sufficient room for a good place and route solution. This should generally be more than 0.1 (10% more cores than necessary) to account for the usual rate of missing chips. """ # Add this simulator to the set of open simulators Simulator._add_simulator(self) # Create the IO controller io_cls = getconfig(network.config, Simulator, "node_io", Ethernet) io_kwargs = getconfig(network.config, Simulator, "node_io_kwargs", dict()) self.io_controller = io_cls(**io_kwargs) # Calculate the machine timestep, this is measured in microseconds # (hence the 1e6 scaling factor). self.timescale = timescale machine_timestep = int((dt / timescale) * 1e6) # Determine the maximum run-time self.max_steps = None if period is None else int(period / dt) self.steps = 0 # Steps simulated # If the simulator is in "run indefinite" mode (i.e., max_steps=None) # then we modify the builders to ignore function of time Nodes and # probes. builder_kwargs = self.io_controller.builder_kwargs if self.max_steps is None: raise NotImplementedError # Create a model from the network, using the IO controller logger.debug("Building model") start_build = time.time() self.model = Model(dt=dt, machine_timestep=machine_timestep, decoder_cache=get_default_decoder_cache()) self.model.build(network, **builder_kwargs) logger.info("Build took {:.3f} seconds".format(time.time() - start_build)) self.model.decoder_cache.shrink() self.dt = self.model.dt self._closed = False # Whether the simulator has been closed or not self.host_sim = self._create_host_sim() # Holder for probe data self.data = {} # Holder for profiling data self.profiler_data = {} # Convert the model into a netlist logger.info("Building netlist") start = time.time() self.netlist = self.model.make_netlist(self.max_steps or 0) # Determine whether to use a spalloc machine or not if use_spalloc is None: # Default is to not use spalloc; this is indicated by either the # absence of the option in the config file OR the option being set # to false. use_spalloc = (rc.has_option("spinnaker_machine", "use_spalloc") and rc.getboolean("spinnaker_machine", "use_spalloc")) # Create a controller for the machine and boot if necessary self.job = None if not use_spalloc or hostname is not None: # Use the specified machine rather than trying to get one # allocated. if hostname is None: hostname = rc.get("spinnaker_machine", "hostname") else: # Attempt to get a machine allocated to us from spalloc import Job # Determine how many boards to ask for (assuming 16 usable cores # per chip and 48 chips per board). n_cores = self.netlist.n_cores * (1.0 + allocation_fudge_factor) n_boards = int(np.ceil((n_cores / 16.) / 48.)) # Request the job self.job = Job(n_boards) logger.info("Allocated job ID %d...", self.job.id) # Wait until we're given the machine logger.info("Waiting for machine allocation...") self.job.wait_until_ready() # spalloc recommends a slight delay before attempting to boot the # machine, later versions of spalloc server may relax this # requirement. time.sleep(5.0) # Store the hostname hostname = self.job.hostname logger.info("Using %d board(s) of \"%s\" (%s)", len(self.job.boards), self.job.machine_name, hostname) self.controller = MachineController(hostname) self.controller.boot() # Get a system-info object to place & route against logger.info("Getting SpiNNaker machine specification") system_info = self.controller.get_system_info() # Place & Route logger.info("Placing and routing") self.netlist.place_and_route( system_info, place=getconfig(network.config, Simulator, 'placer', rig.place_and_route.place), place_kwargs=getconfig(network.config, Simulator, 'placer_kwargs', {}), ) logger.info("{} cores in use".format(len(self.netlist.placements))) chips = set(six.itervalues(self.netlist.placements)) logger.info("Using {}".format(chips)) # Prepare the simulator against the placed, allocated and routed # netlist. self.io_controller.prepare(self.model, self.controller, self.netlist) # Load the application logger.info("Loading application") self.netlist.load_application(self.controller, system_info) # Check if any cores are in bad states if self.controller.count_cores_in_state( ["exit", "dead", "watchdog", "runtime_exception"]): for vertex, (x, y) in six.iteritems(self.netlist.placements): p = self.netlist.allocations[vertex][Cores].start status = self.controller.get_processor_status(p, x, y) if status.cpu_state is not AppState.sync0: print("Core ({}, {}, {}) in state {!s}".format( x, y, p, status)) print(self.controller.get_iobuf(p, x, y)) raise Exception("Unexpected core failures.") logger.info("Preparing and loading machine took {:3f} seconds".format( time.time() - start)) logger.info("Setting router timeout to 16 cycles") for x, y in system_info.chips(): with self.controller(x=x, y=y): data = self.controller.read(0xf1000000, 4) self.controller.write(0xf1000000, data[:-1] + b'\x10') def __enter__(self): """Enter a context which will close the simulator when exited.""" # Return self to allow usage like: # # with nengo_spinnaker.Simulator(model) as sim: # sim.run(1.0) return self def __exit__(self, exception_type, exception_value, traceback): """Exit a context and close the simulator.""" self.close() def run(self, time_in_seconds): """Simulate for the given length of time.""" # Determine how many steps to simulate for steps = int(np.round(float(time_in_seconds) / self.dt)) self.run_steps(steps) def run_steps(self, steps): """Simulate a give number of steps.""" while steps > 0: n_steps = min((steps, self.max_steps)) self._run_steps(n_steps) steps -= n_steps def _run_steps(self, steps): """Simulate for the given number of steps.""" if self._closed: raise Exception("Simulator has been closed and can't be used to " "run further simulations.") if steps is None: if self.max_steps is not None: raise Exception( "Cannot run indefinitely if a simulator period was " "specified. Create a new simulator with Simulator(model, " "period=None) to perform indefinite time simulations.") else: assert steps <= self.max_steps # Prepare the simulation self.netlist.before_simulation(self, steps) # Wait for all cores to hit SYNC0 (either by remaining it or entering # it from init) self._wait_for_transition(AppState.init, AppState.sync0, self.netlist.n_cores) self.controller.send_signal("sync0") # Get a new thread for the IO io_thread = self.io_controller.spawn() # Run the simulation try: # Prep exp_time = steps * self.dt / self.timescale io_thread.start() # Wait for all cores to hit SYNC1 self._wait_for_transition(AppState.sync0, AppState.sync1, self.netlist.n_cores) logger.info("Running simulation...") self.controller.send_signal("sync1") # Execute the local model host_steps = 0 start_time = time.time() run_time = 0.0 local_timestep = self.dt / self.timescale while run_time < exp_time: # Run a step self.host_sim.step() run_time = time.time() - start_time # If that step took less than timestep then spin time.sleep(0.0001) while run_time < host_steps * local_timestep: time.sleep(0.0001) run_time = time.time() - start_time finally: # Stop the IO thread whatever occurs io_thread.stop() # Wait for cores to re-enter sync0 self._wait_for_transition(AppState.run, AppState.sync0, self.netlist.n_cores) # Retrieve simulation data start = time.time() logger.info("Retrieving simulation data") self.netlist.after_simulation(self, steps) logger.info("Retrieving data took {:3f} seconds".format(time.time() - start)) # Increase the steps count self.steps += steps def _wait_for_transition(self, from_state, desired_to_state, num_verts): while True: # If no cores are still in from_state, stop if self.controller.count_cores_in_state(from_state) == 0: break # Wait a bit time.sleep(1.0) # Check if any cores haven't exited cleanly num_ready = self.controller.wait_for_cores_to_reach_state( desired_to_state, num_verts, timeout=5.0) if num_ready != num_verts: # Loop through all placed vertices for vertex, (x, y) in six.iteritems(self.netlist.placements): p = self.netlist.allocations[vertex][Cores].start status = self.controller.get_processor_status(p, x, y) if status.cpu_state is not desired_to_state: print("Core ({}, {}, {}) in state {!s}".format( x, y, p, status.cpu_state)) print(self.controller.get_iobuf(p, x, y)) raise Exception("Unexpected core failures before reaching %s " "state." % desired_to_state) def _create_host_sim(self): # change node_functions to reflect time # TODO: improve the reference simulator so that this is not needed # by adding a realtime option node_functions = {} node_info = dict(start=None) for node in self.io_controller.host_network.all_nodes: if callable(node.output): old_func = node.output if node.size_in == 0: def func(t, f=old_func): now = time.time() if node_info['start'] is None: node_info['start'] = now t = (now - node_info['start']) * self.timescale return f(t) else: def func(t, x, f=old_func): now = time.time() if node_info['start'] is None: node_info['start'] = now t = (now - node_info['start']) * self.timescale return f(t, x) node.output = func node_functions[node] = old_func # Build the host simulator host_sim = nengo.Simulator(self.io_controller.host_network, dt=self.dt) # reset node functions for node, func in node_functions.items(): node.output = func return host_sim def close(self): """Clean the SpiNNaker board and prevent further simulation.""" if not self._closed: # Stop the application self._closed = True self.io_controller.close() self.controller.send_signal("stop") # Destroy the job if we allocated one if self.job is not None: self.job.destroy() # Remove this simulator from the list of open simulators Simulator._remove_simulator(self) def trange(self, dt=None): return np.arange(1, self.steps + 1) * (self.dt or dt)
def __init__(self, network, dt=0.001, period=10.0, timescale=1.0, hostname=None, use_spalloc=None, allocation_fudge_factor=0.6): """Create a new Simulator with the given network. Parameters ---------- period : float or None Duration of one period of the simulator. This determines how much memory will be allocated to store precomputed and probed data. timescale : float Scaling factor to apply to the simulation, e.g., a value of `0.5` will cause the simulation to run at half real-time. hostname : string or None Hostname of the SpiNNaker machine to use; if None then the machine specified in the config file will be used. use_spalloc : bool or None Allocate a SpiNNaker machine for the simulator using ``spalloc``. If None then the setting specified in the config file will be used. Other Parameters ---------------- allocation_fudge_factor: Fudge factor to allocate more cores than really necessary when using `spalloc` to ensure that (a) there are sufficient "live" cores in the allocated machine, (b) there is sufficient room for a good place and route solution. This should generally be more than 0.1 (10% more cores than necessary) to account for the usual rate of missing chips. """ # Add this simulator to the set of open simulators Simulator._add_simulator(self) # Create the IO controller io_cls = getconfig(network.config, Simulator, "node_io", Ethernet) io_kwargs = getconfig(network.config, Simulator, "node_io_kwargs", dict()) self.io_controller = io_cls(**io_kwargs) # Calculate the machine timestep, this is measured in microseconds # (hence the 1e6 scaling factor). self.timescale = timescale machine_timestep = int((dt / timescale) * 1e6) # Determine the maximum run-time self.max_steps = None if period is None else int(period / dt) self.steps = 0 # Steps simulated # If the simulator is in "run indefinite" mode (i.e., max_steps=None) # then we modify the builders to ignore function of time Nodes and # probes. builder_kwargs = self.io_controller.builder_kwargs if self.max_steps is None: raise NotImplementedError # Create a model from the network, using the IO controller logger.debug("Building model") start_build = time.time() self.model = Model(dt=dt, machine_timestep=machine_timestep, decoder_cache=get_default_decoder_cache()) self.model.build(network, **builder_kwargs) logger.info("Build took {:.3f} seconds".format(time.time() - start_build)) self.model.decoder_cache.shrink() self.dt = self.model.dt self._closed = False # Whether the simulator has been closed or not self.host_sim = self._create_host_sim() # Holder for probe data self.data = {} # Holder for profiling data self.profiler_data = {} # Convert the model into a netlist logger.info("Building netlist") start = time.time() self.netlist = self.model.make_netlist(self.max_steps or 0) # Determine whether to use a spalloc machine or not if use_spalloc is None: # Default is to not use spalloc; this is indicated by either the # absence of the option in the config file OR the option being set # to false. use_spalloc = ( rc.has_option("spinnaker_machine", "use_spalloc") and rc.getboolean("spinnaker_machine", "use_spalloc")) # Create a controller for the machine and boot if necessary self.job = None if not use_spalloc or hostname is not None: # Use the specified machine rather than trying to get one # allocated. if hostname is None: hostname = rc.get("spinnaker_machine", "hostname") else: # Attempt to get a machine allocated to us from spalloc import Job # Determine how many boards to ask for (assuming 16 usable cores # per chip and 48 chips per board). n_cores = self.netlist.n_cores * (1.0 + allocation_fudge_factor) n_boards = int(np.ceil((n_cores / 16.) / 48.)) # Request the job self.job = Job(n_boards) logger.info("Allocated job ID %d...", self.job.id) # Wait until we're given the machine logger.info("Waiting for machine allocation...") self.job.wait_until_ready() # spalloc recommends a slight delay before attempting to boot the # machine, later versions of spalloc server may relax this # requirement. time.sleep(5.0) # Store the hostname hostname = self.job.hostname logger.info("Using %d board(s) of \"%s\" (%s)", len(self.job.boards), self.job.machine_name, hostname) self.controller = MachineController(hostname) self.controller.boot() # Get a system-info object to place & route against logger.info("Getting SpiNNaker machine specification") system_info = self.controller.get_system_info() # Place & Route logger.info("Placing and routing") self.netlist.place_and_route( system_info, place=getconfig(network.config, Simulator, 'placer', rig.place_and_route.place), place_kwargs=getconfig(network.config, Simulator, 'placer_kwargs', {}), ) logger.info("{} cores in use".format(len(self.netlist.placements))) chips = set(six.itervalues(self.netlist.placements)) logger.info("Using {}".format(chips)) # Prepare the simulator against the placed, allocated and routed # netlist. self.io_controller.prepare(self.model, self.controller, self.netlist) # Load the application logger.info("Loading application") self.netlist.load_application(self.controller, system_info) # Check if any cores are in bad states if self.controller.count_cores_in_state(["exit", "dead", "watchdog", "runtime_exception"]): for vertex, (x, y) in six.iteritems(self.netlist.placements): p = self.netlist.allocations[vertex][Cores].start status = self.controller.get_processor_status(p, x, y) if status.cpu_state is not AppState.sync0: print("Core ({}, {}, {}) in state {!s}".format( x, y, p, status)) print(self.controller.get_iobuf(p, x, y)) raise Exception("Unexpected core failures.") logger.info("Preparing and loading machine took {:3f} seconds".format( time.time() - start )) logger.info("Setting router timeout to 16 cycles") for x, y in system_info.chips(): with self.controller(x=x, y=y): data = self.controller.read(0xf1000000, 4) self.controller.write(0xf1000000, data[:-1] + b'\x10')
class Simulator(object): """SpiNNaker simulator for Nengo models. The simulator period determines how much data will be stored on SpiNNaker and is the maximum length of simulation allowed before data is transferred between the machine and the host PC. If the period is set to `None` function of time Nodes will not be optimised and probes will be disabled. For any other value simulation lengths of less than or equal to the period will be in real-time, longer simulations will be possible but will include short gaps when data is transferred between SpiNNaker and the host. :py:meth:`~.close` should be called when the simulator will no longer be used. This will close all sockets used to communicate with the SpiNNaker machine and will leave the machine in a clean state. Failure to call `close` may result in later failures. Alternatively `with` may be used:: sim = nengo_spinnaker.Simulator(network) with sim: sim.run(10.0) """ _open_simulators = set() @classmethod def _add_simulator(cls, simulator): cls._open_simulators.add(simulator) @classmethod def _remove_simulator(cls, simulator): cls._open_simulators.remove(simulator) def __init__(self, network, dt=0.001, period=10.0, timescale=1.0, hostname=None, use_spalloc=None, allocation_fudge_factor=0.6): """Create a new Simulator with the given network. Parameters ---------- period : float or None Duration of one period of the simulator. This determines how much memory will be allocated to store precomputed and probed data. timescale : float Scaling factor to apply to the simulation, e.g., a value of `0.5` will cause the simulation to run at half real-time. hostname : string or None Hostname of the SpiNNaker machine to use; if None then the machine specified in the config file will be used. use_spalloc : bool or None Allocate a SpiNNaker machine for the simulator using ``spalloc``. If None then the setting specified in the config file will be used. Other Parameters ---------------- allocation_fudge_factor: Fudge factor to allocate more cores than really necessary when using `spalloc` to ensure that (a) there are sufficient "live" cores in the allocated machine, (b) there is sufficient room for a good place and route solution. This should generally be more than 0.1 (10% more cores than necessary) to account for the usual rate of missing chips. """ # Add this simulator to the set of open simulators Simulator._add_simulator(self) # Create the IO controller io_cls = getconfig(network.config, Simulator, "node_io", Ethernet) io_kwargs = getconfig(network.config, Simulator, "node_io_kwargs", dict()) self.io_controller = io_cls(**io_kwargs) # Calculate the machine timestep, this is measured in microseconds # (hence the 1e6 scaling factor). self.timescale = timescale machine_timestep = int((dt / timescale) * 1e6) # Determine the maximum run-time self.max_steps = None if period is None else int(period / dt) self.steps = 0 # Steps simulated # If the simulator is in "run indefinite" mode (i.e., max_steps=None) # then we modify the builders to ignore function of time Nodes and # probes. builder_kwargs = self.io_controller.builder_kwargs if self.max_steps is None: raise NotImplementedError # Create a model from the network, using the IO controller logger.debug("Building model") start_build = time.time() self.model = Model(dt=dt, machine_timestep=machine_timestep, decoder_cache=get_default_decoder_cache()) self.model.build(network, **builder_kwargs) logger.info("Build took {:.3f} seconds".format(time.time() - start_build)) self.model.decoder_cache.shrink() self.dt = self.model.dt self._closed = False # Whether the simulator has been closed or not self.host_sim = self._create_host_sim() # Holder for probe data self.data = {} # Holder for profiling data self.profiler_data = {} # Convert the model into a netlist logger.info("Building netlist") start = time.time() self.netlist = self.model.make_netlist(self.max_steps or 0) # Determine whether to use a spalloc machine or not if use_spalloc is None: # Default is to not use spalloc; this is indicated by either the # absence of the option in the config file OR the option being set # to false. use_spalloc = ( rc.has_option("spinnaker_machine", "use_spalloc") and rc.getboolean("spinnaker_machine", "use_spalloc")) # Create a controller for the machine and boot if necessary self.job = None if not use_spalloc or hostname is not None: # Use the specified machine rather than trying to get one # allocated. if hostname is None: hostname = rc.get("spinnaker_machine", "hostname") else: # Attempt to get a machine allocated to us from spalloc import Job # Determine how many boards to ask for (assuming 16 usable cores # per chip and 48 chips per board). n_cores = self.netlist.n_cores * (1.0 + allocation_fudge_factor) n_boards = int(np.ceil((n_cores / 16.) / 48.)) # Request the job self.job = Job(n_boards) logger.info("Allocated job ID %d...", self.job.id) # Wait until we're given the machine logger.info("Waiting for machine allocation...") self.job.wait_until_ready() # spalloc recommends a slight delay before attempting to boot the # machine, later versions of spalloc server may relax this # requirement. time.sleep(5.0) # Store the hostname hostname = self.job.hostname logger.info("Using %d board(s) of \"%s\" (%s)", len(self.job.boards), self.job.machine_name, hostname) self.controller = MachineController(hostname) self.controller.boot() # Get a system-info object to place & route against logger.info("Getting SpiNNaker machine specification") system_info = self.controller.get_system_info() # Place & Route logger.info("Placing and routing") self.netlist.place_and_route( system_info, place=getconfig(network.config, Simulator, 'placer', rig.place_and_route.place), place_kwargs=getconfig(network.config, Simulator, 'placer_kwargs', {}), ) logger.info("{} cores in use".format(len(self.netlist.placements))) chips = set(six.itervalues(self.netlist.placements)) logger.info("Using {}".format(chips)) # Prepare the simulator against the placed, allocated and routed # netlist. self.io_controller.prepare(self.model, self.controller, self.netlist) # Load the application logger.info("Loading application") self.netlist.load_application(self.controller, system_info) # Check if any cores are in bad states if self.controller.count_cores_in_state(["exit", "dead", "watchdog", "runtime_exception"]): for vertex, (x, y) in six.iteritems(self.netlist.placements): p = self.netlist.allocations[vertex][Cores].start status = self.controller.get_processor_status(p, x, y) if status.cpu_state is not AppState.sync0: print("Core ({}, {}, {}) in state {!s}".format( x, y, p, status)) print(self.controller.get_iobuf(p, x, y)) raise Exception("Unexpected core failures.") logger.info("Preparing and loading machine took {:3f} seconds".format( time.time() - start )) logger.info("Setting router timeout to 16 cycles") for x, y in system_info.chips(): with self.controller(x=x, y=y): data = self.controller.read(0xf1000000, 4) self.controller.write(0xf1000000, data[:-1] + b'\x10') def __enter__(self): """Enter a context which will close the simulator when exited.""" # Return self to allow usage like: # # with nengo_spinnaker.Simulator(model) as sim: # sim.run(1.0) return self def __exit__(self, exception_type, exception_value, traceback): """Exit a context and close the simulator.""" self.close() def run(self, time_in_seconds): """Simulate for the given length of time.""" # Determine how many steps to simulate for steps = int(np.round(float(time_in_seconds) / self.dt)) self.run_steps(steps) def run_steps(self, steps): """Simulate a give number of steps.""" while steps > 0: n_steps = min((steps, self.max_steps)) self._run_steps(n_steps) steps -= n_steps def _run_steps(self, steps): """Simulate for the given number of steps.""" if self._closed: raise Exception("Simulator has been closed and can't be used to " "run further simulations.") if steps is None: if self.max_steps is not None: raise Exception( "Cannot run indefinitely if a simulator period was " "specified. Create a new simulator with Simulator(model, " "period=None) to perform indefinite time simulations." ) else: assert steps <= self.max_steps # Prepare the simulation self.netlist.before_simulation(self, steps) # Wait for all cores to hit SYNC0 (either by remaining it or entering # it from init) self._wait_for_transition(AppState.init, AppState.sync0, self.netlist.n_cores) self.controller.send_signal("sync0") # Get a new thread for the IO io_thread = self.io_controller.spawn() # Run the simulation try: # Prep exp_time = steps * self.dt / self.timescale io_thread.start() # Wait for all cores to hit SYNC1 self._wait_for_transition(AppState.sync0, AppState.sync1, self.netlist.n_cores) logger.info("Running simulation...") self.controller.send_signal("sync1") # Execute the local model host_steps = 0 start_time = time.time() run_time = 0.0 local_timestep = self.dt / self.timescale while run_time < exp_time: # Run a step self.host_sim.step() run_time = time.time() - start_time # If that step took less than timestep then spin time.sleep(0.0001) while run_time < host_steps * local_timestep: time.sleep(0.0001) run_time = time.time() - start_time finally: # Stop the IO thread whatever occurs io_thread.stop() # Wait for cores to re-enter sync0 self._wait_for_transition(AppState.run, AppState.sync0, self.netlist.n_cores) # Retrieve simulation data start = time.time() logger.info("Retrieving simulation data") self.netlist.after_simulation(self, steps) logger.info("Retrieving data took {:3f} seconds".format( time.time() - start )) # Increase the steps count self.steps += steps def _wait_for_transition(self, from_state, desired_to_state, num_verts): while True: # If no cores are still in from_state, stop if self.controller.count_cores_in_state(from_state) == 0: break # Wait a bit time.sleep(1.0) # Check if any cores haven't exited cleanly num_ready = self.controller.wait_for_cores_to_reach_state( desired_to_state, num_verts, timeout=5.0) if num_ready != num_verts: # Loop through all placed vertices for vertex, (x, y) in six.iteritems(self.netlist.placements): p = self.netlist.allocations[vertex][Cores].start status = self.controller.get_processor_status(p, x, y) if status.cpu_state is not desired_to_state: print("Core ({}, {}, {}) in state {!s}".format( x, y, p, status.cpu_state)) print(self.controller.get_iobuf(p, x, y)) raise Exception("Unexpected core failures before reaching %s " "state." % desired_to_state) def _create_host_sim(self): # change node_functions to reflect time # TODO: improve the reference simulator so that this is not needed # by adding a realtime option node_functions = {} node_info = dict(start=None) for node in self.io_controller.host_network.all_nodes: if callable(node.output): old_func = node.output if node.size_in == 0: def func(t, f=old_func): now = time.time() if node_info['start'] is None: node_info['start'] = now t = (now - node_info['start']) * self.timescale return f(t) else: def func(t, x, f=old_func): now = time.time() if node_info['start'] is None: node_info['start'] = now t = (now - node_info['start']) * self.timescale return f(t, x) node.output = func node_functions[node] = old_func # Build the host simulator host_sim = nengo.Simulator(self.io_controller.host_network, dt=self.dt) # reset node functions for node, func in node_functions.items(): node.output = func return host_sim def close(self): """Clean the SpiNNaker board and prevent further simulation.""" if not self._closed: # Stop the application self._closed = True self.io_controller.close() self.controller.send_signal("stop") # Destroy the job if we allocated one if self.job is not None: self.job.destroy() # Remove this simulator from the list of open simulators Simulator._remove_simulator(self) def trange(self, dt=None): return np.arange(1, self.steps + 1) * (self.dt or dt)
def __call__(self, spalloc_server, spalloc_user, n_chips, spalloc_port=None): """ :param spalloc_server: The server from which the machine should be\ requested :param spalloc_port: The port of the SPALLOC server :param spalloc_user: The user to allocate the machine to :param n_chips: The number of chips required """ # Work out how many boards are needed n_boards = float(n_chips) / self._N_CHIPS_PER_BOARD # If the number of boards rounded up is less than 10% bigger than the\ # actual number of boards, add another board just in case if math.ceil(n_boards) - n_boards < 0.1: n_boards += 1 n_boards = int(math.ceil(n_boards)) job = None if spalloc_port is None: job = Job(n_boards, hostname=spalloc_server, owner=spalloc_user) else: job = Job(n_boards, hostname=spalloc_server, port=spalloc_port, owner=spalloc_user) try: job.wait_until_ready() except: job.destroy() ex_type, ex_value, ex_traceback = sys.exc_info() raise ex_type, ex_value, ex_traceback # get param from jobs before starting, so that hanging doesn't occur width = job.width height = job.height hostname = job.hostname machine_allocation_controller = _SpallocJobController(job) machine_allocation_controller.start() return { "machine_name": hostname, "machine_version": self._MACHINE_VERSION, "machine_width": width, "machine_height": height, "machine_n_boards": None, "machine_down_chips": None, "machine_down_cores": None, "machine_bmp_details": None, "reset_machine_on_start_up": False, "auto_detect_bmp": False, "scamp_connection_data": None, "boot_port_number": None, "max_sdram_size": None, "machine_allocation_controller": machine_allocation_controller }
def main(argv=None): t = Terminal(stream=sys.stderr) cfg = config.read_config() parser = argparse.ArgumentParser( description="Request (and allocate) a SpiNNaker machine.") parser.add_argument("--version", "-V", action="version", version=__version__) parser.add_argument("--quiet", "-q", action="store_true", default=False, help="suppress informational messages") parser.add_argument("--debug", action="store_true", default=False, help="enable additional diagnostic information") parser.add_argument("--no-destroy", "-D", action="store_true", default=False, help="do not destroy the job on exit") if MachineController is not None: parser.add_argument("--boot", "-B", action="store_true", default=False, help="boot the machine once powered on") allocation_args = parser.add_argument_group( "allocation requirement arguments") allocation_args.add_argument("what", nargs="*", default=[], type=int, metavar="WHAT", help="what to allocate: nothing or 1 " "requests 1 SpiNN-5 board, NUM requests " "at least NUM SpiNN-5 boards, WIDTH " "HEIGHT means WIDTHxHEIGHT triads of " "SpiNN-5 boards and X Y Z requests a " "board the specified logical board " "coordinate.") allocation_args.add_argument("--resume", "-r", type=int, help="if given, resume keeping the " "specified job alive rather than " "creating a new job (all allocation " "requirements will be ignored)") allocation_args.add_argument("--machine", "-m", nargs="?", default=cfg["machine"], help="only allocate boards which are part " "of a specific machine, or any machine " "if no machine is given " "(default: %(default)s)") allocation_args.add_argument("--tags", "-t", nargs="*", metavar="TAG", default=cfg["tags"] or ["default"], help="only allocate boards which have (at " "least) the specified flags " "(default: {})".format(" ".join(cfg["tags"] or []))) allocation_args.add_argument("--min-ratio", type=float, metavar="RATIO", default=cfg["min_ratio"], help="when allocating by number of boards, " "require that the allocation be at " "least as square as this ratio " "(default: %(default)s)") allocation_args.add_argument("--max-dead-boards", type=int, metavar="NUM", default=(-1 if cfg["max_dead_boards"] is None else cfg["max_dead_boards"]), help="boards allowed to be " "dead in the allocation, or -1 to allow " "any number of dead boards " "(default: %(default)s)") allocation_args.add_argument("--max-dead-links", type=int, metavar="NUM", default=(-1 if cfg["max_dead_links"] is None else cfg["max_dead_links"]), help="inter-board links allowed to be " "dead in the allocation, or -1 to allow " "any number of dead links " "(default: %(default)s)") allocation_args.add_argument( "--require-torus", "-w", action="store_true", default=cfg["require_torus"], help="require that the allocation contain " "torus (a.k.a. wrap-around) " "links {}".format("(default)" if cfg["require_torus"] else "")) allocation_args.add_argument( "--no-require-torus", "-W", action="store_false", dest="require_torus", help="do not require that the allocation " "contain torus (a.k.a. wrap-around) " "links {}".format("" if cfg["require_torus"] else "(default)")) command_args = parser.add_argument_group("command wrapping arguments") command_args.add_argument("--command", "-c", nargs=argparse.REMAINDER, help="execute the specified command once boards " "have been allocated and deallocate the " "boards when the application exits ({} and " "{hostname} are substituted for the chip " "chip at (0, 0)'s hostname, {w} and " "{h} give the dimensions of the SpiNNaker " "machine in chips, {ethernet_ips} is a " "temporary file containing a CSV with " "three columns: x, y and hostname giving " "the hostname of each Ethernet connected " "SpiNNaker chip)") server_args = parser.add_argument_group("spalloc server arguments") server_args.add_argument("--owner", default=cfg["owner"], help="by convention, the email address of the " "owner of the job (default: %(default)s)") server_args.add_argument("--hostname", "-H", default=cfg["hostname"], help="hostname or IP of the spalloc server " "(default: %(default)s)") server_args.add_argument("--port", "-P", default=cfg["port"], type=int, help="port number of the spalloc server " "(default: %(default)s)") server_args.add_argument( "--keepalive", type=int, metavar="SECONDS", default=(-1 if cfg["keepalive"] is None else cfg["keepalive"]), help="the interval at which to require " "keepalive messages to be sent to " "prevent the server cancelling the " "job, or -1 to not require keepalive " "messages (default: %(default)s)") server_args.add_argument("--reconnect-delay", default=cfg["reconnect_delay"], type=float, metavar="SECONDS", help="seconds to wait before " "reconnecting to the server if the " "connection is lost (default: %(default)s)") server_args.add_argument("--timeout", default=cfg["timeout"], type=float, metavar="SECONDS", help="seconds to wait for a response " "from the server (default: %(default)s)") args = parser.parse_args(argv) # Fail if no owner is defined (unless resuming) if not args.owner and args.resume is None: parser.error( "--owner must be specified (typically your email address)") # Fail if server not specified if args.hostname is None: parser.error("--hostname of spalloc server must be specified") # Set universal job arguments job_kwargs = { "hostname": args.hostname, "port": args.port, "reconnect_delay": args.reconnect_delay if args.reconnect_delay >= 0.0 else None, "timeout": args.timeout if args.timeout >= 0.0 else None, } if args.resume: job_args = [] job_kwargs.update({ "resume_job_id": args.resume, }) else: # Make sure 'what' takes the right form if len(args.what) not in (0, 1, 2, 3): parser.error("expected either no arguments, one argument, NUM, " "two arguments, WIDTH HEIGHT, or three arguments " "X Y Z") # Unpack arguments for the job and server job_args = args.what job_kwargs.update({ "owner": args.owner, "keepalive": args.keepalive if args.keepalive >= 0.0 else None, "machine": args.machine, "tags": args.tags if args.machine is None else None, "min_ratio": args.min_ratio, "max_dead_boards": args.max_dead_boards if args.max_dead_boards >= 0.0 else None, "max_dead_links": args.max_dead_links if args.max_dead_links >= 0.0 else None, "require_torus": args.require_torus, }) # Set debug level if args.debug: logging.basicConfig(level=logging.DEBUG) # Create temporary file in which to write CSV of all board IPs _, ip_file_filename = tempfile.mkstemp(".csv", "spinnaker_ips_") def info(msg): if not args.quiet: t.stream.write("{}\n".format(msg)) # Reason for destroying the job reason = None try: # Create the job try: job = Job(*job_args, **job_kwargs) except (OSError, IOError) as e: info(t.red("Could not connect to server: {}".format(e))) return 6 try: # Wait for it to become ready, keeping the user informed along the # way old_state = None cur_state = job.state while True: # Show debug info on state-change if old_state != cur_state: if cur_state == JobState.queued: info( t.update( t.yellow("Job {}: Waiting in queue...".format( job.id)))) elif cur_state == JobState.power: info( t.update( t.yellow( "Job {}: Waiting for power on...".format( job.id)))) elif cur_state == JobState.ready: # Here we go! break elif cur_state == JobState.destroyed: # Exit with error state try: reason = job.reason except (IOError, OSError): reason = None if reason is not None: info( t.update( t.red("Job {}: Destroyed: {}".format( job.id, reason)))) else: info(t.red("Job {}: Destroyed.".format(job.id))) return 1 elif cur_state == JobState.unknown: info( t.update( t.red("Job {}: Job not recognised by server.". format(job.id)))) return 2 else: info( t.update( t.red( "Job {}: Entered an unrecognised state {}." .format(job.id, cur_state)))) return 3 try: old_state = cur_state cur_state = job.wait_for_state_change(cur_state) except KeyboardInterrupt: # Gracefully terminate from keyboard interrupt info( t.update( t.red("Job {}: Keyboard interrupt.".format( job.id)))) reason = "Keyboard interrupt." return 4 # Machine is now ready write_ips_to_csv(job.connections, ip_file_filename) # Boot the machine if required if MachineController is not None and args.boot: info(t.update(t.yellow("Job {}: Booting...".format(job.id)))) mc = MachineController(job.hostname) mc.boot(job.width, job.height) info(t.update(t.green("Job {}: Ready!".format(job.id)))) # Either run the user's application or just print the details. if args.command: return run_command(args.command, job.id, job.machine_name, job.connections, job.width, job.height, ip_file_filename) else: print_info(job.machine_name, job.connections, job.width, job.height, ip_file_filename) return 0 finally: # Destroy job and disconnect client if args.no_destroy: job.close() else: job.destroy(reason) finally: # Delete IP address list file os.remove(ip_file_filename)