def __call__(self, argv=None): cfg = config.read_config() parser = self.get_parser(cfg) server_args = parser.add_argument_group("spalloc server arguments") self.build_server_arg_group(server_args, cfg) args = parser.parse_args(argv) # Fail if server not specified if args.hostname is None: parser.error("--hostname of spalloc server must be specified") self.verify_arguments(args) try: with self.client_factory(args.hostname, args.port) as client: version_verify(client, args.timeout) self.body(client, args) return 0 except (IOError, OSError, ProtocolError, ProtocolTimeoutError) as e: sys.stderr.write("Error communicating with server: {}\n".format(e)) return 1 except SpallocServerException as srv_exn: sys.stderr.write("Error from server: {}\n".format(srv_exn)) return 1 except Terminate as t: t.exit()
def test_options(filename, option_name, config_value, value): # Test all config options. # Write config file (omitting the config value if None, e.g. to test # default value) with open(filename, "w") as f: f.write("[spalloc]\n") if config_value is not None: f.write("{}={}".format(option_name, config_value)) cfg = read_config([filename]) assert option_name in cfg assert cfg[option_name] == value
def test_priority(tempdir): f1 = os.path.join(tempdir, "f1") f2 = os.path.join(tempdir, "f2") with open(f1, "w") as f: f.write("[spalloc]\nport=123\nhostname=bar") with open(f2, "w") as f: f.write("[spalloc]\nport=321\ntags=qux") cfg = read_config([f1, f2]) assert cfg["port"] == 321 assert cfg["reconnect_delay"] == 5.0 assert cfg["hostname"] == "bar" assert cfg["tags"] == ["qux"]
def main(argv=None): t = Terminal(stream=sys.stderr) cfg = config.read_config() parser = argparse.ArgumentParser(description="List all active jobs.") parser.add_argument("--version", "-V", action="version", version=__version__) parser.add_argument("--watch", "-w", action="store_true", default=False, help="watch the list of live jobs in real time") filter_args = parser.add_argument_group("filtering arguments") filter_args.add_argument("--machine", "-m", help="list only jobs on the specified " "machine") filter_args.add_argument("--owner", "-o", help="list only jobs belonging to a particular " "owner") server_args = parser.add_argument_group("spalloc server arguments") server_args.add_argument("--hostname", "-H", default=cfg["hostname"], help="hostname or IP of the spalloc server " "(default: %(default)s)") server_args.add_argument("--port", "-P", default=cfg["port"], type=int, help="port number of the spalloc server " "(default: %(default)s)") server_args.add_argument("--timeout", default=cfg["timeout"], type=float, metavar="SECONDS", help="seconds to wait for a response " "from the server (default: %(default)s)") args = parser.parse_args(argv) # Fail if server not specified if args.hostname is None: parser.error("--hostname of spalloc server must be specified") client = ProtocolClient(args.hostname, args.port) try: # Connect to server and ensure compatible version client.connect() version = tuple( map(int, client.version(timeout=args.timeout).split("."))) if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP): sys.stderr.write("Incompatible server version ({}).\n".format( ".".join(map(str, version)))) return 2 if args.watch: client.notify_job(timeout=args.timeout) while True: jobs = client.list_jobs(timeout=args.timeout) # Clear the screen before reprinting the table if args.watch: sys.stdout.write(t.clear_screen()) print(render_job_list(t, jobs, args.machine, args.owner)) # Exit or wait for changes, if requested if not args.watch: return 0 else: # Wait for state change try: client.wait_for_notification() except KeyboardInterrupt: # Gracefully exit print("") return 0 # Print a newline to separate old table from the new table when # it gets printed if ANSI screen clearing is not possible. print("") except (IOError, OSError, ProtocolTimeoutError) as e: sys.stderr.write("Error communicating with server: {}\n".format(e)) return 1 finally: client.close()
def main(argv=None): t = Terminal() cfg = config.read_config() parser = argparse.ArgumentParser( description="Get the state of individual machines.") parser.add_argument("--version", "-V", action="version", version=__version__) parser.add_argument("machine", nargs="?", help="if given, specifies the machine to inspect") parser.add_argument("--watch", "-w", action="store_true", default=False, help="update the output when things change.") parser.add_argument("--detailed", "-d", action="store_true", default=False, help="list detailed job information") server_args = parser.add_argument_group("spalloc server arguments") server_args.add_argument("--hostname", "-H", default=cfg["hostname"], help="hostname or IP of the spalloc server " "(default: %(default)s)") server_args.add_argument("--port", "-P", default=cfg["port"], type=int, help="port number of the spalloc server " "(default: %(default)s)") server_args.add_argument("--timeout", default=cfg["timeout"], type=float, metavar="SECONDS", help="seconds to wait for a response " "from the server (default: %(default)s)") args = parser.parse_args(argv) # Fail if server not specified if args.hostname is None: parser.error("--hostname of spalloc server must be specified") # Fail if --detailed used without specifying machine if args.machine is None and args.detailed: parser.error( "--detailed only works when a specific machine is specified") client = ProtocolClient(args.hostname, args.port) try: # Connect to server and ensure compatible version client.connect() version = tuple( map(int, client.version(timeout=args.timeout).split("."))) if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP): sys.stderr.write("Incompatible server version ({}).\n".format( ".".join(map(str, version)))) return 2 while True: if args.watch: client.notify_machine(args.machine, timeout=args.timeout) t.stream.write(t.clear_screen()) # Prevent errors on stderr being cleared away due to clear # being buffered t.stream.flush() # Get all information machines = client.list_machines(timeout=args.timeout) jobs = client.list_jobs(timeout=args.timeout) # Display accordingly if args.machine is None: retval = list_machines(t, machines, jobs) else: retval = show_machine(t, machines, jobs, args.machine, not args.detailed) # Wait for changes (if required) if retval != 0 or not args.watch: return retval else: try: client.wait_for_notification() print("") except KeyboardInterrupt: print("") return 0 except (IOError, OSError, ProtocolTimeoutError) as e: sys.stderr.write("Error communicating with server: {}\n".format(e)) return 1 finally: client.close()
def main(argv=None): cfg = config.read_config() parser = argparse.ArgumentParser( description="Find out the location (physical or logical) " "of a chip or board.") parser.add_argument("--version", "-V", action="version", version=__version__) control_args = parser.add_mutually_exclusive_group(required=True) control_args.add_argument("--board", "-b", "--logical", "-l", nargs=4, metavar=("MACHINE", "X", "Y", "Z"), help="specify the logical board coordinate") control_args.add_argument("--physical", "-p", nargs=4, metavar=("MACHINE", "CABINET", "FRAME", "BOARD"), help="specify a board's physical location") control_args.add_argument("--chip", "-c", nargs=3, metavar=("MACHINE", "X", "Y"), help="specify a board by chip coordinates (as " "if the whole machine is being used)") control_args.add_argument("--job-chip", "-j", nargs=3, metavar=("JOB_ID", "X", "Y"), help="specify the chip coordinates of a chip " "within a job's boards") server_args = parser.add_argument_group("spalloc server arguments") server_args.add_argument("--hostname", "-H", default=cfg["hostname"], help="hostname or IP of the spalloc server " "(default: %(default)s)") server_args.add_argument("--port", "-P", default=cfg["port"], type=int, help="port number of the spalloc server " "(default: %(default)s)") server_args.add_argument("--timeout", default=cfg["timeout"], type=float, metavar="SECONDS", help="seconds to wait for a response " "from the server (default: %(default)s)") args = parser.parse_args(argv) # Fail if server not specified if args.hostname is None: parser.error("--hostname of spalloc server must be specified") client = ProtocolClient(args.hostname, args.port) try: # Connect to server and ensure compatible version client.connect() version = tuple( map(int, client.version(timeout=args.timeout).split("."))) if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP): sys.stderr.write("Incompatible server version ({}).\n".format( ".".join(map(str, version)))) return 2 # Work out what the user asked for try: show_board_chip = False if args.board: machine, x, y, z = args.board where_is_kwargs = { "machine": machine, "x": int(x), "y": int(y), "z": int(z), } elif args.physical: machine, c, f, b = args.physical where_is_kwargs = { "machine": machine, "cabinet": int(c), "frame": int(f), "board": int(b), } elif args.chip: machine, x, y = args.chip where_is_kwargs = { "machine": machine, "chip_x": int(x), "chip_y": int(y), } show_board_chip = True elif args.job_chip: job_id, x, y = args.job_chip where_is_kwargs = { "job_id": int(job_id), "chip_x": int(x), "chip_y": int(y), } show_board_chip = True except ValueError as e: parser.error("Error: {}".format(e)) # Ask the server location = client.where_is(**where_is_kwargs) if location is None: sys.stderr.write("No boards at the specified location.\n") return 4 else: out = OrderedDict() out["Machine"] = location["machine"] out["Physical location"] = "Cabinet {}, Frame {}, Board {}".format( *location["physical"]) out["Board coordinate"] = tuple(location["logical"]) out["Machine chip coordinates"] = tuple(location["chip"]) if show_board_chip: out["Coordinates within board"] = tuple(location["board_chip"]) out["Job using board"] = location["job_id"] if location["job_id"]: out["Coordinates within job"] = tuple(location["job_chip"]) print(render_definitions(out)) return 0 except (IOError, OSError, ProtocolTimeoutError) as e: sys.stderr.write("Error communicating with server: {}\n".format(e)) return 1 finally: client.close()
def __init__(self, *args, **kwargs): """Request a SpiNNaker machine. A :py:class:`.Job` is constructed in one of the following styles:: >>> # Any single (SpiNN-5) board >>> Job() >>> Job(1) >>> # Any machine with at least 4 boards >>> Job(4) >>> # Any 7-or-more board machine with an aspect ratio at least as >>> # square as 1:2 >>> Job(7, min_ratio=0.5) >>> # Any 4x5 triad segment of a machine (may or may-not be a >>> # torus/full machine) >>> Job(4, 5) >>> # Any torus-connected (full machine) 4x2 machine >>> Job(4, 2, require_torus=True) >>> # Board x=3, y=2, z=1 on the machine named "m" >>> Job(3, 2, 1, machine="m") >>> # Keep using (and keeping-alive) an existing allocation >>> Job(resume_job_id=123) Once finished with a Job, the :py:meth:`.destroy` (or in unusual applications :py:meth:`.Job.close`) method must be called to destroy the job, close the connection to the server and terminate the background keep-alive thread. Alternatively, a Job may be used as a context manager which automatically calls :py:meth:`.destroy` on exiting the block:: >>> with Job() as j: ... # ...for example... ... my_boot(j.hostname, j.width, j.height) ... my_application(j.hostname) The following keyword-only parameters below are used both to specify the server details as well as the job requirements. Most parameters default to the values supplied in the local :py:mod:`~spalloc.config` file allowing usage as in the examples above. Parameters ---------- hostname : str **Required.** The name of the spalloc server to connect to. (Read from config file if not specified.) port : int The port number of the spalloc server to connect to. (Read from config file if not specified.) reconnect_delay : float Number of seconds between attempts to reconnect to the server. (Read from config file if not specified.) timeout : float or None Timeout for waiting for replies from the server. If None, will keep trying forever. (Read from config file if not specified.) config_filenames : [str, ...] If given must be a list of filenames to read configuration options from. If not supplied, the default config file locations are searched. Set to an empty list to prevent using values from config files. Other Parameters ---------------- resume_job_id : int or None If supplied, rather than creating a new job, take on an existing one, keeping it alive as required by the original job. If this argument is used, all other requirements are ignored. owner : str **Required.** The name of the owner of the job. By convention this should be your email address. (Read from config file if not specified.) keepalive : float or None The number of seconds after which the server may consider the job dead if this client cannot communicate with it. If None, no timeout will be used and the job will run until explicitly destroyed. Use with extreme caution. (Read from config file if not specified.) machine : str or None Specify the name of a machine which this job must be executed on. If None, the first suitable machine available will be used, according to the tags selected below. Must be None when tags are given. (Read from config file if not specified.) tags : [str, ...] or None The set of tags which any machine running this job must have. If None is supplied, only machines with the "default" tag will be used. If machine is given, this argument must be None. (Read from config file if not specified.) min_ratio : float The aspect ratio (h/w) which the allocated region must be 'at least as square as'. Set to 0.0 for any allowable shape, 1.0 to be exactly square etc. Ignored when allocating single boards or specific rectangles of triads. max_dead_boards : int or None The maximum number of broken or unreachable boards to allow in the allocated region. If None, any number of dead boards is permitted, as long as the board on the bottom-left corner is alive. (Read from config file if not specified.) max_dead_links : int or None The maximum number of broken links allow in the allocated region. When require_torus is True this includes wrap-around links, otherwise peripheral links are not counted. If None, any number of broken links is allowed. (Read from config file if not specified.). require_torus : bool If True, only allocate blocks with torus connectivity. In general this will only succeed for requests to allocate an entire machine. Must be False when allocating boards. (Read from config file if not specified.) """ # Read configuration config_filenames = kwargs.pop("config_filenames", SEARCH_PATH) config = read_config(config_filenames) # Get protocol client options hostname = kwargs.get("hostname", config["hostname"]) owner = kwargs.get("owner", config["owner"]) port = kwargs.get("port", config["port"]) self._reconnect_delay = kwargs.get("reconnect_delay", config["reconnect_delay"]) self._timeout = kwargs.get("timeout", config["timeout"]) if hostname is None: raise ValueError("A hostname must be specified.") # Cached responses of _get_state and _get_machine_info self._last_state = None self._last_machine_info = None # Connection to server (and associated lock) self._client = ProtocolClient(hostname, port) self._client_lock = threading.RLock() # Set-up (but don't start) background keepalive thread self._keepalive_thread = threading.Thread( target=self._keepalive_thread, name="job-keepalive-thread") self._keepalive_thread.daemon = True # Event fired when the background thread should shut-down self._stop = threading.Event() # Check version compatibility (fail fast if can't communicate with # server) self._client.connect(timeout=self._timeout) self._assert_compatible_version() # Resume/create the job resume_job_id = kwargs.get("resume_job_id", None) if resume_job_id: self.id = resume_job_id # If the job no longer exists, we can't get the keepalive interval # (and there's nothing to keepalive) so just bail out. job_state = self._get_state() if (job_state.state == JobState.unknown or job_state.state == JobState.destroyed): raise JobDestroyedError("Job {} does not exist: {}{}{}".format( resume_job_id, job_state.state.name, ": " if job_state.reason is not None else "", job_state.reason if job_state.reason is not None else "")) # Snag the keepalive interval from the job self._keepalive = job_state.keepalive logger.info("Spalloc resumed job %d", self.id) else: # Get job creation arguments job_args = args job_kwargs = { "owner": owner, "keepalive": kwargs.get("keepalive", config["keepalive"]), "machine": kwargs.get("machine", config["machine"]), "tags": kwargs.get("tags", config["tags"]), "min_ratio": kwargs.get("min_ratio", config["min_ratio"]), "max_dead_boards": kwargs.get("max_dead_boards", config["max_dead_boards"]), "max_dead_links": kwargs.get("max_dead_links", config["max_dead_links"]), "require_torus": kwargs.get("require_torus", config["require_torus"]), "timeout": self._timeout, } # Sanity check arguments if job_kwargs["owner"] is None: raise ValueError("An owner must be specified.") if ((job_kwargs["tags"] is not None) and (job_kwargs["machine"] is not None)): raise ValueError( "Only one of tags and machine may be specified.") self._keepalive = job_kwargs["keepalive"] # Create the job (failing fast if can't communicate) self.id = self._client.create_job(*job_args, **job_kwargs) logger.info("Created spalloc job %d", self.id) # Start keepalive thread now that everything is up self._keepalive_thread.start()
def main(argv=None): t = Terminal() cfg = config.read_config() parser = argparse.ArgumentParser(description="Manage running jobs.") parser.add_argument("--version", "-V", action="version", version=__version__) parser.add_argument("job_id", type=int, nargs="?", help="the job ID of interest, optional if the current " "owner only has one job") parser.add_argument("--owner", "-o", default=cfg["owner"], help="if no job ID is provided and this owner has " "only one job, this job is assumed " "(default: %(default)s)") control_args = parser.add_mutually_exclusive_group() control_args.add_argument("--info", "-i", action="store_true", help="Show basic job information (the default)") control_args.add_argument("--watch", "-w", action="store_true", help="watch this job for state changes") control_args.add_argument("--power-on", "--reset", "-p", "-r", action="store_true", help="power-on or reset the job's boards") control_args.add_argument("--power-off", action="store_true", help="power-off the job's boards") control_args.add_argument("--ethernet-ips", "-e", action="store_true", help="output the IPs of all Ethernet connected " "chips as a CSV") control_args.add_argument("--destroy", "-D", nargs="?", metavar="REASON", const="", help="destroy a queued or running job") server_args = parser.add_argument_group("spalloc server arguments") server_args.add_argument("--hostname", "-H", default=cfg["hostname"], help="hostname or IP of the spalloc server " "(default: %(default)s)") server_args.add_argument("--port", "-P", default=cfg["port"], type=int, help="port number of the spalloc server " "(default: %(default)s)") server_args.add_argument("--timeout", default=cfg["timeout"], type=float, metavar="SECONDS", help="seconds to wait for a response " "from the server (default: %(default)s)") args = parser.parse_args(argv) # Fail if server not specified if args.hostname is None: parser.error("--hostname of spalloc server must be specified") # Fail if job *and* owner not specified if args.job_id is None and args.owner is None: parser.error("job ID (or --owner) not specified") client = ProtocolClient(args.hostname, args.port) try: # Connect to server and ensure compatible version client.connect() version = tuple( map(int, client.version(timeout=args.timeout).split("."))) if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP): sys.stderr.write("Incompatible server version ({}).\n".format( ".".join(map(str, version)))) return 2 # If no Job ID specified, attempt to discover one if args.job_id is None: jobs = client.list_jobs(timeout=args.timeout) job_ids = [ job["job_id"] for job in jobs if job["owner"] == args.owner ] if len(job_ids) == 0: sys.stderr.write("Owner {} has no live jobs.\n".format( args.owner)) return 3 elif len(job_ids) > 1: sys.stderr.write("Ambiguous: {} has {} live jobs: {}\n".format( args.owner, len(job_ids), ", ".join(map(str, job_ids)))) return 3 else: args.job_id = job_ids[0] # Do as the user asked if args.watch: return watch_job(t, client, args.timeout, args.job_id) elif args.power_on: return power_job(client, args.timeout, args.job_id, True) elif args.power_off: return power_job(client, args.timeout, args.job_id, False) elif args.ethernet_ips: return list_ips(client, args.timeout, args.job_id) elif args.destroy is not None: # Set default destruction message if args.destroy == "" and args.owner: args.destroy = "Destroyed by {}".format(args.owner) return destroy_job(client, args.timeout, args.job_id, args.destroy) else: return show_job_info(t, client, args.timeout, args.job_id) except (IOError, OSError, ProtocolTimeoutError) as e: sys.stderr.write("Error communicating with server: {}\n".format(e)) return 1 finally: client.close()
def parse_argv(argv): cfg = config.read_config() parser = argparse.ArgumentParser( description="Request (and allocate) a SpiNNaker machine.") parser.add_argument("--version", "-V", action="version", version=__version__) parser.add_argument("--quiet", "-q", action="store_true", default=False, help="suppress informational messages") parser.add_argument("--debug", action="store_true", default=False, help="enable additional diagnostic information") parser.add_argument("--no-destroy", "-D", action="store_true", default=False, help="do not destroy the job on exit") if MachineController is not None: parser.add_argument("--boot", "-B", action="store_true", default=False, help="boot the machine once powered on") allocation_args = parser.add_argument_group( "allocation requirement arguments") allocation_args.add_argument( "what", nargs="*", default=[], type=int, metavar="WHAT", help="what to allocate: nothing or 1 requests 1 SpiNN-5 board, NUM " "requests at least NUM SpiNN-5 boards, WIDTH HEIGHT means " "WIDTHxHEIGHT triads of SpiNN-5 boards and X Y Z requests a " "board the specified logical board coordinate.") allocation_args.add_argument( "--resume", "-r", type=int, help="if given, resume keeping the specified job alive rather than " "creating a new job (all allocation requirements will be ignored)") allocation_args.add_argument( "--machine", "-m", nargs="?", default=cfg["machine"], help="only allocate boards which are part of a specific machine, or " "any machine if no machine is given (default: %(default)s)") allocation_args.add_argument( "--tags", "-t", nargs="*", metavar="TAG", default=cfg["tags"] or ["default"], help="only allocate boards which have (at least) the specified flags " "(default: {})".format(" ".join(cfg["tags"] or []))) allocation_args.add_argument( "--min-ratio", type=float, metavar="RATIO", default=cfg["min_ratio"], help="when allocating by number of boards, require that the " "allocation be at least as square as this ratio (default: " "%(default)s)") allocation_args.add_argument( "--max-dead-boards", type=int, metavar="NUM", default=( -1 if cfg["max_dead_boards"] is None else cfg["max_dead_boards"]), help="boards allowed to be dead in the allocation, or -1 to allow " "any number of dead boards (default: %(default)s)") allocation_args.add_argument( "--max-dead-links", type=int, metavar="NUM", default=( -1 if cfg["max_dead_links"] is None else cfg["max_dead_links"]), help="inter-board links allowed to be dead in the allocation, or -1 " "to allow any number of dead links (default: %(default)s)") allocation_args.add_argument( "--require-torus", "-w", action="store_true", default=cfg["require_torus"], help="require that the allocation contain torus (a.k.a. wrap-around) " "links {}".format("(default)" if cfg["require_torus"] else "")) allocation_args.add_argument( "--no-require-torus", "-W", action="store_false", dest="require_torus", help="do not require that the allocation contain torus (a.k.a. " "wrap-around) links {}".format( "" if cfg["require_torus"] else "(default)")) command_args = parser.add_argument_group("command wrapping arguments") command_args.add_argument( "--command", "-c", nargs=argparse.REMAINDER, help="execute the specified command once boards have been allocated " "and deallocate the boards when the application exits ({} and " "{hostname} are substituted for the chip chip at (0, 0)'s hostname, " "{w} and {h} give the dimensions of the SpiNNaker machine in chips, " "{ethernet_ips} is a temporary file containing a CSV with three " "columns: x, y and hostname giving the hostname of each Ethernet " "connected SpiNNaker chip)") server_args = parser.add_argument_group("spalloc server arguments") server_args.add_argument( "--owner", default=cfg["owner"], help="by convention, the email address of the owner of the job " "(default: %(default)s)") server_args.add_argument( "--hostname", "-H", default=cfg["hostname"], help="hostname or IP of the spalloc server (default: %(default)s)") server_args.add_argument( "--port", "-P", default=cfg["port"], type=int, help="port number of the spalloc server (default: %(default)s)") server_args.add_argument( "--keepalive", type=int, metavar="SECONDS", default=(-1 if cfg["keepalive"] is None else cfg["keepalive"]), help="the interval at which to require keepalive messages to be " "sent to prevent the server cancelling the job, or -1 to not " "require keepalive messages (default: %(default)s)") server_args.add_argument( "--reconnect-delay", default=cfg["reconnect_delay"], type=float, metavar="SECONDS", help="seconds to wait before reconnecting to the server if the " "connection is lost (default: %(default)s)") server_args.add_argument( "--timeout", default=cfg["timeout"], type=float, metavar="SECONDS", help="seconds to wait for a response from the server " "(default: %(default)s)") return parser, parser.parse_args(argv)
def main(argv=None): t = Terminal(stream=sys.stderr) cfg = config.read_config() parser = argparse.ArgumentParser( description="Request (and allocate) a SpiNNaker machine.") parser.add_argument("--version", "-V", action="version", version=__version__) parser.add_argument("--quiet", "-q", action="store_true", default=False, help="suppress informational messages") parser.add_argument("--debug", action="store_true", default=False, help="enable additional diagnostic information") parser.add_argument("--no-destroy", "-D", action="store_true", default=False, help="do not destroy the job on exit") if MachineController is not None: parser.add_argument("--boot", "-B", action="store_true", default=False, help="boot the machine once powered on") allocation_args = parser.add_argument_group( "allocation requirement arguments") allocation_args.add_argument("what", nargs="*", default=[], type=int, metavar="WHAT", help="what to allocate: nothing or 1 " "requests 1 SpiNN-5 board, NUM requests " "at least NUM SpiNN-5 boards, WIDTH " "HEIGHT means WIDTHxHEIGHT triads of " "SpiNN-5 boards and X Y Z requests a " "board the specified logical board " "coordinate.") allocation_args.add_argument("--resume", "-r", type=int, help="if given, resume keeping the " "specified job alive rather than " "creating a new job (all allocation " "requirements will be ignored)") allocation_args.add_argument("--machine", "-m", nargs="?", default=cfg["machine"], help="only allocate boards which are part " "of a specific machine, or any machine " "if no machine is given " "(default: %(default)s)") allocation_args.add_argument("--tags", "-t", nargs="*", metavar="TAG", default=cfg["tags"] or ["default"], help="only allocate boards which have (at " "least) the specified flags " "(default: {})".format(" ".join(cfg["tags"] or []))) allocation_args.add_argument("--min-ratio", type=float, metavar="RATIO", default=cfg["min_ratio"], help="when allocating by number of boards, " "require that the allocation be at " "least as square as this ratio " "(default: %(default)s)") allocation_args.add_argument("--max-dead-boards", type=int, metavar="NUM", default=(-1 if cfg["max_dead_boards"] is None else cfg["max_dead_boards"]), help="boards allowed to be " "dead in the allocation, or -1 to allow " "any number of dead boards " "(default: %(default)s)") allocation_args.add_argument("--max-dead-links", type=int, metavar="NUM", default=(-1 if cfg["max_dead_links"] is None else cfg["max_dead_links"]), help="inter-board links allowed to be " "dead in the allocation, or -1 to allow " "any number of dead links " "(default: %(default)s)") allocation_args.add_argument( "--require-torus", "-w", action="store_true", default=cfg["require_torus"], help="require that the allocation contain " "torus (a.k.a. wrap-around) " "links {}".format("(default)" if cfg["require_torus"] else "")) allocation_args.add_argument( "--no-require-torus", "-W", action="store_false", dest="require_torus", help="do not require that the allocation " "contain torus (a.k.a. wrap-around) " "links {}".format("" if cfg["require_torus"] else "(default)")) command_args = parser.add_argument_group("command wrapping arguments") command_args.add_argument("--command", "-c", nargs=argparse.REMAINDER, help="execute the specified command once boards " "have been allocated and deallocate the " "boards when the application exits ({} and " "{hostname} are substituted for the chip " "chip at (0, 0)'s hostname, {w} and " "{h} give the dimensions of the SpiNNaker " "machine in chips, {ethernet_ips} is a " "temporary file containing a CSV with " "three columns: x, y and hostname giving " "the hostname of each Ethernet connected " "SpiNNaker chip)") server_args = parser.add_argument_group("spalloc server arguments") server_args.add_argument("--owner", default=cfg["owner"], help="by convention, the email address of the " "owner of the job (default: %(default)s)") server_args.add_argument("--hostname", "-H", default=cfg["hostname"], help="hostname or IP of the spalloc server " "(default: %(default)s)") server_args.add_argument("--port", "-P", default=cfg["port"], type=int, help="port number of the spalloc server " "(default: %(default)s)") server_args.add_argument( "--keepalive", type=int, metavar="SECONDS", default=(-1 if cfg["keepalive"] is None else cfg["keepalive"]), help="the interval at which to require " "keepalive messages to be sent to " "prevent the server cancelling the " "job, or -1 to not require keepalive " "messages (default: %(default)s)") server_args.add_argument("--reconnect-delay", default=cfg["reconnect_delay"], type=float, metavar="SECONDS", help="seconds to wait before " "reconnecting to the server if the " "connection is lost (default: %(default)s)") server_args.add_argument("--timeout", default=cfg["timeout"], type=float, metavar="SECONDS", help="seconds to wait for a response " "from the server (default: %(default)s)") args = parser.parse_args(argv) # Fail if no owner is defined (unless resuming) if not args.owner and args.resume is None: parser.error( "--owner must be specified (typically your email address)") # Fail if server not specified if args.hostname is None: parser.error("--hostname of spalloc server must be specified") # Set universal job arguments job_kwargs = { "hostname": args.hostname, "port": args.port, "reconnect_delay": args.reconnect_delay if args.reconnect_delay >= 0.0 else None, "timeout": args.timeout if args.timeout >= 0.0 else None, } if args.resume: job_args = [] job_kwargs.update({ "resume_job_id": args.resume, }) else: # Make sure 'what' takes the right form if len(args.what) not in (0, 1, 2, 3): parser.error("expected either no arguments, one argument, NUM, " "two arguments, WIDTH HEIGHT, or three arguments " "X Y Z") # Unpack arguments for the job and server job_args = args.what job_kwargs.update({ "owner": args.owner, "keepalive": args.keepalive if args.keepalive >= 0.0 else None, "machine": args.machine, "tags": args.tags if args.machine is None else None, "min_ratio": args.min_ratio, "max_dead_boards": args.max_dead_boards if args.max_dead_boards >= 0.0 else None, "max_dead_links": args.max_dead_links if args.max_dead_links >= 0.0 else None, "require_torus": args.require_torus, }) # Set debug level if args.debug: logging.basicConfig(level=logging.DEBUG) # Create temporary file in which to write CSV of all board IPs _, ip_file_filename = tempfile.mkstemp(".csv", "spinnaker_ips_") def info(msg): if not args.quiet: t.stream.write("{}\n".format(msg)) # Reason for destroying the job reason = None try: # Create the job try: job = Job(*job_args, **job_kwargs) except (OSError, IOError) as e: info(t.red("Could not connect to server: {}".format(e))) return 6 try: # Wait for it to become ready, keeping the user informed along the # way old_state = None cur_state = job.state while True: # Show debug info on state-change if old_state != cur_state: if cur_state == JobState.queued: info( t.update( t.yellow("Job {}: Waiting in queue...".format( job.id)))) elif cur_state == JobState.power: info( t.update( t.yellow( "Job {}: Waiting for power on...".format( job.id)))) elif cur_state == JobState.ready: # Here we go! break elif cur_state == JobState.destroyed: # Exit with error state try: reason = job.reason except (IOError, OSError): reason = None if reason is not None: info( t.update( t.red("Job {}: Destroyed: {}".format( job.id, reason)))) else: info(t.red("Job {}: Destroyed.".format(job.id))) return 1 elif cur_state == JobState.unknown: info( t.update( t.red("Job {}: Job not recognised by server.". format(job.id)))) return 2 else: info( t.update( t.red( "Job {}: Entered an unrecognised state {}." .format(job.id, cur_state)))) return 3 try: old_state = cur_state cur_state = job.wait_for_state_change(cur_state) except KeyboardInterrupt: # Gracefully terminate from keyboard interrupt info( t.update( t.red("Job {}: Keyboard interrupt.".format( job.id)))) reason = "Keyboard interrupt." return 4 # Machine is now ready write_ips_to_csv(job.connections, ip_file_filename) # Boot the machine if required if MachineController is not None and args.boot: info(t.update(t.yellow("Job {}: Booting...".format(job.id)))) mc = MachineController(job.hostname) mc.boot(job.width, job.height) info(t.update(t.green("Job {}: Ready!".format(job.id)))) # Either run the user's application or just print the details. if args.command: return run_command(args.command, job.id, job.machine_name, job.connections, job.width, job.height, ip_file_filename) else: print_info(job.machine_name, job.connections, job.width, job.height, ip_file_filename) return 0 finally: # Destroy job and disconnect client if args.no_destroy: job.close() else: job.destroy(reason) finally: # Delete IP address list file os.remove(ip_file_filename)