예제 #1
0
    def __call__(self, argv=None):
        cfg = config.read_config()
        parser = self.get_parser(cfg)
        server_args = parser.add_argument_group("spalloc server arguments")
        self.build_server_arg_group(server_args, cfg)
        args = parser.parse_args(argv)

        # Fail if server not specified
        if args.hostname is None:
            parser.error("--hostname of spalloc server must be specified")
        self.verify_arguments(args)

        try:
            with self.client_factory(args.hostname, args.port) as client:
                version_verify(client, args.timeout)
                self.body(client, args)
                return 0
        except (IOError, OSError, ProtocolError, ProtocolTimeoutError) as e:
            sys.stderr.write("Error communicating with server: {}\n".format(e))
            return 1
        except SpallocServerException as srv_exn:
            sys.stderr.write("Error from server: {}\n".format(srv_exn))
            return 1
        except Terminate as t:
            t.exit()
예제 #2
0
def test_options(filename, option_name, config_value, value):
    # Test all config options.

    # Write config file (omitting the config value if None, e.g. to test
    # default value)
    with open(filename, "w") as f:
        f.write("[spalloc]\n")
        if config_value is not None:
            f.write("{}={}".format(option_name, config_value))

    cfg = read_config([filename])

    assert option_name in cfg
    assert cfg[option_name] == value
예제 #3
0
def test_priority(tempdir):
    f1 = os.path.join(tempdir, "f1")
    f2 = os.path.join(tempdir, "f2")

    with open(f1, "w") as f:
        f.write("[spalloc]\nport=123\nhostname=bar")
    with open(f2, "w") as f:
        f.write("[spalloc]\nport=321\ntags=qux")

    cfg = read_config([f1, f2])

    assert cfg["port"] == 321
    assert cfg["reconnect_delay"] == 5.0
    assert cfg["hostname"] == "bar"
    assert cfg["tags"] == ["qux"]
예제 #4
0
def main(argv=None):
    t = Terminal(stream=sys.stderr)

    cfg = config.read_config()

    parser = argparse.ArgumentParser(description="List all active jobs.")

    parser.add_argument("--version",
                        "-V",
                        action="version",
                        version=__version__)

    parser.add_argument("--watch",
                        "-w",
                        action="store_true",
                        default=False,
                        help="watch the list of live jobs in real time")

    filter_args = parser.add_argument_group("filtering arguments")

    filter_args.add_argument("--machine",
                             "-m",
                             help="list only jobs on the specified "
                             "machine")
    filter_args.add_argument("--owner",
                             "-o",
                             help="list only jobs belonging to a particular "
                             "owner")

    server_args = parser.add_argument_group("spalloc server arguments")

    server_args.add_argument("--hostname",
                             "-H",
                             default=cfg["hostname"],
                             help="hostname or IP of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--port",
                             "-P",
                             default=cfg["port"],
                             type=int,
                             help="port number of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--timeout",
                             default=cfg["timeout"],
                             type=float,
                             metavar="SECONDS",
                             help="seconds to wait for a response "
                             "from the server (default: %(default)s)")

    args = parser.parse_args(argv)

    # Fail if server not specified
    if args.hostname is None:
        parser.error("--hostname of spalloc server must be specified")

    client = ProtocolClient(args.hostname, args.port)
    try:
        # Connect to server and ensure compatible version
        client.connect()
        version = tuple(
            map(int,
                client.version(timeout=args.timeout).split(".")))
        if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP):
            sys.stderr.write("Incompatible server version ({}).\n".format(
                ".".join(map(str, version))))
            return 2

        if args.watch:
            client.notify_job(timeout=args.timeout)

        while True:
            jobs = client.list_jobs(timeout=args.timeout)

            # Clear the screen before reprinting the table
            if args.watch:
                sys.stdout.write(t.clear_screen())

            print(render_job_list(t, jobs, args.machine, args.owner))

            # Exit or wait for changes, if requested
            if not args.watch:
                return 0
            else:
                # Wait for state change
                try:
                    client.wait_for_notification()
                except KeyboardInterrupt:
                    # Gracefully exit
                    print("")
                    return 0

                # Print a newline to separate old table from the new table when
                # it gets printed if ANSI screen clearing is not possible.
                print("")

    except (IOError, OSError, ProtocolTimeoutError) as e:
        sys.stderr.write("Error communicating with server: {}\n".format(e))
        return 1
    finally:
        client.close()
예제 #5
0
def main(argv=None):
    t = Terminal()

    cfg = config.read_config()

    parser = argparse.ArgumentParser(
        description="Get the state of individual machines.")

    parser.add_argument("--version",
                        "-V",
                        action="version",
                        version=__version__)

    parser.add_argument("machine",
                        nargs="?",
                        help="if given, specifies the machine to inspect")

    parser.add_argument("--watch",
                        "-w",
                        action="store_true",
                        default=False,
                        help="update the output when things change.")

    parser.add_argument("--detailed",
                        "-d",
                        action="store_true",
                        default=False,
                        help="list detailed job information")

    server_args = parser.add_argument_group("spalloc server arguments")

    server_args.add_argument("--hostname",
                             "-H",
                             default=cfg["hostname"],
                             help="hostname or IP of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--port",
                             "-P",
                             default=cfg["port"],
                             type=int,
                             help="port number of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--timeout",
                             default=cfg["timeout"],
                             type=float,
                             metavar="SECONDS",
                             help="seconds to wait for a response "
                             "from the server (default: %(default)s)")

    args = parser.parse_args(argv)

    # Fail if server not specified
    if args.hostname is None:
        parser.error("--hostname of spalloc server must be specified")

    # Fail if --detailed used without specifying machine
    if args.machine is None and args.detailed:
        parser.error(
            "--detailed only works when a specific machine is specified")

    client = ProtocolClient(args.hostname, args.port)
    try:
        # Connect to server and ensure compatible version
        client.connect()
        version = tuple(
            map(int,
                client.version(timeout=args.timeout).split(".")))
        if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP):
            sys.stderr.write("Incompatible server version ({}).\n".format(
                ".".join(map(str, version))))
            return 2

        while True:
            if args.watch:
                client.notify_machine(args.machine, timeout=args.timeout)
                t.stream.write(t.clear_screen())
                # Prevent errors on stderr being cleared away due to clear
                # being buffered
                t.stream.flush()

            # Get all information
            machines = client.list_machines(timeout=args.timeout)
            jobs = client.list_jobs(timeout=args.timeout)

            # Display accordingly
            if args.machine is None:
                retval = list_machines(t, machines, jobs)
            else:
                retval = show_machine(t, machines, jobs, args.machine,
                                      not args.detailed)

            # Wait for changes (if required)
            if retval != 0 or not args.watch:
                return retval
            else:
                try:
                    client.wait_for_notification()
                    print("")
                except KeyboardInterrupt:
                    print("")
                    return 0

    except (IOError, OSError, ProtocolTimeoutError) as e:
        sys.stderr.write("Error communicating with server: {}\n".format(e))
        return 1
    finally:
        client.close()
예제 #6
0
def main(argv=None):
    cfg = config.read_config()

    parser = argparse.ArgumentParser(
        description="Find out the location (physical or logical) "
        "of a chip or board.")

    parser.add_argument("--version",
                        "-V",
                        action="version",
                        version=__version__)

    control_args = parser.add_mutually_exclusive_group(required=True)
    control_args.add_argument("--board",
                              "-b",
                              "--logical",
                              "-l",
                              nargs=4,
                              metavar=("MACHINE", "X", "Y", "Z"),
                              help="specify the logical board coordinate")
    control_args.add_argument("--physical",
                              "-p",
                              nargs=4,
                              metavar=("MACHINE", "CABINET", "FRAME", "BOARD"),
                              help="specify a board's physical location")
    control_args.add_argument("--chip",
                              "-c",
                              nargs=3,
                              metavar=("MACHINE", "X", "Y"),
                              help="specify a board by chip coordinates (as "
                              "if the whole machine is being used)")
    control_args.add_argument("--job-chip",
                              "-j",
                              nargs=3,
                              metavar=("JOB_ID", "X", "Y"),
                              help="specify the chip coordinates of a chip "
                              "within a job's boards")

    server_args = parser.add_argument_group("spalloc server arguments")

    server_args.add_argument("--hostname",
                             "-H",
                             default=cfg["hostname"],
                             help="hostname or IP of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--port",
                             "-P",
                             default=cfg["port"],
                             type=int,
                             help="port number of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--timeout",
                             default=cfg["timeout"],
                             type=float,
                             metavar="SECONDS",
                             help="seconds to wait for a response "
                             "from the server (default: %(default)s)")

    args = parser.parse_args(argv)

    # Fail if server not specified
    if args.hostname is None:
        parser.error("--hostname of spalloc server must be specified")

    client = ProtocolClient(args.hostname, args.port)
    try:
        # Connect to server and ensure compatible version
        client.connect()
        version = tuple(
            map(int,
                client.version(timeout=args.timeout).split(".")))
        if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP):
            sys.stderr.write("Incompatible server version ({}).\n".format(
                ".".join(map(str, version))))
            return 2

        # Work out what the user asked for
        try:
            show_board_chip = False
            if args.board:
                machine, x, y, z = args.board
                where_is_kwargs = {
                    "machine": machine,
                    "x": int(x),
                    "y": int(y),
                    "z": int(z),
                }
            elif args.physical:
                machine, c, f, b = args.physical
                where_is_kwargs = {
                    "machine": machine,
                    "cabinet": int(c),
                    "frame": int(f),
                    "board": int(b),
                }
            elif args.chip:
                machine, x, y = args.chip
                where_is_kwargs = {
                    "machine": machine,
                    "chip_x": int(x),
                    "chip_y": int(y),
                }
                show_board_chip = True
            elif args.job_chip:
                job_id, x, y = args.job_chip
                where_is_kwargs = {
                    "job_id": int(job_id),
                    "chip_x": int(x),
                    "chip_y": int(y),
                }
                show_board_chip = True
        except ValueError as e:
            parser.error("Error: {}".format(e))

        # Ask the server
        location = client.where_is(**where_is_kwargs)
        if location is None:
            sys.stderr.write("No boards at the specified location.\n")
            return 4
        else:
            out = OrderedDict()
            out["Machine"] = location["machine"]
            out["Physical location"] = "Cabinet {}, Frame {}, Board {}".format(
                *location["physical"])
            out["Board coordinate"] = tuple(location["logical"])
            out["Machine chip coordinates"] = tuple(location["chip"])
            if show_board_chip:
                out["Coordinates within board"] = tuple(location["board_chip"])
            out["Job using board"] = location["job_id"]
            if location["job_id"]:
                out["Coordinates within job"] = tuple(location["job_chip"])

            print(render_definitions(out))
            return 0

    except (IOError, OSError, ProtocolTimeoutError) as e:
        sys.stderr.write("Error communicating with server: {}\n".format(e))
        return 1
    finally:
        client.close()
예제 #7
0
    def __init__(self, *args, **kwargs):
        """Request a SpiNNaker machine.

        A :py:class:`.Job` is constructed in one of the following styles::

            >>> # Any single (SpiNN-5) board
            >>> Job()
            >>> Job(1)

            >>> # Any machine with at least 4 boards
            >>> Job(4)

            >>> # Any 7-or-more board machine with an aspect ratio at least as
            >>> # square as 1:2
            >>> Job(7, min_ratio=0.5)

            >>> # Any 4x5 triad segment of a machine (may or may-not be a
            >>> # torus/full machine)
            >>> Job(4, 5)

            >>> # Any torus-connected (full machine) 4x2 machine
            >>> Job(4, 2, require_torus=True)

            >>> # Board x=3, y=2, z=1 on the machine named "m"
            >>> Job(3, 2, 1, machine="m")

            >>> # Keep using (and keeping-alive) an existing allocation
            >>> Job(resume_job_id=123)

        Once finished with a Job, the :py:meth:`.destroy` (or in unusual
        applications :py:meth:`.Job.close`) method must be called to destroy
        the job, close the connection to the server and terminate the
        background keep-alive thread. Alternatively, a Job may be used as a
        context manager which automatically calls :py:meth:`.destroy` on
        exiting the block::

            >>> with Job() as j:
            ...     # ...for example...
            ...     my_boot(j.hostname, j.width, j.height)
            ...     my_application(j.hostname)

        The following keyword-only parameters below are used both to specify
        the server details as well as the job requirements. Most parameters
        default to the values supplied in the local :py:mod:`~spalloc.config`
        file allowing usage as in the examples above.

        Parameters
        ----------
        hostname : str
            **Required.** The name of the spalloc server to connect to. (Read
            from config file if not specified.)
        port : int
            The port number of the spalloc server to connect to. (Read from
            config file if not specified.)
        reconnect_delay : float
            Number of seconds between attempts to reconnect to the server.
            (Read from config file if not specified.)
        timeout : float or None
            Timeout for waiting for replies from the server. If None, will keep
            trying forever. (Read from config file if not specified.)
        config_filenames : [str, ...]
            If given must be a list of filenames to read configuration options
            from. If not supplied, the default config file locations are
            searched. Set to an empty list to prevent using values from config
            files.

        Other Parameters
        ----------------
        resume_job_id : int or None
            If supplied, rather than creating a new job, take on an existing
            one, keeping it alive as required by the original job. If this
            argument is used, all other requirements are ignored.
        owner : str
            **Required.** The name of the owner of the job. By convention this
            should be your email address. (Read from config file if not
            specified.)
        keepalive : float or None
            The number of seconds after which the server may consider the job
            dead if this client cannot communicate with it. If None, no timeout
            will be used and the job will run until explicitly destroyed. Use
            with extreme caution. (Read from config file if not specified.)
        machine : str or None
            Specify the name of a machine which this job must be executed on.
            If None, the first suitable machine available will be used,
            according to the tags selected below. Must be None when tags are
            given. (Read from config file if not specified.)
        tags : [str, ...] or None
            The set of tags which any machine running this job must have. If
            None is supplied, only machines with the "default" tag will be
            used. If machine is given, this argument must be None.  (Read from
            config file if not specified.)
        min_ratio : float
            The aspect ratio (h/w) which the allocated region must be 'at least
            as square as'. Set to 0.0 for any allowable shape, 1.0 to be
            exactly square etc. Ignored when allocating single boards or
            specific rectangles of triads.
        max_dead_boards : int or None
            The maximum number of broken or unreachable boards to allow in the
            allocated region. If None, any number of dead boards is permitted,
            as long as the board on the bottom-left corner is alive. (Read from
            config file if not specified.)
        max_dead_links : int or None
            The maximum number of broken links allow in the allocated region.
            When require_torus is True this includes wrap-around links,
            otherwise peripheral links are not counted.  If None, any number of
            broken links is allowed. (Read from config file if not specified.).
        require_torus : bool
            If True, only allocate blocks with torus connectivity. In general
            this will only succeed for requests to allocate an entire machine.
            Must be False when allocating boards. (Read from config file if not
            specified.)
        """
        # Read configuration
        config_filenames = kwargs.pop("config_filenames", SEARCH_PATH)
        config = read_config(config_filenames)

        # Get protocol client options
        hostname = kwargs.get("hostname", config["hostname"])
        owner = kwargs.get("owner", config["owner"])
        port = kwargs.get("port", config["port"])
        self._reconnect_delay = kwargs.get("reconnect_delay",
                                           config["reconnect_delay"])
        self._timeout = kwargs.get("timeout", config["timeout"])
        if hostname is None:
            raise ValueError("A hostname must be specified.")

        # Cached responses of _get_state and _get_machine_info
        self._last_state = None
        self._last_machine_info = None

        # Connection to server (and associated lock)
        self._client = ProtocolClient(hostname, port)
        self._client_lock = threading.RLock()

        # Set-up (but don't start) background keepalive thread
        self._keepalive_thread = threading.Thread(
            target=self._keepalive_thread, name="job-keepalive-thread")
        self._keepalive_thread.daemon = True

        # Event fired when the background thread should shut-down
        self._stop = threading.Event()

        # Check version compatibility (fail fast if can't communicate with
        # server)
        self._client.connect(timeout=self._timeout)
        self._assert_compatible_version()

        # Resume/create the job
        resume_job_id = kwargs.get("resume_job_id", None)
        if resume_job_id:
            self.id = resume_job_id

            # If the job no longer exists, we can't get the keepalive interval
            # (and there's nothing to keepalive) so just bail out.
            job_state = self._get_state()
            if (job_state.state == JobState.unknown
                    or job_state.state == JobState.destroyed):
                raise JobDestroyedError("Job {} does not exist: {}{}{}".format(
                    resume_job_id, job_state.state.name,
                    ": " if job_state.reason is not None else "",
                    job_state.reason if job_state.reason is not None else ""))

            # Snag the keepalive interval from the job
            self._keepalive = job_state.keepalive

            logger.info("Spalloc resumed job %d", self.id)
        else:
            # Get job creation arguments
            job_args = args
            job_kwargs = {
                "owner":
                owner,
                "keepalive":
                kwargs.get("keepalive", config["keepalive"]),
                "machine":
                kwargs.get("machine", config["machine"]),
                "tags":
                kwargs.get("tags", config["tags"]),
                "min_ratio":
                kwargs.get("min_ratio", config["min_ratio"]),
                "max_dead_boards":
                kwargs.get("max_dead_boards", config["max_dead_boards"]),
                "max_dead_links":
                kwargs.get("max_dead_links", config["max_dead_links"]),
                "require_torus":
                kwargs.get("require_torus", config["require_torus"]),
                "timeout":
                self._timeout,
            }

            # Sanity check arguments
            if job_kwargs["owner"] is None:
                raise ValueError("An owner must be specified.")
            if ((job_kwargs["tags"] is not None)
                    and (job_kwargs["machine"] is not None)):
                raise ValueError(
                    "Only one of tags and machine may be specified.")

            self._keepalive = job_kwargs["keepalive"]

            # Create the job (failing fast if can't communicate)
            self.id = self._client.create_job(*job_args, **job_kwargs)

            logger.info("Created spalloc job %d", self.id)

        # Start keepalive thread now that everything is up
        self._keepalive_thread.start()
예제 #8
0
def main(argv=None):
    t = Terminal()

    cfg = config.read_config()

    parser = argparse.ArgumentParser(description="Manage running jobs.")

    parser.add_argument("--version",
                        "-V",
                        action="version",
                        version=__version__)

    parser.add_argument("job_id",
                        type=int,
                        nargs="?",
                        help="the job ID of interest, optional if the current "
                        "owner only has one job")

    parser.add_argument("--owner",
                        "-o",
                        default=cfg["owner"],
                        help="if no job ID is provided and this owner has "
                        "only one job, this job is assumed "
                        "(default: %(default)s)")

    control_args = parser.add_mutually_exclusive_group()

    control_args.add_argument("--info",
                              "-i",
                              action="store_true",
                              help="Show basic job information (the default)")
    control_args.add_argument("--watch",
                              "-w",
                              action="store_true",
                              help="watch this job for state changes")
    control_args.add_argument("--power-on",
                              "--reset",
                              "-p",
                              "-r",
                              action="store_true",
                              help="power-on or reset the job's boards")
    control_args.add_argument("--power-off",
                              action="store_true",
                              help="power-off the job's boards")
    control_args.add_argument("--ethernet-ips",
                              "-e",
                              action="store_true",
                              help="output the IPs of all Ethernet connected "
                              "chips as a CSV")
    control_args.add_argument("--destroy",
                              "-D",
                              nargs="?",
                              metavar="REASON",
                              const="",
                              help="destroy a queued or running job")

    server_args = parser.add_argument_group("spalloc server arguments")

    server_args.add_argument("--hostname",
                             "-H",
                             default=cfg["hostname"],
                             help="hostname or IP of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--port",
                             "-P",
                             default=cfg["port"],
                             type=int,
                             help="port number of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--timeout",
                             default=cfg["timeout"],
                             type=float,
                             metavar="SECONDS",
                             help="seconds to wait for a response "
                             "from the server (default: %(default)s)")

    args = parser.parse_args(argv)

    # Fail if server not specified
    if args.hostname is None:
        parser.error("--hostname of spalloc server must be specified")

    # Fail if job *and* owner not specified
    if args.job_id is None and args.owner is None:
        parser.error("job ID (or --owner) not specified")

    client = ProtocolClient(args.hostname, args.port)
    try:
        # Connect to server and ensure compatible version
        client.connect()
        version = tuple(
            map(int,
                client.version(timeout=args.timeout).split(".")))
        if not (VERSION_RANGE_START <= version < VERSION_RANGE_STOP):
            sys.stderr.write("Incompatible server version ({}).\n".format(
                ".".join(map(str, version))))
            return 2

        # If no Job ID specified, attempt to discover one
        if args.job_id is None:
            jobs = client.list_jobs(timeout=args.timeout)
            job_ids = [
                job["job_id"] for job in jobs if job["owner"] == args.owner
            ]
            if len(job_ids) == 0:
                sys.stderr.write("Owner {} has no live jobs.\n".format(
                    args.owner))
                return 3
            elif len(job_ids) > 1:
                sys.stderr.write("Ambiguous: {} has {} live jobs: {}\n".format(
                    args.owner, len(job_ids), ", ".join(map(str, job_ids))))
                return 3
            else:
                args.job_id = job_ids[0]

        # Do as the user asked
        if args.watch:
            return watch_job(t, client, args.timeout, args.job_id)
        elif args.power_on:
            return power_job(client, args.timeout, args.job_id, True)
        elif args.power_off:
            return power_job(client, args.timeout, args.job_id, False)
        elif args.ethernet_ips:
            return list_ips(client, args.timeout, args.job_id)
        elif args.destroy is not None:
            # Set default destruction message
            if args.destroy == "" and args.owner:
                args.destroy = "Destroyed by {}".format(args.owner)
            return destroy_job(client, args.timeout, args.job_id, args.destroy)
        else:
            return show_job_info(t, client, args.timeout, args.job_id)

    except (IOError, OSError, ProtocolTimeoutError) as e:
        sys.stderr.write("Error communicating with server: {}\n".format(e))
        return 1
    finally:
        client.close()
예제 #9
0
def parse_argv(argv):
    cfg = config.read_config()

    parser = argparse.ArgumentParser(
        description="Request (and allocate) a SpiNNaker machine.")
    parser.add_argument("--version", "-V", action="version",
                        version=__version__)
    parser.add_argument("--quiet", "-q", action="store_true",
                        default=False,
                        help="suppress informational messages")
    parser.add_argument("--debug", action="store_true",
                        default=False,
                        help="enable additional diagnostic information")
    parser.add_argument("--no-destroy", "-D", action="store_true",
                        default=False,
                        help="do not destroy the job on exit")
    if MachineController is not None:
        parser.add_argument("--boot", "-B", action="store_true",
                            default=False,
                            help="boot the machine once powered on")

    allocation_args = parser.add_argument_group(
        "allocation requirement arguments")
    allocation_args.add_argument(
        "what", nargs="*", default=[], type=int, metavar="WHAT",
        help="what to allocate: nothing or 1 requests 1 SpiNN-5 board, NUM "
        "requests at least NUM SpiNN-5 boards, WIDTH HEIGHT means "
        "WIDTHxHEIGHT triads of SpiNN-5 boards and X Y Z requests a "
        "board the specified logical board coordinate.")
    allocation_args.add_argument(
        "--resume", "-r", type=int,
        help="if given, resume keeping the specified job alive rather than "
        "creating a new job (all allocation requirements will be ignored)")
    allocation_args.add_argument(
        "--machine", "-m", nargs="?", default=cfg["machine"],
        help="only allocate boards which are part of a specific machine, or "
        "any machine if no machine is given (default: %(default)s)")
    allocation_args.add_argument(
        "--tags", "-t", nargs="*", metavar="TAG",
        default=cfg["tags"] or ["default"],
        help="only allocate boards which have (at least) the specified flags "
        "(default: {})".format(" ".join(cfg["tags"] or [])))
    allocation_args.add_argument(
        "--min-ratio", type=float, metavar="RATIO", default=cfg["min_ratio"],
        help="when allocating by number of boards, require that the "
        "allocation be at least as square as this ratio (default: "
        "%(default)s)")
    allocation_args.add_argument(
        "--max-dead-boards", type=int, metavar="NUM", default=(
            -1 if cfg["max_dead_boards"] is None else cfg["max_dead_boards"]),
        help="boards allowed to be dead in the allocation, or -1 to allow "
        "any number of dead boards (default: %(default)s)")
    allocation_args.add_argument(
        "--max-dead-links", type=int, metavar="NUM", default=(
            -1 if cfg["max_dead_links"] is None else cfg["max_dead_links"]),
        help="inter-board links allowed to be dead in the allocation, or -1 "
        "to allow any number of dead links (default: %(default)s)")
    allocation_args.add_argument(
        "--require-torus", "-w", action="store_true",
        default=cfg["require_torus"],
        help="require that the allocation contain torus (a.k.a. wrap-around) "
        "links {}".format("(default)" if cfg["require_torus"] else ""))
    allocation_args.add_argument(
        "--no-require-torus", "-W", action="store_false", dest="require_torus",
        help="do not require that the allocation contain torus (a.k.a. "
        "wrap-around) links {}".format(
            "" if cfg["require_torus"] else "(default)"))

    command_args = parser.add_argument_group("command wrapping arguments")
    command_args.add_argument(
        "--command", "-c", nargs=argparse.REMAINDER,
        help="execute the specified command once boards have been allocated "
        "and deallocate the boards when the application exits ({} and "
        "{hostname} are substituted for the chip chip at (0, 0)'s hostname, "
        "{w} and {h} give the dimensions of the SpiNNaker machine in chips, "
        "{ethernet_ips} is a temporary file containing a CSV with three "
        "columns: x, y and hostname giving the hostname of each Ethernet "
        "connected SpiNNaker chip)")

    server_args = parser.add_argument_group("spalloc server arguments")
    server_args.add_argument(
        "--owner", default=cfg["owner"],
        help="by convention, the email address of the owner of the job "
        "(default: %(default)s)")
    server_args.add_argument(
        "--hostname", "-H", default=cfg["hostname"],
        help="hostname or IP of the spalloc server (default: %(default)s)")
    server_args.add_argument(
        "--port", "-P", default=cfg["port"], type=int,
        help="port number of the spalloc server (default: %(default)s)")
    server_args.add_argument(
        "--keepalive", type=int, metavar="SECONDS",
        default=(-1 if cfg["keepalive"] is None else cfg["keepalive"]),
        help="the interval at which to require keepalive messages to be "
        "sent to prevent the server cancelling the job, or -1 to not "
        "require keepalive messages (default: %(default)s)")
    server_args.add_argument(
        "--reconnect-delay", default=cfg["reconnect_delay"], type=float,
        metavar="SECONDS",
        help="seconds to wait before reconnecting to the server if the "
        "connection is lost (default: %(default)s)")
    server_args.add_argument(
        "--timeout", default=cfg["timeout"], type=float, metavar="SECONDS",
        help="seconds to wait for a response from the server "
        "(default: %(default)s)")
    return parser, parser.parse_args(argv)
예제 #10
0
def main(argv=None):
    t = Terminal(stream=sys.stderr)

    cfg = config.read_config()

    parser = argparse.ArgumentParser(
        description="Request (and allocate) a SpiNNaker machine.")

    parser.add_argument("--version",
                        "-V",
                        action="version",
                        version=__version__)

    parser.add_argument("--quiet",
                        "-q",
                        action="store_true",
                        default=False,
                        help="suppress informational messages")
    parser.add_argument("--debug",
                        action="store_true",
                        default=False,
                        help="enable additional diagnostic information")
    parser.add_argument("--no-destroy",
                        "-D",
                        action="store_true",
                        default=False,
                        help="do not destroy the job on exit")

    if MachineController is not None:
        parser.add_argument("--boot",
                            "-B",
                            action="store_true",
                            default=False,
                            help="boot the machine once powered on")

    allocation_args = parser.add_argument_group(
        "allocation requirement arguments")
    allocation_args.add_argument("what",
                                 nargs="*",
                                 default=[],
                                 type=int,
                                 metavar="WHAT",
                                 help="what to allocate: nothing or 1 "
                                 "requests 1 SpiNN-5 board, NUM requests "
                                 "at least NUM SpiNN-5 boards, WIDTH "
                                 "HEIGHT means WIDTHxHEIGHT triads of "
                                 "SpiNN-5 boards and X Y Z requests a "
                                 "board the specified logical board "
                                 "coordinate.")
    allocation_args.add_argument("--resume",
                                 "-r",
                                 type=int,
                                 help="if given, resume keeping the "
                                 "specified job alive rather than "
                                 "creating a new job (all allocation "
                                 "requirements will be ignored)")
    allocation_args.add_argument("--machine",
                                 "-m",
                                 nargs="?",
                                 default=cfg["machine"],
                                 help="only allocate boards which are part "
                                 "of a specific machine, or any machine "
                                 "if no machine is given "
                                 "(default: %(default)s)")
    allocation_args.add_argument("--tags",
                                 "-t",
                                 nargs="*",
                                 metavar="TAG",
                                 default=cfg["tags"] or ["default"],
                                 help="only allocate boards which have (at "
                                 "least) the specified flags "
                                 "(default: {})".format(" ".join(cfg["tags"]
                                                                 or [])))
    allocation_args.add_argument("--min-ratio",
                                 type=float,
                                 metavar="RATIO",
                                 default=cfg["min_ratio"],
                                 help="when allocating by number of boards, "
                                 "require that the allocation be at "
                                 "least as square as this ratio "
                                 "(default: %(default)s)")
    allocation_args.add_argument("--max-dead-boards",
                                 type=int,
                                 metavar="NUM",
                                 default=(-1 if cfg["max_dead_boards"] is None
                                          else cfg["max_dead_boards"]),
                                 help="boards allowed to be "
                                 "dead in the allocation, or -1 to allow "
                                 "any number of dead boards "
                                 "(default: %(default)s)")
    allocation_args.add_argument("--max-dead-links",
                                 type=int,
                                 metavar="NUM",
                                 default=(-1 if cfg["max_dead_links"] is None
                                          else cfg["max_dead_links"]),
                                 help="inter-board links allowed to be "
                                 "dead in the allocation, or -1 to allow "
                                 "any number of dead links "
                                 "(default: %(default)s)")
    allocation_args.add_argument(
        "--require-torus",
        "-w",
        action="store_true",
        default=cfg["require_torus"],
        help="require that the allocation contain "
        "torus (a.k.a. wrap-around) "
        "links {}".format("(default)" if cfg["require_torus"] else ""))
    allocation_args.add_argument(
        "--no-require-torus",
        "-W",
        action="store_false",
        dest="require_torus",
        help="do not require that the allocation "
        "contain torus (a.k.a. wrap-around) "
        "links {}".format("" if cfg["require_torus"] else "(default)"))

    command_args = parser.add_argument_group("command wrapping arguments")
    command_args.add_argument("--command",
                              "-c",
                              nargs=argparse.REMAINDER,
                              help="execute the specified command once boards "
                              "have been allocated and deallocate the "
                              "boards when the application exits ({} and "
                              "{hostname} are substituted for the chip "
                              "chip at (0, 0)'s hostname, {w} and "
                              "{h} give the dimensions of the SpiNNaker "
                              "machine in chips, {ethernet_ips} is a "
                              "temporary file containing a CSV with "
                              "three columns: x, y and hostname giving "
                              "the hostname of each Ethernet connected "
                              "SpiNNaker chip)")

    server_args = parser.add_argument_group("spalloc server arguments")

    server_args.add_argument("--owner",
                             default=cfg["owner"],
                             help="by convention, the email address of the "
                             "owner of the job (default: %(default)s)")
    server_args.add_argument("--hostname",
                             "-H",
                             default=cfg["hostname"],
                             help="hostname or IP of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument("--port",
                             "-P",
                             default=cfg["port"],
                             type=int,
                             help="port number of the spalloc server "
                             "(default: %(default)s)")
    server_args.add_argument(
        "--keepalive",
        type=int,
        metavar="SECONDS",
        default=(-1 if cfg["keepalive"] is None else cfg["keepalive"]),
        help="the interval at which to require "
        "keepalive messages to be sent to "
        "prevent the server cancelling the "
        "job, or -1 to not require keepalive "
        "messages (default: %(default)s)")
    server_args.add_argument("--reconnect-delay",
                             default=cfg["reconnect_delay"],
                             type=float,
                             metavar="SECONDS",
                             help="seconds to wait before "
                             "reconnecting to the server if the "
                             "connection is lost (default: %(default)s)")
    server_args.add_argument("--timeout",
                             default=cfg["timeout"],
                             type=float,
                             metavar="SECONDS",
                             help="seconds to wait for a response "
                             "from the server (default: %(default)s)")

    args = parser.parse_args(argv)

    # Fail if no owner is defined (unless resuming)
    if not args.owner and args.resume is None:
        parser.error(
            "--owner must be specified (typically your email address)")

    # Fail if server not specified
    if args.hostname is None:
        parser.error("--hostname of spalloc server must be specified")

    # Set universal job arguments
    job_kwargs = {
        "hostname":
        args.hostname,
        "port":
        args.port,
        "reconnect_delay":
        args.reconnect_delay if args.reconnect_delay >= 0.0 else None,
        "timeout":
        args.timeout if args.timeout >= 0.0 else None,
    }

    if args.resume:
        job_args = []
        job_kwargs.update({
            "resume_job_id": args.resume,
        })
    else:
        # Make sure 'what' takes the right form
        if len(args.what) not in (0, 1, 2, 3):
            parser.error("expected either no arguments, one argument, NUM, "
                         "two arguments, WIDTH HEIGHT, or three arguments "
                         "X Y Z")

        # Unpack arguments for the job and server
        job_args = args.what
        job_kwargs.update({
            "owner":
            args.owner,
            "keepalive":
            args.keepalive if args.keepalive >= 0.0 else None,
            "machine":
            args.machine,
            "tags":
            args.tags if args.machine is None else None,
            "min_ratio":
            args.min_ratio,
            "max_dead_boards":
            args.max_dead_boards if args.max_dead_boards >= 0.0 else None,
            "max_dead_links":
            args.max_dead_links if args.max_dead_links >= 0.0 else None,
            "require_torus":
            args.require_torus,
        })

    # Set debug level
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)

    # Create temporary file in which to write CSV of all board IPs
    _, ip_file_filename = tempfile.mkstemp(".csv", "spinnaker_ips_")

    def info(msg):
        if not args.quiet:
            t.stream.write("{}\n".format(msg))

    # Reason for destroying the job
    reason = None

    try:
        # Create the job
        try:
            job = Job(*job_args, **job_kwargs)
        except (OSError, IOError) as e:
            info(t.red("Could not connect to server: {}".format(e)))
            return 6
        try:
            # Wait for it to become ready, keeping the user informed along the
            # way
            old_state = None
            cur_state = job.state
            while True:
                # Show debug info on state-change
                if old_state != cur_state:
                    if cur_state == JobState.queued:
                        info(
                            t.update(
                                t.yellow("Job {}: Waiting in queue...".format(
                                    job.id))))
                    elif cur_state == JobState.power:
                        info(
                            t.update(
                                t.yellow(
                                    "Job {}: Waiting for power on...".format(
                                        job.id))))
                    elif cur_state == JobState.ready:
                        # Here we go!
                        break
                    elif cur_state == JobState.destroyed:
                        # Exit with error state
                        try:
                            reason = job.reason
                        except (IOError, OSError):
                            reason = None

                        if reason is not None:
                            info(
                                t.update(
                                    t.red("Job {}: Destroyed: {}".format(
                                        job.id, reason))))
                        else:
                            info(t.red("Job {}: Destroyed.".format(job.id)))
                        return 1
                    elif cur_state == JobState.unknown:
                        info(
                            t.update(
                                t.red("Job {}: Job not recognised by server.".
                                      format(job.id))))
                        return 2
                    else:
                        info(
                            t.update(
                                t.red(
                                    "Job {}: Entered an unrecognised state {}."
                                    .format(job.id, cur_state))))
                        return 3

                try:
                    old_state = cur_state
                    cur_state = job.wait_for_state_change(cur_state)
                except KeyboardInterrupt:
                    # Gracefully terminate from keyboard interrupt
                    info(
                        t.update(
                            t.red("Job {}: Keyboard interrupt.".format(
                                job.id))))
                    reason = "Keyboard interrupt."
                    return 4

            # Machine is now ready
            write_ips_to_csv(job.connections, ip_file_filename)

            # Boot the machine if required
            if MachineController is not None and args.boot:
                info(t.update(t.yellow("Job {}: Booting...".format(job.id))))
                mc = MachineController(job.hostname)
                mc.boot(job.width, job.height)

            info(t.update(t.green("Job {}: Ready!".format(job.id))))

            # Either run the user's application or just print the details.
            if args.command:
                return run_command(args.command, job.id, job.machine_name,
                                   job.connections, job.width, job.height,
                                   ip_file_filename)

            else:
                print_info(job.machine_name, job.connections, job.width,
                           job.height, ip_file_filename)
                return 0
        finally:
            # Destroy job and disconnect client
            if args.no_destroy:
                job.close()
            else:
                job.destroy(reason)
    finally:
        # Delete IP address list file
        os.remove(ip_file_filename)