Exemplo n.º 1
0
def spawn(
    image_id: str,
    total_count: int,
    instance_type: str,
    key_name: str,
    security_group_id: str,
):
    """Spawn new EC2 instances to make a total.

    Args:
        image_id (str): Image ID.
        total_count (int): Desired number of instances.
        instance_type (str): Type of the instance.
        key_name (str): Name of the key pair.
        security_group_id (str): Security group.
    """
    available = get_num_instances()
    if available < total_count:
        run(
            image_id=image_id,
            count=total_count - available,
            instance_type=instance_type,
            key_name=key_name,
            security_group_id=security_group_id,
        )
    else:
        out.out("Already enough instances available.")
Exemplo n.º 2
0
def stop_running():
    """Stop all running EC2 instances."""
    instances = [instance for instance in get_state("running")]
    if len(instances) > 0:
        stop(*instances)
    else:
        out.out("No running instances.")
Exemplo n.º 3
0
def start_stopped():
    """Start all stopped EC2 instances."""
    instances = [instance for instance in get_state("stopped")]
    if len(instances) > 0:
        start(*instances)
    else:
        out.out("No stopped instances.")
Exemplo n.º 4
0
def terminate_all():
    """Terminate all EC2 instances."""
    instance_ids = [instance["InstanceId"] for instance in get_instances()]
    if len(instance_ids) > 0:
        execute_command("aws", "ec2", "terminate-instances", "--instance-ids",
                        *instance_ids)
    else:
        out.out("No instances to terminate.")
Exemplo n.º 5
0
    def _minimise_l_bfgs_b(f,
                           vs,
                           f_calls=10000,
                           iters=1000,
                           trace=False,
                           names=None,
                           jit=False):
        names = _convert_and_validate_names(names)

        # Run function once to ensure that all variables are initialised and
        # available.
        val_init = f(vs)

        # SciPy doesn't perform zero iterations, so handle that edge case
        # manually.
        if iters == 0 or f_calls == 0:
            return B.to_numpy(val_init)

        # Extract initial value.
        x0 = B.to_numpy(vs.get_latent_vector(*names))

        # The optimiser expects to get `float64`s.
        def _convert(*xs):
            return [B.cast(np.float64, B.to_numpy(x)) for x in xs]

        # Wrap the function and get the list of function evaluations.
        f_vals, f_wrapped = wrap_f(vs, names, f, jit, _convert)

        # Perform optimisation routine.
        def perform_minimisation(callback_=lambda _: None):
            return fmin_l_bfgs_b(
                func=f_wrapped,
                x0=x0,
                maxiter=iters,
                maxfun=f_calls,
                callback=callback_,
                disp=0,
            )

        if trace:
            # Print progress during minimisation.
            with out.Progress(name='Minimisation of "{}"'.format(f.__name__),
                              total=iters) as progress:

                def callback(_):
                    progress({"Objective value": np.min(f_vals)})

                x_opt, val_opt, info = perform_minimisation(callback)

            with out.Section("Termination message"):
                out.out(convert(info["task"], str))
        else:
            # Don't print progress; simply perform minimisation.
            x_opt, val_opt, info = perform_minimisation()

        vs.set_latent_vector(x_opt, *names)  # Assign optimum.

        return val_opt  # Return optimal value.
Exemplo n.º 6
0
def print_logs(path: str):
    """Display the tail of logs on all running instances.

    Args:
        path (str): Path to the log.
    """
    for ip, log in ssh_map([f"tail -n100 {path}"], broadcast=True).items():
        with out.Section(ip):
            out.out(log)
Exemplo n.º 7
0
def test_time_report_calculation(monkeypatch):
    monkeypatch.setattr(out, "report_time", True)

    # Test that time is correctly calculated.
    with Mock() as mock:
        out._time_start = time.time() - 2 * 60 * 60 - 2 * 60 - 2
        out.out("a")

    assert len(mock) == 1
    assert mock[0] == "02:02:02 | a\n"
Exemplo n.º 8
0
def test_out_newlines():
    # Test that newlines are correctly indented.
    with Mock() as mock:
        out.out("a\nb")

        with out.Section():
            out.out("c\nd")

    assert len(mock) == 2
    assert mock[0] == "a\nb\n"
    assert mock[1] == "    c\n    d\n"
Exemplo n.º 9
0
    def minimise_l_bfgs_b(f,
                          vs,
                          f_calls=10000,
                          iters=1000,
                          trace=False,
                          names=None):
        names = [] if names is None else names

        # Run function once to ensure that all variables are initialised and
        # available.
        val_init = f(vs)

        # SciPy doesn't perform zero iterations, so handle that edge case
        # manually.
        if iters == 0 or f_calls == 0:
            return B.to_numpy(val_init)

        # Extract initial value.
        x0 = B.to_numpy(vs.get_vector(*names))

        # Wrap the function and get the list of function evaluations.
        f_vals, f_wrapped = wrap_f(vs, names, f)

        # Perform optimisation routine.
        def perform_minimisation(callback_=lambda _: None):
            return fmin_l_bfgs_b(func=f_wrapped,
                                 x0=x0,
                                 maxiter=iters,
                                 maxfun=f_calls,
                                 callback=callback_,
                                 disp=0)

        if trace:
            # Print progress during minimisation.
            with out.Progress(name='Minimisation of "{}"'.format(f.__name__),
                              total=iters) as progress:
                def callback(_):
                    progress({'Objective value': np.min(f_vals)})

                x_opt, val_opt, info = perform_minimisation(callback)

            with out.Section('Termination message'):
                out.out(info['task'].decode('utf-8'))
        else:
            # Don't print progress; simply perform minimisation.
            x_opt, val_opt, info = perform_minimisation()

        vs.set_vector(x_opt, *names)  # Assign optimum.

        return val_opt  # Return optimal value.
Exemplo n.º 10
0
def exception(x, e):
    """In the case that an exception is raised during function evaluation,
    print a warning and return NaN for the function value and gradient.

    Args:
        x (tensor): Current input.
        e (:class:`Exception`): Caught exception.

    Returns:
        tuple: Tuple containing NaN and NaNs for the gradient.
    """
    with out.Section("Caught exception during function evaluation"):
        out.out(traceback.format_exc().strip())
    grad_nan = np.empty(x.shape)
    grad_nan[:] = np.nan
    return np.nan, grad_nan
Exemplo n.º 11
0
def test_section():
    with Mock() as mock:
        out.out("before")

        with out.Section():
            out.out("message1")

        with out.Section("name"):
            out.out("message2")

            with out.Section():
                out.out("message3")

        out.out("after")

    assert len(mock) == 6
    assert mock[0] == "before\n"
    assert mock[1] == "    message1\n"
    assert mock[2] == "name:\n"
    assert mock[3] == "    message2\n"
    assert mock[4] == "        message3\n"
    assert mock[5] == "after\n"
Exemplo n.º 12
0
def test_time_report_interval(monkeypatch):
    monkeypatch.setattr(out, "report_time", True)

    # Test that time stamp is not repeated unnecessarily.
    with Mock() as mock:
        out.out("a")
        out.out("b")
        time.sleep(1.0)
        out.out("c")

    assert len(mock) == 3
    assert mock[0] == "00:00:00 | a\n"
    assert mock[1] == "         | b\n"
    assert mock[2] == "00:00:01 | c\n"
Exemplo n.º 13
0
def test_out():
    with Mock() as mock:
        out.out("message")

    assert len(mock) == 1
    assert str(mock) == "message\n"
Exemplo n.º 14
0
def manage_cluster(
    commands: List[List[str]],
    instance_type: str,
    key_name: str,
    security_group_id: str,
    image_id: str,
    sync_sources: List[str],
    sync_target: Path,
    monitor_aws_repo: str,
    monitor_call: str,
    monitor_delay: int,
):
    """Manage the cluster.

    Args:
        commands (list[list[str]]): One list of commands for every experiment.
        image_id (str): Image ID.
        instance_type (str): Type of the instance.
        key_name (str): Name of the key pair.
        security_group_id (str): Security group.
        sync_sources (list[str]): List of sources to sync.
        sync_target (:class:`.util.Path`): Directory to sync to.
        monitor_aws_repo (str, optional): Path to the root of this repo. The repo
            must consider the virtual environment "venv" which has the repo installed
            in editable mode.
        monitor_call (str): Call to start the monitor. See :mod:`.monitor`.
        monitor_delay (int): Number of seconds to wait before starting the monitor.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--spawn",
        type=int,
        help="Spawn instances.",
    )
    parser.add_argument(
        "--start",
        action="store_true",
        help="Start experiments.",
    )
    parser.add_argument(
        "--terminate",
        action="store_true",
        help="Terminate all instances. This is a kill switch.",
    )
    parser.add_argument(
        "--kill",
        action="store_true",
        help="Kill all running experiments, but keep the instances running.",
    )
    parser.add_argument(
        "--stop",
        action="store_true",
        help="Stop all running instances",
    )
    parser.add_argument(
        "--sync-stopped",
        action="store_true",
        help="Synchronise all stopped instances.",
    )
    parser.add_argument(
        "--sync-sleep",
        default=120,
        type=int,
        help="Number of seconds to sleep before syncing again.",
    )
    args = parser.parse_args()

    if args.sync_stopped:
        with out.Section("Syncing all stopped instances in five batches"):
            for batch in np.array_split(get_state("stopped"), 5):
                # Batches can be empty.
                if len(batch) == 0:
                    continue

                # Start the instances.
                start(*batch)

                try:
                    # Wait for the instances to have booted.
                    out.out(
                        "Waiting a minute for the instances to have booted...")
                    time.sleep(60)

                    # Refresh the instances to get the IPs.
                    instance_ids = [
                        instance["InstanceId"] for instance in batch
                    ]
                    batch = get_instances(*instance_ids)

                    # Sync.
                    sync(
                        sync_sources,
                        sync_target,
                        ips=[
                            instance["PublicIpAddress"] for instance in batch
                        ],
                    )
                finally:
                    # Stop the instances again.
                    stop(*batch)

        out.out("Syncing completed: not continuing execution of script.")
        exit()

    if args.spawn:
        with out.Section("Starting all stopped instances"):
            start_stopped()

        with out.Section("Spawning instances"):
            spawn(
                image_id=image_id,
                total_count=args.spawn,
                instance_type=instance_type,
                key_name=key_name,
                security_group_id=security_group_id,
            )

        while not check_all_running():
            out.out("Waiting for all instances to be running...")
            time.sleep(5)

        out.out("Waiting a minute for all instances to have booted...")
        time.sleep(60)

    if args.kill:
        with out.Section("Killing all experiments"):
            kill_all()

    if args.stop:
        with out.Section("Stopping all instances"):
            stop_running()

    if args.terminate:
        with out.Section("Terminating all instances"):
            terminate_all()

    if args.start:
        num_instances = len(get_running_ips())
        pieces = np.array_split(commands, num_instances)
        # Ensure that we have regular Python lists.
        pieces = [piece.tolist() for piece in pieces]

        with out.Section("Starting experiments"):
            out.kv("Number of commands", len(commands))
            out.kv("Number of instances", num_instances)
            out.kv("Maximum runs per instance",
                   max([len(piece) for piece in pieces]))
            ssh_map(
                *[[
                    *config["setup_commands"],
                    *sum(piece, []),
                    *config["teardown_commands"],
                ] for piece in pieces],
                start_experiment=True,
                in_experiment=True,
                start_monitor=True,
                monitor_aws_repo=monitor_aws_repo,
                monitor_delay=monitor_delay,
                monitor_call=monitor_call,
            )

    while True:
        out.kv("Instances still running", len(get_running_ips()))
        sync(sync_sources, sync_target)
        out.out(f"Sleeping for {args.sync_sleep} second(s)...")
        time.sleep(args.sync_sleep)