示例#1
0
def configure_client_instances(
        num_clients: int,
        num_cpu: int,
        num_ram: float,
        gpu: bool = False) -> Tuple[List[Instance], List[str]]:
    """Return list of client instances and a list of instance names."""
    instance_names = [f"client_{i}" for i in range(num_clients)]

    instances = [
        Instance(
            name=instance_name,
            group="clients",
            num_cpu=num_cpu,
            num_ram=num_ram,
            gpu=gpu,
        ) for instance_name in instance_names
    ]

    return instances, instance_names
示例#2
0

client_instances_100, client_names_100 = configure_client_instances(
    num_clients=100, num_cpu=2, num_ram=4)

client_instances_10, client_names_10 = configure_client_instances(
    num_clients=10, num_cpu=2, num_ram=4)

SETTINGS = {
    ###
    ### FedFS vs FedAvg
    ###
    "fn-c50-r40-fedavg-16":
    Baseline(
        instances=[
            Instance(name="server", group="server", num_cpu=4, num_ram=16)
        ] + client_instances_100,
        server=ServerSetting(
            instance_name="server",
            strategy="fedavg",
            rounds=FN_ROUNDS,
            min_num_clients=FN_MIN_NUM_CLIENTS,
            sample_fraction=FN_SAMPLE_FRACTION_50,
            min_sample_size=FN_MIN_SAMPLE_SIZE_50,
            training_round_timeout=16,
            lr_initial=FN_LR_INITIAL,
            partial_updates=False,
            importance_sampling=False,
            dynamic_timeout=False,
        ),
        clients=configure_clients(
示例#3
0
def run(baseline: str, setting: str, adapter: str) -> None:
    """Run baseline."""
    print(f"Starting baseline with {setting} settings.")

    wheel_remote_path = (f"/root/{WHEEL_FILENAME}" if adapter == "docker" else
                         f"/home/ubuntu/{WHEEL_FILENAME}")

    settings = load_baseline_setting(baseline, setting)

    # Get instances and add a logserver to the list
    instances = settings.instances
    instances.append(
        Instance(name="logserver", group="logserver", num_cpu=2, num_ram=2))

    # Configure cluster
    log(INFO, "(1/9) Configure cluster.")
    cluster = configure_cluster(adapter, instances, baseline, setting)

    # Start the cluster; this takes some time
    log(INFO, "(2/9) Start cluster.")
    cluster.start()

    # Upload wheel to all instances
    log(INFO, "(3/9) Upload wheel to all instances.")
    cluster.upload_all(WHEEL_LOCAL_PATH, wheel_remote_path)

    # Install the wheel on all instances
    log(INFO, "(4/9) Install wheel on all instances.")
    cluster.exec_all(command.install_wheel(wheel_remote_path))
    extras = ["examples-tensorflow"
              ] if "tf_" in baseline else ["examples-pytorch"]
    cluster.exec_all(
        command.install_wheel(wheel_remote_path=wheel_remote_path,
                              wheel_extras=extras))

    # Download datasets in server and clients
    log(INFO, "(5/9) Download dataset on server and clients.")
    cluster.exec_all(command.download_dataset(baseline=baseline),
                     groups=["server", "clients"])

    # Start logserver
    log(INFO, "(6/9) Start logserver.")
    logserver = cluster.get_instance("logserver")
    cluster.exec(
        logserver.name,
        command.start_logserver(
            logserver_s3_bucket=CONFIG.get("aws", "logserver_s3_bucket"),
            logserver_s3_key=f"{baseline}_{setting}_{now()}.log",
        ),
    )

    # Start Flower server on Flower server instances
    log(INFO, "(7/9) Start server.")
    cluster.exec(
        "server",
        command.start_server(
            log_host=f"{logserver.private_ip}:8081",
            baseline=baseline,
            setting=setting,
        ),
    )

    # Start Flower clients
    log(INFO, "(8/9) Start clients.")
    server = cluster.get_instance("server")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Start the load operations and mark each future with its URL
        concurrent.futures.wait([
            executor.submit(
                cluster.exec,
                client_setting.instance_name,
                command.start_client(
                    log_host=f"{logserver.private_ip}:8081",
                    server_address=f"{server.private_ip}:8080",
                    baseline=baseline,
                    setting=setting,
                    cid=client_setting.cid,
                ),
            ) for client_setting in settings.clients
        ])

    # Shutdown server and client instance after 10min if not at least one Flower
    # process is running it
    log(INFO, "(9/9) Start shutdown watcher script.")
    cluster.exec_all(command.watch_and_shutdown("flwr", adapter))

    # Give user info how to tail logfile
    private_key = (DOCKER_PRIVATE_KEY if adapter == "docker" else
                   path.expanduser(CONFIG.get("ssh", "private_key")))

    log(
        INFO,
        "If you would like to tail the central logfile run:\n\n\t%s\n",
        command.tail_logfile(adapter, private_key, logserver),
    )
示例#4
0

client_instances_50, client_names_50 = configure_client_instances(
    num_clients=50, num_cpu=2, num_ram=8)

client_instances_10, client_names_10 = configure_client_instances(
    num_clients=10, num_cpu=2, num_ram=8)

SETTINGS = {
    ###
    ### FedFS vs FedAvg
    ###
    "fn-c25-r50-fedavg-230":
    Baseline(
        instances=[
            Instance(name="server", group="server", num_cpu=4, num_ram=16)
        ] + client_instances_50,
        server=ServerSetting(
            instance_name="server",
            strategy="fedavg",
            rounds=FN_ROUNDS,
            min_num_clients=FN_MIN_NUM_CLIENTS,
            sample_fraction=FN_SAMPLE_FRACTION_25,
            min_sample_size=FN_MIN_SAMPLE_SIZE_25,
            training_round_timeout=FN_TRAINING_ROUND_TIMEOUT,
            lr_initial=FN_LR_INITIAL,
            partial_updates=False,
            importance_sampling=False,
            dynamic_timeout=False,
        ),
        clients=configure_clients(