Пример #1
0
def test_dynamic_neighbor_allreduce_optimizer(device, atc_style, kwargs):
    error_threshold = kwargs.get("error_threshold", 1.5)

    problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs = \
        problem_setup()

    isCUDA = pin_model_to_device(device, model)

    base_dist_optimizer = (bf.DistributedAdaptThenCombineOptimizer if atc_style else
                           bf.DistributedAdaptWithCombineOptimizer)
    optimizer = base_dist_optimizer(optimizer, model=model,
                                    communication_type=bf.CommunicationType.neighbor_allreduce)

    dynamic_topo_gen = topology_util.GetDynamicOnePeerSendRecvRanks(
        bf.load_topology(), bf.rank())

    # Train and test
    train_mse = []
    test_mse = []
    for _ in range(num_epochs):
        dynamic_neighbor_allreduce_train(model, optimizer, train_dataloader, isCUDA,
                                         dynamic_topo_gen)
        train_mse.append(evaluation(model, train_dataloader, isCUDA))
        test_mse.append(evaluation(model, test_dataloader, isCUDA))
    train_mse = np.array(train_mse)
    test_mse = np.array(test_mse)

    # Check if the MSEs in the last three epochs are small enough
    assert (
        train_mse[-3:].max() < error_threshold*problem_builder.noise_level**2
    ), "Train MSE in the last three epochs doesn't coverge."
    assert (
        test_mse[-3:].max() < error_threshold*problem_builder.noise_level**2
    ), "Train MSE in the last three epochs doesn't coverge."
Пример #2
0
                     "[expo2(Default), ring, mesh, star].")

x_bar = bf.allreduce(x, average=True)
mse = [torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)]

if not args.asynchronous_mode:
    self_weight = None
    neighbor_weights = None
    send_neighbors = None

    if args.enable_dynamic_topology:
        if args.virtual_topology == "InnerOuterExpo2":
            dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks(
                bf.size(), local_size=bf.local_size(), self_rank=bf.rank())
        else:
            dynamic_neighbor_allreduce_gen = topology_util.GetDynamicOnePeerSendRecvRanks(
                bf.load_topology(), bf.rank())

    for ite in range(args.max_iters):
        if args.enable_dynamic_topology:
            send_neighbors, recv_neighbors = next(
                dynamic_neighbor_allreduce_gen)
            neighbor_weights = {
                r: 1 / (len(recv_neighbors) + 1)
                for r in recv_neighbors
            }
            self_weight = 1 / (len(recv_neighbors) + 1)

        x = bf.neighbor_allreduce(x,
                                  name='x',
                                  self_weight=self_weight,
                                  neighbor_weights=neighbor_weights,