Пример #1
0
    def _test_reduce_helper(
        self,
        group,
        group_id,
        rank,
        op,
        master_value,
        worker_value,
        expected_value,
        cuda=False,
        rank_to_GPU=None,
    ):
        for src in group:
            if rank == src:
                tensor = _build_tensor(src + 1).fill_(master_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.reduce(tensor, src, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
            else:
                tensor = _build_tensor(src + 1).fill_(worker_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.reduce(tensor, src, op, group_id)

        self._barrier()
Пример #2
0
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k, v in loss_dict.items():
            loss_names.append(k)
            all_losses.append(v)
        all_losses = torch.stack(all_losses, dim=0)
        dist.reduce(all_losses, dst=0)
        if dist.get_rank() == 0:
            # only main process gets accumulated, so only divide by
            # world_size in this case
            all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses
Пример #3
0
def run():
    modell = model.CNN()
    # modell = model.AlexNet()

    size = dist.get_world_size()
    rank = dist.get_rank()

    group_list = []
    for i in range(size):
        group_list.append(i)
    group = dist.new_group(group_list)

    while (1):

        for param in modell.parameters():
            # for dst in range(1, size):
            # dist.send(param.data, dst=dst)
            dist.broadcast(param.data, src=0, group=group)

        for param in modell.parameters():
            tensor_temp = torch.zeros_like(param.data)
            dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group)
            param.data = tensor_temp / (size - 1)
Пример #4
0
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k, v in loss_dict.items():
            loss_names.append(k)
            all_losses.append(v)
        all_losses = torch.stack(all_losses, dim=0)
        dist.reduce(all_losses, dst=0)
        if dist.get_rank() == 0:
            # only main process gets accumulated, so only divide by
            # world_size in this case
            all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses
Пример #5
0
def run(size, rank):


    modell = model.CNN()
    #modell = model.AlexNet()

    optimizer = torch.optim.Adam(modell.parameters(), lr=LR)
    loss_func = torch.nn.CrossEntropyLoss()



    if(IID == True):
        train_loader = Mnist().get_train_data()
        test_data = Mnist().get_test_data()
    else:
        if(rank > 0):
            if(rank == 1):
                train_loader = Mnist_noniid().get_train_data1()
                test_data = Mnist_noniid().get_test_data1()
            if(rank == 2):
                train_loader = Mnist_noniid().get_train_data2()
                test_data = Mnist_noniid().get_test_data2()
            if(rank == 3):
                train_loader = Mnist_noniid().get_train_data3()
                test_data = Mnist_noniid().get_test_data3()
            if(rank == 4):
                train_loader = Mnist_noniid().get_train_data4()
                test_data = Mnist_noniid().get_test_data4()
            if(rank == 5):
                train_loader = Mnist_noniid().get_train_data5()
                test_data = Mnist_noniid().get_test_data5()

    #size = dist.get_world_size()
    #rank = dist.get_rank()

    #train_loader = Mnist().get_train_data()
    #test_data = Mnist().get_test_data()

    for step, (b_x, b_y) in enumerate(test_data):
        test_x = b_x
        test_y = b_y

    group_list = []
    for i in range(size):
        group_list.append(i)
    group = dist.new_group(group_list)

    for epoch in range(MAX_EPOCH):

        modell = get_new_model(modell, group)
        #current_model = copy.deepcopy(modell)

        test_output, last_layer = modell(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy()
        accuracy = float((pred_y == test_y.data.numpy()).astype(int).sum()) / float(test_y.size(0))


        for step, (b_x, b_y) in enumerate(train_loader):

            #modell = get_new_model(modell)
            #current_model = copy.deepcopy(modell)

            output = modell(b_x)[0]
            loss = loss_func(output, b_y)
            optimizer.zero_grad()
            loss.backward()   
            optimizer.step()

        for param in modell.parameters():
            dist.reduce(param.data, dst=0, op=dist.reduce_op.SUM, group=group)

        f = open('./test.txt', 'a')
        print('Epoch: ', epoch, ' Rank: ', rank, '| train loss: %.4f' % loss.data.numpy(),
              '| test accuracy: %.2f' % accuracy, file=f)
        print('Epoch: ', epoch, ' Rank: ', rank, '| train loss: %.4f' % loss.data.numpy(),
              '| test accuracy: %.2f' % accuracy)
        f.close()
Пример #6
0
def run(size, rank):
    modell = model.CNN()
    # modell = model.AlexNet()

    optimizer = torch.optim.Adam(modell.parameters(), lr=LR)
    loss_func = torch.nn.CrossEntropyLoss()

    # size = dist.get_world_size()
    # rank = dist.get_rank()

    if (IID == True):
        train_loader = Mnist().get_train_data()
        test_data = Mnist().get_test_data()
        test_x = torch.unsqueeze(test_data.test_data, dim=1).type(
            torch.FloatTensor
        ) / 255.  # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
        test_y = test_data.test_labels
    else:
        if (rank > 0):
            if (rank == 1):
                train_loader = Mnist_noniid().get_train_data1()
                test_data = Mnist_noniid().get_test_data1()
                test_x = torch.unsqueeze(test_data.test_data, dim=1).type(
                    torch.FloatTensor
                ) / 255.  # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
                test_y = test_data.test_labels
            if (rank == 2):
                train_loader = Mnist_noniid().get_train_data2()
                test_data = Mnist_noniid().get_test_data2()
                test_x = torch.unsqueeze(test_data.test_data, dim=1).type(
                    torch.FloatTensor
                ) / 255.  # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
                test_y = test_data.test_labels
            if (rank == 3):
                train_loader = Mnist_noniid().get_train_data3()
                test_data = Mnist_noniid().get_test_data3()
                test_x = torch.unsqueeze(test_data.test_data, dim=1).type(
                    torch.FloatTensor
                ) / 255.  # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
                test_y = test_data.test_labels
            if (rank == 4):
                train_loader = Mnist_noniid().get_train_data4()
                test_data = Mnist_noniid().get_test_data4()
                test_x = torch.unsqueeze(test_data.test_data, dim=1).type(
                    torch.FloatTensor
                ) / 255.  # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
                test_y = test_data.test_labels
            if (rank == 5):
                train_loader = Mnist_noniid().get_train_data5()
                test_data = Mnist_noniid().get_test_data5()
                test_x = torch.unsqueeze(test_data.test_data, dim=1).type(
                    torch.FloatTensor
                ) / 255.  # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
                test_y = test_data.test_labels

    # test_x = torch.unsqueeze(test_data.test_data, dim=1).type(
    #     torch.FloatTensor) / 255.  # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
    # test_y = test_data.test_labels

    group_list = []
    for i in range(size):
        group_list.append(i)
    group = dist.new_group(group_list)

    for epoch in range(MAX_EPOCH):

        modell = get_new_model(modell)
        # current_model = copy.deepcopy(modell)

        for step, (b_x, b_y) in enumerate(train_loader):
            # modell = get_new_model(modell)
            # current_model = copy.deepcopy(modell)

            output = modell(b_x)[0]
            loss = loss_func(output, b_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # new_model = copy.deepcopy(modell)

        # for param1, param2 in zip( current_model.parameters(), new_model.parameters() ):
        # dist.reduce(param2.data-param1.data, dst=0, op=dist.reduce_op.SUM, group=group)

        for param in modell.parameters():
            dist.reduce(param, dst=0, op=dist.reduce_op.SUM, group=group)

        test_output, last_layer = modell(test_x)
        pred_y = torch.max(test_output, 1)[1].data.numpy()
        accuracy = float(
            (pred_y == test_y.data.numpy()).astype(int).sum()) / float(
                test_y.size(0))
        print('Epoch: ', epoch, ' Rank: ', rank,
              '| train loss: %.4f' % loss.data.numpy(),
              '| test accuracy: %.2f' % accuracy)