#coding=utf-8 import torch import ctypes import dadt.pytorch as dadt dadt.init(broad_cast_executor='nccl', all_reduce_executor='nccl') device = torch.device('cuda:{}'.format(dadt.local_rank())) # device = torch.device('cpu') if 0 == dadt.rank(): x = torch.tensor([1, 2, 3, 4], device=device, dtype=torch.float) else: x = torch.tensor([1, 1, 1, 1], device=device, dtype=torch.float) y = dadt.all_reduce(x, "x") print(dadt.rank(), y) y = dadt.all_reduce(x, "x") print(dadt.rank(), y) dadt.shutdown()
#coding=utf-8 import torch import ctypes import dadt.pytorch as dadt dadt.init(broad_cast_executor='mpi') device = torch.device('cuda:{}'.format(dadt.local_rank())) if 0 == dadt.rank(): x = torch.tensor([1, 2, 3, 4], device=device) else: x = torch.tensor([0, 0, 0, 0], device=device) y = dadt.broad_cast(x, "x") print(dadt.rank(), y) dadt.shutdown()
def train(): # init dadt dadt.init( cycle_duration_ms=5, broad_cast_executor='nccl', all_reduce_executor='nccl', group_buffer_size=0) # Data augmentation and normalization for training # Just normalization for validation data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } data_dir = 'hymenoptera_data' image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']} dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4, shuffle=True, num_workers=4) for x in ['train', 'val']} dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} class_names = image_datasets['train'].classes device = torch.device("cuda:{}".format(dadt.local_rank())) model_ft = models.resnet101(pretrained=False) num_ftrs = model_ft.fc.in_features # Here the size of each output sample is set to 2. # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)). model_ft.fc = nn.Linear(num_ftrs, 2) model_ft = model_ft.to(device) criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.002, momentum=0.9) # init distribute optimizer d_optimizer = dadt.DistributedOptimizer(optimizer=optimizer_ft) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) model_ft.train() total_cost_time = 0.0 total_count = 0.0 for epoch in range(2500000): running_loss = 0.0 running_corrects = 0 for inputs, labels in dataloaders['train']: start_time = time.time() inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients d_optimizer.zero_grad() outputs = model_ft(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) loss.backward() d_optimizer.step() cost_time = int(round((time.time() - start_time) * 1000)) total_cost_time += cost_time total_count += 1 # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) print('Rank:{}, cost time:{}, avg tiem:{} epoch:{}, loss:{}'.format(dadt.rank(), cost_time, total_cost_time/total_count, epoch, loss.item())) print('--------------------------------------------------------------------------') epoch_loss = running_loss / dataset_sizes['train'] epoch_acc = running_corrects.double() / dataset_sizes['train'] print('Rank:{}, {} Loss: {:.4f} Acc: {:.4f}'.format(dadt.rank(), 'train', epoch_loss, epoch_acc))
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() # initialize dadt dadt.init(broad_cast_executor='nccl', all_reduce_executor='nccl', all_reduce_buffer_size=64 * 1024 * 1024) torch.manual_seed(args.seed) # get device by rank device = torch.device("cuda:{}".format(dadt.local_rank())) kwargs = {'batch_size': args.batch_size} kwargs.update({'num_workers': 1, 'pin_memory': True, 'shuffle': True}, ) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST('./data{}'.format(dadt.local_rank()), train=True, download=True, transform=transform) dataset2 = datasets.MNIST('./data{}'.format(dadt.local_rank()), train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) distribte_optimizer = dadt.DistributedOptimizer(optimizer=optimizer) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, distribte_optimizer, epoch) test(model, device, test_loader) scheduler.step() if args.save_model and 0 == dadt.rank(): torch.save(model.state_dict(), "mnist_cnn.pt") # shut down background thread dadt.shutdown()