Exemplo n.º 1
0
 def get(self):
     test_model = test()
     self.write(test_model.f() + "_blog")
Exemplo n.º 2
0
def main_worker(gpu, ngpus_per_node, args):
    print("gpu:", gpu)
    args.gpu = gpu
    if args.rank == 0:  #(第一台服务器只有三台GPU,需要特殊处理)
        newrank = args.rank * ngpus_per_node + gpu
    else:
        newrank = args.rank * ngpus_per_node + gpu - 1
    #初始化,使用tcp方式进行通信
    print("begin init")
    dist.init_process_group(init_method=args.init_method,
                            backend="nccl",
                            world_size=args.world_size,
                            rank=newrank)
    print("end init")

    #建立通信group,rank=0作为server,用broadcast模拟send和rec,需要server和每个client建立group
    group = []
    for i in range(1, args.world_size):
        group.append(dist.new_group([0, i]))
    allgroup = dist.new_group([i for i in range(args.world_size)])

    if newrank == 0:
        """ server"""

        print("使用{}号服务器的第{}块GPU作为server".format(args.rank, gpu))

        #在模型训练期间,server只负责整合参数并分发,不参与任何计算
        #设置cpu
        args.device = torch.device(
            'cuda:{}'.format(args.gpu)
            if torch.cuda.is_available() and args.gpu != -1 else 'cpu')

        net = CNNMnist().to(args.device)
        w_avg = copy.deepcopy(net.state_dict())
        for j in range(args.epochs):
            if j == args.epochs - 1:
                for i in w_avg.keys():
                    temp = w_avg[i].to(args.device)
                    w_avg[i] = average_gradients(temp, group, allgroup)
            else:
                for i in w_avg.keys():
                    temp = w_avg[i].to(args.device)
                    average_gradients(temp, group, allgroup)
        torch.save(w_avg, 'w_wag')
        net.load_state_dict(w_avg)
        #加载测试数据
        trans_mnist = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])
        dataset_test = datasets.MNIST('data/',
                                      train=False,
                                      download=True,
                                      transform=trans_mnist)
        test_set = torch.utils.data.DataLoader(dataset_test,
                                               batch_size=args.bs)
        test_accuracy, test_loss = test(net, test_set, args)
        print("Testing accuracy: {:.2f}".format(test_accuracy))
        print("Testing loss: {:.2f}".format(test_loss))

    else:
        """clents"""

        print("使用{}号服务器的第{}块GPU作为第{}个client".format(args.rank, gpu, newrank))

        #设置gpu
        args.device = torch.device(
            'cuda:{}'.format(args.gpu)
            if torch.cuda.is_available() and args.gpu != -1 else 'cpu')

        print("begin train...")
        net = CNNMnist().to(args.device)
        print(net)
        data = torch.load("data/distributed/data_of_client{}".format(newrank))
        bsz = 64
        train_set = torch.utils.data.DataLoader(data, batch_size=bsz)

        optimizer = torch.optim.SGD(net.parameters(), lr=args.lr, momentum=0.5)
        num_batches = ceil(len(train_set.dataset) / float(bsz))
        start = time.time()
        for epoch in range(args.epochs):
            for iter in range(3):
                epoch_loss = 0.0
                for data, target in train_set:
                    data, target = data.to(args.device), target.to(args.device)
                    data, target = Variable(data), Variable(target)
                    optimizer.zero_grad()
                    output = net(data)
                    loss = F.nll_loss(output, target)
                    epoch_loss += loss.item()
                    loss.backward()
                    optimizer.step()
                if iter == 3 - 1:
                    print('Rank ', dist.get_rank(), ', epoch ', epoch, ': ',
                          epoch_loss / num_batches)
            """federated learning"""
            w_avg = copy.deepcopy(net.state_dict())

            for k in w_avg.keys():
                print("k:", k)
                temp = average_gradients(w_avg[k].to(args.device), group,
                                         allgroup)
                w_avg[k] = temp
            net.load_state_dict(w_avg)

        end = time.time()
        print(" training time:{}".format((end - start)))

        train_accuracy, train_loss = test(net, train_set, args)
        print("Training accuracy: {:.2f}".format(train_accuracy))
        print("Training loss: {:.2f}".format(train_loss))
Exemplo n.º 3
0
 def get(self):
     test_model = test()
     self.write(test_model.f() + "_blog")
Exemplo n.º 4
0
        device = torch.device('cpu')
    elif torch.cuda.is_available() and args.use_gpu:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    else:
        device = torch.device('cpu')

    model.to(device)
    logger.info('Running with device %s', device)

    transform = torchvision.transforms.Compose(
        [torchvision.transforms.ToTensor()])

    if args.test:
        test_dataset = XrayImageFolder(args.test_data, transform=transform)
        test_dataloader = DataLoader(test_dataset, batch_size=cfg.BATCH_SIZE)
        test(model, test_dataloader, device)

    elif args.inference:
        img = cv2.imread(args.img_path)
        try:
            img = transform(img).unsqueeze(0)
        except TypeError:
            logger.exception('Possible incorrect image path or image type')

        start_time = datetime.now()
        with torch.no_grad():
            img = img.to(device)
            output = model.inference(img)

        label = output.data.argmax()
        prob = output.detach()[0][label].item() * 100
Exemplo n.º 5
0
 def get(self):
     test_model = test()
     self.write(test_model.f() + "_admin")
Exemplo n.º 6
0
    print('run')
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            train_epoch(i, train_loader, model, criterion, optimizer, opt,
                        train_logger, train_batch_logger)
        if not opt.no_val:
            validation_loss = val_epoch(i, val_loader, model, criterion, opt,
                                        val_logger)

        if not opt.no_train and not opt.no_val:
            scheduler.step(validation_loss)

    if opt.test:
        spatial_transform = Compose([
            Scale(int(opt.sample_size / opt.scale_in_test)),
            CornerCrop(opt.sample_size, opt.crop_position_in_test),
            ToTensor(opt.norm_value), norm_method
        ])
        temporal_transform = LoopPadding(opt.sample_duration)
        target_transform = VideoID()

        test_data = get_test_set(opt, spatial_transform, temporal_transform,
                                 target_transform)
        test_loader = torch.utils.data.DataLoader(
            test_data,
            batch_size=opt.batch_size,
            shuffle=False,
            num_workers=opt.n_threads,
            pin_memory=True)
        test.test(test_loader, model, opt, test_data.class_names)