model=copy.deepcopy(global_model), global_round=epoch) local_weights.append(copy.deepcopy(w)) local_losses.append(copy.deepcopy(loss)) # update global weights global_weights = average_weights(local_weights) # update global weights global_model.load_state_dict(global_weights) loss_avg = sum(local_losses) / len(local_losses) train_loss.append(loss_avg) # Calculate avg training accuracy over all users at every epoch list_acc, list_loss = [], [] global_model.eval() for c in range(args.num_users): local_model = LocalUpdate(args=args, dataset=train_dataset, idxs=user_groups[idx], logger=logger) acc, loss = local_model.inference(model=global_model) list_acc.append(acc) list_loss.append(loss) train_accuracy.append(sum(list_acc) / len(list_acc)) # print global training loss after every 'i' rounds if (epoch + 1) % print_every == 0: print(f' \nAvg Training Stats after {epoch+1} global rounds:') print(f'Training Loss : {np.mean(np.array(train_loss))}') print('Train Accuracy: {:.2f}% \n'.format(100 *
def main(): start_time = time.time() # define paths path_project = os.path.abspath('..') logger = SummaryWriter('../logs') args = args_parser() exp_details(args) if args.gpu: torch.cuda.set_device(0) device = 'cuda' if args.gpu else 'cpu' # load dataset and user groups train_dataset, test_dataset, user_groups = get_dataset(args) args.num_users = len(user_groups) # BUILD MODEL if args.model == 'cnn': # Convolutional neural netork if args.dataset == 'mnist': global_model = CNNMnist(args=args) elif args.dataset == 'fmnist': global_model = CNNFashion_Mnist(args=args) elif args.dataset == 'cifar': global_model = CNNCifar(args=args) elif args.model == 'mlp': # Multi-layer preceptron img_size = train_dataset[0][0].shape len_in = 1 for x in img_size: len_in *= x global_model = MLP(dim_in=len_in, dim_hidden=64, dim_out=args.num_classes) else: exit('Error: unrecognized model') # Set the model to train and send it to device. global_model.to(device) global_model.train() # copy weights global_weights = global_model.state_dict() # Training train_loss, train_accuracy = [], [] val_acc_list, net_list = [], [] cv_loss, cv_acc = [], [] print_every = 2 val_loss_pre, counter = 0, 0 #Beolvassuk, hogy éppen mely résztvevők vesznek részt a tanításban (0 jelentése, hogy benne van, 1 az hogy nincs) users = [] fp = open('users.txt', "r") x = fp.readline().split(' ') for i in x: if i != '': users.append(int(i)) fp.close() #for epoch in tqdm(range(args.epochs)): for epoch in range(args.epochs): local_weights, local_losses = [], [] #print(f'\n | Global Training Round : {epoch+1} |\n') global_model.train() m = max(int(args.frac * args.num_users), 1) idxs_users = np.random.choice(range(args.num_users), m, replace=False) for idx in idxs_users: local_model = LocalUpdate(args=args, dataset=train_dataset, idxs=user_groups[idx], logger=logger) w, loss = local_model.update_weights( model=copy.deepcopy(global_model), global_round=epoch) local_weights.append(copy.deepcopy(w)) local_losses.append(copy.deepcopy(loss)) global_weights = average_weights(local_weights) # update global weights global_model.load_state_dict(global_weights) loss_avg = sum(local_losses) / len(local_losses) train_loss.append(loss_avg) # Calculate avg training accuracy over all users at every epoch list_acc, list_loss = [], [] global_model.eval() for c in range(args.num_users): local_model = LocalUpdate(args=args, dataset=train_dataset, idxs=user_groups[idx], logger=logger) acc, loss = local_model.inference(model=global_model) list_acc.append(acc) list_loss.append(loss) train_accuracy.append(sum(list_acc) / len(list_acc)) # print global training loss after every 'i' rounds '''if (epoch+1) % print_every == 0: print(f' \nAvg Training Stats after {epoch+1} global rounds:') print(f'Training Loss : {np.mean(np.array(train_loss))}') print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))''' # Test inference after completion of training #Beolvassuk hogy mely résztvevőnek mely labeleket osztottuk ki. ftrain = open('traindataset.txt') testlabels = [] line = ftrain.readline() while line != "": sor = line.split(' ') array = [] for i in sor: array.append(int(i)) testlabels.append(array) line = ftrain.readline() ftrain.close() print("USERS LABELS") print(testlabels) #Minden lehetséges koalícióra lefut a tesztelés for j in range((2**args.num_users) - 1): binary = numberToBinary(j, len(users)) test_acc, test_loss = test_inference(args, global_model, test_dataset, testlabels, binary, len(binary)) #Teszt eredmények kiírása print("RESZTVEVOK") print(users) print("TEST NUMBER") print(j) print("TEST BINARY") print(binary) print("TEST LABELS") print(testlabels) print("Test Accuracy") print("{:.2f}%".format(100 * test_acc)) print() # Saving the objects train_loss and train_accuracy: '''file_name = '../save/objects/{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}].pkl'.\
def main_test(args): start_time = time.time() now = datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S') # define paths logger = SummaryWriter('../logs') # easydict 사용하는 경우 주석처리 # args = args_parser() # checkpoint 생성위치 args.save_path = os.path.join(args.save_path, args.exp_folder) if not os.path.exists(args.save_path): os.makedirs(args.save_path) save_path_tmp = os.path.join(args.save_path, 'tmp_{}'.format(now)) if not os.path.exists(save_path_tmp): os.makedirs(save_path_tmp) SAVE_PATH = os.path.join(args.save_path, '{}_{}_T[{}]_C[{}]_iid[{}]_E[{}]_B[{}]'. format(args.dataset, args.model, args.epochs, args.frac, args.iid, args.local_ep, args.local_bs)) # 시드 고정 torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) # torch.cuda.set_device(0) device = torch.device("cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") cpu_device = torch.device('cpu') # log 파일 생성 log_path = os.path.join('../logs', args.exp_folder) if not os.path.exists(log_path): os.makedirs(log_path) loggertxt = get_logger( os.path.join(log_path, '{}_{}_{}_{}.log'.format(args.model, args.optimizer, args.norm, now))) logging.info(args) # csv csv_save = '../csv/' + now csv_path = os.path.join(csv_save, 'accuracy.csv') csv_logger_keys = ['train_loss', 'accuracy'] csvlogger = CSVLogger(csv_path, csv_logger_keys) # load dataset and user groups train_dataset, test_dataset, client_loader_dict = get_dataset(args) # cifar-100의 경우 자동 설정 if args.dataset == 'cifar100': args.num_classes = 100 # BUILD MODEL if args.model == 'cnn': # Convolutional neural network if args.dataset == 'mnist': global_model = CNNMnist(args=args) elif args.dataset == 'fmnist': global_model = CNNFashion_Mnist(args=args) elif args.dataset == 'cifar': global_model = CNNCifar(args=args) elif args.dataset == 'cifar100': global_model = CNNCifar(args=args) elif args.model == 'mlp': # Multi-layer preceptron img_size = train_dataset[0][0].shape len_in = 1 for x in img_size: len_in *= x global_model = MLP(dim_in=len_in, dim_hidden=64, dim_out=args.num_classes) elif args.model == 'cnn_vc': global_model = CNNCifar_fedVC(args=args) elif args.model == 'cnn_vcbn': global_model = CNNCifar_VCBN(args=args) elif args.model == 'cnn_vcgn': global_model = CNNCifar_VCGN(args=args) elif args.model == 'resnet18_ws': global_model = resnet18(num_classes=args.num_classes, weight_stand=1) elif args.model == 'resnet18': global_model = resnet18(num_classes=args.num_classes, weight_stand=0) elif args.model == 'resnet32': global_model = ResNet32_test(num_classes=args.num_classes) elif args.model == 'resnet18_mabn': global_model = resnet18_mabn(num_classes=args.num_classes) elif args.model == 'vgg': global_model = vgg11() elif args.model == 'cnn_ws': global_model = CNNCifar_WS(args=args) else: exit('Error: unrecognized model') # Set the model to train and send it to device. loggertxt.info(global_model) # fedBN처럼 gn no communication 용 client_models = [copy.deepcopy(global_model) for idx in range(args.num_users)] # copy weights global_weights = global_model.state_dict() global_model.to(device) global_model.train() # Training train_loss, train_accuracy = [], [] val_acc_list, net_list = [], [] # how does help BN 확인용 client_loss = [[] for i in range(args.num_users)] client_conv_grad = [[] for i in range(args.num_users)] client_fc_grad = [[] for i in range(args.num_users)] client_total_grad_norm = [[] for i in range(args.num_users)] # 전체 loss 추적용 -how does help BN # 재시작 if args.resume: checkpoint = torch.load(SAVE_PATH) global_model.load_state_dict(checkpoint['global_model']) if args.hold_normalize: for client_idx in range(args.num_users): client_models[client_idx].load_state_dict(checkpoint['model_{}'.format(client_idx)]) else: for client_idx in range(args.num_users): client_models[client_idx].load_state_dict(checkpoint['global_model']) resume_iter = int(checkpoint['a_iter']) + 1 print('Resume trainig form epoch {}'.format(resume_iter)) else: resume_iter = 0 # learning rate scheduler #scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, gamma=0.1,step_size=500) # start training for epoch in tqdm(range(args.epochs)): local_weights, local_losses = [], [] if args.verbose: print(f'\n | Global Training Round : {epoch + 1} |\n') global_model.train() m = max(int(args.frac * args.num_users), 1) idxs_users = np.random.choice(range(args.num_users), m, replace=False) for idx in idxs_users: """ for key in global_model.state_dict().keys(): if args.hold_normalize: if 'bn' not in key: client_models[idx].state_dict()[key].data.copy_(global_model.state_dict()[key]) else: client_models[idx].state_dict()[key].data.copy_(global_model.state_dict()[key]) """ torch.cuda.empty_cache() local_model = LocalUpdate(args=args, logger=logger, train_loader=client_loader_dict[idx], device=device) w, loss, batch_loss, conv_grad, fc_grad, total_gard_norm = local_model.update_weights( model=copy.deepcopy(global_model), global_round=epoch, idx_user=idx) local_weights.append(copy.deepcopy(w)) # client의 1 epoch에서의 평균 loss값 ex)0.35(즉, batch loss들의 평균) local_losses.append(copy.deepcopy(loss)) # 전체 round scheduler # scheduler.step() # loss graph용 -> client당 loss값 진행 저장 -> 모두 client별로 저장. client_loss[idx].append(batch_loss) client_conv_grad[idx].append(conv_grad) client_fc_grad[idx].append(fc_grad) client_total_grad_norm[idx].append(total_gard_norm) # print(total_gard_norm) # gn, bn 복사 # client_models[idx].load_state_dict(w) del local_model del w # update global weights global_weights = average_weights(local_weights, client_loader_dict, idxs_users) # update global weights # opt = OptRepo.name2cls('adam')(global_model.parameters(), lr=0.01, betas=(0.9, 0.99), eps=1e-3) opt = OptRepo.name2cls('sgd')(global_model.parameters(), lr=10, momentum=0.9) opt.zero_grad() opt_state = opt.state_dict() global_weights = aggregation(global_weights, global_model) global_model.load_state_dict(global_weights) opt = OptRepo.name2cls('sgd')(global_model.parameters(), lr=10, momentum=0.9) # opt = OptRepo.name2cls('adam')(global_model.parameters(), lr=0.01, betas=(0.9, 0.99), eps=1e-3) opt.load_state_dict(opt_state) opt.step() loss_avg = sum(local_losses) / len(local_losses) train_loss.append(loss_avg) global_model.eval() # for c in range(args.num_users): # local_model = LocalUpdate(args=args, dataset=train_dataset, # idxs=user_groups[idx], logger=logger) # acc, loss = local_model.inference(model=global_model) # list_acc.append(acc) # list_loss.append(loss) # train_accuracy.append(sum(list_acc)/len(list_acc)) train_accuracy = test_inference(args, global_model, test_dataset, device=device) val_acc_list.append(train_accuracy) # print global training loss after every 'i' rounds # if (epoch+1) % print_every == 0: loggertxt.info(f' \nAvg Training Stats after {epoch + 1} global rounds:') loggertxt.info(f'Training Loss : {loss_avg}') loggertxt.info('Train Accuracy: {:.2f}% \n'.format(100 * train_accuracy)) csvlogger.write_row([loss_avg, 100 * train_accuracy]) if (epoch + 1) % 100 == 0: tmp_save_path = os.path.join(save_path_tmp, 'tmp_{}.pt'.format(epoch+1)) torch.save(global_model.state_dict(),tmp_save_path) # Test inference after completion of training test_acc = test_inference(args, global_model, test_dataset, device=device) print(' Saving checkpoints to {}...'.format(SAVE_PATH)) if args.hold_normalize: client_dict = {} for idx, model in enumerate(client_models): client_dict['model_{}'.format(idx)] = model.state_dict() torch.save(client_dict, SAVE_PATH) else: torch.save({'global_model': global_model.state_dict()}, SAVE_PATH) loggertxt.info(f' \n Results after {args.epochs} global rounds of training:') # loggertxt.info("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1])) loggertxt.info("|---- Test Accuracy: {:.2f}%".format(100 * test_acc)) # frac이 1이 아닐경우 잘 작동하지않음. # batch_loss_list = np.array(client_loss).sum(axis=0) / args.num_users # conv_grad_list = np.array(client_conv_grad).sum(axis=0) / args.num_users # fc_grad_list = np.array(client_fc_grad).sum(axis=0) / args.num_users # total_grad_list = np.array(client_total_grad_norm).sum(axis=0) /args.num_users # client의 avg를 구하고 싶었으나 현재는 client 0만 확인 # client마다 batch가 다를 경우 bug 예상 return train_loss, val_acc_list, client_loss[0], client_conv_grad[0], client_fc_grad[0], client_total_grad_norm[0]
def main(): start_time = time.time() # define paths path_project = os.path.abspath('..') logger = SummaryWriter('../logs') args = args_parser() args = adatok.arguments(args) exp_details(args) if args.gpu: torch.cuda.set_device(args.gpu) device = 'cuda' if args.gpu else 'cpu' # load dataset and user groups train_dataset, test_dataset, user_groups = get_dataset(args) if adatok.data.image_initialization == True: adatok.data.image_initialization = False return # BUILD MODEL if args.model == 'cnn': # Convolutional neural netork if args.dataset == 'mnist': global_model = CNNMnist(args=args) elif args.dataset == 'fmnist': global_model = CNNFashion_Mnist(args=args) elif args.dataset == 'cifar': global_model = CNNCifar(args=args) elif args.model == 'mlp': # Multi-layer preceptron img_size = train_dataset[0][0].shape len_in = 1 for x in img_size: len_in *= x global_model = MLP(dim_in=len_in, dim_hidden=64, dim_out=args.num_classes) else: exit('Error: unrecognized model') # Set the model to train and send it to device. global_model.to(device) global_model.train() #print(global_model) # copy weights global_weights = global_model.state_dict() # Training train_loss, train_accuracy = [], [] val_acc_list, net_list = [], [] cv_loss, cv_acc = [], [] print_every = 2 val_loss_pre, counter = 0, 0 for epoch in tqdm(range(args.epochs)): local_weights, local_losses = [], [] #print(f'\n | Global Training Round : {epoch+1} |\n') global_model.train() m = max(int(args.frac * args.num_users), 1) idxs_users = np.random.choice(range(args.num_users), m, replace=False) for idx in idxs_users: local_model = LocalUpdate(args=args, dataset=train_dataset, idxs=user_groups[idx], logger=logger) w, loss = local_model.update_weights( model=copy.deepcopy(global_model), global_round=epoch) local_weights.append(copy.deepcopy(w)) local_losses.append(copy.deepcopy(loss)) # update global weights global_weights = average_weights(local_weights) # update global weights global_model.load_state_dict(global_weights) loss_avg = sum(local_losses) / len(local_losses) train_loss.append(loss_avg) # Calculate avg training accuracy over all users at every epoch list_acc, list_loss = [], [] global_model.eval() for c in range(args.num_users): local_model = LocalUpdate(args=args, dataset=train_dataset, idxs=user_groups[idx], logger=logger) acc, loss = local_model.inference(model=global_model) list_acc.append(acc) list_loss.append(loss) train_accuracy.append(sum(list_acc) / len(list_acc)) # print global training loss after every 'i' rounds '''if (epoch+1) % print_every == 0: print(f' \nAvg Training Stats after {epoch+1} global rounds:') print(f'Training Loss : {np.mean(np.array(train_loss))}') print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))''' # Test inference after completion of training for i in adatok.data.test_groups_in_binary: adatok.data.actual_test_group_in_binary = i test_acc, test_loss = test_inference(args, global_model, test_dataset) print("Resoults") print(epoch) print(adatok.data.actual_train_group_in_binary) print(adatok.data.actual_test_group_in_binary) print(test_acc) print(test_loss) '''