def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] if args.lr == 0.0: if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: learning_rate = 0.1 else: learning_rate = 0.01 else: learning_rate = args.lr for i in workers: optimizer = MySGD(models[i].parameters(), lr=learning_rate) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 50 else: decay_period = 1000 print('Begin!') # store (train loss, energy, iterations) trainloss_file = './trainloss' + args.model + '.txt' if (os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') log_file = args.model + 'log.txt' if (os.path.isfile(log_file)): os.remove(log_file) f_log = open(log_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i - 1])) epoch_train_loss = 0.0 total_time = 0.0 total_pulling_ratio = 0.0 epoch_avg_pull_ratio = 0.0 clock_epoch = 0 test_loss = 0 test_acc = 0 for iteration in range(args.epochs * iterations_epoch): clock_epoch += 1 iteration_loss = 0.0 epoch = int((iteration + 1) / iterations_epoch) for i in workers: models[i].train() g_list = [] for i in workers: try: data, target = next(train_data_iter_list[i - 1]) except StopIteration: train_data_iter_list[i - 1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i - 1]) data, target = Variable(data), Variable(target) optimizers_list[i - 1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i - 1].get_delta_w() g_list.append(delta_ws) iteration_loss += loss.data.item() / workers_num epoch_train_loss += iteration_loss g_q_list = [] for g in g_list: g_quantization, compression_ratio = quantization(g, args.bit) g_q_list.append(g_quantization) # 同步操作 g_avg = [] for p_idx, param in enumerate(models[0].parameters()): global_update_layer = torch.zeros_like(param.data) for w in workers: global_update_layer += g_q_list[w - 1][p_idx] tensor = global_update_layer / workers_num g_avg.append(tensor) param.data -= tensor pull_workers = 0 pull_workers_list = pull_judge(workers_num, args.ratio) for w in workers: isPulling = w in pull_workers_list if isPulling: pull_workers += 1 for p_idx, param in enumerate(models[0].parameters()): if isPulling: list(models[w].parameters())[p_idx].data = param.data else: list(models[w].parameters())[p_idx].data -= g_q_list[ w - 1][p_idx] print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) total_pulling_ratio += pull_workers / workers_num epoch_avg_pull_ratio += pull_workers / workers_num f_log.write( str(args.this_rank) + "\t" + str(iteration_loss) + "\t" + str(epoch) + "\t" + str(pull_workers / workers_num) + # the ratio of pulling workers "\t" + str(iteration) + "\t" + str(pull_workers_list) + '\n') f_log.flush() # train loss every epoch if iteration % iterations_epoch == 0: # 训练结束后进行test if iteration % (2 * iterations_epoch) == 0: test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) f_trainloss.write( str(args.this_rank) + "\t" + str(epoch_train_loss / float(clock_epoch)) + "\t" + str(test_loss) + "\t" + str(test_acc) + "\t" + str(total_pulling_ratio) + # accumulated pulling ratio of workers "\t" + str(epoch) + "\t" + str(epoch_avg_pull_ratio / clock_epoch) + # the avg ratio of pulling workers in an epoch "\t" + str(iteration) + "\t" + str(total_time) + # time '\n') f_trainloss.flush() epoch_train_loss = 0.0 epoch_avg_pull_ratio = 0.0 clock_epoch = 0 for i in workers: if (epoch + 1) % decay_period == 0: for param_group in optimizers_list[i - 1].param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format( param_group['lr'])) f_log.close() f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] for i in workers: if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(models[i].parameters(), lr=0.1) else: optimizer = MySGD(models[i].parameters(), lr=0.01) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 500 else: decay_period = 1000000 print('Begin!') global_g = [torch.zeros_like(param.data) for param in model.parameters()] # store (train loss, energy, iterations) trainloss_file = './trainloss' + args.model + '_' + args.file_name + '_ec.txt' # .txt file name if (os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i - 1])) epoch_train_loss = 0.0 global_clock = 0 g_remain_list = [] ratio = args.ratio threshold = 0. # compensation h_last_list = [] # h_t h_remain_list = [] # h_t - 1 alpha = args.alpha beta = args.beta print(alpha, " and ", beta) for iteration in range(args.epochs * iterations_epoch): iteration_loss = 0.0 g_list = [] g_change_average = [ torch.zeros_like(param.data) for param in models[0].parameters() ] global_clock += 1 for i in workers: try: data, target = next(train_data_iter_list[i - 1]) except StopIteration: train_data_iter_list[i - 1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i - 1]) data, target = Variable(data), Variable(target) optimizers_list[i - 1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i - 1].get_delta_w() g_list.append(delta_ws) iteration_loss += loss.data.item() / workers_num if global_clock == 1: g_remain = [ torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws ] g_remain_list.append(g_remain) h_remain = [torch.zeros_like(g_layer) for g_layer in delta_ws] h_remain_list.append(h_remain) h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws] h_last_list.append(h_last) test_loss = str( 0) ###################################################### test_acc = str( 0) ###################################################### # synchronous update # the gradient change in the first iteration is gradient itself for g_change_layer_idx, g_change_layer in enumerate( g_change_average): g_change_layer.data += delta_ws[ g_change_layer_idx].data / workers_num sparsification_ratio = 1.0 else: new_delta_ws = [ torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws ] # new_delta_ws = optimizers_list[i-1].get_delta_w() for idx, g_layer in enumerate(delta_ws): # print(new_delta_ws[idx], " and ", alpha * (h_last_list[i-1][idx] - h_remain_list[i-1][idx])) new_delta_ws[idx] += alpha * (h_last_list[i - 1][idx] - h_remain_list[i - 1][idx]) print(ratio) g_remain, g_large_change, sparsification_ratio = get_upload( g_remain_list[i - 1], new_delta_ws, ratio, args.isCompensate, threshold) g_remain_list[i - 1] = g_remain # synchronous update for g_change_layer_idx, g_change_layer in enumerate( g_change_average): g_change_layer.data += g_large_change[ g_change_layer_idx].data / workers_num # update h h_last = [torch.zeros_like(g_layer) for g_layer in delta_ws] h_remain = h_last_list[i - 1] for idx, g_layer in enumerate(delta_ws): h_last[idx] = h_remain[idx] * beta if args.add == 1: h_last[idx] += (delta_ws[idx] - g_remain[idx]) else: h_last[idx] -= (delta_ws[idx] - g_remain[idx]) h_remain_list[i - 1] = h_remain h_last_list[i - 1] = h_last # 同步操作 g_quare_sum = 0.0 # for threshold for p_idx, param in enumerate(models[0].parameters()): global_g[p_idx].data += g_change_average[p_idx].data param.data -= global_g[p_idx].data for w in workers: list(models[w].parameters() )[p_idx].data = param.data + torch.zeros_like(param.data) g_quare_sum += torch.sum(global_g[p_idx].data * global_g[p_idx].data) g_quare_sum = torch.sqrt(g_quare_sum) threshold = g_quare_sum.data.item() epoch_train_loss += iteration_loss epoch = int(iteration / iterations_epoch) # print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) # if (iteration+1) % iterations_epoch == 0: if True: # #''' if (iteration + 1) % iterations_epoch == 0: # 训练结束后进行test test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) epoch_train_loss = 0.0 #''' f_trainloss.write( str(args.this_rank) + "\t" + str(epoch_train_loss / float(iterations_epoch)) + "\t" + str(iteration_loss) + "\t" + str(0) + "\t" + str(epoch) + "\t" + str(0) + "\t" + str(iteration) + "\t" + str(sparsification_ratio) + # time "\t" + str(global_clock) + # time "\t" + str(test_loss) + # test_loss "\t" + str(test_acc) + # test_acc '\n') f_trainloss.flush() # 在指定epochs (iterations) 减少缩放因子 if (epoch + 1) in [0, 1000]: ratio = ratio * 0.1 print('--------------------------------') print(ratio) for i in workers: models[i].train() if (epoch + 1) % decay_period == 0: for param_group in optimizers_list[i - 1].param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format( param_group['lr'])) f_trainloss.close()
def run(workers, models, save_path, train_data_list, test_data, iterations_epoch): workers_num = len(workers) print('Model recved successfully!') optimizers_list = [] for i in workers: if args.model in ['MnistCNN', 'AlexNet', 'ResNet18OnCifar10']: optimizer = MySGD(models[i].parameters(), lr=0.1) else: optimizer = MySGD(models[i].parameters(), lr=0.01) optimizers_list.append(optimizer) if args.model in ['MnistCNN', 'AlexNet']: criterion = torch.nn.NLLLoss() else: criterion = torch.nn.CrossEntropyLoss() if args.model in ['AlexNet', 'ResNet18OnCifar10']: decay_period = 500 else: decay_period = 1000000 print('Begin!') global_g = [torch.zeros_like(param.data) for param in model.parameters()] # store (train loss, energy, iterations) trainloss_file = './trainloss_oldsimu' + args.model + '_w15r1lr0.1.txt' if (os.path.isfile(trainloss_file)): os.remove(trainloss_file) f_trainloss = open(trainloss_file, 'a') train_data_iter_list = [] for i in workers: train_data_iter_list.append(iter(train_data_list[i - 1])) epoch_train_loss = 0.0 global_clock = 0 g_remain_list = [] ratio = args.ratio threshold = 0. print("Begin for") for iteration in range(args.epochs * iterations_epoch): iteration_loss = 0.0 g_list = [] g_change_average = [ torch.zeros_like(param.data) for param in models[0].parameters() ] global_clock += 1 for i in workers: # zheli try: data, target = next(train_data_iter_list[i - 1]) except StopIteration: train_data_iter_list[i - 1] = iter(train_data_list[i - 1]) data, target = next(train_data_iter_list[i - 1]) data, target = Variable(data), Variable(target) optimizers_list[i - 1].zero_grad() output = models[i](data) loss = criterion(output, target) loss.backward() delta_ws = optimizers_list[i - 1].get_delta_w() # zheli gzhi g_list.append(delta_ws) iteration_loss += loss.data.item() / workers_num if global_clock == 1: # chushihua diyilun g_remain = [ torch.zeros_like(g_layer) + g_layer for g_layer in delta_ws ] g_remain_list.append(g_remain) test_loss = str( 0) ###################################################### test_acc = str( 0) ###################################################### # synchronous update # the gradient change in the first iteration is gradient itself for g_change_layer_idx, g_change_layer in enumerate( g_change_average): g_change_layer.data += delta_ws[ g_change_layer_idx].data / workers_num sparsification_ratio = 1.0 else: # print(delta_ws) # 2 huge? g_remain, g_large_change, sparsification_ratio = get_upload( g_remain_list[i - 1], delta_ws, ratio, args.isCompensate, threshold) #hanshu g_remain_list[i - 1] = g_remain # server g gengxin # synchronous update for g_change_layer_idx, g_change_layer in enumerate( g_change_average): # line5 g_change_layer.data += g_large_change[ g_change_layer_idx].data / workers_num #line5 # 同步操作 g_quare_sum = 0.0 # for threshold server for p_idx, param in enumerate(models[0].parameters()): global_g[p_idx].data += g_change_average[ p_idx].data #zheli delta pingjun param.data -= global_g[p_idx].data for w in workers: #gengxin w list(models[w]. parameters())[p_idx].data = param.data + torch.zeros_like( param.data) #hui chuan g_quare_sum += torch.sum(global_g[p_idx].data * global_g[p_idx].data) #server buxishuo g_quare_sum = torch.sqrt(g_quare_sum) threshold = g_quare_sum.data.item() epoch_train_loss += iteration_loss epoch = int(iteration / iterations_epoch) print('Epoch {}, Loss:{}'.format(epoch, loss.data.item())) if True: # #''' if (iteration + 1) % iterations_epoch == 0: # 训练结束后进行test test_loss, test_acc = test_model(0, model, test_data, criterion=criterion) epoch_train_loss = 0.0 #''' f_trainloss.write( str(args.this_rank) + "\t" + str(epoch_train_loss / float(iterations_epoch)) + "\t" + str(iteration_loss) + "\t" + str(0) + "\t" + str(epoch) + "\t" + str(0) + "\t" + str(iteration) + "\t" + str(sparsification_ratio) + # time "\t" + str(global_clock) + # time "\t" + str(test_loss) + # test_loss "\t" + str(test_acc) + # test_acc '\n') f_trainloss.flush() #epoch_train_loss = 0.0 # 在指定epochs (iterations) 减少缩放因子 if (epoch + 1) in [0, 1000]: ratio = ratio * 0.1 print('--------------------------------') print(ratio) for i in workers: models[i].train() if (epoch + 1) % decay_period == 0: for param_group in optimizers_list[i - 1].param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format( param_group['lr'])) f_trainloss.close()