loss.backward() optimizers[worker].step() wcounter += 1 if wcounter == tau: break avg_index = int((r % (N_w / 2)) * 2) ps_functions.average_model(nets[avg_index], nets[avg_index + 1]) ps_functions.synch_weight(nets[avg_index + 1], nets[avg_index]) for n in range(N_w): if n != avg_index and n != avg_index + 1: ps_functions.average_model2(nets[n], nets[avg_index]) if (r * tau) % 120 == 0: ps_functions.initialize_zero(ps_model) for n in range(N_w): ps_functions.weight_accumulate(nets[n], ps_model, N_w) correct = 0 total = 0 with torch.no_grad(): for data in testloader: images, labels = data images, labels = images.to(device), labels.to(device) outputs = ps_model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total)) results[0][res_ind] = 100 * correct / total res_ind += 1
gradient_average(nets[c][n], ps_model, lr) i = 0 c += 1 if c == num_cl: c = 0 per += 1 iter_ind += 1 if per == period: ps_functions.ps_param_zero(ps_model) for cl in range(num_cl): # ps_functions.gradient_accumulate(old_nets[cl][0], nets[cl][0], ps_model, is_avg, num_cl) ps_functions.weight_accumulate(nets[cl][0], ps_model, num_cl) # ps_functions.sparse_grad(top_k_ps, ps_model, device) for cl in range(num_cl): for n in range(num_w_per_cluster): # old_nets[cl][n] = ps_functions.gradient_average(nets[cl][n], ps_model, scale) ps_functions.weight_broadcast(nets[cl][n], ps_model) per = 0 i = 0 c = 0 ##### Change lr for next iteration during the warm up phase ##################### ps_functions.warmup_lr(optimizers, num_cl, num_w_per_cluster, lr, iter_ind, max_ind)
inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) optimizers[worker].zero_grad() preds = nets[worker](inputs) loss = criterions[worker](preds, labels) loss.backward() ps_functions.synch_weight(reserveNets[worker], nets[worker]) break for worker in range(N_w): ps_functions.initialize_zero(nets[worker]) index = worker - 4 if index < 0: index += N_w for i in range(5): ps_functions.weight_accumulate( reserveNets[int((index + (i * 2)) % N_w)], nets[worker], 5) optimizers[worker].step() # w_index sends its model to other workers # # other workers upon receiving the model take the average for n in range(N_w): if n != w_index: ps_functions.average_model(nets[n], nets[w_index]) if (r % 100) == 0 and r != 0: ## reset of extraModel ps_functions.initialize_zero(avg_model) # model ## take average for worker in range(N_w): ps_functions.weight_accumulate(nets[worker], avg_model,
if n != w_index: ps_functions.average_model(nets[n], nets[w_index]) # averaging the momentums for n in range(N_w): if n != w_index: ps_functions.average_momentum(optimizers[n], optimizers[w_index]) if (r % 100) == 0 and r != 0: ## reset of extraModel ps_functions.initialize_zero(avg_model) # model ps_functions.momentum_zero(avg_Optimizer) ## take average for worker in range(N_w): ps_functions.weight_accumulate(nets[worker], avg_model, N_w) # model ps_functions.momentum_accumulate(avg_Optimizer, optimizers[worker], N_w) ##assign all worker models for worker in range(N_w): ps_functions.synch_weight(nets[worker], avg_model) # model ps_functions.momentum_Avg(avg_Optimizer, optimizers[worker]) # momentum if r % 100 == 0: ps_functions.initialize_zero(ps_model) for n in range(N_w): ps_functions.weight_accumulate(nets[n], ps_model, N_w) correct = 0 total = 0 with torch.no_grad(): for data in testloader:
loss = criterions[worker](preds, labels) loss.backward() break #4 for worker in range(N_w): ps_functions.weight_dif(netsCurrent[worker], netsOLD[worker], netsDif[worker]) for worker in range(N_w): ps_functions.synch_weight(nets[worker], netsAvg[worker]) #5 totalRand = 0 rand = abs(np.random.normal(1, 0, N_n)) normalizationFactor = sum(rand) / N_n for i in range(N_n): neighbor = int(connectionMatrix[worker][i]) constant = (N_n + 1) * normalizationFactor / rand[i] ps_functions.weight_accumulate(netsDif[neighbor], nets[worker], constant) #PWdif & 6 ps_functions.weight_accumulate(netsDif[worker], nets[worker], N_n + 1) ps_functions.synch_weight(netsAvg[worker], nets[worker]) #7 optimizers[worker].step() #8 ps_functions.synch_weight(netsOLD[worker], netsCurrent[worker]) #9 ps_functions.synch_weight(netsCurrent[worker], nets[worker]) #10 if r % 100 == 0: ps_functions.initialize_zero(ps_model) for n in range(N_w): ps_functions.weight_accumulate(nets[n], ps_model, N_w) correct = 0 total = 0 with torch.no_grad(): for data in testloader: images, labels = data
optimizers[worker].zero_grad() preds = nets[worker](inputs) loss = criterions[worker](preds, labels) loss.backward() break #4 for worker in range(N_w): ps_functions.weight_dif(netsCurrent[worker], netsOLD[worker], netsDif[worker]) for worker in range(N_w): ps_functions.synch_weight(nets[worker], netsAvg[worker]) #5 index = worker - N_n if index < 0: index += N_w for i in range(N_n + 1): ps_functions.weight_accumulate(netsDif[int( (index + (i * 2)) % N_w)], nets[worker], N_n + 1) #PWdif & 6 ps_functions.synch_weight(netsAvg[worker], nets[worker]) #7 optimizers[worker].step() #8 ps_functions.synch_weight(netsOLD[worker], netsCurrent[worker]) #9 ps_functions.synch_weight(netsCurrent[worker], nets[worker]) #10 # w_index sends its model to other workers # # other workers upon receiving the model take the average if r % 100 == 0: ps_functions.initialize_zero(ps_model) for n in range(N_w): ps_functions.weight_accumulate(nets[n], ps_model, N_w) correct = 0 total = 0 with torch.no_grad():