def compute_q_loss(global_state, state, reward, next_global_state, next_state): if args.cuda: global_state = global_state.cuda() state = state.cuda() reward = reward.cuda() next_global_state = next_global_state.cuda() next_state = next_state.cuda() global_state = Variable(global_state, requires_grad=True) state = Variable(state, requires_grad=True) next_global_state = Variable(next_global_state, volatile=True) next_state = Variable(next_state, volatile=True) current_q_values, _ = training_agents[0].act(global_state, state) max_next_q_values, _ = training_agents[0].target_act( next_global_state, next_state) max_next_q_values = max_next_q_values.max(1)[0] # sum the rewards for individual agents expected_q_values = Variable( reward.mean(dim=1)) + args.gamma * max_next_q_values loss = MSELoss()(current_q_values, expected_q_values) loss.backward() return loss.cpu().data[0]
def fit(self, train_x, train_y, val_x, val_y, bin_size, lr, batch_size, with_gap, earlystop, verbose): self.batch_size = batch_size optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, amsgrad=True) loss_list = [] for i in range(self.maxepochs): optimizer.zero_grad() x, y = self.loader.get_batch() x = x.to(self.device) y = y.yo(self.device) ypred = self.model(x) if ypred.dim() == 2: ypred = ypred.squeeze(1) assert ypred.size() == y.size() loss = MSELoss(reduction='mean')(ypred, y) loss.backward() optimizer.step() if earlystop == True: loss_list = loss_list.append(self.evaluate(val_x, val_y)) else: if train_x.size()[0] == batch_size: loss_list = loss_list.append(loss.cpu().data.numpy()) if len(loss_list)>5 \ and abs(loss_list[-2]/loss_list[-1]-1)<0.0001 : break if self.earlystop == True: return None, loss_list[-1] else: return loss_list[-1], self.evaluate(val_x, val_y)