def _test_and_save(self, episode, test_game): self.training = False self.model.train(self.training) test_eps_variable = None total_reward = 0 with torch.no_grad(): test_lstm_hidden_vb = (Variable( torch.zeros(1, self.hidden_dim).type(self.dtype)), Variable( torch.zeros(1, self.hidden_dim).type( self.dtype))) current_experience = test_game.reset() test_game.visual() eps_count = 0 test_actions = [] test_done = False while eps_count < self.max_episode and not test_done: test_p_vb, test_v_vb, test_lstm_hidden_vb = self.model( preprocess_state(current_experience.state1, self.dtype, is_volatile=False), test_lstm_hidden_vb, test_eps_variable) test_action = test_p_vb.max(1)[1].data[0] test_actions.append(test_action) for _ in range(self.skip_frame): current_experience = test_game.step(test_action) test_game.visual() total_reward = total_reward + current_experience.reward if current_experience.terminal1: test_done = True break eps_count = eps_count + self.skip_frame self._save_model(episode, total_reward)
def normal(x, mu, sigma): pi = np.array([math.pi]) pi = torch.from_numpy(pi).float() pi = Variable(pi) a = (-1 * (x - mu).pow(2) / (2 * sigma)).exp() b = 1 / (2 * sigma * pi.expand_as(sigma)).sqrt() return a * b
def forward(self, x, lstm_hidden_vb=None, eps=None): x = x.view(x.size(0), 3, self.input_dims[1], self.input_dims[1]) x = F.leaky_relu(self.down1(F.leaky_relu(self.conv1(x)))) x = F.leaky_relu(self.down2(F.leaky_relu(self.conv2(x)))) x = F.leaky_relu(self.down3(F.leaky_relu(self.conv3(x)))) x = F.leaky_relu(self.down4(F.leaky_relu(self.conv4(x)))) x = x.view(x.size(0), -1) hx, cx = self.lstm(x, lstm_hidden_vb) x = self.linear_encoder(hx) mu = self.linear_mu(hx) sigma = Variable(torch.zeros(mu.size(0), mu.size(1))-self.sig) z = self.sampler(mu, sigma, eps=eps) sigma_det = Variable(torch.zeros(mu.size(0), mu.size(1))-self.sig) x = self.sampler(x, sigma_det,eps=eps) x = F.leaky_relu(torch.cat([x,z], dim=1)) p = self.policy_5(x) p = self.policy_6(p) v = self.value_5(x) return p, v, (hx, cx)
def _reset_lstm_hidden_vb_episode(self, training=True ): # seq_len, batch_size, hidden_dim not_training = not training if not_training: with torch.no_grad(): self.lstm_hidden_vb = (Variable( torch.zeros(self.batch_size, self.hidden_dim).type(self.dtype)), Variable( torch.zeros(self.batch_size, self.hidden_dim).type( self.dtype))) self.latent = Variable( torch.zeros(self.batch_size, self.hidden_dim)) self.eps_variable = None else: self.lstm_hidden_vb = (Variable( torch.zeros(self.batch_size, self.hidden_dim).type(self.dtype)), Variable( torch.zeros(self.batch_size, self.hidden_dim).type( self.dtype))) self.latent = Variable( torch.zeros(self.batch_size, self.hidden_dim)) self.eps_variable = None
def preprocess_state(state, dtype, is_volatile=False): if isinstance(state, list): state_vb = [] for i in range(len(state)): if is_volatile: with torch.no_grad(): state_vb.append( Variable( torch.from_numpy( state[i]).unsqueeze(0).type(dtype))) else: state_vb.append( Variable( torch.from_numpy(state[i]).unsqueeze(0).type(dtype))) else: if is_volatile: with torch.no_grad(): state_vb = Variable( torch.from_numpy(state).unsqueeze(0).type(dtype)) else: state_vb = Variable( torch.from_numpy(state).unsqueeze(0).type(dtype)) return state_vb
def forward(self, x, z_prev, lstm_hidden_vb=None, eps=None): x = x.view(x.size(0), 3, self.input_dims[1], self.input_dims[1]) x = F.leaky_relu(self.down1(F.leaky_relu(self.conv1(x))), inplace=True) x = F.leaky_relu(self.down2(F.leaky_relu(self.conv2(x))), inplace=True) x = F.leaky_relu(self.down3(F.leaky_relu(self.conv3(x))), inplace=True) x = F.leaky_relu(self.down4(F.leaky_relu(self.conv4(x))), inplace=True) x = x.view(x.size(0), -1) hx, cx = self.lstm(x, lstm_hidden_vb) if self.crelu: z_prev = self.crelu_z(z_prev) else: z_prev = F.leaky_relu(z_prev, inplace=True) mu_prior = self.prior_mu(z_prev) sigma_prior = Variable( torch.zeros(mu_prior.size(0), mu_prior.size(1)) - self.sig) x = self.linear_encoder(hx) mu = self.linear_mu(hx) sigma = self.linear_sigma(hx) self.x = Variable(x.data) z = self.sampler(mu, sigma, eps=eps) if self.crelu: x = self.crelu_x(torch.cat([x, z], dim=1)) else: x = F.leaky_relu(torch.cat([x, z], dim=1), inplace=True) p = self.policy_5(x) p = self.policy_6(p) v = self.value_5(x) return p, v, z, (hx, cx), (mu, sigma), (mu_prior, sigma_prior)
def forward(self, input): return F.linear( input, self.weight + self.sigma_weight * Variable(self.epsilon_weight), self.bias + self.sigma_bias * Variable(self.epsilon_bias))
def _backward(self, sT_vb): self.optimizer.zero_grad() # preparation _, valueT_vb, _ = self.model(sT_vb, self.lstm_hidden_vb) for i in range(self.batch_size): if self.A3C_Experiences[i].terminal1[-1]: valueT_vb.data[i] = 0 valueT_vb = Variable(valueT_vb.data) rollout_steps = [ len(self.A3C_Experiences[i].reward) for i in range(self.batch_size) ] policy_vb = [ self.A3C_Experiences[i].policy_vb for i in range(self.batch_size) ] action_batch_vb = [ self.A3C_Experiences[i].action for i in range(self.batch_size) ] policy_log_vb = [[ torch.log(policy_vb[i][j]) for j in range(len(policy_vb[i])) ] for i in range(len(policy_vb))] entropy_vb = [[ -(policy_log_vb[i][j] * policy_vb[i][j]).sum(1) for j in range(len(policy_vb[i])) ] for i in range(len(policy_vb))] policy_log_vb = [[ policy_log_vb[i][j].gather( 1, Variable(action_batch_vb[i][j]).unsqueeze(0).detach()) for j in range(len(action_batch_vb[i])) ] for i in range(len(action_batch_vb))] for i in range(self.batch_size): self.A3C_Experiences[i].value0_vb.append( Variable(valueT_vb.data[i])) gae_ts = torch.zeros(self.batch_size, 1) if self.gpu >= 0: gae_ts = gae_ts.cuda() # compute loss policy_loss_vb = [0. for i in range(self.batch_size)] value_loss_vb = [0. for i in range(self.batch_size)] loss_model_vb = 0 for j in range(self.batch_size): for i in reversed(range(rollout_steps[j])): valueT_vb[j] = self.gamma * valueT_vb[ j] + self.A3C_Experiences[j].reward[i] advantage_vb = valueT_vb[j] - self.A3C_Experiences[ j].value0_vb[i] value_loss_vb[j] = value_loss_vb[j] + 0.5 * advantage_vb.pow(2) tderr_ts = self.A3C_Experiences[j].reward[ i] + self.gamma * self.A3C_Experiences[j].value0_vb[ i + 1].data - self.A3C_Experiences[j].value0_vb[i].data gae_ts[j] = gae_ts[j] * self.tau * self.gamma + tderr_ts policy_loss_vb[j] = policy_loss_vb[j] - ( policy_log_vb[j][i] * Variable(gae_ts[j]) + self.beta * entropy_vb[j][i]) loss_model_vb = loss_model_vb + ( policy_loss_vb[j] + self.lam * value_loss_vb[j]) / rollout_steps[j] self.model.zero_grad() loss_model_vb.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_grad) p_loss_avg = 0 v_loss_avg = 0 loss_avg = loss_model_vb.data.cpu().numpy() for i in range(self.batch_size): p_loss_avg += policy_loss_vb[i].data.cpu().numpy( ) / self.batch_size v_loss_avg += value_loss_vb[i].data.cpu().numpy() / self.batch_size # log training stats self.p_loss_avg += p_loss_avg self.v_loss_avg += v_loss_avg self.loss_avg += loss_model_vb.data.cpu().numpy() self.loss_counter += 1 self.logger.warning("Reporting @ Step: " + str(self.train_step) + " | Elapsed Time: " + str(time.time() - self.start_time)) self.logger.warning( "Iteration: {}; current p_loss: {}; average p_loss: {}".format( self.train_step, p_loss_avg, self.p_loss_avg / self.loss_counter)) self.logger.warning( "Iteration: {}; current v_loss: {}; average v_loss: {}".format( self.train_step, v_loss_avg, self.v_loss_avg / self.loss_counter)) self.logger.warning( "Iteration: {}; current loss : {}; average loss : {}".format( self.train_step, loss_avg, self.loss_avg / self.loss_counter))
def _reset_lstm_hidden_vb_rollout(self): self.lstm_hidden_vb = (Variable(self.lstm_hidden_vb[0].data), Variable(self.lstm_hidden_vb[1].data)) self.latent = Variable(self.latent.data)