class Agent(object): def __init__(self, model, env, args, state): self.model = model self.env = env self.state = state self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True self.info = None self.reward = 0 self.gpu_id = -1 self.position_history = Buffer(200) def action_train(self): if self.args.model == 'CONV': self.state = self.state.unsqueeze(0) value, mu, sigma, (self.hx, self.cx) = self.model( (Variable(self.state), (self.hx, self.cx))) mu = torch.clamp(mu, -1.0, 1.0) sigma = F.softplus(sigma) + 1e-5 eps = torch.randn(mu.size()) pi = np.array([math.pi]) pi = torch.from_numpy(pi).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): eps = Variable(eps).cuda() pi = Variable(pi).cuda() else: eps = Variable(eps) pi = Variable(pi) action = (mu + sigma.sqrt() * eps).data act = Variable(action) prob = normal(act, mu, sigma, self.gpu_id, gpu=self.gpu_id >= 0) action = torch.clamp(action, -1.0, 1.0) entropy = 0.5 * ((sigma * 2 * pi.expand_as(sigma)).log() + 1) self.entropies.append(entropy) log_prob = (prob + 1e-6).log() self.log_probs.append(log_prob) state, reward, self.done, self.info = self.env.step( action.cpu().numpy()[0]) reward = max(min(float(reward), 1.0), -1.0) self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.eps_len += 1 # update position history self.position_history.push(self.env.env.hull.position.x) # check for the stagnation if self._is_stagnating(): self.done = True self.reward = -100 self.done = self.done or self.eps_len >= self.args.max_episode_length self.values.append(value) self.rewards.append(reward) return self def action_test(self): with torch.no_grad(): if self.done: if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.cx = Variable(torch.zeros(1, 128).cuda()) self.hx = Variable(torch.zeros(1, 128).cuda()) else: self.cx = Variable(torch.zeros(1, 128)) self.hx = Variable(torch.zeros(1, 128)) else: self.cx = Variable(self.cx.data) self.hx = Variable(self.hx.data) if self.args.model == 'CONV': self.state = self.state.unsqueeze(0) value, mu, sigma, (self.hx, self.cx) = self.model( (Variable(self.state), (self.hx, self.cx))) mu = torch.clamp(mu.data, -1.0, 1.0) action = mu.cpu().numpy()[0] #print("action ====================", action) state, self.reward, self.done, self.info = self.env.step(action) self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.eps_len += 1 # update position history self.position_history.push(self.env.env.hull.position.x) # check for the stagnation if self._is_stagnating(): self.done = True self.reward = -100 self.done = self.done or self.eps_len >= self.args.max_episode_length return self def _is_stagnating(self): if self.position_history.is_full(): pos_past = self.position_history.get(0) pos_now = self.position_history.get(-1) if pos_now - pos_past == 0: return True return False def clear_actions(self): self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] return self
def test_set(): buf = Buffer([0, 0, 0, 0]) buf.set(1, 16) assert_equal(buf.get(1), 16)
def test_get(): buf = Buffer([0, 16, 0]) assert_equal(buf.get(1), 16)
ep_ret += rew ep_len += 1 if done or (t==local_steps_per_epoch-1): # if not done: # print("WARNING: trajectory cut off by epoch at %d steps." % ep_len) last_val = rew if done else v_t buffer.finish_path(last_val) if done: rewards.append(ep_ret) obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 agent.update(buffer.get()) for i in range(10): obs, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 rewards = [] while not d or ep_len == 1000: act, _, _ = agent.get_action(obs) obs, r, d, _ = env.step(act[0]) ep_len += 1 ep_ret += r rewards.append(r) env.render() obs, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 print(np.mean(np.array(rewards))) print(rewards)