예제 #1
0
 def evaluate(self,
              state: np.ndarray,
              hidden: np.ndarray = None,
              done=None):
     state = totorch(state, self.device)
     hidden = totorch_many(
         *hidden, device=self.device) if hidden is not None else None
     with torch.no_grad():
         policy, value, hidden = self.forward(state, hidden, done)
     return tonumpy(policy), tonumpy(value), tonumpy_many(*hidden)
예제 #2
0
 def eval_state(self, state, loc):
     with torch.no_grad():
         x, y = zip(*loc)
         x, y = torch.tensor(x).to(self.device), torch.tensor(y).to(
             self.device)
         state_torch = totorch(state, self.device)
         Qsa = self.model(state_torch, x, y)
     return tonumpy(Qsa)
예제 #3
0
def train(global_model, model, env, nsteps, num_episodes, ID):
    opt = torch.optim.RMSprop(global_model.parameters(), lr=1e-3)
    episode = 0
    episode_steps = 0
    episode_score = 0
    T = 0
    state = env.reset()
    start = time.time()
    while episode < num_episodes:
        rollout = []
        for t in range(nsteps):
            with torch.no_grad():
                policy, value = model(totorch(state[None], device='cpu'))
                policy, value = tonumpy(policy), tonumpy(value)
            action = np.random.choice(policy.shape[1], p=policy[0])
            next_state, reward, done, info = env.step(action)
            episode_score += reward
            rollout.append((state, action, reward, value, done))
            state = next_state

            T += 1
            episode_steps += 1

            if done or t == nsteps-1:
                states, actions, rewards, values, dones = stack_many(*zip(*rollout))
                with torch.no_grad():
                    _, last_values = model.forward(totorch(next_state[None], device='cpu'))
                    last_values = last_values.cpu().numpy()
                

                    R = lambda_return(rewards, values, last_values, dones, gamma=0.9, lambda_=0.95, clip=False)
                
                loss = update_params(model, global_model, opt, states, actions, R)
                
                #self.T += t

                if done:
                    episode += 1
                    state = env.reset()
                    if episode % 1 == 0:
                        time_taken = time.time() - start 
                        print(f'worker {ID}, total worker steps {T:,} local episode {episode}, episode score {episode_score} episode steps {episode_steps}, time taken {time_taken:,.1f}s, fps {episode_steps/time_taken:.2f}')
                    episode_steps = 0
                    episode_score = 0
                    start = time.time()
                    break
예제 #4
0
 def intrinsic_reward(self, next_state: np.ndarray, state_mean: np.ndarray,
                      state_std):
     next_state, state_mean, state_std = totorch_many(next_state,
                                                      state_mean,
                                                      state_std,
                                                      device=self.device)
     with torch.no_grad():
         intr_reward = self._intr_reward(next_state, state_mean, state_std)
     return tonumpy(intr_reward)
예제 #5
0
 def get_pixel_control(self, state:np.ndarray):
     with torch.no_grad():
         enc_state = self.policy.model(totorch(state, self.device))
         Qaux = self.Qaux(enc_state)
     return tonumpy(Qaux)
예제 #6
0
 def evaluate(self, state):
     with torch.no_grad():
         policy, value_extr, value_intr = self.forward(
             totorch(state, self.device))
     return tonumpy(policy), tonumpy(value_extr), tonumpy(value_intr)
예제 #7
0
 def evaluate(self, state: np.ndarray):
     with torch.no_grad():
         policy, _ = self.policy.forward(totorch(state, self.policy.device))
         value = self.value.forward(totorch(state, self.value.device))
     return tonumpy(policy), tonumpy(value)
예제 #8
0
 def get_value(self, state: np.ndarray):
     with torch.no_grad():
         value = self.value.forward(totorch(state, self.value.device))
     return tonumpy(value)
예제 #9
0
 def get_policy(self, state: np.ndarray):
     with torch.no_grad():
         policy, Adv = self.policy.forward(
             totorch(state, self.policy.device))
     return tonumpy(policy), tonumpy(Adv)
예제 #10
0
 def evaluate(self, state: np.ndarray):
     state = totorch(state, self.device)
     with torch.no_grad():
         policy, value = self.forward(state)
     return tonumpy(policy), tonumpy(value)
예제 #11
0
 def get_pixel_control(self, state:np.ndarray, action_reward, hidden):
     state, action_reward, hidden = totorch(state, self.device), totorch(action_reward, self.device), totorch_many(*hidden, device=self.device)
     with torch.no_grad():
         lstm_state, _ = self.policy.lstm_forward(state, action_reward, hidden, done=None)
         Qaux = self.Qaux(lstm_state)
     return tonumpy(Qaux)