Exemplo n.º 1
0
    def take_one_step(self, envs, add_to_replay=False):
        states = [
            e.last_state if hasattr(e, 'last_state') else e.reset()
            for e in envs
        ]
        tensor_states = self.tensor(states, torch.float32)
        qvals = self.training_model(tensor_states).detach().cpu().numpy()

        num_states, num_actions = qvals.shape
        actions = np.argmax(qvals, axis=-1)
        random_actions = get_rng().integers(num_actions, size=num_states)
        use_random = get_rng().random(num_states) < self.epsilon
        actions = np.choose(use_random, [actions, random_actions])
        rewards = []
        dones = []

        for env, state, action in zip(envs, states, actions):
            next_state, reward, done, info = env.step(action)
            if done:
                next_state = env.reset()
            env.last_state = next_state
            if add_to_replay:
                self.replay_buffer.push(state, action, reward, done)
                self.num_steps += 1
            rewards.append(reward)
            dones.append(done)

        return states, actions, rewards, dones, qvals
Exemplo n.º 2
0
    def train_batch(self, batch):
        idx = np.arange(len(batch.states))

        for _ in range(self.epochs_per_batch):
            get_rng().shuffle(idx)
            for k in idx.reshape(self.num_minibatches, -1):
                entropy, loss = self.calculate_loss(
                    batch.states[k], batch.actions[k], batch.action_prob[k],
                    batch.values[k], batch.returns[k], batch.advantages[k])
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
Exemplo n.º 3
0
    def take_one_step(self, envs):
        obs, agent_ids = self.obs_for_envs(envs)

        obs_tensor = self.tensor(obs, torch.float32)
        qvals = self.training_model(obs_tensor).detach().cpu().numpy()

        num_states, num_actions = qvals.shape
        actions = np.argmax(qvals, axis=-1)
        random_actions = get_rng().integers(num_actions, size=num_states)
        use_random = get_rng().random(num_states) < self.epsilon
        actions = np.choose(use_random, [actions, random_actions])

        next_obs, rewards, done = self.act_on_envs(envs, actions)

        return obs, actions, rewards, done, next_obs, agent_ids
Exemplo n.º 4
0
    def take_one_step(self, envs):
        states = [
            e.last_state if hasattr(e, 'last_state') else e.reset()
            for e in envs
        ]
        entries = []
        tensor_states = torch.tensor(states,
                                     device=self.compute_device,
                                     dtype=torch.float32)
        values, policies = self.model(tensor_states)

        for i, (policy, env) in enumerate(zip(policies, envs)):
            policy = softmax(policy.detach().cpu().numpy())
            action = get_rng().choice(len(policy), p=policy)
            next_state, reward, done, info = env.step(action)
            if done:
                next_state = env.reset()
            env.last_state = next_state

            entries.append({
                'action': action,
                'action_prob': policy[action],
                'reward': reward,
                'state': states[i],
                'done': done
            })

        return entries
Exemplo n.º 5
0
    def train_batch(self, batch):
        num_samples = len(batch.obs)
        idx = np.arange(num_samples)
        splits = np.linspace(0,
                             num_samples,
                             self.num_minibatches + 2,
                             dtype=int)[1:-1]

        for _ in range(self.epochs_per_batch):
            get_rng().shuffle(idx)
            for k in np.split(idx, splits):
                entropy, loss = self.calculate_loss(
                    batch.obs[k], batch.actions[k], batch.action_prob[k],
                    batch.values[k], batch.returns[k], batch.advantages[k])
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
Exemplo n.º 6
0
    def sample(self, batch_size):
        assert self.idx >= batch_size + self.tail_length

        idx = self.idx % self.capacity
        i1 = idx - 1 - get_rng().choice(len(self), batch_size, replace=False)
        i0 = i1 - self.tail_length

        return (
            list(self.states[i0]),  # don't want dtype=object in output
            self.actions[i0],
            self.rewards[i0],
            list(self.states[i1]),  # states n steps later
            self.done[i0],  # whether or not the episode ended before n steps
        )
Exemplo n.º 7
0
    def take_one_step(self, envs):
        obs, agent_ids = self.obs_for_envs(envs)

        tensor_obs = self.tensor(obs, torch.float32)
        values, policies = self.model(tensor_obs)
        values = values.detach().cpu().numpy()
        policies = policies.detach().cpu().numpy()
        actions = [
            get_rng().choice(len(policy), p=policy) for policy in policies
        ]

        next_obs, rewards, done = self.act_on_envs(envs, actions)

        return obs, actions, rewards, done, next_obs, agent_ids, policies, values
Exemplo n.º 8
0
 def take_one_step(self, envs):
     states = [
         e.last_obs if hasattr(e, 'last_obs') else e.reset() for e in envs
     ]
     tensor_states = self.tensor(states, torch.float32)
     values, policies = self.model(tensor_states)
     values = values.detach().cpu().numpy()
     policies = policies.detach().cpu().numpy()
     actions = []
     rewards = []
     dones = []
     for policy, env in zip(policies, envs):
         action = get_rng().choice(len(policy), p=policy)
         obs, reward, done, info = env.step(action)
         if done:
             obs = env.reset()
         env.last_obs = obs
         actions.append(action)
         rewards.append(reward)
         dones.append(done)
     return states, actions, rewards, dones, policies, values
Exemplo n.º 9
0
 def sample(self, batch_size):
     sub_buffer = self.buffer[:self.idx]
     data = get_rng().choice(sub_buffer, batch_size, replace=False)
     return zip(*data)