def take_one_step(self, envs, add_to_replay=False): states = [ e.last_state if hasattr(e, 'last_state') else e.reset() for e in envs ] tensor_states = self.tensor(states, torch.float32) qvals = self.training_model(tensor_states).detach().cpu().numpy() num_states, num_actions = qvals.shape actions = np.argmax(qvals, axis=-1) random_actions = get_rng().integers(num_actions, size=num_states) use_random = get_rng().random(num_states) < self.epsilon actions = np.choose(use_random, [actions, random_actions]) rewards = [] dones = [] for env, state, action in zip(envs, states, actions): next_state, reward, done, info = env.step(action) if done: next_state = env.reset() env.last_state = next_state if add_to_replay: self.replay_buffer.push(state, action, reward, done) self.num_steps += 1 rewards.append(reward) dones.append(done) return states, actions, rewards, dones, qvals
def train_batch(self, batch): idx = np.arange(len(batch.states)) for _ in range(self.epochs_per_batch): get_rng().shuffle(idx) for k in idx.reshape(self.num_minibatches, -1): entropy, loss = self.calculate_loss( batch.states[k], batch.actions[k], batch.action_prob[k], batch.values[k], batch.returns[k], batch.advantages[k]) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
def take_one_step(self, envs): obs, agent_ids = self.obs_for_envs(envs) obs_tensor = self.tensor(obs, torch.float32) qvals = self.training_model(obs_tensor).detach().cpu().numpy() num_states, num_actions = qvals.shape actions = np.argmax(qvals, axis=-1) random_actions = get_rng().integers(num_actions, size=num_states) use_random = get_rng().random(num_states) < self.epsilon actions = np.choose(use_random, [actions, random_actions]) next_obs, rewards, done = self.act_on_envs(envs, actions) return obs, actions, rewards, done, next_obs, agent_ids
def take_one_step(self, envs): states = [ e.last_state if hasattr(e, 'last_state') else e.reset() for e in envs ] entries = [] tensor_states = torch.tensor(states, device=self.compute_device, dtype=torch.float32) values, policies = self.model(tensor_states) for i, (policy, env) in enumerate(zip(policies, envs)): policy = softmax(policy.detach().cpu().numpy()) action = get_rng().choice(len(policy), p=policy) next_state, reward, done, info = env.step(action) if done: next_state = env.reset() env.last_state = next_state entries.append({ 'action': action, 'action_prob': policy[action], 'reward': reward, 'state': states[i], 'done': done }) return entries
def train_batch(self, batch): num_samples = len(batch.obs) idx = np.arange(num_samples) splits = np.linspace(0, num_samples, self.num_minibatches + 2, dtype=int)[1:-1] for _ in range(self.epochs_per_batch): get_rng().shuffle(idx) for k in np.split(idx, splits): entropy, loss = self.calculate_loss( batch.obs[k], batch.actions[k], batch.action_prob[k], batch.values[k], batch.returns[k], batch.advantages[k]) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
def sample(self, batch_size): assert self.idx >= batch_size + self.tail_length idx = self.idx % self.capacity i1 = idx - 1 - get_rng().choice(len(self), batch_size, replace=False) i0 = i1 - self.tail_length return ( list(self.states[i0]), # don't want dtype=object in output self.actions[i0], self.rewards[i0], list(self.states[i1]), # states n steps later self.done[i0], # whether or not the episode ended before n steps )
def take_one_step(self, envs): obs, agent_ids = self.obs_for_envs(envs) tensor_obs = self.tensor(obs, torch.float32) values, policies = self.model(tensor_obs) values = values.detach().cpu().numpy() policies = policies.detach().cpu().numpy() actions = [ get_rng().choice(len(policy), p=policy) for policy in policies ] next_obs, rewards, done = self.act_on_envs(envs, actions) return obs, actions, rewards, done, next_obs, agent_ids, policies, values
def take_one_step(self, envs): states = [ e.last_obs if hasattr(e, 'last_obs') else e.reset() for e in envs ] tensor_states = self.tensor(states, torch.float32) values, policies = self.model(tensor_states) values = values.detach().cpu().numpy() policies = policies.detach().cpu().numpy() actions = [] rewards = [] dones = [] for policy, env in zip(policies, envs): action = get_rng().choice(len(policy), p=policy) obs, reward, done, info = env.step(action) if done: obs = env.reset() env.last_obs = obs actions.append(action) rewards.append(reward) dones.append(done) return states, actions, rewards, dones, policies, values
def sample(self, batch_size): sub_buffer = self.buffer[:self.idx] data = get_rng().choice(sub_buffer, batch_size, replace=False) return zip(*data)