def get_trajectories(batch_size=32, timesteps=10, policy='random', random_start=False, training=True): envs = MultiEnvironment([Env() for _ in range(batch_size)]) t_states, t_rewards, t_dones, t_actions = [], [], [], [] # Initial actions/stats actions = np.random.randint(envs.action_space.n, size=(batch_size, )) for t in range(timesteps): states, rewards, dones, _ = envs.step(actions) rewards = [rewards] if policy == 'random': actions = np.random.randint(envs.action_space.n, size=(batch_size, )) if policy == 'repeat': actions = [i % envs.action_space.n for i in range(batch_size)] t_states.append(states) t_rewards.append(rewards) t_dones.append(dones) t_actions.append(actions) # Reshape to (batch_size, timesteps, ...) states = np.swapaxes(t_states, 0, 1) rewards = np.swapaxes(t_rewards, 0, 1) dones = np.swapaxes(t_dones, 0, 1) actions = np.swapaxes(t_actions, 0, 1) return states, rewards, dones, actions
def get_trajectories(batch_size=32, timesteps=10, policy=None): envs = MultiEnvironment([BoxesEnv() for _ in range(batch_size)]) t_states, t_rewards, t_dones, t_actions = [], [], [], [] for t in range(timesteps): actions = np.random.randint(envs.action_space.n, size=(batch_size, )) states, rewards, dones, _ = envs.step(actions) t_states.append(states) t_rewards.append(rewards) t_dones.append(dones) t_actions.append(actions) # Reshape to (batch_size, timesteps, ...) states = np.swapaxes(t_states, 0, 1) rewards = np.swapaxes(t_rewards, 0, 1) dones = np.swapaxes(t_dones, 0, 1) actions = np.swapaxes(t_actions, 0, 1) return states, rewards, dones, actions
self.prev_action = np.random.randint(self.num_actions) def step(self, state): flip = np.random.random() if flip > 0.90: action = 1 # mash the 'shoot' button elif flip > 0.25: action = self.prev_action self.prev_action = action else: action = np.random.randint(self.num_actions) return action MAX_BATCH_SIZE = 32 envs = MultiEnvironment([GameEnv() for _ in range(MAX_BATCH_SIZE)]) states = envs.reset() def get_trajectories(batch_size=32, timesteps=10, policy=None, random_start=False): global states actions = np.random.randint(envs.action_space.n, size=(batch_size,)) PolicyFn = policy or HeuristicPolicy policies = [PolicyFn() for _ in range(batch_size)] t_states, t_rewards, t_dones, t_actions = [], [], [], [] for t in range(timesteps): actions = [p.step(s) for p, s in zip(policies, states)] actions = actions[:batch_size] # hack for fixed envs w/ varible batch size new_states, rewards, dones, _ = envs.step(actions) t_states.append(new_states) for i in range(batch_size):