def validate_sync(self, render=False): episode_scores = [] env = self.val_envs for episode in range(self.num_val_episodes//self.num_envs): states = env.reset() episode_score = [] zeros = np.zeros((len(self.env)), dtype=np.int32) prev_actrew = concat_action_reward(zeros, zeros, self.action_size+1) # start with action 0 and reward 0 prev_hidden = self.model.get_initial_hidden(len(self.val_envs)) for t in range(self.nsteps): policies, values, hidden = self.model.evaluate(states[None], prev_actrew, prev_hidden) actions = fastsample(policies) next_states, rewards, dones, infos = env.step(actions) states = next_states episode_score.append(rewards*(1-dones)) if render: with self.lock: env.render() if dones.sum() == self.num_envs or t == self.val_steps -1: tot_reward = np.sum(np.stack(episode_score), axis=0) episode_scores.append(tot_reward) break return np.mean(episode_scores)
def rollout(self): rollout = [] for t in range(self.nsteps): policies, values_extr, values_intr = self.model.evaluate( self.states) actions = fastsample(policies) next_states, extr_rewards, dones, infos = self.env.step(actions) next_states__ = next_states[:, -1:] if len( next_states.shape ) == 4 else next_states # [num_envs, channels, height, width] for convolutions intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std) rollout.append( (self.states, next_states__, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones)) self.states = next_states states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many( *zip(*rollout)) last_policy, last_values_extr, last_values_intr, = self.model.evaluate( self.states) return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones
def validate_sync(self, render): episode_scores = [] env = self.val_envs for episode in range(self.num_val_episodes // len(env)): states = env.reset() episode_score = [] prev_hidden = self.model.get_initial_hidden(len(self.val_envs)) for t in range(self.val_steps): policies, values, hidden = self.model.evaluate( states[None], prev_hidden) actions = fastsample(policies) next_states, rewards, dones, infos = env.step(actions) states = next_states episode_score.append(rewards * (1 - dones)) if render: with self.lock: env.render() if dones.sum() == self.num_envs or t == self.val_steps - 1: tot_reward = np.sum(np.stack(episode_score), axis=0) episode_scores.append(tot_reward) break return np.mean(episode_scores)
def _validate_async(self, env, num_ep, max_steps, render=False): for episode in range(num_ep): state = env.reset() episode_score = [] hidden = self.model.get_initial_hidden(1) for t in range(max_steps): policy, value, hidden = self.model.evaluate( state[None, None], hidden) #print('policy', policy, 'value', value) action = int(fastsample(policy)) next_state, reward, done, info = env.step(action) state = next_state episode_score.append(reward) if render: with self.lock: env.render() if done or t == max_steps - 1: tot_reward = np.sum(episode_score) with self.lock: self.validate_rewards.append(tot_reward) break if render: with self.lock: env.close()
def _train_onestep(self): states = self.env.reset() y = np.zeros((self.num_envs)) num_steps = self.total_steps // self.num_envs for t in range(1, num_steps + 1): policies, values = self.model.evaluate(self.states) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) y = rewards + self.gamma * self.model.get_value(next_states) * ( 1 - dones) l = self.model.backprop(states, y, actions) states = next_states if self.render_freq > 0 and t % ( (self.validate_freq // self.num_envs) * self.render_freq) == 0: render = True else: render = False if self.validate_freq > 0 and t % (self.validate_freq // self.num_envs) == 0: self.validation_summary(t, l, start, render) start = time.time() if self.save_freq > 0 and t % (self.save_freq // self.num_envs) == 0: self.s += 1 self.save(self.s) print('saved model')
def rollout(self, ): rollout = [] for t in range(self.nsteps): policies, values = self.model.evaluate(self.states) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones)) self.states = next_states states, actions, rewards, values, dones = stack_many(*zip(*rollout)) _, last_values = self.model.evaluate(next_states) return states, actions, rewards, dones, values, last_values
def rollout(self,): rollout = [] for t in range(self.nsteps): policies, values = self.model.evaluate(self.states) # Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis]) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones)) self.replay.append((self.states, actions, rewards, values, dones)) # add to replay memory self.states = next_states states, actions, rewards, values, dones = stack_many(*zip(*rollout)) _, last_values = self.model.evaluate(next_states) return states, actions, rewards, values, dones, last_values
def rollout(self, ): rollout = [] first_hidden = self.prev_hidden for t in range(self.nsteps): policies, values, hidden = self.model.evaluate( self.states[None], self.prev_hidden) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, values, dones)) self.states = next_states self.prev_hidden = self.model.mask_hidden( hidden, dones) # reset hidden state at end of episode states, actions, rewards, values, dones = stack_many(*zip(*rollout)) _, last_values, _ = self.model.evaluate(self.states[None], self.prev_hidden) return states, actions, rewards, first_hidden, dones, values, last_values
def rollout(self,): rollout = [] first_hidden = self.prev_hidden for t in range(self.nsteps): policies, values, hidden = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden) #Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[None]) actions = fastsample(policies) next_states, rewards, dones, infos = self.env.step(actions) rollout.append((self.states, actions, rewards, self.prev_actions_rewards, dones, infos)) self.replay.append((self.states, actions, rewards, self.prev_hidden, self.prev_actions_rewards, dones)) # add to replay memory self.states = next_states self.prev_hidden = self.model.mask_hidden(hidden, dones) # reset hidden state at end of episode self.prev_actions_rewards = concat_action_reward(actions, rewards, self.action_size+1) states, actions, rewards, prev_actions_rewards, dones, infos = stack_many(*zip(*rollout)) _, last_values, _ = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden) return states, actions, rewards, first_hidden, prev_actions_rewards, dones, last_values
def rollout(self, ): rollout = [] for t in range(self.nsteps): start = time.time() policies, values = self.model.evaluate(self.states) actions = fastsample(policies) next_states, extr_rewards, dones, infos = self.env.step(actions) mean, std = self.state_mean[None], self.state_std[None] intr_rewards = self.model.intrinsic_reward( (self.states - mean) / std, actions, (next_states - mean) / std) rewards = extr_rewards + intr_rewards rollout.append( (self.states, next_states, actions, rewards, values, dones)) self.states = next_states states, next_states, actions, rewards, values, dones = stack_many(*zip( *rollout)) return states, next_states, actions, rewards, dones, values
def get_action(self, states): policies, values_extr, values_intr = self.model.evaluate(states) actions = fastsample(policies) return actions
def get_action(self, state): policy, value = self.model.evaluate(state) action = int(fastsample(policy)) return action