def validate_sync(self, render=False):
        episode_scores = []
        env = self.val_envs
        for episode in range(self.num_val_episodes//self.num_envs):
            states = env.reset()
            episode_score = []
            zeros = np.zeros((len(self.env)), dtype=np.int32)
            prev_actrew = concat_action_reward(zeros, zeros, self.action_size+1) # start with action 0 and reward 0
            prev_hidden = self.model.get_initial_hidden(len(self.val_envs))
            for t in range(self.nsteps):
                policies, values, hidden = self.model.evaluate(states[None], prev_actrew, prev_hidden)
                actions = fastsample(policies)
                next_states, rewards, dones, infos = env.step(actions)
                states = next_states

                episode_score.append(rewards*(1-dones))
                
                if render:
                    with self.lock:
                        env.render()

                if dones.sum() == self.num_envs or t == self.val_steps -1:
                    tot_reward = np.sum(np.stack(episode_score), axis=0)
                    episode_scores.append(tot_reward)
                    break
        
        return np.mean(episode_scores)
示例#2
0
    def rollout(self):
        rollout = []
        for t in range(self.nsteps):
            policies, values_extr, values_intr = self.model.evaluate(
                self.states)
            actions = fastsample(policies)
            next_states, extr_rewards, dones, infos = self.env.step(actions)

            next_states__ = next_states[:, -1:] if len(
                next_states.shape
            ) == 4 else next_states  # [num_envs, channels, height, width] for convolutions
            intr_rewards = self.model.intrinsic_reward(next_states__,
                                                       self.state_mean,
                                                       self.state_std)

            rollout.append(
                (self.states, next_states__, actions, extr_rewards,
                 intr_rewards, values_extr, values_intr, policies, dones))
            self.states = next_states

        states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(
            *zip(*rollout))
        last_policy, last_values_extr, last_values_intr, = self.model.evaluate(
            self.states)
        return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones
示例#3
0
    def validate_sync(self, render):
        episode_scores = []
        env = self.val_envs
        for episode in range(self.num_val_episodes // len(env)):
            states = env.reset()
            episode_score = []
            prev_hidden = self.model.get_initial_hidden(len(self.val_envs))
            for t in range(self.val_steps):
                policies, values, hidden = self.model.evaluate(
                    states[None], prev_hidden)
                actions = fastsample(policies)
                next_states, rewards, dones, infos = env.step(actions)
                states = next_states

                episode_score.append(rewards * (1 - dones))

                if render:
                    with self.lock:
                        env.render()

                if dones.sum() == self.num_envs or t == self.val_steps - 1:
                    tot_reward = np.sum(np.stack(episode_score), axis=0)
                    episode_scores.append(tot_reward)
                    break

        return np.mean(episode_scores)
示例#4
0
    def _validate_async(self, env, num_ep, max_steps, render=False):
        for episode in range(num_ep):
            state = env.reset()
            episode_score = []
            hidden = self.model.get_initial_hidden(1)
            for t in range(max_steps):
                policy, value, hidden = self.model.evaluate(
                    state[None, None], hidden)
                #print('policy', policy, 'value', value)
                action = int(fastsample(policy))
                next_state, reward, done, info = env.step(action)
                state = next_state

                episode_score.append(reward)

                if render:
                    with self.lock:
                        env.render()

                if done or t == max_steps - 1:
                    tot_reward = np.sum(episode_score)
                    with self.lock:
                        self.validate_rewards.append(tot_reward)

                    break
        if render:
            with self.lock:
                env.close()
示例#5
0
    def _train_onestep(self):
        states = self.env.reset()
        y = np.zeros((self.num_envs))
        num_steps = self.total_steps // self.num_envs
        for t in range(1, num_steps + 1):
            policies, values = self.model.evaluate(self.states)
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)
            y = rewards + self.gamma * self.model.get_value(next_states) * (
                1 - dones)

            l = self.model.backprop(states, y, actions)
            states = next_states

            if self.render_freq > 0 and t % (
                (self.validate_freq // self.num_envs) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               self.num_envs) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq //
                                           self.num_envs) == 0:
                self.s += 1
                self.save(self.s)
                print('saved model')
示例#6
0
    def rollout(self, ):
        rollout = []
        for t in range(self.nsteps):
            policies, values = self.model.evaluate(self.states)
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)
            rollout.append((self.states, actions, rewards, values, dones))
            self.states = next_states

        states, actions, rewards, values, dones = stack_many(*zip(*rollout))
        _, last_values = self.model.evaluate(next_states)
        return states, actions, rewards, dones, values, last_values
示例#7
0
    def rollout(self,):
        rollout = []
        for t in range(self.nsteps):
            policies, values = self.model.evaluate(self.states)
            # Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis])
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)

            rollout.append((self.states, actions, rewards, values, dones))
            self.replay.append((self.states, actions, rewards, values, dones)) # add to replay memory
            self.states = next_states
        
        states, actions, rewards, values, dones = stack_many(*zip(*rollout))
        _, last_values = self.model.evaluate(next_states)
        return states, actions, rewards, values, dones, last_values
示例#8
0
    def rollout(self, ):
        rollout = []
        first_hidden = self.prev_hidden
        for t in range(self.nsteps):
            policies, values, hidden = self.model.evaluate(
                self.states[None], self.prev_hidden)
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)
            rollout.append((self.states, actions, rewards, values, dones))
            self.states = next_states
            self.prev_hidden = self.model.mask_hidden(
                hidden, dones)  # reset hidden state at end of episode

        states, actions, rewards, values, dones = stack_many(*zip(*rollout))
        _, last_values, _ = self.model.evaluate(self.states[None],
                                                self.prev_hidden)
        return states, actions, rewards, first_hidden, dones, values, last_values
    def rollout(self,):
        rollout = []
        first_hidden = self.prev_hidden
        for t in range(self.nsteps):
            policies, values, hidden = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden)
            #Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[None])
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)

            rollout.append((self.states, actions, rewards, self.prev_actions_rewards, dones, infos))
            self.replay.append((self.states, actions, rewards, self.prev_hidden, self.prev_actions_rewards, dones)) # add to replay memory
            self.states = next_states
            self.prev_hidden = self.model.mask_hidden(hidden, dones) # reset hidden state at end of episode
            self.prev_actions_rewards = concat_action_reward(actions, rewards, self.action_size+1)
        
        states, actions, rewards, prev_actions_rewards, dones, infos = stack_many(*zip(*rollout))
        _, last_values, _ = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden)
        return states, actions, rewards, first_hidden, prev_actions_rewards, dones, last_values
示例#10
0
    def rollout(self, ):
        rollout = []
        for t in range(self.nsteps):
            start = time.time()
            policies, values = self.model.evaluate(self.states)
            actions = fastsample(policies)
            next_states, extr_rewards, dones, infos = self.env.step(actions)

            mean, std = self.state_mean[None], self.state_std[None]
            intr_rewards = self.model.intrinsic_reward(
                (self.states - mean) / std, actions,
                (next_states - mean) / std)
            rewards = extr_rewards + intr_rewards
            rollout.append(
                (self.states, next_states, actions, rewards, values, dones))
            self.states = next_states

        states, next_states, actions, rewards, values, dones = stack_many(*zip(
            *rollout))
        return states, next_states, actions, rewards, dones, values
示例#11
0
 def get_action(self, states):
     policies, values_extr, values_intr = self.model.evaluate(states)
     actions = fastsample(policies)
     return actions
示例#12
0
 def get_action(self, state):
     policy, value = self.model.evaluate(state)
     action = int(fastsample(policy))
     return action