예제 #1
0
  def __call__(self, state, greedy=False):
    action_scale = self.env.max_action

    # initial exploration and intrinsic curiosity
    res = None
    if self.training:
      if self.config.get('initial_explore') and len(self.replay_buffer) < self.config.initial_explore:
          res = np.array([self.env.action_space.sample() for _ in range(self.env.num_envs)])
      elif hasattr(self, 'ag_curiosity'):
        state = self.ag_curiosity.relabel_state(state)
      
    state = flatten_state(state, self.config.modalities + self.config.goal_modalities)  # flatten goal environments
    if hasattr(self, 'state_normalizer'):
      state = self.state_normalizer(state, update=self.training)
    
    if res is not None:
      return res

    state = self.torch(state)

    if self.use_actor_target:
      action, _ = self.actor_target(state)
    else:
      action, _ = self.actor(state)
    action = self.numpy(action)

    if self.training and not greedy and self.config.get('eexplore'):
      eexplore = self.config.eexplore
      if hasattr(self, 'ag_curiosity'):
        eexplore = self.ag_curiosity.go_explore * self.config.go_eexplore + eexplore
      mask = (np.random.random((action.shape[0], 1)) < eexplore).astype(np.float32)
      randoms = np.random.random(action.shape) * (2 * action_scale) - action_scale
      action = mask * randoms + (1 - mask) * action
    
    return np.clip(action, -action_scale, action_scale)
예제 #2
0
  def __call__(self, state, greedy=False):
    res = None
    # Initial Exploration 
    if self.training:
      if self.config.get('initial_explore') and len(
        self.replay_buffer) < self.config.initial_explore:
        res = np.array([self.env.action_space.sample() for _ in range(self.env.num_envs)])
      elif hasattr(self, 'ag_curiosity'):
        state = self.ag_curiosity.relabel_state(state)
    
    state = flatten_state(state)  # flatten goal environments
    if hasattr(self, 'state_normalizer'):
      state = self.state_normalizer(state, update=self.training)

    if res is not None:
      return res

    state = self.torch(state)
  
    if self.use_qvalue_target:
      q_values = self.numpy(self.qvalue_target(state))
    else:
      q_values = self.numpy(self.qvalue(state))
    
    if self.training and not greedy and np.random.random() < self.config.random_action_prob(steps=self.config.env_steps): 
        action = np.random.randint(self.env.action_space.n, size=[self.env.num_envs])
    else:
      action = np.argmax(q_values, -1) # Convert to int

    return action
예제 #3
0
    def _process_experience(self, exp):
        if getattr(self, 'logger'):
            self.logger.add_tabular('Replay buffer size', len(self.buffer))
        done = np.expand_dims(exp.done, 1)  # format for replay buffer
        reward = np.expand_dims(exp.reward, 1)  # format for replay buffer

        action = exp.action

        if self.goal_shape:
            state = flatten_state(exp.state, self.modalities)
            next_state = flatten_state(exp.next_state, self.modalities)
            if hasattr(self, 'achieved_goal'):
                previous_achieved = self.achieved_goal(exp.state)
                achieved = self.achieved_goal(exp.next_state)
            else:
                previous_achieved = exp.state['achieved_goal']
                achieved = exp.next_state['achieved_goal']
            desired = flatten_state(exp.state, self.goal_modalities)
            if hasattr(self, 'ag_curiosity'
                       ) and self.ag_curiosity.current_goals is not None:
                behavioral = self.ag_curiosity.current_goals
                # recompute online reward
                reward = self.env.compute_reward(achieved, behavioral, {
                    's': state,
                    'a': action,
                    'ns': next_state
                }).reshape(-1, 1)
            else:
                behavioral = desired
            for i in range(self.n_envs):
                self._subbuffers[i].append([
                    state[i], action[i], reward[i], next_state[i], done[i],
                    previous_achieved[i], achieved[i], behavioral[i],
                    desired[i]
                ])
        else:
            state = exp.state
            next_state = exp.next_state
            for i in range(self.n_envs):
                self._subbuffers[i].append(
                    [state[i], action[i], reward[i], next_state[i], done[i]])

        for i in range(self.n_envs):
            if exp.trajectory_over[i]:
                trajectory = [np.stack(a) for a in zip(*self._subbuffers[i])]
                self.buffer.add_trajectory(*trajectory)
                self._subbuffers[i] = []
예제 #4
0
  def __call__(self, state, greedy=False):
    action_scale = self.env.max_action

    # initial exploration and intrinsic curiosity
    res = None
    if self.training:
      if self.config.get('initial_explore') and len(self.replay_buffer) < self.config.initial_explore:
        res = np.array([self.env.action_space.sample() for _ in range(self.env.num_envs)])
      elif hasattr(self, 'ag_curiosity'):
        state = self.ag_curiosity.relabel_state(state)
        
    state = flatten_state(state, self.config.modalities + self.config.goal_modalities)  # flatten goal environments, # batch x state_dim
    if hasattr(self, 'state_normalizer'):
      state = self.state_normalizer(state, update=self.training)

    if res is not None:
      return res

    state = self.torch(state)

    action_proposals = self.actor(state) # batch x num_proposals x action_dim
    states = torch.repeat_interleave(state, action_proposals.shape[1], 0)
    q_values = self.critic(states, action_proposals.reshape(-1, action_proposals.shape[-1]))
    q_values = q_values.reshape(state.shape[0], action_proposals.shape[1]) # batch x num_proposals
    best_actions = torch.argmax(q_values, dim=-1, keepdims=True) # batch x 1
    action = action_proposals.gather(1, torch.tile(best_actions[:,:,None], (1, 1, action_proposals.shape[2]))).squeeze(1)
    action = self.numpy(action)
    
    if self.training and not greedy:
      action = self.action_noise(action)
      if self.config.get('eexplore'):
        eexplore = self.config.eexplore
        if hasattr(self, 'ag_curiosity'):
          eexplore = self.ag_curiosity.go_explore * self.config.go_eexplore + eexplore
        mask = (np.random.random((action.shape[0], 1)) < eexplore).astype(np.float32)
        randoms = np.random.random(action.shape) * (2 * action_scale) - action_scale
        action = mask * randoms + (1 - mask) * action

    return np.clip(action, -action_scale, action_scale)