Exemplo n.º 1
0
    def get_action(self, z):
        h = rnn_output(self.state, z, EXP_MODE)
        # print(len(h), " h:", h) #TODO: 256+32 (the 32 comes first)
        # So we could have 288*2*18 params, or 288*2*environment.action_space.n (6 for Pong)
        '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            raise Exception("Not ported to atari")
            # h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
            # action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
        else:
            # could probabilistically sample from softmax, but greedy
            action = np.argmax(np.matmul(h, self.weight) + self.bias)

        # action[1] = (action[1]+1.0) / 2.0
        # action[2] = clip(action[2])
        # print("Action:", action)
        action_one_hot = np.zeros(self.num_actions)
        action_one_hot[action] = 1
        # print("Action hot:", action_one_hot)

        self.state = rnn_next_state(self.rnn, z, action_one_hot, self.state)

        return action
Exemplo n.º 2
0
    def get_action(self, z):
        h = rnn_output(self.state, z, EXP_MODE)
        '''
        action = np.dot(h, self.weight) + self.bias
        action[0] = np.tanh(action[0])
        action[1] = sigmoid(action[1])
        action[2] = clip(np.tanh(action[2]))
        '''
        # print(h)
        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
            action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
        else:
            action = sigmoid(np.dot(h, self.weight) + self.bias)
        # action = sigmoid(h)
        # print(np.mean(action))
        # print(action)

        action_mean = np.mean(action)
        action_mean = action_mean * 62
        action = np.array([int(action_mean)])

        # action[1] = (action[1] + 1.0) / 2.0
        # action[2] = clip(action[2])

        # action = np.array([int(action[2])])

        print("action", action)

        self.state = rnn_next_state(self.rnn, z, action, self.state)

        return action
Exemplo n.º 3
0
    def get_action(self, z, epsilon=0.0):
        h = rnn_output(self.state, z, EXP_MODE)
        '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
        if np.random.rand() < epsilon:
            action = np.random.randint(0, self.na)
        else:
            if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
                h = np.maximum(
                    np.dot(h, self.weight_hidden) + self.bias_hidden, 0)
                action = np.argmax(
                    np.dot(h, self.weight_output) + self.bias_output)
            else:
                action = np.argmax(np.dot(h, self.weight) + self.bias)

        oh_action = np.zeros(self.na)
        oh_action[action] = 1

        # action[1] = (action[1]+1.0) / 2.0
        # action[2] = clip(action[2])

        # TODO check about this fucntion
        self.state = rnn_next_state(self.rnn, z, oh_action, self.state)

        return action
Exemplo n.º 4
0
    def modify_observation(self, obs: dict):
        features = rnn_output(self.rnn_state, self.z, self.features_mode)

        # Append encoded observation (encoding z and hidden state h)
        obs["features"] = features
        if not self.keep_image:
            del obs["image"]

        return obs
Exemplo n.º 5
0
    def get_action(self, feature):
        h = rnn_output(self.rnn_state, feature, EXP_MODE)
        action, v_preds = self.net.policy.get_action(h, verbose=False)

        action_one_hot = get_one_hot(np.array(action), ACTION_SPACE)

        self.rnn_state = rnn_next_state(self.net.rnn, feature, action_one_hot,
                                        self.rnn_state)
        return h, action, v_preds
Exemplo n.º 6
0
 def get_action(self,z):
  h=rnn_output(self.state,z,EXP_MODE)
  if EXP_MODE==MODE_Z_HIDDEN:
   h=np.tanh(np.dot(h,self.weight_hidden)+self.bias_hidden)
   action=np.tanh(np.dot(h,self.weight_output)+self.bias_output)
  else:
   action=np.tanh(np.dot(h,self.weight)+self.bias)
  action[1]=(action[1]+1.0)/2.0
  action[2]=clip(action[2])
  self.state=rnn_next_state(self.rnn,z,action,self.state)
  return action
Exemplo n.º 7
0
 def encode_obs(self, obs, prev_state, action):
     # convert raw obs to z, mu, logvar
     result = np.copy(obs).astype(np.float)/255.0
     result = result.reshape(1, 64, 64, 3)
     mu, logvar = self.vae.encode_mu_logvar(result)
     mu = mu[0]
     logvar = logvar[0]
     s = logvar.shape
     z = mu + np.exp(logvar/2.0) * np.random.randn(*s)
     h = rnn_output(prev_state, z, 4)
     next_state = rnn_next_state(self.rnn, z, np.array(action), prev_state)
     return np.concatenate([h, z]), next_state
Exemplo n.º 8
0
 def encode_obs(obs, prev_state, action):
   # convert raw obs to z, mu, logvar
   result = np.copy(obs).astype(np.float)/255.0
   result = result.reshape(1, 64, 64, 3)
   mu, logvar = vae.encode_mu_logvar(result)
   mu = mu[0]
   logvar = logvar[0]
   s = logvar.shape
   z = mu + np.exp(logvar/2.0) * np.random.randn(*s)
   next_state = rnn_next_state(rnn, z, action, prev_state)
   h = rnn_output(state, z, 4)
   return h, next_state
Exemplo n.º 9
0
  def get_action(self, z):
    h = rnn_output(self.state, z, EXP_MODE)

    '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
    if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer
      h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
      action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
    else:
      action = np.tanh(np.dot(h, self.weight) + self.bias)

    self.state = rnn_next_state(self.rnn, z, action, self.state)

    return action
Exemplo n.º 10
0
    def get_action(self, z):
        h = rnn_output(self.state, z, EXP_MODE)

        if self.arglist.inference:
            oppo_intents = []
            for i in range(self.arglist.agent_num - 1):
                act_traj = self.act_traj[i]
                intent = self.oppo_model.get_inference(act_traj)
                oppo_intents.append(intent)
            oppo_intents = np.reshape(
                oppo_intents,
                ((self.arglist.agent_num - 1) * self.arglist.action_space))
            '''
      action = np.dot(h, self.weight) + self.bias
      action[0] = np.tanh(action[0])
      action[1] = sigmoid(action[1])
      action[2] = clip(np.tanh(action[2]))
      '''
            #Oppo intent shape (batch_size, agent_num, action_space)
            # reshape oppo_intent  agent_num * batch_size * action_space

            controller_input = np.concatenate((h, oppo_intents))
        else:
            controller_input = h

        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            x = np.tanh(
                np.dot(controller_input, self.weight_hidden) +
                self.bias_hidden)
            action = np.tanh(np.dot(x, self.weight_output) + self.bias_output)
        else:
            action = np.tanh(np.dot(controller_input, self.weight) + self.bias)
        for i in range(self.action_space):
            action[i] = clip(action[i])

        self.state = rnn_next_state(self.rnn, z, action, self.act_traj,
                                    self.state)
        # self.oppo_state = oppo_next_state(self.oppo_model, action, self.act_traj, self.oppo_state)

        # epsilon exploration
        if np.random.uniform(0, 1) < 0.2:
            action = [np.random.uniform(-3, 3)] * len(action)
        return action
Exemplo n.º 11
0
 def get_action(self, z):
     h = rnn_output(self.state, z, EXP_MODE)
     #print('h', h.shape, h)
     '''
 action = np.dot(h, self.weight) + self.bias
 action[0] = np.tanh(action[0])
 action[1] = sigmoid(action[1])
 action[2] = clip(np.tanh(action[2]))
 '''
     if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
         h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
         action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
     else:
         '''print(h.shape)
   print(self.weight.shape)
   print(self.bias.shape)'''
         action = np.tanh(np.dot(h, self.weight) + self.bias)
     '''for i in range(ACTION_SIZE):
   action[i] = (action[i]+1.0) / 2.0 #all actions value are in range 0 to 1'''
     #action[2] = clip(action[2])
     self.state = rnn_next_state(self.rnn, z, action,
                                 self.state)  #update weights of MDN-RNN
     return action
Exemplo n.º 12
0
    def get_action(self, z, arglist):
        h = rnn_output(self.state, z, EXP_MODE)
        '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
            action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
        else:
            action = np.tanh(np.dot(h, self.weight) + self.bias)

        if arglist.competitive:
            obs, rewards, done, win = self.env.step([action[0], 'script'])
        else:
            obs, rewards, done, win = self.env.step(action)

        extra_reward = 0.0  # penalize for turning too frequently
        if arglist.competitive:
            if arglist.train_mode and penalize_turning:
                extra_reward -= np.abs(action[0]) / 10.0
                rewards[0] += extra_reward
            reward = rewards[0]
        else:
            if arglist.train_mode and penalize_turning:
                reward = np.sum(rewards)
                extra_reward -= np.abs(action[0]) / 10.0
                reward += extra_reward

        # recording_reward.append(reward)
        # total_reward += reward

        self.state = rnn_next_state(self.rnn, z, action, self.state)

        return action
Exemplo n.º 13
0
 def step(self, action):
     self.rnn_state, self.z, reward, done = rnn_sim(self.rnn, self.z, self.rnn_state, action)
     obs = OrderedDict(features=rnn_output(self.rnn_state, self.z, self.features_mode))
     info = {}  # Additional info, not used
     return obs, reward, done, info
Exemplo n.º 14
0
 def reset(self):
     self.rnn_state = rnn_init_state(self.rnn)
     self.z = self._sample_init_z()
     obs = OrderedDict(features=rnn_output(self.rnn_state, self.z, self.features_mode))
     return obs