def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) # print(len(h), " h:", h) #TODO: 256+32 (the 32 comes first) # So we could have 288*2*18 params, or 288*2*environment.action_space.n (6 for Pong) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer raise Exception("Not ported to atari") # h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) # action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: # could probabilistically sample from softmax, but greedy action = np.argmax(np.matmul(h, self.weight) + self.bias) # action[1] = (action[1]+1.0) / 2.0 # action[2] = clip(action[2]) # print("Action:", action) action_one_hot = np.zeros(self.num_actions) action_one_hot[action] = 1 # print("Action hot:", action_one_hot) self.state = rnn_next_state(self.rnn, z, action_one_hot, self.state) return action
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' # print(h) if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = sigmoid(np.dot(h, self.weight) + self.bias) # action = sigmoid(h) # print(np.mean(action)) # print(action) action_mean = np.mean(action) action_mean = action_mean * 62 action = np.array([int(action_mean)]) # action[1] = (action[1] + 1.0) / 2.0 # action[2] = clip(action[2]) # action = np.array([int(action[2])]) print("action", action) self.state = rnn_next_state(self.rnn, z, action, self.state) return action
def get_action(self, z, epsilon=0.0): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if np.random.rand() < epsilon: action = np.random.randint(0, self.na) else: if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.maximum( np.dot(h, self.weight_hidden) + self.bias_hidden, 0) action = np.argmax( np.dot(h, self.weight_output) + self.bias_output) else: action = np.argmax(np.dot(h, self.weight) + self.bias) oh_action = np.zeros(self.na) oh_action[action] = 1 # action[1] = (action[1]+1.0) / 2.0 # action[2] = clip(action[2]) # TODO check about this fucntion self.state = rnn_next_state(self.rnn, z, oh_action, self.state) return action
def modify_observation(self, obs: dict): features = rnn_output(self.rnn_state, self.z, self.features_mode) # Append encoded observation (encoding z and hidden state h) obs["features"] = features if not self.keep_image: del obs["image"] return obs
def get_action(self, feature): h = rnn_output(self.rnn_state, feature, EXP_MODE) action, v_preds = self.net.policy.get_action(h, verbose=False) action_one_hot = get_one_hot(np.array(action), ACTION_SPACE) self.rnn_state = rnn_next_state(self.net.rnn, feature, action_one_hot, self.rnn_state) return h, action, v_preds
def get_action(self,z): h=rnn_output(self.state,z,EXP_MODE) if EXP_MODE==MODE_Z_HIDDEN: h=np.tanh(np.dot(h,self.weight_hidden)+self.bias_hidden) action=np.tanh(np.dot(h,self.weight_output)+self.bias_output) else: action=np.tanh(np.dot(h,self.weight)+self.bias) action[1]=(action[1]+1.0)/2.0 action[2]=clip(action[2]) self.state=rnn_next_state(self.rnn,z,action,self.state) return action
def encode_obs(self, obs, prev_state, action): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) h = rnn_output(prev_state, z, 4) next_state = rnn_next_state(self.rnn, z, np.array(action), prev_state) return np.concatenate([h, z]), next_state
def encode_obs(obs, prev_state, action): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) next_state = rnn_next_state(rnn, z, action, prev_state) h = rnn_output(state, z, 4) return h, next_state
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) self.state = rnn_next_state(self.rnn, z, action, self.state) return action
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) if self.arglist.inference: oppo_intents = [] for i in range(self.arglist.agent_num - 1): act_traj = self.act_traj[i] intent = self.oppo_model.get_inference(act_traj) oppo_intents.append(intent) oppo_intents = np.reshape( oppo_intents, ((self.arglist.agent_num - 1) * self.arglist.action_space)) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' #Oppo intent shape (batch_size, agent_num, action_space) # reshape oppo_intent agent_num * batch_size * action_space controller_input = np.concatenate((h, oppo_intents)) else: controller_input = h if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer x = np.tanh( np.dot(controller_input, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(x, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(controller_input, self.weight) + self.bias) for i in range(self.action_space): action[i] = clip(action[i]) self.state = rnn_next_state(self.rnn, z, action, self.act_traj, self.state) # self.oppo_state = oppo_next_state(self.oppo_model, action, self.act_traj, self.oppo_state) # epsilon exploration if np.random.uniform(0, 1) < 0.2: action = [np.random.uniform(-3, 3)] * len(action) return action
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) #print('h', h.shape, h) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: '''print(h.shape) print(self.weight.shape) print(self.bias.shape)''' action = np.tanh(np.dot(h, self.weight) + self.bias) '''for i in range(ACTION_SIZE): action[i] = (action[i]+1.0) / 2.0 #all actions value are in range 0 to 1''' #action[2] = clip(action[2]) self.state = rnn_next_state(self.rnn, z, action, self.state) #update weights of MDN-RNN return action
def get_action(self, z, arglist): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) if arglist.competitive: obs, rewards, done, win = self.env.step([action[0], 'script']) else: obs, rewards, done, win = self.env.step(action) extra_reward = 0.0 # penalize for turning too frequently if arglist.competitive: if arglist.train_mode and penalize_turning: extra_reward -= np.abs(action[0]) / 10.0 rewards[0] += extra_reward reward = rewards[0] else: if arglist.train_mode and penalize_turning: reward = np.sum(rewards) extra_reward -= np.abs(action[0]) / 10.0 reward += extra_reward # recording_reward.append(reward) # total_reward += reward self.state = rnn_next_state(self.rnn, z, action, self.state) return action
def step(self, action): self.rnn_state, self.z, reward, done = rnn_sim(self.rnn, self.z, self.rnn_state, action) obs = OrderedDict(features=rnn_output(self.rnn_state, self.z, self.features_mode)) info = {} # Additional info, not used return obs, reward, done, info
def reset(self): self.rnn_state = rnn_init_state(self.rnn) self.z = self._sample_init_z() obs = OrderedDict(features=rnn_output(self.rnn_state, self.z, self.features_mode)) return obs