def get_actions(self, observations): flat_obs = self.observation_space.flatten_n(observations) if self.state_include_action: assert self.prev_actions is not None all_input = np.concatenate([flat_obs, self.prev_actions], axis=-1) else: all_input = flat_obs probs, hidden_vec = self.f_step_prob(all_input, self.prev_hiddens) # print("probs: ",probs) # R0,z_os = self.debug(all_input, self.prev_hiddens) # print('R0: ',R0) # print("z_os: ",z_os) # sess = tf.get_default_session() # print('q*w: ',sess.run(tf.matmul(q,w))) # print("action_pred: ",action_pred) actions = special.weighted_sample_n(probs, np.arange(self.action_space.n)) #print("actions: ",actions) prev_actions = self.prev_actions self.prev_actions = self.action_space.flatten_n(actions) #print("prve_actions: ",self.prev_actions) self.prev_hiddens = hidden_vec agent_info = dict(prob=probs) if self.state_include_action: agent_info["prev_action"] = np.copy(prev_actions) return actions, agent_info
def get_actions(self, observations): # Figure out which agents need valid actions agents_to_act = [ i for i, j in enumerate(observations) if j != [None] * len(j) ] agents_not_to_act = [ x for x in list(range(len(observations))) if x not in agents_to_act ] if (len(agents_to_act) == 0): # no agents are acting (shouldn't happen) return [None] * len(observations) else: # copy a valid observation into locations that have [None] valid_obs = next(obs for obs in observations if obs != [None] * len(obs)) observations = [ obs if obs != [None] * len(obs) else valid_obs for obs in observations ] flat_obs = self.observation_space.flatten_n(observations) if self.state_include_action: assert self.prev_actions is not None try: all_input = np.concatenate([flat_obs, self.prev_actions], axis=-1) except ValueError: all_input = np.concatenate([flat_obs, self.prev_actions.T], axis=-1) else: all_input = flat_obs probs, hidden_vec = self.f_step_prob(all_input, self.prev_hiddens) actions = special.weighted_sample_n(probs, np.arange(self.action_space.n)) # dont update prev_actions, hidden_vec for non-acting agents # replace those actions with None before returning prev_actions = self.prev_actions prev_actions_flattened = self.action_space.flatten_n(actions) actions = actions.tolist() for i in agents_not_to_act: hidden_vec[i] = self.prev_hiddens[i] prev_actions_flattened[i, :] = prev_actions[i, :] actions[i] = None self.prev_actions = prev_actions_flattened self.prev_hiddens = hidden_vec agent_info = dict(prob=probs) if self.state_include_action: agent_info["prev_action"] = np.copy(prev_actions) return actions, agent_info
def get_actions(self, observations): flat_obs = self.observation_space.flatten_n(observations) if self.state_include_action: assert self.prev_actions is not None all_input = np.concatenate([flat_obs, self.prev_actions], axis=-1) else: all_input = flat_obs probs, hidden_vec = self.f_step_prob(all_input, self.prev_hiddens) actions = special.weighted_sample_n(probs, np.arange(self.action_space.n)) prev_actions = self.prev_actions self.prev_actions = self.action_space.flatten_n(actions) self.prev_hiddens = hidden_vec agent_info = dict(prob=probs) if self.state_include_action: agent_info["prev_action"] = np.copy(prev_actions) return actions, agent_info
def get_actions(self, observations): flat_obs = self.observation_space.flatten_n(observations) if self.hardcoded_q is not None: q_func = self.hardcoded_q else: q_func = tf.get_default_session().run(self.q_func) q_vals = flat_obs.dot(q_func) # softmax qv = (1.0 / self.ent_wt) * q_vals qv = qv - np.max(qv, axis=1, keepdims=True) probs = np.exp(qv) probs = probs / np.sum(probs, axis=1, keepdims=True) actions = special.weighted_sample_n(probs, np.arange(self.action_space.n)) agent_info = dict(prob=probs) return actions, agent_info
def get_actions(self, observations): flat_obs = self.observation_space.flatten_n(observations) if self.state_include_action: assert self.prev_actions is not None all_input = np.concatenate([ flat_obs, self.prev_actions ], axis=-1) else: all_input = flat_obs probs, hidden_vec = self.f_step_prob(all_input, self.prev_hiddens) actions = special.weighted_sample_n(probs, np.arange(self.action_space.n)) prev_actions = self.prev_actions self.prev_actions = self.action_space.flatten_n(actions) self.prev_hiddens = hidden_vec agent_info = dict(prob=probs) if self.state_include_action: agent_info["prev_action"] = np.copy(prev_actions) return actions, agent_info
def weighted_sample_n(self, weights_matrix): return special.weighted_sample_n(weights_matrix, self._items_arr)