def step(self, obs, expert_qv, expert_action, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = { self.obs0: U.adjust_shape(self.obs0, [obs]), self.expert_qv: U.adjust_shape(self.expert_qv, [expert_qv]), self.expert_actions: U.adjust_shape(self.expert_actions, [expert_action]) } if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None
def act_interpolate(self, obs, states, b_states, dones): sess = self.model.sess act_model = self.model.act_model b_act_model = self.model_burnin.act_model feed_dict = { act_model.X: adjust_shape(act_model.X, obs), b_act_model.X: adjust_shape(b_act_model.X, obs), } if states is not None: feed_dict.update({ act_model.S: adjust_shape(act_model.S, states), b_act_model.S: adjust_shape(b_act_model.S, b_states), }) variables = [ self.model.act_model.action_run, self.model.act_model.vf_run, self.model.act_model.state, self.model_burnin.act_model.vf_run, self.model_burnin.act_model.state, self.model.act_model.neglogp_run, self.model.act_model.latent_mean, self.model_burnin.act_model.latent_mean, self.model_burnin.act_model.action_run ] a, v, state, b_v, b_state, neglogp, lm, b_lm, b_a = sess.run( variables, feed_dict) if state.size == 0: state = None if b_state.size == 0: b_state = None return a, v, b_v, state, b_state, neglogp, lm, b_lm, b_a
def step(self, obs, apply_noise=True, compute_Q=True, states=None, masks=None): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf if states is not None and masks is not None: feed_dict = { self.obs0: U.adjust_shape(self.obs0, [obs]), self.mask0: U.adjust_shape(self.mask0, [masks]), self.state0: U.adjust_shape(self.state0, [states]) } else: feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None
def prob(self, observation, a): sess = self.sess or tf.get_default_session() feed_dict = { self.X: adjust_shape(self.X, observation), self.action_ph: adjust_shape(self.action_ph, a) } return sess.run([self.pdf], feed_dict)[0]
def __call__(self, obs, action): # with self.graph.as_default(): print("Expert call") feed_dict = {self.obs0: U.adjust_shape(self.obs0, obs), self.actions: U.adjust_shape(self.actions, action)} # import IPython; IPython.embed() q = self.sess.run([self.critic_tf], feed_dict=feed_dict) print("Expert return") return q
def get_batch_bonus_and_update(self, observation, **extra_feed): sess = self.sess feed_dict = {self.X: adjust_shape(self.X, observation)} for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): inpt = self.__dict__[inpt_name] if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': feed_dict[inpt] = adjust_shape(inpt, data) result = sess.run({"bonus": self.bonus, "train": self.train}, feed_dict) return result["bonus"]
def _evaluate(self, variables, observation, **extra_feed): sess = self.sess feed_dict = {self.X: adjust_shape(self.X, observation)} for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): inpt = self.__dict__[inpt_name] if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': feed_dict[inpt] = adjust_shape(inpt, data) return sess.run(variables, feed_dict)
def _evaluate(self, variables, observation, **extra_feed): sess = self.sess or tf.get_default_session() feed_dict = {self.X: adjust_shape(self.X, observation)} for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): inpt = self.__dict__[inpt_name] if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': feed_dict[inpt] = adjust_shape(inpt, data) #how to make sess.run choose a specific output? return sess.run(variables, feed_dict)
def _evaluate(self, variables, observation, **extra_feed): sess = self.sess feed_dict = {self.X: adjust_shape(self.X, observation)} for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): inpt = self.__dict__[inpt_name] if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': feed_dict[inpt] = adjust_shape(inpt, data) #print(feed_dict) """ if 'dropoutpi_keep_prob' in extra_feed.keys(): feed_dict.update({self.dropoutpi_keep_prob: extra_feed['dropoutpi_keep_prob']}) """ return sess.run(variables, feed_dict)
def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} # feed_dict={ph: [data] for ph, data in zip(self.obs0, obs)} # feed_dict = {self.obs0: [obs]} # Get the normalized obs first # norm_obs0 = self.sess.run(self.norm_obs0, feed_dict=feed_dict) # use the normalized obs for training # feed_dict = {ph: data for ph, data in zip(self.norm_obs0_ph, norm_obs0)} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action[0], q, None, None
def step(self, obs, apply_noise=True, compute_Q=True): """Apply the policy. Note the noise: for DDPG if we are *deploying* it, we should probably set the noise to False, such as for the `--play` option. """ if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() #assert noise.shape == action[0].shape # daniel: with my fix, both are (numenv, acdim) assert noise.shape == action.shape, '{} {}'.format( noise.shape, action.shape) action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None
def _evaluate(self, variables, observation, **extra_feed): sess = self.sess feed_dict = {} if self.is_list_obs: for idx, X in enumerate(self.X): feed_dict[X] = adjust_shape(X, observation[idx]) else: feed_dict[self.X] = adjust_shape(self.X, observation) for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): inpt = self.__dict__[inpt_name] if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': feed_dict[inpt] = adjust_shape(inpt, data) return sess.run(variables, feed_dict)
def adv_gradient(self, obs, reward, actions, old_obs): feed_dict = { self.X: adjust_shape(self.X, obs), self.reward: adjust_shape(self.reward, reward), self.action: adjust_shape(self.action, actions), self.old_X: adjust_shape(self.old_X, old_obs), } # For debugging purpose #a = self.sess.run(-self.neglogp * (self.reward - self.vf), feed_dict) #b = self.sess.run(self.adv_gamma * tf.square( # tf.reduce_sum(self.X - self.old_X, self.axes)), feed_dict) #c = self.sess.run(self.loss, feed_dict) #print(a[64], b[64], c[64]) #print(self.sess.run(self.grads, feed_dict)[0][0]) return self.sess.run(self.grads, feed_dict)
def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = { self.obs0: U.adjust_shape(self.obs0, [obs]) } #obs0에만 obs feed해준다 if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None #Exploration을 위해 액션에 노이즈 추가 if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None
def __call__(self, obs): # with self.graph.as_default(): print("Expert Actor call") feed_dict = {self.obs0: U.adjust_shape(self.obs0, obs)} # import IPython; IPython.embed() action = self.sess.run([self.actor_tf], feed_dict=feed_dict) print("Expert Actor return") return action
def learnt_step(self, obs): actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None
def _evaluate(self, variables, observation, **extra_feed): #print("GUGU1") sess = self.sess #print("GUGU2") feed_dict = {self.X: adjust_shape(self.X, observation)} #print("GUGU3") for inpt_name, data in extra_feed.items(): #print("GUGU4") if inpt_name in self.__dict__.keys(): #print("GUGU5") inpt = self.__dict__[inpt_name] #print("GUGU6") if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': #print("GUGU7") feed_dict[inpt] = adjust_shape(inpt, data) #print("GUGU8") #print(variables, feed_dict) return sess.run(variables, feed_dict)
def cal_neglogp(self, observation, a): """ Tgli defined, given a, calculate its negative log prob """ sess = self.sess feed_dict = { self.X: adjust_shape(self.X, observation), self.action_modified: a } return sess.run(self.neglogp_modified, feed_dict)
def step(self, obs, compute_Q=True): feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(self.actor_tf, feed_dict=feed_dict) q = None action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None
def _evaluate(self, variables, observation, action_mask=None, **extra_feed): sess = self.sess if action_mask is None: action_mask = np.ones((5, len(observation[0])), dtype=np.bool) feed_dict = { self.X: adjust_shape(self.X, observation), self._action_mask_ph: action_mask } #messing things up for other algs for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): inpt = self.__dict__[inpt_name] if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': feed_dict[inpt] = adjust_shape(inpt, data) return sess.run(variables, feed_dict)
def _evaluate(self, variables, observation, **extra_feed): # 480,640,3 ,1,1 sess = self.sess # feed_dict = deep.copy of onservation if (isinstance(self.X, list)): for eachKeys in self.obs_space: count = 0 feed_dict = { eachKeys: adjust_shape(self.X[count], observation[count]) } count += 1 feed_dict = {self.X: adjust_shape(self.X, observation)} # for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): inpt = self.__dict__[inpt_name] if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder': feed_dict[inpt] = adjust_shape(inpt, data) return sess.run(variables, feed_dict)
def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None
def step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None #print('\nACTION BEFORE NOISE:',action) if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1])[0] #print('\nACTION AFTER NOISE:',action) #print(self.action_range[0], self.action_range[1]) return action, q, None, None
def step(self, obs, apply_noise=False): """Apply the policy, no noise added. Returns a 4-tuple, only for compatibility with other code. We just care about returning the action as the first argument. Leaving apply_noise for compatibility with `baselines/run.py`. """ if not self.use_keras: # obs is from env, which is (B,224,224,3), but for calling our # model, we want it (B,x,x,3) where x is our chosen smaller dimension obs_new = [] for b in range(obs.shape[0]): resized = cv2.resize(obs[b], (self.obs_shape[0], self.obs_shape[1])) obs_new.append(resized) obs_new = np.array(obs_new) assert obs_new.shape == (obs.shape[0], self.obs_shape[0], self.obs_shape[1], 4) obs = obs_new actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} action = self.sess.run(actor_tf, feed_dict=feed_dict) action = np.clip(action, self.action_range[0], self.action_range[1]) return action, None, None, None
def make_feed_dict(self, data): if isinstance(self._placeholder, list): adj_data = adjust_shape(self._placeholder, data) return dict(zip(self._placeholder, adj_data)) else: return {self._placeholder: adjust_shape(self._placeholder, data)}
def evaluate(self, vars, input): sess = get_session() feed_dict = {self.X: adjust_shape(self.X, input)} return sess.run(vars, feed_dict)
def compute_Q(self, obs0_n, actions_n): q = self.sess.run(self.critic_with_actor_tf, feed_dict={ self.obs0_n: U.adjust_shape(self.obs0_n, [obs0_n]), self.actions_n: U.adjust_shape(self.actions_n, [actions_n]) }) return q[0, 0]
def make_feed_dict(self, data): return {self._placeholder: adjust_shape(self._placeholder, data)}
def evaluate(self, obs): eval_X = observation_placeholder(self.ob_space, batch_size=self.nenvs) sess = get_session() feed_dict = {eval_X: adjust_shape(eval_X, obs)} return sess.run(self.out, feed_dict)