def change_is_goodz(self): sess = get_session() sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, })
def train(self): mem_replay = self.memory_replay batch_size = self.batch_size if len(self) > batch_size * 64: for i in range(self.train_multiplier): batch = mem_replay.sample(batch_size) sess = get_session() res = sess.run( [ self.critic_loss, self.actor_loss, self.cstep, self.astep, self.target_soft_updates ], feed_dict={ self.observation: batch['obs0'], self.action: batch['actions'], self.observation_after: batch['obs1'], self.reward: batch['rewards'], self.terminals1: batch['terminals_1'], self.tau: 5e-4 }) #self.sync_target(update='soft') self.traincounter += 1 if self.traincounter % 20 == 0: print(' ' * 30, 'closs: {:6.4f} aloss: {:6.4f}'.format( res[0], res[1]), end='\r')
def __call__(self, obs): input_observation = np.reshape(obs, (1, len(obs))) feed_dict = {self.observation: input_observation} #actor = self.actor #obs = np.reshape(observation,(1,len(observation)))############### sess = get_session() #res = sess.run(self.a_infer,self.q_infer,feed_dict=feed_dict) [a, q] = sess.run([self.perturbed_actor_tf, self.q_infer], feed_dict=feed_dict) actions, q = a[0], q[0] #if curr_noise is not None: # disp_actions = (actions-self.action_bias) / self.action_multiplier # disp_actions = disp_actions * 5 + np.arange(self.action_dims) * 12.0 + 30 # noise = curr_noise * 5 - np.arange(self.action_dims) * 12.0 - 30 # self.lock.acquire() #self.loggraph(np.hstack([disp_actions,noise,q])) # self.lock.release() # temporarily disabled. #action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) #action = action.flatten() #if self.action_noise is not None and apply_noise: # noise = self.action_noise() # assert noise.shape == action.shape # action += noise #action = np.clip(action, self.action_range[0], self.action_range[1]) return actions #action, q
def save_agent(self, i): sess = get_session() self.saver.save( sess, "/home/daniel/Videos/underworld/underworld_dumpster/model/model", global_step=i) self.memory_replay.save( "/home/daniel/Videos/underworld/underworld_dumpster/mem.pickle" + str(i))
def load_agent(self, i): sess = get_session() self.saver.restore( sess, "/home/daniel/Videos/underworld/underworld_dumpster/model/model-" + str(i)) self.memory_replay.load( "/home/daniel/Videos/underworld/underworld_dumpster/mem.pickle" + str(i))
def adapt_param_noise(self): sess = get_session() # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory_replay.sample(batch_size=self.batch_size) sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = sess.run(self.adaptive_policy_distance, feed_dict={ self.observation: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) #mean_distance = mpi_mean(distance) self.param_noise.adapt(distance)
def sync_target(self, update='hard'): sess = get_session() if update == 'hard': sess.run(self.target_init_updates) else: sess.run(self.target_soft_updates, feed_dict={self.tau: 5e-4})
def initialize(self): sess = get_session() sess.run(tf.global_variables_initializer()) sess.graph.finalize()