Пример #1
0
    def change_is_goodz(self):

        sess = get_session()
        sess.run(self.perturb_policy_ops,
                 feed_dict={
                     self.param_noise_stddev: self.param_noise.current_stddev,
                 })
Пример #2
0
    def train(self):
        mem_replay = self.memory_replay
        batch_size = self.batch_size

        if len(self) > batch_size * 64:

            for i in range(self.train_multiplier):
                batch = mem_replay.sample(batch_size)
                sess = get_session()
                res = sess.run(
                    [
                        self.critic_loss, self.actor_loss, self.cstep,
                        self.astep, self.target_soft_updates
                    ],
                    feed_dict={
                        self.observation: batch['obs0'],
                        self.action: batch['actions'],
                        self.observation_after: batch['obs1'],
                        self.reward: batch['rewards'],
                        self.terminals1: batch['terminals_1'],
                        self.tau: 5e-4
                    })
                #self.sync_target(update='soft')
                self.traincounter += 1
                if self.traincounter % 20 == 0:
                    print(' ' * 30,
                          'closs: {:6.4f} aloss: {:6.4f}'.format(
                              res[0], res[1]),
                          end='\r')
Пример #3
0
    def __call__(self, obs):
        input_observation = np.reshape(obs, (1, len(obs)))
        feed_dict = {self.observation: input_observation}
        #actor = self.actor
        #obs = np.reshape(observation,(1,len(observation)))###############
        sess = get_session()
        #res = sess.run(self.a_infer,self.q_infer,feed_dict=feed_dict)
        [a, q] = sess.run([self.perturbed_actor_tf, self.q_infer],
                          feed_dict=feed_dict)
        actions, q = a[0], q[0]

        #if curr_noise is not None:
        #    disp_actions = (actions-self.action_bias) / self.action_multiplier
        #    disp_actions = disp_actions * 5 + np.arange(self.action_dims) * 12.0 + 30

        #    noise = curr_noise * 5 - np.arange(self.action_dims) * 12.0 - 30

        # self.lock.acquire()
        #self.loggraph(np.hstack([disp_actions,noise,q]))
        # self.lock.release()
        # temporarily disabled.

        #action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        #action = action.flatten()
        #if self.action_noise is not None and apply_noise:
        #    noise = self.action_noise()
        #    assert noise.shape == action.shape
        #    action += noise
        #action = np.clip(action, self.action_range[0], self.action_range[1])
        return actions  #action, q
Пример #4
0
 def save_agent(self, i):
     sess = get_session()
     self.saver.save(
         sess,
         "/home/daniel/Videos/underworld/underworld_dumpster/model/model",
         global_step=i)
     self.memory_replay.save(
         "/home/daniel/Videos/underworld/underworld_dumpster/mem.pickle" +
         str(i))
Пример #5
0
 def load_agent(self, i):
     sess = get_session()
     self.saver.restore(
         sess,
         "/home/daniel/Videos/underworld/underworld_dumpster/model/model-" +
         str(i))
     self.memory_replay.load(
         "/home/daniel/Videos/underworld/underworld_dumpster/mem.pickle" +
         str(i))
Пример #6
0
    def adapt_param_noise(self):

        sess = get_session()
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory_replay.sample(batch_size=self.batch_size)
        sess.run(self.perturb_adaptive_policy_ops,
                 feed_dict={
                     self.param_noise_stddev: self.param_noise.current_stddev,
                 })
        distance = sess.run(self.adaptive_policy_distance,
                            feed_dict={
                                self.observation:
                                batch['obs0'],
                                self.param_noise_stddev:
                                self.param_noise.current_stddev,
                            })

        #mean_distance = mpi_mean(distance)
        self.param_noise.adapt(distance)
Пример #7
0
 def sync_target(self, update='hard'):
     sess = get_session()
     if update == 'hard':
         sess.run(self.target_init_updates)
     else:
         sess.run(self.target_soft_updates, feed_dict={self.tau: 5e-4})
Пример #8
0
 def initialize(self):
     sess = get_session()
     sess.run(tf.global_variables_initializer())
     sess.graph.finalize()