Пример #1
0
class A3C_Thread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index  # Number the thread

        self._set_local_network(device, network_scope, scene_scope,
                                task_scope)  # Set local network

        self.sync = self.local_network.sync_from(
            global_network)  # Synthesize from the global network

        self.learning_rate_input = learning_rate_input  # Set learning rate

        self.max_global_time_step = max_global_time_step  # Set maximum of global time step

        self._set_trainer_optimizer(device, global_network,
                                    grad_applier)  # Set trainer

        self._set_environment(initial_learning_rate)  # Set environment

    # Create local network
    def _set_local_network(self, device, network_scope, scene_scope,
                           task_scope):
        self.local_network = DRLNetwork(action_size=ACTION_SIZE,
                                        device=device,
                                        network_scope=network_scope,
                                        scene_scopes=[scene_scope])

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]
        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

    # Set trainer and optimizer
    # Set Actor-Critic gradient and optimizer
    # Use the accumulated trainer from Zhu
    def _set_trainer_optimizer(self, device, global_network, grad_applier):
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

    def _local_var_name(self, var):
        return '/'.join(var.name.split('/')[1:])

    def _get_accum_grad_name(self, var):
        return self._local_var_name(var).replace(':', '_') + '_accum_grad:0'

    # Set environments
    def _set_environment(self, initial_learning_rate):
        self.episode_max_q = -np.inf
        self.env = None
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        self.episode_length = 0

    # Choose one action according to the pi values
    def choose_action(self, pi_values):
        action = np.random.choice(np.arange(len(pi_values)), p=pi_values)
        return action

    # Take LOCAL_T_MAX step in one process
    # And update the accumulated gradients
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })
        start_local_t = self.local_t

        # Initialization
        states = []
        actions = []
        rewards = []
        values = []
        targets = []
        terminal_end = False

        # Reset accmulated gradient variables
        sess.run(self.reset_gradients)
        # Obtain shared parameters from global
        sess.run(self.sync)

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)

            pi_ = np.array(pi_) / np.sum(pi_)
            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("%s:" % self.scene_scope)
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            # ad-hoc reward for navigation
            # reward = 10.0 if terminal else -0.01
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (self.thread_index, global_t, self.thread_index,
                       self.scene_scope, self.task_scope, self.scene_scope,
                       self.task_scope, self.episode_reward, self.scene_scope,
                       self.task_scope, self.episode_length, self.scene_scope,
                       self.task_scope, self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.env.reset()

                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values,
                                        targets):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.t: batch_t,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write(
                "#Thread-%d-%s-Local timestep-%d\n" %
                (self.thread_index, self.scene_scope, self.local_t))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t

    def _record_score(self, sess, writer, summary_op, placeholders, values,
                      global_t):
        feed_dict = {}
        for k in placeholders:
            feed_dict[placeholders[k]] = values[k]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        if VERBOSE:
            sys.stdout.write('writing to summary writer at time %d\n' %
                             (global_t))
        writer.add_summary(summary_str, global_t)
        # writer.flush()

    def _anneal_learning_rate(self, global_time_step):
        time_step_to_go = max(self.max_global_time_step - global_time_step,
                              0.0)
        learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
        return learning_rate
Пример #2
0
                           global_step=global_t)
                last_global_t = global_t

    def signal_handler(signal, frame):
        global stop_requested
        print('You pressed Ctrl+C!')
        stop_requested = True

    # create tensorboard summaries
    summary_op, summary_placeholders = create_summary()
    summary_writer = tf.summary.FileWriter(LOG_FILE, sess.graph)

    # init or load checkpoint with saver
    # if you don't need to be able to resume training, use the next line instead.
    # it will result in a much smaller checkpoint file.
    saver = tf.train.Saver(max_to_keep=10, var_list=global_network.get_vars())
    # saver = tf.train.Saver(max_to_keep=10)

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded: {}".format(checkpoint.model_checkpoint_path))
        tokens = checkpoint.model_checkpoint_path.split("-")
        # set global step
        global_t = int(tokens[1])
        print(">>> global step set: {}".format(global_t))
    else:
        print("Could not find old checkpoint")
    train_threads = []
    for i in range(NUM_THREADS):
        train_threads.append(
class Train(object):
    def __init__(self):
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

        # self.evluation_gap = 10**6
        print(MAX_TIME_STEP)
        self.device = "/gpu:0" if USE_GPU else "/cpu:0"
        self.network_scope = TASK_TYPE
        self.list_of_tasks = TASK_LIST
        self.scene_scopes = self.list_of_tasks.keys()
        self.global_t = 0
        self.stop_requested = False

        self.initial_learning_rate = self.log_uniform(LR_ALPHA_LOW,
                                                      LR_ALPHA_HIGH,
                                                      LR_ALPHA_LOG_RATE)

        self.global_network = DRLNetwork(action_size=ACTION_SIZE,
                                         device=self.device,
                                         network_scope=self.network_scope,
                                         scene_scopes=self.scene_scopes)

        self.branches = []
        for scene in self.scene_scopes:
            for task in self.list_of_tasks[scene]:
                self.branches.append((scene, task))

        self.NUM_TASKS = len(self.branches)
        assert NUM_THREADS >= self.NUM_TASKS, \
            "Not enough threads for multitasking: at least {} threads needed.".format(self.NUM_TASKS)

        self.learning_rate_input = tf.placeholder("float")
        self.grad_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=RMSP_ALPHA,
            momentum=0.0,
            epsilon=RMSP_EPSILON,
            clip_norm=GRAD_NORM_CLIP,
            device=self.device)

        # instantiate each training thread
        # each thread is training for one target in one scene
        self.training_threads = []
        for i in range(NUM_THREADS):
            scene, task = self.branches[i % self.NUM_TASKS]
            training_thread = ADQN_Thread(i,
                                          self.global_network,
                                          self.initial_learning_rate,
                                          self.learning_rate_input,
                                          self.grad_applier,
                                          MAX_TIME_STEP,
                                          device=self.device,
                                          network_scope="thread-%d" % (i + 1),
                                          scene_scope=scene,
                                          task_scope=task)
            self.training_threads.append(training_thread)

    def log_uniform(self, lo, hi, rate):
        log_lo = np.log(lo)
        log_hi = np.log(hi)
        v = log_lo * (1 - rate) + log_hi * rate
        return np.exp(v)

    def train(self):
        # prepare session
        self.sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        init = tf.global_variables_initializer()
        self.sess.run(init)

        # create tensorboard summaries
        self.create_summary()
        self.summary_writer = tf.summary.FileWriter(LOG_FILE, self.sess.graph)

        # init or load checkpoint with saver
        # if you don't need to be able to resume training, use the next line instead.
        # it will result in a much smaller checkpoint file.
        self.saver = tf.train.Saver(max_to_keep=10,
                                    var_list=self.global_network.get_vars())
        # saver = tf.train.Saver(max_to_keep=10)

        self.checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
        if self.checkpoint and self.checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess,
                               self.checkpoint.model_checkpoint_path)
            print("checkpoint loaded: {}".format(
                self.checkpoint.model_checkpoint_path))
            tokens = self.checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: {}".format(self.global_t))
        else:
            print("Could not find old checkpoint")

        train_threads = []
        for i in range(NUM_THREADS):
            train_threads.append(
                threading.Thread(target=self.train_function, args=(i, )))

        signal.signal(signal.SIGINT, self.signal_handler)

        # start each training thread
        for t in train_threads:
            t.start()

        print('Press Ctrl+C to stop.')
        signal.pause()

        # wait for all threads to finish
        for t in train_threads:
            t.join()

        print('Now saving data. Please wait.')
        self.saver.save(self.sess,
                        CHECKPOINT_DIR + '/' + 'checkpoint',
                        global_step=self.global_t)
        self.summary_writer.close()

    def create_summary(self):
        self.summary_op = dict()
        self.summary_placeholders = dict()
        for i in range(NUM_THREADS):
            scene, task = self.branches[i % self.NUM_TASKS]
            key = scene + "-" + task

            # summary for tensorboard
            episode_reward_input = tf.placeholder("float")
            episode_length_input = tf.placeholder("float")
            #episode_max_q_input  = tf.placeholder("float")

            scalar_summaries = [
                tf.summary.scalar(key + "/Episode Reward",
                                  episode_reward_input),
                tf.summary.scalar(key + "/Episode Length",
                                  episode_length_input)
                #tf.summary.scalar(key+"/Episode Max Q", episode_max_q_input)
            ]

            self.summary_op[key] = tf.summary.merge(scalar_summaries)
            self.summary_placeholders[key] = {
                "episode_reward_input": episode_reward_input,
                "episode_length_input": episode_length_input,
                #"episode_max_q_input": episode_max_q_input,
                "learning_rate_input": self.learning_rate_input
            }

    def train_function(self, parallel_index):
        training_thread = self.training_threads[parallel_index]
        last_global_t = 0

        scene, task = self.branches[parallel_index % self.NUM_TASKS]
        key = scene + "-" + task
        while self.global_t < MAX_TIME_STEP and not self.stop_requested:
            diff_global_t = training_thread.process(
                self.sess, self.global_t, self.summary_writer,
                self.summary_op[key], self.summary_placeholders[key])
            self.global_t += diff_global_t
            # periodically save checkpoints to disk
            if parallel_index == 0 and self.global_t - last_global_t > 1000000:
                print('Save checkpoint at timestamp %d' % self.global_t)
                self.saver.save(self.sess,
                                CHECKPOINT_DIR + '/' + 'checkpoint',
                                global_step=self.global_t)
                last_global_t = self.global_t

    def signal_handler(self, signal, frame):
        print('You pressed Ctrl+C!')
        self.stop_requested = True
class ADQN_Thread(object):
  def __init__(self, thread_index, global_network, initial_learning_rate,
               learning_rate_input, grad_applier, max_global_time_step,
               device, network_scope="network", scene_scope="scene",
               task_scope="task"):
    
    self.thread_index = thread_index                                        # Number the thread

    self._set_local_network(device, network_scope, scene_scope, task_scope) # Set local network

    self.sync = self.local_network.sync_from(global_network)                # Synthesize from the global network

    self.learning_rate_input = learning_rate_input                          # Set learning rate

    self.max_global_time_step = max_global_time_step                        # Set maximum of global time step
    
    self._set_trainer_optimizer(device, global_network, grad_applier)                     # Set trainer
    
    self._set_environment(initial_learning_rate)                            # Set environment

    self.memory_size = MEMORY_SIZE # memory size for replay buffer

    self.memory = np.zeros((self.memory_size, 2048 * 4 * 2 + 2))  # initialize zero memory [s, a, r, s_]

    self.replace_target_iter = DQN_REPLACE_TARGET_ITER

    self.batch_size = DQN_BATCH_SIZE

    self.gamma = REWARD_DECAY


  # Create local network
  def _set_local_network(self, device, network_scope, scene_scope, task_scope):
    self.local_network = DRLNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope,
                           scene_scopes=[scene_scope])
    
    self.network_scope = network_scope
    self.scene_scope = scene_scope
    self.task_scope = task_scope
    self.scopes = [network_scope, scene_scope, task_scope]
    self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

  # Set trainer and optimizer
  # Set Actor-Critic gradient and optimizer
  # Use the accumulated trainer from Zhu
  def _set_trainer_optimizer(self, device, global_network, grad_applier):
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize(self.local_network.total_loss,
                                  self.local_network.get_vars())

    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()

    accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
    global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]

    self.apply_gradients = grad_applier.apply_gradients(global_net_vars, self.trainer.get_accum_grad_list() )

  def _local_var_name(self, var):
    return '/'.join(var.name.split('/')[1:])

  def _get_accum_grad_name(self, var):
    return self._local_var_name(var).replace(':','_') + '_accum_grad:0'

  # Set environments
  def _set_environment(self, initial_learning_rate):
    self.env = None
    self.local_t = 0
    self.initial_learning_rate = initial_learning_rate
    self.episode_reward = 0
    self.episode_length = 0


  def choose_action(self, actions_value):
    # epsilon-greedy
    if np.random.uniform() < EPSILON:
      action = np.argmax(actions_value)
    else:
      action = np.random.randint(0, ACTION_SIZE)
    return action

  # Take LOCAL_T_MAX step in one process
  def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders):
    #print("start process")

    if self.env is None:
      # lazy evaluation
      time.sleep(self.thread_index*1.0)
      self.env = Environment({
        'scene_name': self.scene_scope,
        'terminal_state_id': int(self.task_scope)
      })
    start_local_t = self.local_t

    # Reset accmulated gradient variables
    sess.run(self.reset_gradients)
    # Obtain shared parameters from global 
    sess.run( self.sync )

    # t_max times loop
    for i in range(LOCAL_T_MAX):
      old_s_t = self.env.s_t
      actions_value = self.local_network.run_DQN(sess, self.env.s_t, self.env.target, self.scopes)
      action = self.choose_action(actions_value)

      if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0:
        sys.stdout.write("%s:" % self.scene_scope)
        sys.stdout.write("Pi = {0} V = {1}\n".format(actions_value, action))

      # process game
      self.env.step(action)

      # receive game result
      reward = self.env.reward
      terminal = self.env.terminal

      # ad-hoc reward for navigation
      # reward = 10.0 if terminal else -0.01
      if self.episode_length > 5e3: terminal = True

      self.episode_reward += reward
      self.episode_length += 1

      """
      print("Local t: {0:d}".format(self.local_t))
      print("Reward: {0:f}".format(reward))
      print("Episode reward: {0:f}".format(self.episode_reward))
      print("Episode length: {0:d}".format(self.episode_length))
      """

      self.local_t += 1

      # store transition to replay buffer
      self.store_transition(old_s_t, action, reward, self.env.s_t)

      if terminal:
        sys.stdout.write("#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s \n" % (self.thread_index, global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope))

        summary_values = {
          "episode_reward_input": self.episode_reward,
          "episode_length_input": float(self.episode_length),
          "learning_rate_input": self._anneal_learning_rate(global_t)
        }

        self._record_score(sess, summary_writer, summary_op, summary_placeholders,
                           summary_values, global_t)
        self.episode_reward = 0
        self.episode_length = 0
        self.env.reset()

        break

    # update target network
    if self.local_t % self.replace_target_iter == 0:
      sess.run(self.local_network.replace_target_op)
      # print('\ntarget_params_replaced\n')

    # sample batch memory from all memory
    if self.memory_counter > self.memory_size:
      sample_index = np.random.choice(self.memory_size, size=self.batch_size)
    else:
      sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
    batch_memory = self.memory[sample_index, :]

    batch_memory_s_ = np.reshape(batch_memory[:, -2048*4:], (-1, 2048, 4))
    batch_memory_s = np.reshape(batch_memory[:, :2048*4], (-1, 2048, 4))
    batch_memory_t = np.reshape(np.tile(self.env.target, [self.batch_size, 1]), (-1, 2048, 4))

    q_next, q_eval = sess.run(
      [self.local_network.q_next, self.local_network.q_eval],
      feed_dict={
        self.local_network.s_: batch_memory_s_,  # fixed params
        self.local_network.s: batch_memory_s,  # newest params
        self.local_network.t: batch_memory_t
      })

    # change q_target w.r.t q_eval's action
    q_target = q_eval.copy()

    batch_index = np.arange(self.batch_size, dtype=np.int32)
    eval_act_index = batch_memory[:, 2048*4].astype(int)
    reward = batch_memory[:, 2048*4 + 1]

    key_eval = self.network_scope + '/' + self.scene_scope + '/eval'
    if terminal:
      q_target[key_eval][batch_index, eval_act_index] = reward
    else:
      key_target = self.network_scope + '/'+ self.scene_scope + '/target'
      q_target[key_eval][batch_index, eval_act_index] = reward + self.gamma * np.max(q_next[key_target], axis=1)

    for idx in batch_index:
      # train eval network
      sess.run(self.accum_gradients,
               feed_dict={
                 self.local_network.s: [batch_memory_s[idx]],
                 self.local_network.t: [batch_memory_t[idx]],
                 self.local_network.q_target: [q_target[key_eval][idx]]})

      cur_learning_rate = self._anneal_learning_rate(global_t)

      # update global network
      sess.run( self.apply_gradients,
                feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
      sys.stdout.write("#Thread-%d-%s-Local timestep-%d\n" % (self.thread_index, self.scene_scope, self.local_t))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t


  def _record_score(self, sess, writer, summary_op, placeholders, values, global_t):
    feed_dict = {}
    for k in placeholders:
      feed_dict[placeholders[k]] = values[k]
    summary_str = sess.run(summary_op, feed_dict=feed_dict)
    if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t))
    writer.add_summary(summary_str, global_t)
    # writer.flush()


  def _anneal_learning_rate(self, global_time_step):
    time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0)
    learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step
    return learning_rate

  def store_transition(self, s, a, r, s_):
    if not hasattr(self, 'memory_counter'):
      self.memory_counter = 0

    transition = np.hstack((np.reshape(s, -1), [a, r], np.reshape(s_,-1)))

    # replace the old memory with new memory
    index = self.memory_counter % self.memory_size
    self.memory[index, :] = transition

    self.memory_counter += 1