Пример #1
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE, device)
        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 max_global_time_step):

        self.thread_index = thread_index
        self.learning_rate_input = tf.placeholder("float")
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE)
        self.local_network.prepare_loss(ENTROPY_BETA)

        # policy
        self.policy_trainer = AccumTrainer()
        self.policy_trainer.prepare_minimize(
            self.local_network.policy_loss,
            self.local_network.get_policy_vars())
        self.policy_accum_gradients = self.policy_trainer.accumulate_gradients(
        )
        self.policy_reset_gradients = self.policy_trainer.reset_gradients()

        self.policy_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.policy_apply_gradients = self.policy_applier.apply_gradients(
            global_network.get_policy_vars(),
            self.policy_trainer.get_accum_grad_list())

        # value
        self.value_trainer = AccumTrainer()
        self.value_trainer.prepare_minimize(
            self.local_network.value_loss, self.local_network.get_value_vars())
        self.value_accum_gradients = self.value_trainer.accumulate_gradients()
        self.value_reset_gradients = self.value_trainer.reset_gradients()

        self.value_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.value_apply_gradients = self.value_applier.apply_gradients(
            global_network.get_value_vars(),
            self.value_trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # thread0 will record score for TensorBoard
        if self.thread_index == 0:
            self.score_input = tf.placeholder(tf.int32)
            tf.scalar_summary("score", self.score_input)
  def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step):

    self.thread_index = thread_index
    self.learning_rate_input = tf.placeholder("float")
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # policy
    self.policy_trainer = AccumTrainer()
    self.policy_trainer.prepare_minimize( self.local_network.policy_loss,
                                          self.local_network.get_policy_vars() )
    self.policy_accum_gradients = self.policy_trainer.accumulate_gradients()
    self.policy_reset_gradients = self.policy_trainer.reset_gradients()
  
    self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                         decay = 0.99,
                                         momentum = 0.0,
                                         epsilon = RMSP_EPSILON )
    self.policy_apply_gradients = self.policy_applier.apply_gradients(
        global_network.get_policy_vars(),
        self.policy_trainer.get_accum_grad_list() )

    # value
    self.value_trainer = AccumTrainer()
    self.value_trainer.prepare_minimize( self.local_network.value_loss,
                                         self.local_network.get_value_vars() )
    self.value_accum_gradients = self.value_trainer.accumulate_gradients()
    self.value_reset_gradients = self.value_trainer.reset_gradients()
  
    self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                        decay = 0.99,
                                        momentum = 0.0,
                                        epsilon = RMSP_EPSILON )
    self.value_apply_gradients = self.value_applier.apply_gradients(
        global_network.get_value_vars(),
        self.value_trainer.get_accum_grad_list() )
    
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # thread0 will record score for TensorBoard
    if self.thread_index == 0:
      self.score_input = tf.placeholder(tf.int32)
      tf.scalar_summary("score", self.score_input)
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE, device)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0
Пример #5
0
def log_uniform(lo, hi, rate):
    log_lo = math.log(lo)
    log_hi = math.log(hi)
    v = log_lo * (1 - rate) + log_hi * rate
    return math.exp(v)


initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH,
                                    INITIAL_ALPHA_LOG_RATE)

global_t = 0

stop_requested = False

global_network = GameACNetwork(ACTION_SIZE)

training_threads = []

learning_rate_input = tf.placeholder("float")

policy_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                decay=0.99,
                                momentum=0.0,
                                epsilon=RMSP_EPSILON)

value_applier = RMSPropApplier(learning_rate=learning_rate_input,
                               decay=0.99,
                               momentum=0.0,
                               epsilon=RMSP_EPSILON)
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE, device)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0


  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)      
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    sess.run( self.accum_gradients,
              feed_dict = {
                self.local_network.s: batch_si,
                self.local_network.a: batch_a,
                self.local_network.td: batch_td,
                self.local_network.r: batch_R } )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
class A3CTrainingThread(object):
  def __init__(self, thread_index, global_network, initial_learning_rate,
               learning_rate_input,
               policy_applier, value_applier,
               max_global_time_step):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # policy
    self.policy_trainer = AccumTrainer()
    self.policy_trainer.prepare_minimize( self.local_network.policy_loss,
                                          self.local_network.get_policy_vars(),
                                          GRAD_NORM_CLIP )
    
    self.policy_accum_gradients = self.policy_trainer.accumulate_gradients()
    self.policy_reset_gradients = self.policy_trainer.reset_gradients()
  
    self.policy_apply_gradients = policy_applier.apply_gradients(
        global_network.get_policy_vars(),
        self.policy_trainer.get_accum_grad_list() )

    # value
    self.value_trainer = AccumTrainer()
    self.value_trainer.prepare_minimize( self.local_network.value_loss,
                                         self.local_network.get_value_vars(),
                                         GRAD_NORM_CLIP )
    self.value_accum_gradients = self.value_trainer.accumulate_gradients()
    self.value_reset_gradients = self.value_trainer.reset_gradients()
  

    self.value_apply_gradients = value_applier.apply_gradients(
        global_network.get_value_vars(),
        self.value_trainer.get_accum_grad_list() )
    
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # thread0 will record score for TensorBoard
    if self.thread_index == 0:
      self.score_input = tf.placeholder(tf.int32)
      tf.scalar_summary("score", self.score_input)

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      self.score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # 加算された勾配をリセット
    sess.run( self.policy_reset_gradients )
    sess.run( self.value_reset_gradients )

    # shared から localにweightをコピー
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      value_ = self.local_network.run_value(sess, self.game_state.s_t)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # gameを実行
      self.game_state.process(action)

      # 実行した結果
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      rewards.append(reward)

      self.local_t += 1

      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        if self.thread_index == 0:        
          self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t)
          
        self.episode_reward = 0
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    # 勾配を算出して加算していく
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      sess.run( self.policy_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.a: [a],
                    self.local_network.td: [td] } )
      
      sess.run( self.value_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.r: [R] } )

    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.policy_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )
    # Learning rate for Critic is half of Actor's
    sess.run( self.value_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate * 0.5 } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # 進んだlocal step数を返す
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
Пример #8
0
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE, device)
    self.local_network.prepare_loss(ENTROPY_BETA)

    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0


  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      value_ = self.local_network.run_value(sess, self.game_state.s_t)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: [si],
                  self.local_network.a: [a],
                  self.local_network.td: [td],
                  self.local_network.r: [R]} )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # 進んだlocal step数を返す
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
Пример #9
0
from a3c_training_thread import A3CTrainingThread
from rmsprop_applier import RMSPropApplier

from constants import ACTION_SIZE
from constants import PARALLEL_SIZE
from constants import MAX_TIME_STEP
from constants import CHECKPOINT_DIR
from constants import RMSP_EPSILON
from constants import RMSP_ALPHA
from constants import GRAD_NORM_CLIP
from constants import USE_GPU

# use CPU for weight visualize tool
device = "/cpu:0"

global_network = GameACNetwork(ACTION_SIZE, device)

training_threads = []

learning_rate_input = tf.placeholder("float")

grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                              decay=RMSP_ALPHA,
                              momentum=0.0,
                              epsilon=RMSP_EPSILON,
                              clip_norm=GRAD_NORM_CLIP,
                              device=device)

for i in range(PARALLEL_SIZE):
    training_thread = A3CTrainingThread(i,
                                        global_network,
Пример #10
0
    values = []
    sum = 0.0
    for rate in pi_values:
        sum = sum + rate
        value = sum
        values.append(value)

    r = random.random() * sum
    for i in range(len(values)):
        if values[i] >= r:
            return i
    #fail safe
    return len(values) - 1


global_network = GameACNetwork(ACTION_SIZE)

training_threads = []
for i in range(PARALLEL_SIZE):
    training_thread = A3CTrainingThread(i, global_network, 1.0, 8000000)
    training_threads.append(training_thread)

sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

saver = tf.train.Saver()
checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
if checkpoint and checkpoint.model_checkpoint_path:
    saver.restore(sess, checkpoint.model_checkpoint_path)
    print "checkpoint loaded:", checkpoint.model_checkpoint_path
def choose_action(pi_values):
  values = []
  sum = 0.0
  for rate in pi_values:
    sum = sum + rate
    value = sum
    values.append(value)
    
  r = random.random() * sum
  for i in range(len(values)):
    if values[i] >= r:
      return i;
  #fail safe
  return len(values)-1

global_network = GameACNetwork(ACTION_SIZE)

learning_rate_input = tf.placeholder("float")

policy_applier = RMSPropApplier(learning_rate = learning_rate_input,
                                decay = 0.99,
                                momentum = 0.0,
                                epsilon = RMSP_EPSILON )

value_applier = RMSPropApplier(learning_rate = learning_rate_input,
                               decay = 0.99,
                               momentum = 0.0,
                               epsilon = RMSP_EPSILON )

training_threads = []
for i in range(PARALLEL_SIZE):
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 max_global_time_step):

        self.thread_index = thread_index
        self.learning_rate_input = tf.placeholder("float")
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE)
        self.local_network.prepare_loss(ENTROPY_BETA)

        # policy
        self.policy_trainer = AccumTrainer()
        self.policy_trainer.prepare_minimize(
            self.local_network.policy_loss,
            self.local_network.get_policy_vars())
        self.policy_accum_gradients = self.policy_trainer.accumulate_gradients(
        )
        self.policy_reset_gradients = self.policy_trainer.reset_gradients()

        self.policy_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.policy_apply_gradients = self.policy_applier.apply_gradients(
            global_network.get_policy_vars(),
            self.policy_trainer.get_accum_grad_list())

        # value
        self.value_trainer = AccumTrainer()
        self.value_trainer.prepare_minimize(
            self.local_network.value_loss, self.local_network.get_value_vars())
        self.value_accum_gradients = self.value_trainer.accumulate_gradients()
        self.value_reset_gradients = self.value_trainer.reset_gradients()

        self.value_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.value_apply_gradients = self.value_applier.apply_gradients(
            global_network.get_value_vars(),
            self.value_trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # thread0 will record score for TensorBoard
        if self.thread_index == 0:
            self.score_input = tf.placeholder(tf.int32)
            tf.scalar_summary("score", self.score_input)

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        #fail safe
        return len(values) - 1

    def _record_score(self, sess, summary_writer, summary_op, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={self.score_input: score})
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # 加算された勾配をリセット
        sess.run(self.policy_reset_gradients)
        sess.run(self.value_reset_gradients)

        # shared から localにweightをコピー
        sess.run(self.sync)

        start_local_t = self.local_t

        # 5回ループ
        for i in range(LOCAL_T_MAX):
            pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            action = self.choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            value_ = self.local_network.run_value(sess, self.game_state.s_t)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print "pi=", pi_
                print " V=", value_

            # gameを実行
            self.game_state.process(action)

            # 実行した結果
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            self.game_state.update()

            if terminal:
                terminal_end = True
                print "score=", self.episode_reward

                if self.thread_index == 0:
                    self._record_score(sess, summary_writer, summary_op,
                                       self.episode_reward, global_t)

                self.episode_reward = 0
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # 勾配を算出して加算していく
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            sess.run(self.policy_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.a: [a],
                         self.local_network.td: [td]
                     })

            sess.run(self.value_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.r: [R]
                     })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.policy_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})
        sess.run(self.value_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print "TIMESTEP", self.local_t

        # 進んだlocal step数を返す
        diff_local_t = self.local_t - start_local_t
        return diff_local_t