def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars()) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients( ) self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list()) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars()) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input)
def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars() ) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients() self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list() ) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars() ) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input)
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
def log_uniform(lo, hi, rate): log_lo = math.log(lo) log_hi = math.log(hi) v = log_lo * (1 - rate) + log_hi * rate return math.exp(v) initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) global_t = 0 stop_requested = False global_network = GameACNetwork(ACTION_SIZE) training_threads = [] learning_rate_input = tf.placeholder("float") policy_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) value_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON)
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R } ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, policy_applier, value_applier, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars(), GRAD_NORM_CLIP ) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients() self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_apply_gradients = policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list() ) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars(), GRAD_NORM_CLIP ) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_apply_gradients = value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={ self.score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run( self.policy_reset_gradients ) sess.run( self.value_reset_gradients ) # shared から localにweightをコピー sess.run( self.sync ) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run( self.policy_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] } ) sess.run( self.value_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.r: [R] } ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.policy_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) # Learning rate for Critic is half of Actor's sess.run( self.value_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate * 0.5 } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run( self.accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td], self.local_network.r: [R]} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
from a3c_training_thread import A3CTrainingThread from rmsprop_applier import RMSPropApplier from constants import ACTION_SIZE from constants import PARALLEL_SIZE from constants import MAX_TIME_STEP from constants import CHECKPOINT_DIR from constants import RMSP_EPSILON from constants import RMSP_ALPHA from constants import GRAD_NORM_CLIP from constants import USE_GPU # use CPU for weight visualize tool device = "/cpu:0" global_network = GameACNetwork(ACTION_SIZE, device) training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network,
values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 global_network = GameACNetwork(ACTION_SIZE) training_threads = [] for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, 1.0, 8000000) training_threads.append(training_thread) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "checkpoint loaded:", checkpoint.model_checkpoint_path
def choose_action(pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 global_network = GameACNetwork(ACTION_SIZE) learning_rate_input = tf.placeholder("float") policy_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) value_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = 0.99, momentum = 0.0, epsilon = RMSP_EPSILON ) training_threads = [] for i in range(PARALLEL_SIZE):
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars()) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients( ) self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list()) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars()) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={self.score_input: score}) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run(self.policy_reset_gradients) sess.run(self.value_reset_gradients) # shared から localにweightをコピー sess.run(self.sync) start_local_t = self.local_t # 5回ループ for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run(self.policy_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] }) sess.run(self.value_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.r: [R] }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.policy_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) sess.run(self.value_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t