def __init__(self, thread_id, env_name, global_model, init_lr, lr_ph, grad_applier, max_time_steps, model_dim, gamma): self.thread_id = thread_id self.global_model = global_model self.init_lr = init_lr self.grad_applier = grad_applier self.lr_ph = lr_ph self.max_time_steps = max_time_steps self.gamma = gamma height, width, num_frames, num_actions = model_dim self.local_model = ConvNetA3C(height, width, num_frames, num_actions) self.num_actions = num_actions trainer = AccumTrainer("/cpu:0") trainer.prepare_minimize(self.local_model.loss, self.local_model.params) self.accum_grads = trainer.accumulate_gradients() self.reset_grads = trainer.reset_gradients() self.apply_grads = grad_applier.apply_gradients( global_model.params, trainer.get_accum_grad_list()) self.sync = self.local_model.sync_from(global_model) self.env = AtariAleEnvironment(env_name) self.s_t = self.env.reset() self.start_time = None self.ep_rwd, self.num_ep = 0, 0 self.avg_rwd = None self.t = 0 self.prev_t = 0
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R } ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, environment): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network = global_network.structural_clone( network_name="thread-net-%s" % self.thread_index) self.local_network.prepare_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) self.game_state = GymGameState(113 * thread_index, env=environment) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.lstm_last_output_state = None # cache last lstm hidden states here def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 # def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): # summary_str = sess.run(summary_op, feed_dict={ # score_input: score, # }) # summary_writer.add_summary(summary_str, global_t) """next steps x init the lstm state before process is called, somewhere x reinit lstm state after terminal episodes ?!? allow lstm state to persist even after global weights are copied (i guess) x feed state in to lstm during policy evals how does state work in gradient backups? Tests: - inspect lstm state inputs, outputs, and episode stored values - """ def reset(self): # todo: any other states to clean up?? # could have been absorbed in to a check for validity of game state... # but thats kind of magic-y and icky for this scenario self.game_state.reset() def process(self, sess, global_t, summary_writer, record_score_fn): #summary_op, score_input): states = [] actions = [] rewards = [] values = [] lstm_states = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) # write weight summaries...only need them from one thread really if (self.thread_index == 0): param_summary = sess.run(self.local_network.param_summary_op) summary_writer.add_summary(param_summary, global_step=global_t) start_local_t = self.local_t # resume with wherever we left off on last time through the action loop # TODO: no reason the network itself current should care about this if (self.lstm_last_output_state is None): self.lstm_last_output_state = self.local_network.lstm_initial_state_value lstm_state = self.lstm_last_output_state # lstm_state = self.local_network.lstm_last_output_state_value # t_max times loop for i in range(LOCAL_T_MAX): states.append(self.game_state.s_t) lstm_states.append(lstm_state) pi_, value_, lstm_state = self.local_network.run( sess, self.game_state.s_t, lstm_state) action = self.local_network.sample_action(pi_) # print "a3c train: pi_: ", pi_ # print "a3c train: action: ", action # pi_ = self.local_network.run_policy(sess, self.game_state.s_t) # action = choose_action(pi_) # self.choose_action(pi_) actions.append(action) # value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # s_t1 -> s_t self.game_state.update() # not sure why this is separate... # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append(np.clip(reward, -1, 1000)) self.local_t += 1 if terminal: terminal_end = True print "terminal score =", self.episode_reward # self._record_score(sess, summary_writer, summary_op, score_input, # self.episode_reward, global_t) record_score_fn(sess, summary_writer, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() # ugh. reset lstm state! lstm_state = self.local_network.lstm_initial_state_value break R = 0.0 if not terminal_end: # R = self.local_network.run_value(sess, self.game_state.s_t) _, R, _ = self.local_network.run(sess, self.game_state.s_t, lstm_state) # self.local_network.lstm_last_output_state_value = lstm_state # preserve for next time through the loop self.lstm_last_output_state = lstm_state # TODO: cant store the lists i pass directly since they'll be destructively reversed by # this call....hmmmmm # maybe just reverse them here and leave it? # start with copying the lists self.backup_and_accum_gradients(sess, global_t, summary_writer, states=states, lstm_states=lstm_states, actions=actions, values=values, rewards=rewards, final_reward_estimate=R) if (self.thread_index == 0) and (self.local_t % 100) == 0: print("TIMESTEP %d GLOBAL %d" % (self.local_t, global_t)) # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t def backup_and_accum_gradients(self, sess, global_t, summary_writer, states, lstm_states, actions, values, rewards, final_reward_estimate): """ inputs are lists reflecting a recorded episode fragment in the order they occured a = sample{ pi(a | s, lstm_s ) } v = V(s, lstms) r = env.step(a) :param states: states :param actions: :param rewards: :param lstm_states: :return: """ # TODO: copy these and leave the originals alone... actions.reverse() states.reverse() rewards.reverse() values.reverse() lstm_states.reverse() R = final_reward_estimate # compute and accmulate gradients for (ai, ri, si, Vi, lstm_si) in zip(actions, rewards, states, values, lstm_states): R = ri + GAMMA * R td = R - Vi a = self.local_network.feedback_action(ai) # a = np.zeros([self.local_network.action_size]) # a[ai] = 1 # reshape state input # no batching for now _, loss_summary = sess.run( [self.accum_gradients, self.local_network.loss_summary_op], feed_dict=self.local_network.loss_feed_dictionary( si, a, td, R, lstm_si) # feed_dict={ # self.local_network.s: [si], # self.local_network.a: [a], # self.local_network.td: [td], # self.local_network.r: [R], # self.local_network.lstm_current_state_tensor: lstm_si # } ) if (self.thread_index == 0): summary_writer.add_summary(loss_summary, global_step=global_t) """ idea: maybe possible to do n-step TBPTT after having retroactively computed R for each state feed in batches of size up to n_max to a set of parallel networks with idea: set up the lstm with say 5 recursive calls. then the initial inputs would need to be padded...maybe? would work if made the inputs in batches and altered iteration logic to cycle inputs through the history... """ cur_learning_rate = self._anneal_learning_rate(global_t) _, grad_summary = sess.run( [self.apply_gradients, self.grad_summary_op], feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0): summary_writer.add_summary(grad_summary, global_step=global_t) # TODO: rename 'states' variable as 'observations' in next version just to be f*****g crystal clear def process_memory(self, sess, global_t, summary_writer, states, initial_lstm_state, actions, rewards, final_state): """ :param sess: :param global_t: :param summary_writer: :param states: :param initial_lstm_state: :param actions: :param rewards: :param final_state: observation after the last game step...use None to signal terminal, otherwise used to compute the final boostrap Value :return: """ # TODO: gotcha initial_lstm_state must be set carefully # if the episode reflects t=0, the state is always known # otherwise how can we know what the lstm state output of the *current* policy might plausibly have been # unless the same policy was executed from the very beginning of the historical episode and propagated # we could just record the lstm_state prior to the beginning of the history episode as an approximation # we might expect it to converge reasonably after a number of steps to something from the plausible distribution # for the current policy...however, over time, the policy will drift away further and further from what # created the original lstm_state # this suggests the solution that we update the stored initial lstm state in the replay memory after every refresh # ...almost like a real memory trace in a human brain might... # but how can we update it ???? # maybe keep one state in reserve just to prime...but then we can only update the lstm state after it, not the one # that initial state needs....HMMM. maybe just # # for certain environments we could just apply the network to s_t+0 repeatedly until the lstm state converges # this works if the problem and/or env dont depend on any direct measure of time..perhaps # # easiest solution might just be to always reference the episodes to t=0 # or just ignore first k states when backing up and computing gradients...since presumably we'll have converged # to something reasonable by that point values = [] lstm_states = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) lstm_state = initial_lstm_state for (s_t, a_t, r_t) in zip(states, actions, rewards): # accum lstm states lstm_states.append(lstm_state) pi_, value_, lstm_state = self.local_network.run( sess, s_t, lstm_state) # get values values.append(values)
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, game_function=ale_game_state, local_network=None): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = local_network() self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = game_function(thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("state={}".format(self.game_state.s_t)) print("pi={}".format(pi_)) print(" V={}".format(value_)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, policy_applier, value_applier, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars(), GRAD_NORM_CLIP ) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients() self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_apply_gradients = policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list() ) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars(), GRAD_NORM_CLIP ) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_apply_gradients = value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={ self.score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run( self.policy_reset_gradients ) sess.run( self.value_reset_gradients ) # shared から localにweightをコピー sess.run( self.sync ) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run( self.policy_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] } ) sess.run( self.value_accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.r: [R] } ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.policy_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) # Learning rate for Critic is half of Actor's sess.run( self.value_apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate * 0.5 } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, options): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.options = options if options.use_lstm: self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device) else: self.local_network = GameACFFNetwork(options.action_size, device) self.local_network.prepare_loss(options.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.indent = " |" * self.thread_index self.steps = 0 self.no_reward_steps = 0 self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0) if self.options.train_episode_steps > 0: self.max_reward = 0.0 self.max_episode_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores = Episode_scores(options) self.tes = self.options.train_episode_steps if self.options.tes_list is not None: self.tes = self.options.tes_list[thread_index] print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes)) self.initial_lives = self.game_state.initial_lives self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1) if self.options.record_new_record_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_record_dir): os.makedirs(self.options.record_new_record_dir) self.episode_screens = [] if self.options.record_new_room_dir is not None: if self.thread_index == 0: if not os.path.exists(self.options.record_new_room_dir): os.makedirs(self.options.record_new_room_dir) self.episode_screens = [] self.greediness = options.greediness self.repeat_action_ratio = options.repeat_action_ratio self.prev_action = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values, global_t): # Add greediness for broader exploration r = random.random() if r < self.greediness: action = int(r * len(pi_values)) elif r < self.repeat_action_ratio: action = self.prev_action else: # Increase randomness of choice if no reward term is too long if self.no_reward_steps > self.options.no_reward_steps: randomness = (self.no_reward_steps - self.options.no_reward_steps) * self.options.randomness pi_values += randomness pi_values /= sum(pi_values) if self.local_t % self.options.randomness_log_interval == 0: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}randomness={:.8f}".format( elapsed_time, global_t, self.thread_index, self.indent, randomness)) pi_values -= np.finfo(np.float32).epsneg action_samples = np.random.multinomial(self.options.num_experiments, pi_values) action = action_samples.argmax(0) self.prev_action = action return action def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time #@profile def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] liveses = [self.game_state.lives] if self.tes > 0: if self.episode_liveses == []: self.episode_liveses.append(self.game_state.lives) terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if self.options.use_lstm: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(self.options.local_t_max): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_, global_t) states.append(self.game_state.s_t) actions.append(action) values.append(value_) liveses.append(self.game_state.lives) if (self.thread_index == 0) and (self.local_t % self.options.log_interval == 0): print("pi={} (thread{})".format(pi_, self.thread_index)) print(" V={} (thread{})".format(value_, self.thread_index)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward if reward > 0 and \ (self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0"): elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:4.0f},th={}:{}r={:3.0f}RM{:02d}| NEW-SCORE".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, self.game_state.room_no)) # pseudo-count reward if self.options.psc_use: reward += self.game_state.psc_reward # add basic income after some no reward steps if self.no_reward_steps > self.options.no_reward_steps: reward += self.options.basic_income # clip reward if self.options.reward_clip > 0.0: reward = np.clip(reward, -self.options.reward_clip, self.options.reward_clip) rewards.append( reward ) # collect episode log if self.tes > 0: self.episode_states.append(self.game_state.s_t) self.episode_actions.append(action) self.episode_rewards.append(reward) self.episode_values.append(value_) self.episode_liveses.append(self.game_state.lives) if len(self.episode_states) > self.max_history * 2: self.episode_states = self.episode_states[-self.max_history:] self.episode_actions = self.episode_actions[-self.max_history:] self.episode_rewards = self.episode_rewards[-self.max_history:] self.episode_values = self.episode_values[-self.max_history:] self.episode_liveses = self.episode_liveses[-self.max_history-1:] # requirement for OpenAI Gym: --clear-history-on-death=False if self.options.clear_history_on_death and (liveses[-2] > liveses[-1]): self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = self.episode_liveses[-2:] self.local_t += 1 if self.options.record_new_record_dir is not None \ or self.options.record_new_room_dir is not None: screen = self.game_state.uncropped_screen if self.options.compress_frame: screen = lzma.compress(screen.tobytes(), preset=0) self.episode_screens.append(screen) # terminate if the play time is too long self.steps += 1 if self.steps > self.options.max_play_steps: terminal = True # requirement for OpenAI Gym: --terminate-on-lives-lost=False # terminate if lives lost if self.terminate_on_lives_lost and (liveses[-2] > liveses[-1]): terminal = True # count no reward steps if self.game_state.reward == 0.0: self.no_reward_steps += 1 else: self.no_reward_steps = 0 # s_t1 -> s_t self.game_state.update() if self.local_t % self.options.score_log_interval == 0: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, self.game_state.room_no, self.game_state.lives, value_, self.game_state.psc_reward)) # if self.game_state.room_no != self.game_state.prev_room_no: # elapsed_time = time.time() - self.start_time # print("t={:6.0f},s={:9d},th={}:{}RM{:02d}>RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format( # elapsed_time, global_t, self.thread_index, self.indent, # self.game_state.prev_room_no, self.game_state.room_no, # self.game_state.lives, value_, self.game_state.psc_reward)) if self.tes > 0: if self.game_state.lives < self.episode_liveses[-2]: elapsed_time = time.time() - self.start_time print("t={:6.0f},s={:9d},th={}:{}l={:.0f}>{:.0f}RM{:02d}|".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_liveses[-2], self.game_state.lives, self.game_state.room_no)) # seperate steps after getting reward if self.game_state.reward > 0: if not terminal: break if terminal: terminal_end = True elapsed_time = time.time() - self.start_time end_mark = "end" if self.terminate_on_lives_lost else "END" print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}@{}|".format( elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, end_mark)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) if self.tes > 0: if self.options.record_new_room_dir is not None \ and self.game_state.new_room >= 0: dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t, self.thread_index,\ self.episode_reward, self.game_state.new_room) dirname = os.path.join(self.options.record_new_room_dir, dirname) os.makedirs(dirname) for index, screen in enumerate(self.episode_screens): filename = "{:06d}.png".format(index) filename = os.path.join(dirname, filename) screen_image = screen if self.options.compress_frame: screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160)) cv2.imwrite(filename, screen_image) print("@@@ New Room record screens saved to {}".format(dirname)) if self.episode_reward > self.max_episode_reward: if self.options.record_new_record_dir is not None: dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t, self.thread_index,\ self.episode_reward, self.game_state.room_no) dirname = os.path.join(self.options.record_new_record_dir, dirname) os.makedirs(dirname) for index, screen in enumerate(self.episode_screens): filename = "{:06d}.png".format(index) filename = os.path.join(dirname, filename) screen_image = screen if self.options.compress_frame: screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160)) cv2.imwrite(filename, screen_image) print("@@@ New Record screens saved to {}".format(dirname)) self.max_episode_reward = self.episode_reward if self.options.record_all_non0_record: self.max_episode_reward = 0 self.max_reward = 0.0 self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = [] self.episode_scores.add(self.episode_reward, global_t, self.thread_index) if self.options.record_new_record_dir is not None \ or self.options.record_new_room_dir is not None: self.episode_screens= [] self.episode_reward = 0 self.steps = 0 self.no_reward_steps = 0 self.game_state.reset() if self.options.use_lstm: self.local_network.reset_state() break if self.thread_index == 0 and self.local_t % self.options.performance_log_interval < self.options.local_t_max: elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) if self.options.gym_eval: diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end # don't train if following condition # requirement for OpenAI Gym: --terminate-on-lives-lost=False if self.options.terminate_on_lives_lost and (self.thread_index == 0) and (not self.options.train_in_eval): return 0, terminal_end else: if self.tes > 0: _ = self.episode_scores.is_highscore(self.episode_reward) if self.episode_reward > self.max_reward: self.max_reward = self.episode_reward if True: tes = self.tes # requirement for OpenAI Gym: --test-extend=False if self.options.tes_extend and self.initial_lives != 0: tes *= self.options.tes_extend_ratio * (self.game_state.lives / self.initial_lives) if self.game_state.lives == self.initial_lives: tes *= 2 tes = int(tes) tes = min(tes, len(self.episode_states)) print("[OHL]SCORE={:3.0f},s={:9d},th={},lives={},steps={},tes={},RM{:02d}".format(self.episode_reward, global_t, self.thread_index, self.game_state.lives, self.steps, tes, self.game_state.room_no)) if tes == 0: states = [] actions = [] rewards = [] values = [] liveses = self.episode_liveses[-1:] else: states = self.episode_states[-tes:] actions = self.episode_actions[-tes:] rewards = self.episode_rewards[-tes:] values = self.episode_values[-tes:] liveses = self.episode_liveses[-tes-1:] if self.options.clear_history_after_ohl: self.episode_states = [] self.episode_actions = [] self.episode_rewards = [] self.episode_values = [] self.episode_liveses = self.episode_liveses[-2:] if len(states) > 0: R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] lives = liveses.pop() # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): # Consider the number of lives if (not self.options.use_gym) and self.initial_lives != 0.0 and not self.terminate_on_lives_lost: prev_lives = liveses.pop() if prev_lives > lives: weight = self.options.lives_lost_weight rratio = self.options.lives_lost_rratio R *= rratio * ( (1.0 - weight) + weight * (lives / prev_lives) ) ri = self.options.lives_lost_reward lives = prev_lives R = ri + self.options.gamma * R td = R - Vi a = np.zeros([self.options.action_size]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if self.options.use_lstm: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal_end
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, environment): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step # self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network = global_network.structural_clone(network_name="thread-net-%s" % self.thread_index) self.local_network.prepare_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) # self.game_state = GameState(113 * thread_index) self.game_state = GymGameState(113 * thread_index, env=environment) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.lstm_last_output_state = None # cache last lstm hidden states here def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; # fail safe return len(values) - 1 # def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): # summary_str = sess.run(summary_op, feed_dict={ # score_input: score, # }) # summary_writer.add_summary(summary_str, global_t) """next steps x init the lstm state before process is called, somewhere x reinit lstm state after terminal episodes ?!? allow lstm state to persist even after global weights are copied (i guess) x feed state in to lstm during policy evals how does state work in gradient backups? Tests: - inspect lstm state inputs, outputs, and episode stored values - """ def reset(self): # todo: any other states to clean up?? # could have been absorbed in to a check for validity of game state... # but thats kind of magic-y and icky for this scenario self.game_state.reset() def process(self, sess, global_t, summary_writer, record_score_fn): #summary_op, score_input): states = [] actions = [] rewards = [] values = [] lstm_states = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) # write weight summaries...only need them from one thread really if (self.thread_index == 0): param_summary = sess.run(self.local_network.param_summary_op) summary_writer.add_summary(param_summary, global_step=global_t) start_local_t = self.local_t # resume with wherever we left off on last time through the action loop # TODO: no reason the network itself current should care about this if (self.lstm_last_output_state is None): self.lstm_last_output_state = self.local_network.lstm_initial_state_value lstm_state = self.lstm_last_output_state # lstm_state = self.local_network.lstm_last_output_state_value # t_max times loop for i in range(LOCAL_T_MAX): states.append(self.game_state.s_t) lstm_states.append(lstm_state) pi_, value_, lstm_state = self.local_network.run(sess, self.game_state.s_t, lstm_state) action = self.local_network.sample_action(pi_) # print "a3c train: pi_: ", pi_ # print "a3c train: action: ", action # pi_ = self.local_network.run_policy(sess, self.game_state.s_t) # action = choose_action(pi_) # self.choose_action(pi_) actions.append(action) # value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # s_t1 -> s_t self.game_state.update() # not sure why this is separate... # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append(np.clip(reward, -1, 1000)) self.local_t += 1 if terminal: terminal_end = True print "terminal score =", self.episode_reward # self._record_score(sess, summary_writer, summary_op, score_input, # self.episode_reward, global_t) record_score_fn(sess, summary_writer, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() # ugh. reset lstm state! lstm_state = self.local_network.lstm_initial_state_value break R = 0.0 if not terminal_end: # R = self.local_network.run_value(sess, self.game_state.s_t) _, R, _ = self.local_network.run(sess, self.game_state.s_t, lstm_state) # self.local_network.lstm_last_output_state_value = lstm_state # preserve for next time through the loop self.lstm_last_output_state = lstm_state # TODO: cant store the lists i pass directly since they'll be destructively reversed by # this call....hmmmmm # maybe just reverse them here and leave it? # start with copying the lists self.backup_and_accum_gradients(sess, global_t, summary_writer, states=states, lstm_states=lstm_states, actions=actions, values=values, rewards=rewards, final_reward_estimate=R) if (self.thread_index == 0) and (self.local_t % 100) == 0: print("TIMESTEP %d GLOBAL %d" % (self.local_t, global_t)) # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t def backup_and_accum_gradients(self, sess, global_t, summary_writer, states, lstm_states, actions, values, rewards, final_reward_estimate): """ inputs are lists reflecting a recorded episode fragment in the order they occured a = sample{ pi(a | s, lstm_s ) } v = V(s, lstms) r = env.step(a) :param states: states :param actions: :param rewards: :param lstm_states: :return: """ # TODO: copy these and leave the originals alone... actions.reverse() states.reverse() rewards.reverse() values.reverse() lstm_states.reverse() R = final_reward_estimate # compute and accmulate gradients for (ai, ri, si, Vi, lstm_si) in zip(actions, rewards, states, values, lstm_states): R = ri + GAMMA * R td = R - Vi a = self.local_network.feedback_action(ai) # a = np.zeros([self.local_network.action_size]) # a[ai] = 1 # reshape state input # no batching for now _, loss_summary = sess.run([self.accum_gradients, self.local_network.loss_summary_op], feed_dict=self.local_network.loss_feed_dictionary(si, a, td, R, lstm_si) # feed_dict={ # self.local_network.s: [si], # self.local_network.a: [a], # self.local_network.td: [td], # self.local_network.r: [R], # self.local_network.lstm_current_state_tensor: lstm_si # } ) if (self.thread_index == 0): summary_writer.add_summary(loss_summary, global_step=global_t) """ idea: maybe possible to do n-step TBPTT after having retroactively computed R for each state feed in batches of size up to n_max to a set of parallel networks with idea: set up the lstm with say 5 recursive calls. then the initial inputs would need to be padded...maybe? would work if made the inputs in batches and altered iteration logic to cycle inputs through the history... """ cur_learning_rate = self._anneal_learning_rate(global_t) _, grad_summary = sess.run([self.apply_gradients, self.grad_summary_op], feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0): summary_writer.add_summary(grad_summary, global_step=global_t) # TODO: rename 'states' variable as 'observations' in next version just to be f*****g crystal clear def process_memory(self, sess, global_t, summary_writer, states, initial_lstm_state, actions, rewards, final_state): """ :param sess: :param global_t: :param summary_writer: :param states: :param initial_lstm_state: :param actions: :param rewards: :param final_state: observation after the last game step...use None to signal terminal, otherwise used to compute the final boostrap Value :return: """ # TODO: gotcha initial_lstm_state must be set carefully # if the episode reflects t=0, the state is always known # otherwise how can we know what the lstm state output of the *current* policy might plausibly have been # unless the same policy was executed from the very beginning of the historical episode and propagated # we could just record the lstm_state prior to the beginning of the history episode as an approximation # we might expect it to converge reasonably after a number of steps to something from the plausible distribution # for the current policy...however, over time, the policy will drift away further and further from what # created the original lstm_state # this suggests the solution that we update the stored initial lstm state in the replay memory after every refresh # ...almost like a real memory trace in a human brain might... # but how can we update it ???? # maybe keep one state in reserve just to prime...but then we can only update the lstm state after it, not the one # that initial state needs....HMMM. maybe just # # for certain environments we could just apply the network to s_t+0 repeatedly until the lstm state converges # this works if the problem and/or env dont depend on any direct measure of time..perhaps # # easiest solution might just be to always reference the episodes to t=0 # or just ignore first k states when backing up and computing gradients...since presumably we'll have converged # to something reasonable by that point values = [] lstm_states = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) lstm_state = initial_lstm_state for (s_t, a_t, r_t) in zip(states, actions, rewards): # accum lstm states lstm_states.append(lstm_state) pi_, value_, lstm_state = self.local_network.run(sess, s_t, lstm_state) # get values values.append(values)
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run( self.accum_gradients, feed_dict = { self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td], self.local_network.r: [R]} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * \ (self.max_global_time_step - global_time_step) / \ self.max_global_time_step assert learning_rate > 0, 'Learning rate {} is not >0'.format( learning_rate) return learning_rate def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop start_local_t = self.local_t terminal_end = False for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) # Debug output for progress if (self.thread_index == 0) and (self.local_t % 100) == 0: print(('local_t = {:10} pi = ' + '{:7.5f} ' * len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t, *pi_, value_, self.thread_index)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward # TODO: Does this make sense? rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print ("score=", self.episode_reward) self._record_score( sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break # Compute and accmulate gradients R = 0.0 if terminal_end else self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # What is the meaning of these values? batch_si = [] batch_a = [] batch_td = [] batch_R = [] for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)]}) else: sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R}) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print ("TIMESTEP", self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step): self.thread_index = thread_index self.learning_rate_input = tf.placeholder("float") self.max_global_time_step = max_global_time_step self.local_network = GameACNetwork(ACTION_SIZE) self.local_network.prepare_loss(ENTROPY_BETA) # policy self.policy_trainer = AccumTrainer() self.policy_trainer.prepare_minimize( self.local_network.policy_loss, self.local_network.get_policy_vars()) self.policy_accum_gradients = self.policy_trainer.accumulate_gradients( ) self.policy_reset_gradients = self.policy_trainer.reset_gradients() self.policy_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.policy_apply_gradients = self.policy_applier.apply_gradients( global_network.get_policy_vars(), self.policy_trainer.get_accum_grad_list()) # value self.value_trainer = AccumTrainer() self.value_trainer.prepare_minimize( self.local_network.value_loss, self.local_network.get_value_vars()) self.value_accum_gradients = self.value_trainer.accumulate_gradients() self.value_reset_gradients = self.value_trainer.reset_gradients() self.value_applier = RMSPropApplier( learning_rate=self.learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) self.value_apply_gradients = self.value_applier.apply_gradients( global_network.get_value_vars(), self.value_trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # thread0 will record score for TensorBoard if self.thread_index == 0: self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i #fail safe return len(values) - 1 def _record_score(self, sess, summary_writer, summary_op, score, global_t): summary_str = sess.run(summary_op, feed_dict={self.score_input: score}) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op): states = [] actions = [] rewards = [] values = [] terminal_end = False # 加算された勾配をリセット sess.run(self.policy_reset_gradients) sess.run(self.value_reset_gradients) # shared から localにweightをコピー sess.run(self.sync) start_local_t = self.local_t # 5回ループ for i in range(LOCAL_T_MAX): pi_ = self.local_network.run_policy(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) value_ = self.local_network.run_value(sess, self.game_state.s_t) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "pi=", pi_ print " V=", value_ # gameを実行 self.game_state.process(action) # 実行した結果 reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 self.game_state.update() if terminal: terminal_end = True print "score=", self.episode_reward if self.thread_index == 0: self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t) self.episode_reward = 0 break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() # 勾配を算出して加算していく for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 sess.run(self.policy_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.a: [a], self.local_network.td: [td] }) sess.run(self.value_accum_gradients, feed_dict={ self.local_network.s: [si], self.local_network.r: [R] }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.policy_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) sess.run(self.value_apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t % 100) == 0: print "TIMESTEP", self.local_t # 進んだlocal step数を返す diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = self.choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if NETWORK_TYPE == 'LSTM': self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) elif NETWORK_TYPE == 'DILATED': self.local_network = GameACDilatedNetwork(ACTION_SIZE, device) elif NETWORK_TYPE == 'CONV': self.local_network = GameACFFNetwork(ACTION_SIZE, device) self.local_network.prepare_loss(ENTROPY_BETA) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_vars() ) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={ score_input: score }) summary_writer.add_summary(summary_str, global_t) def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t if NETWORK_TYPE == 'LSTM': start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) action = choose_action(pi_) states.append(self.game_state.s_t) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % 100) == 0: print(('local_t = {:10} pi = ' + '{:7.5f} '*len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t, *pi_, value_, self.thread_index)) # process game self.game_state.process(action) # receive game result reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True print ("score=", self.episode_reward) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.game_state.reset() if NETWORK_TYPE == 'LSTM': self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for(ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) if NETWORK_TYPE == 'LSTM': batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size : [len(batch_a)] } ) else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if (self.thread_index == 0) and (self.local_t % 100) == 0: print ("TIMESTEP", self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, sess, name="agent"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step #if USE_LSTM: # self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) #else: self.local_network = Network(name=name) self.local_network.prepare_loss(FLAGS.entropy_beta) # TODO: don't need accum trainer anymore with batch self.trainer = AccumTrainer(device) self.local_network.vars = self.trainer.prepare_minimize( self.local_network.total_loss, self.local_network.get_train_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.get_train_vars(), self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) #if USE_ALE: # self.game_state = GameState(113 * thread_index) #else: self.game = gym.make('Lis-v2') self.game.configure(str(5000 + thread_index)) # game initialization # observation = env.reset() self.observation, reward, end_episode, _ = self.game.step(1) #self.observation = self.preprocess([self.observation]) self.history = [self.rgb2gray(self.observation) for _ in range(4)] #FLAGS.history_frames self.observation = np.dstack(self.history) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) def set_start_time(self, start_time): self.start_time = start_time def rgb2gray(self, rgb, i=0): if FLAGS.save_frames: if self.thread_index == 0 and len( os.listdir(os.path.join(FLAGS.model_dir, "images"))) < 1000: scipy.misc.imsave( "%s/%i.png" % (os.path.join(FLAGS.model_dir, "images"), i), rgb["image"][0]) img = np.asarray(rgb["image"][0])[..., :3] img = np.dot(img, [0.299, 0.587, 0.114]) img = scipy.misc.imresize(img, (84, 84)) / 255.0 #flip H # #img = np.fliplr(img) return img #return -np.dot(img, [0.299, 0.587, 0.114]) / 255.0 + 1.0 def preprocess(self, frames, name=0): if len(frames) == 1: gray = self.rgb2gray(frames[0]) return np.dstack([gray, gray, gray, gray]) return np.dstack([self.rgb2gray(frame) for frame in frames]) def action2string(self, action): moveX, moveZ, turn = 0, 0, 0 """if action == 0: moveX = -10 elif action == 1: moveX = 10 elif action == 2: moveZ = -10 elif action == 3: moveZ = 10 elif action == 4: turn = 10 elif action == 5: turn = -10 elif action == 6: pass""" if action == 0: turn = -10 elif action == 1: turn = 10 elif action == 2: moveZ = 10 elif action == 3: pass return "%s %s %s" % (moveX, moveZ, turn) def get_frame(self, index): if index > len(self.history): return self.history[-1] else: return self.history[-index] def process(self, sess, global_t, summary_writer, summary_op, score_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t #if USE_LSTM: # start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(FLAGS.local_t_max): #if USE_ALE: # pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) #else: pi_, value_ = self.local_network.run_policy_and_value( sess, self.observation) #if self.thread_index == 0: #print(pi_) #cv2.namedWindow("img", cv2.WINDOW_NORMAL) #cv2.imshow("img", self.observation) #cv2.waitKey(1) """if self.thread_index == 0 and len(os.listdir(os.path.join(FLAGS.model_dir, "images"))) < 1000: ft = sess.run(self.local_network.col_hiddens[0][0], feed_dict={self.local_network.s: [self.observation]}) print(ft.shape) scipy.misc.imsave("%s/%i-obs.png" % (os.path.join(FLAGS.model_dir, "images"), global_t + i), self.observation[:, :, 3]) for m in range(8): img = ft[0, :, :, m] img = img - np.amin(img) img /= np.amax(img) img *= 255.0 scipy.misc.imsave("%s/%i-feature-%i.png" % (os.path.join(FLAGS.model_dir, "images"), global_t + i, m), img) """ action = self.choose_action(pi_) states.append(self.observation) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) #if USE_ALE: #self.game_state.process(action) #reward = self.game_state.reward #end_episode = self.game_state.terminal #else: #for i in range(FLAGS.skip_frames): new_obs, reward, end_episode, _ = self.game.step( self.action2string(action)) if len(self.history) > 10: del self.history[0] self.history.append(self.rgb2gray( new_obs, global_t + self.local_t)) #, "%i-a%i" % (global_t, action) def create_history(): return np.dstack([ self.get_frame(1), self.get_frame(2), self.get_frame(3), self.get_frame(4) ]) new_observation = create_history() # process game #self.game_state.process(action) # receive game result #reward = self.game_state.reward terminal = end_episode #self.game_state.terminal self.episode_reward += reward # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 #if USE_ALE: # s_t1 -> s_t # self.game_state.update() #else: if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 #if USE_ALE: self.game.reset() #else: #self.history = [self.rgb2gray(self.game.step(0))] #self.observation = create_history() #if USE_LSTM: # self.local_network.reset_state() break else: self.observation = new_observation R = 0.0 if not terminal_end: #if USE_ALE: # R = self.local_network.run_value(sess, self.game_state.s_t) #else: R = self.local_network.run_value(sess, self.observation) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + FLAGS.gamma * R td = R - Vi a = np.zeros([FLAGS.action_size]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingthread(object): def __init__(self, sess, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, num_trainable_vars): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if LSTM: initializer = tf.random_uniform_initializer(-0.1, 0.1) with tf.variable_scope("model"+str(thread_index), reuse=None, initializer=initializer): self.local_network = AC3LSTM(num_actions, num_states, num_trainable_vars) else: self.local_network = AC3FF(num_actions, num_states, num_trainable_vars) self.local_network.prepare_loss(entropy_beta) self.trainer = AccumTrainer() self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.trainable_vars) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() self.apply_gradients = grad_applier.apply_gradients( global_network.trainable_vars, self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.game_state = ChainMDP() self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0: learning_rate = 0 return learning_rate def choose_action(self, pi_values): values = [] sum = 0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i; #fail safe return len(values)-1 # Run for one episode def thread(self, sess, global_t): states = [] actions = [] rewards = [] values = [] terminal_end = False if LSTM: self.local_network.reset_state() # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t mdp = ChainMDP() state = mdp.states[np.random.randint(0, mdp.num_states-1)] discounted_reward = 0 for i in range(local_t_max): if LSTM: action_probs = self.local_network.run_policy(sess, state, update_rnn_state=True) else: action_probs = self.local_network.run_policy(sess, state) action = self.choose_action(action_probs) states.append(state) actions.append(action) if LSTM: # # Do not update the state again value_ = self.local_network.run_value(sess, state, update_rnn_state=False) else: value_ = self.local_network.run_value(sess, state) values.append(value_) reward, next_state, terminal = mdp.act(state, action) self.episode_reward += reward rewards.append(reward) self.local_t += 1 state = next_state if terminal: terminal_end = True discounted_reward = (discount_rate**i)*self.episode_reward self.episode_reward = 0 state = mdp.states[np.random.randint(0, mdp.num_states-1)] if LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: if LSTM: # Do not update the state again R = self.local_network.run_value(sess, state, update_rnn_state=False) else: R = self.local_network.run_value(sess, state) # Order from the final time point to the first ### why? actions.reverse() states.reverse() rewards.reverse() values.reverse() # compute and accumulate gradients for (action, r, state, V) in zip(actions, rewards, states, values): R = r[0][0] + discount_rate * R td = R - V # temporal difference a = np.zeros([num_actions]) a[action] = 1 #a = np.reshape(a,[1,num_actions]) ### Should be done when the variable is created - or change something on the other end sess.run(self.accum_gradients, feed_dict = { #self.local_network.state: [state], self.local_network.state: np.reshape([float(i) for i in state],[1,mdp.num_states]), ### use np.array( ,dtype=...) instead self.local_network.a: [a], self.local_network.td: [td], self.local_network.r: [R]}) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict = {self.learning_rate_input: cur_learning_rate}) # local step diff_local_t = self.local_t - start_local_t return diff_local_t, discounted_reward