class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) states = [] actions = [] rewards = [] values = [] targets = [] rnn_inits = [] state_representation = [] usf = [] reward_vector = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) #At each episode start we set the initial state of the RNN to zero start_local_t = self.local_t start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_, usf_s_g = self.local_network.run_policy_and_value( sess, self.env.s_t, self.env.target, self.scopes) imidia_s = self.local_network.run_state(sess, self.env.s_t, self.scopes) #usf_s_g = self.local_network.run_usf(sess, self.env.s_t, self.env.target,self.rnn_state_init[0] ,self.rnn_state_init[1] ,self.scopes) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) usf.append(usf_s_g) state_representation.append(imidia_s) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward terminal = self.env.terminal # ad-hoc reward for navigation reward = 10.0 if terminal else -0.01 if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 self.episode_max_q = max(self.episode_max_q, np.max(value_)) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.env.update() if i == (LOCAL_T_MAX - 1) or terminal: imidiate_state_representation_next = [] usf_next = [] #reward_vector_predictor_next=[] last_state = self.env.s_t imidia_s_next = self.local_network.run_state( sess, self.env.s_t, self.scopes) state_representation_next = state_representation[1:] + [ imidia_s_next ] if terminal: usf_next_imi = 0 else: usf_next_imi = self.local_network.run_usf( sess, self.env.s_t, self.env.target, self.scopes) usf_next = usf[1:] + [usf_next_imi] if terminal: terminal_end = True sys.stdout.write( "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) oneResult = [ global_t, self.thread_index, self.scene_scope, self.task_scope, self.episode_reward, self.episode_length, self.episode_max_q ] with open('trainingOutput.csv', 'a+') as fp: # fd.write(oneResult) wr = csv.writer(fp) wr.writerow(oneResult) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.local_network.reset_state() self.env.reset() break R = 0.0 usf_R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.env.s_t, self.env.target, self.scopes) usf_R = self.local_network.run_usf(sess, self.env.s_t, self.env.target, self.scopes) actions.reverse() states.reverse() rewards.reverse() values.reverse() state_representation.reverse() state_representation_next.reverse() usf_next.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] batch_usf_R = [] batch_t = [] # compute and accmulate gradients for (ai, ri, si, Vi, ti, state, usf_n) in zip(actions, rewards, states, values, targets, state_representation_next, usf_next): R = ri + GAMMA * R usf_R = state + GAMMA * usf_R #usf_R = state + GAMMA*usf_n td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_usf_R.append(usf_R) batch_t.append(ti) #We need to reverse this since in the training we unroll for 5 steps unlike in the inferences batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() batch_usf_R.reverse() batch_t.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.return_usf: batch_usf_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] if USE_LSTM: self.local_network = ActorCriticLSTMNetwork( action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) else: self.local_network = ActorCriticFFNetwork( action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.env = None self.obs = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.entropy = np.zeros(20) def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = gym.make('Pong-v0') self.obs = self.env.reset() states = [] actions = [] rewards = [] values = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t if USE_LSTM: start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.obs, self.scopes) action = self.choose_action(pi_) states.append(self.obs) actions.append(action) values.append(value_) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.obs, reward, terminal, info = self.env.step(action) # ad-hoc reward for navigation # reward = 10.0 if terminal else -0.01 # if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 self.episode_max_q = max(self.episode_max_q, np.max(value_)) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t # self.env.update() if terminal: terminal_end = True sys.stdout.write( "time %d | thread #%d | scene %s \n" "%s %s episode reward = %.3f\n" "%s %s episode length = %d\n" "%s %s episode max Q = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t), "episode_entropy": self.entropy[ 0] # self.entropy here is a np.array([1, 20]) with same # value for each element, don't know why } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.obs = self.env.reset() if USE_LSTM: self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.obs, self.scopes) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] # compute and accmulate gradients for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) cur_learning_rate = self._anneal_learning_rate(global_t) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() _, self.entropy = sess.run( [self.apply_gradients, self.local_network.entropy], feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], self.learning_rate_input: cur_learning_rate }) # _, self.entropy = sess.run([self.accum_gradients, self.local_network.entropy], # feed_dict={ # self.local_network.s: batch_si, # self.local_network.a: batch_a, # self.local_network.td: batch_td, # self.local_network.r: batch_R, # self.local_network.step_size: [len(batch_a)] # }) else: _, self.entropy = sess.run( [self.accum_gradients, self.local_network.entropy], feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.td: batch_td, self.local_network.r: batch_R }) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t