def _evaluate(self, sess, list_of_tasks, num_episodes, max_steps, success_cutoff): scene_scopes = list_of_tasks.keys() results = {} for scene_scope in scene_scopes: for task_scope in list_of_tasks[scene_scope]: env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) ep_lengths = [] ep_collisions = [] oracle_lengths = [] ep_successes = [] scopes = [self.network_scope, scene_scope, task_scope] for i_episode in range(num_episodes): env.reset() oracle_lengths.append(env.shortest_path_distances[ env.current_state_id][env.terminal_state_id]) terminal = False ep_length = 0 ep_collision = 0 while not terminal: pi_values = self.local_network.run_policy( sess, env.s_t, env.target, scopes) action = sample_action(pi_values) env.step(action) env.update() terminal = env.terminal if ep_length == max_steps: break if env.collided: ep_collision += 1 ep_length += 1 ep_lengths.append(ep_length) ep_collisions.append(ep_collision) ep_successes.append(int(ep_length < success_cutoff)) results[scene_scope + task_scope] = [ np.mean(ep_lengths), np.mean(ep_collisions), np.mean(oracle_lengths), np.mean(ep_successes) ] return results
def _evaluate(self, sess, list_of_tasks, num_episodes, max_steps, success_cutoff): scene_scopes = list_of_tasks.keys() results = {} for scene_scope in scene_scopes: for task_scope in list_of_tasks[scene_scope]: env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) ep_lengths = [] ep_collisions = [] oracle_lengths = [] ep_successes = [] scopes = [self.network_scope, scene_scope, task_scope] for i_episode in range(num_episodes): env.reset() oracle_lengths.append(env.shortest_path_distances[env.current_state_id][env.terminal_state_id]) terminal = False ep_length = 0 ep_collision = 0 while not terminal: pi_values = self.local_network.run_policy(sess, env.s_t, env.target, scopes) action = sample_action(pi_values) env.step(action) env.update() terminal = env.terminal if ep_length == max_steps: break if env.collided: ep_collision += 1 ep_length += 1 ep_lengths.append(ep_length) ep_collisions.append(ep_collision) ep_successes.append(int(ep_length < success_cutoff)) results[scene_scope + task_scope] = [np.mean(ep_lengths), np.mean(ep_collisions), np.mean(oracle_lengths), np.mean(ep_successes)] return results
def test_scene(dump_file, test_cnt): config = Configuration() scene = os.path.basename(dump_file).split('.')[0] if scene not in TASK_LIST: env = THORDiscreteEnvironment({ 'h5_file_path': dump_file, }) task_list = np.random.choice(list(range(env.n_locations)), test_cnt) else: task_list = TASK_LIST[scene] logging.info( "testing scene %(scene)s task_list=%(task_list)s from dump file %(dump_file)s" % locals()) for t in task_list: target = int(t) env = THORDiscreteEnvironment({ 'h5_file_path': dump_file, 'terminal_state_id': target, }) start = time.time() expert = Expert(env) logging.debug("building policy takes %f s" % (time.time() - start)) assert expert.verify_distance_matrix() for _ in range(test_cnt): env.reset() logging.debug("scene=%s target=%d source=%d" % (scene, target, env.current_state_id)) steps = [] orig_state = env.current_state_id while not env.terminal: a = expert.get_next_action() logging.debug("state=%d action=%d" % (env.current_state_id, a)) env.step(a) steps.append((env.current_state_id, a)) assert len( steps ) < config.max_steps_per_e, "current steps is beyond max_steps_per_e" logging.debug( str(orig_state) + ''.join([expert.get_a_str(a) + str(s) for (s, a) in steps])) assert len( steps) == env.shortest_path_distances[orig_state][target]
pi_values_list=[] while not terminal: pi_values, value_0= global_network.run_policy_and_value(sess, env.s_t, env.target, scopes) pi_values_list.append(pi_values) action = sample_action(pi_values) action_list.append(action) value_list.append(value_0) show_target.append(env.observation) cur_id.append(env.current_state_id) # current_id_list.append() x_draw.append(env.x) z_draw.append(env.z) r_draw.append(env.r) ep_action.append(action) env.step(action) env.update() max_value.append(value_0) viewer.imshow(env.observation,str(value_0)) terminal = env.terminal if ep_t == 1000: break if env.collided: ep_collision += 1 ep_reward += env.reward ep_t += 1 if 1: pi_values, value_0 = global_network.run_policy_and_value(sess, env.s_t, env.target, scopes) pi_values_list.append(pi_values) action = sample_action(pi_values) action_list.append(action)
for i_episode in range(NUM_EVAL_EPISODES): env.reset() terminal = False ep_reward = 0 ep_collision = 0 ep_t = 0 while not terminal: usf_s_g = global_network.run_usf(sess, env.s_t, env.target, scopes) pi_values = global_network.run_policy( sess, env.s_t, env.target, usf_s_g, scopes) action = sample_action(pi_values) env.step(action) env.update() terminal = env.terminal if ep_t == 500: break if env.collided: ep_collision += 1 ep_reward += env.reward ep_t += 1 ep_lengths.append(ep_t) ep_rewards.append(ep_reward) ep_collisions.append(ep_collision) if VERBOSE: print("episode #{} ends after {} steps".format( i_episode, ep_t))
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", training_scene="scene", task_scope="task", checkpoint_scope="checkpoint"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.training_scene = training_scene self.task_scope = task_scope self.checkpoint_scope = checkpoint_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment({ 'scene_name': self.training_scene, 'terminal_state_id': self.task_scope, 'checkpoint_state_id': self.checkpoint_scope }) self.env.reset() states = [] actions = [] rewards = [] values = [] targets = [] checkpoints = [] positions = [] auxilaries = [] auxilaries_cl = [] aclists = [] colists = [] isCheckpointed = [] collision = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.env.s_t, self.env.s_position, self.env.checkpoint, self.env.s_a_t, self.env.s_c_t, self.env.isCheckpoint, self.env.s_aux_cl, self.scopes) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) checkpoints.append(self.env.checkpoint) positions.append(self.env.s_position) aclists.append(self.env.s_a_t) colists.append(self.env.s_c_t) collision.append(self.env.s_aux_cl) isCheckpointed.append(int(self.env.isCheckpoint)) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward terminal = self.env.terminal #if self.env.isCheckpoint: # sys.stdout.write("CHECKPOINT \n") if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 self.episode_max_q = max(self.episode_max_q, np.max(value_)) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.env.update() auxilaries.append(self.env.s_aux) auxilaries_cl.append(self.env.s_aux_cl) if terminal: terminal_end = True sys.stdout.write( "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.env.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.env.s_t, self.env.s_position, self.env.checkpoint, self.env.s_a_t, self.env.s_c_t, self.env.isCheckpoint, self.env.s_aux_cl, self.scopes) actions.reverse() states.reverse() rewards.reverse() values.reverse() positions.reverse() auxilaries.reverse() auxilaries_cl.reverse() aclists.reverse() colists.reverse() isCheckpointed.reverse() collision.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] batch_t = [] batch_c = [] batch_p = [] batch_aux = [] batch_aux_cl = [] batch_al = [] batch_cl = [] batch_ic = [] batch_collision = [] # compute and accmulate gradients for (ai, ri, si, Vi, ti, ci, pi, auxi, aux_cl_i, ali, cli, ici, coli) in zip(actions, rewards, states, values, targets, checkpoints, positions, auxilaries, auxilaries_cl, aclists, colists, isCheckpointed, collision): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_t.append(ti) batch_c.append(ci) batch_p.append(pi) batch_aux.append(auxi) batch_aux_cl.append(aux_cl_i) batch_al.append(ali) batch_cl.append(cli) batch_ic.append(ici) batch_collision.append(coli) sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_p, self.local_network.c: batch_c, self.local_network.td: batch_td, self.local_network.aux: batch_aux, self.local_network.aux_cl: batch_aux_cl, self.local_network.al: batch_al, self.local_network.cl: batch_cl, self.local_network.ic: batch_ic, self.local_network.col: batch_collision, self.local_network.r: batch_R }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t, terminal
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, global_discriminator, initial_learning_rate, learning_rate_input, grad_applier, grad_applier_discriminator, max_global_time_step, device, device2, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.network_scope_D = network_scope + "_d" self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.scopes_d = [self.network_scope_D, task_scope] self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network. total_loss, #getting the gradients of for the local network variablkes self.local_network.get_vars()) #This part is for the newly added PPO loss (we need to keep old and new update parameters) new_variable_list = self.local_network.get_vars() old_varaible_list = self.local_network.get_vars_old() #For the ppo loss begining of the each iteration we need to sync old with current self.old_new_sync = self.local_network.sync_curre_old() self.accum_gradients = self.trainer.accumulate_gradients( ) #This is to assign gradients self.reset_gradients = self.trainer.reset_gradients( ) #after applying the grads to variables we need to resent those variables accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] #get the name list of all the grad vars global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] #check whether the global_network vars are mentioned in gradiet computations for them local_net_vars = [ x for x in self.local_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.apply_gradients_local = grad_applier.apply_gradients_local_net( local_net_vars, self.trainer.get_accum_grad_list()) #If this is unstable it is desireable to first apply the gradients on the local network and then clip and after that we apply self.sync = self.local_network.sync_from( global_network ) #this is to sync from the glocal network Apply updated global params to the local network #This part is for the Discriminator ######################################################################################### # self.local_discriminator = Discriminator_WGAN( # action_size=ACTION_SIZE, # device=device, # network_scope=network_scope, # scene_scopes=[scene_scope]) # # self.local_discriminator.prepare_loss_D(ENTROPY_BETA, self.scopes_d) # # self.trainer_D = AccumTrainer_d(device=device, name="AccumTrainer_d") # # self.trainer_D.prepare_minimize( self.local_discriminator.total_loss_d, # self.local_discriminator.get_vars()) # # # self.accum_gradients_d = self.trainer_D.accumulate_gradients() # self.reset_gradients_d = self.trainer_D.reset_gradients() # # accum_grad_names_discrimi = [ self._local_var_name(x) for x in self.trainer_D.get_accum_grad_list() ] # # global_discri_vars = [ x for x in global_discriminator.get_vars() if self._get_accum_grad_name(x) in accum_grad_names_discrimi ] local_discri_vars = [ x for x in self.local_discriminator.get_vars() if self._get_accum_grad_name(x) in accum_grad_names_discrimi ] # self.apply_gradients_discriminator = grad_applier_discriminator.apply_gradients( local_discri_vars, self.trainer_D.get_accum_grad_list() ) #applying grad to the LOCAL network # self.clip_local_d_weights = self.local_discriminator.clip_weights( ) #here we are clipping the global net weights directly. # self.sync_discriminator_l_G = self.local_discriminator.sync_to( global_discriminator) # self.sync_discriminator_G_l = self.local_discriminator.sync_from( global_discriminator) # self.D_var_G = global_discriminator.get_vars() self.D_var_l = self.local_discriminator.get_vars() # # # ######################################################################################### self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: print('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) self.env.reset() #resetting the environment for each thread self.env_Oracle = Environment( { #Every iteration in the thread the expert start with the current state of the agent 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope), 'initial_state': self.env.current_state_id }) self.env_Oracle.reset() states = [] #to keeep state ,actions ,targets and other stae actions = [] rewards = [] values = [] targets = [] dones = [] states_oracle = [] actions_oracle = [] targets_oracle = [] terminal_end = False #in the start terminal state_end is false sess.run( self.reset_gradients ) #resetting the gradient positions when starting the process for each Iteration sess.run(self.sync) # copy weights from shared to local #dicriminator sync ########################## sess.run(self.sync_discriminator_G_l ) #Copy the weights from the sharead to the local sess.run(self.reset_gradients_d ) #resetting the gradients of the discriminator slosts ######################## start_local_t = self.local_t self.oracle = ShortestPathOracle(self.env_Oracle, ACTION_SIZE) ######################################################################################### #Sampling the Expert Trajectories for i in range(100): #We might need to use an for loop to finish the expert trajectory first oracle_pi = self.oracle.run_policy( self.env_Oracle.current_state_id ) #get the policy of the oracle which means the shotest path kind of action in the given state oracle_action = self.choose_action(oracle_pi) states_oracle.append(self.env_Oracle.s_t) actions_oracle.append(oracle_action) targets_oracle.append(self.env_Oracle.target) self.env_Oracle.step(oracle_action) terminal_o = self.env_Oracle.terminal self.env_Oracle.update() if terminal_o: break ############################################################################################## # t_max times loop for i in range( LOCAL_T_MAX ): #one thread will run for maximum amoound to 5 iterations then do a gradiet uodate pi_, value_ = self.local_network.run_policy_and_value( sess, self.env.s_t, self.env.target, self.scopes) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward #getting the reward from the env terminal = self.env.terminal #geting whether the agent went to a terminal state # ad-hoc reward for navigation reward = 10.0 if terminal else -0.01 #this is the normal reward here 10 if terminal all the others it is -0.01 (ollision donesst take in to the accout) if self.episode_length > 5e3: terminal = True #Here we do not let agent to run more that 5000 steps so we make it terminal #but the above terminal thing has no effect on giving 10 as the rwaerd because we set the rweard above self.episode_reward += reward self.episode_length += 1 #this is what is the maximum value got in the episode self.episode_max_q = max(self.episode_max_q, np.max( value_)) #self.episode_max_q-This is -inf in the beggining # clip reward rewards.append( np.clip(reward, -1, 1) ) #make sure the rewartds is between -1 and +1 even thore rtthere is a 10 self.local_t += 1 # s_t1 -> s_t self.env.update() if terminal: #if we go to the terminal state we will surely break the function score = self.local_discriminator.run_critic( sess, states, targets, actions, self.scopes_d) sys.stdout.write("Critic_Score = {0}".format(score)) terminal_end = True sys.stdout.write( "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 #after terminal state we gonna make all these variables to zero self.episode_length = 0 #Now the AI need to start from new position self.episode_max_q = -np.inf #after a terminaltion we do this self.env.reset() break R = 0.0 #In the terminal Return is nothing #If it's terminal end we do not have a return from the final state if not terminal_end: #But if it's not the turminal Return is the next value function R = self.local_network.run_value(sess, self.env.s_t, self.env.target, self.scopes) #Agent's Samples actions.reverse() states.reverse() rewards.reverse() values.reverse() #Expert's Samples states_oracle.reverse() actions_oracle.reverse() actions_oracle.reverse() #Agent's batch batch_si = [] batch_a = [] batch_actions = [] batch_td = [] batch_R = [] batch_t = [] #Expert's Batch batch_si_ex = [] batch_a_ex = [] batch_t_ex = [] batch_si_d = [] batch_t_d = [] batch_actions_d = [] #This is for the for (s_e, a_e, t_e) in zip(states_oracle, actions_oracle, targets_oracle): batch_si_ex.append(s_e) batch_a_ex.append(a_e) batch_t_ex.append(t_e) for (ai, si, ti) in zip(actions, states, targets): batch_actions_d.append(ai) batch_si_d.append(si) batch_t_d.append(ti) cur_learning_rate = self._anneal_learning_rate(global_t) for i in range(10): #sess.run(self.reset_gradients_d) sess.run( self. accum_gradients_d, #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence feed_dict={ self.local_discriminator.s_e: batch_si_ex, self.local_discriminator.Actions_e: batch_a_ex, self.local_discriminator.s_a: batch_si_d, self.local_discriminator.Actions_a: batch_actions_d, self.local_discriminator.t_e: batch_t_ex, self.local_discriminator.t_a: batch_t_d }) sess.run( self. apply_gradients_discriminator, #directly gradients get apply on the global discri feed_dict={self.learning_rate_input: 0.00005}) loss = sess.run(self.local_discriminator.total_loss_d, feed_dict={ self.local_discriminator.s_e: batch_si_ex, self.local_discriminator.Actions_e: batch_a_ex, self.local_discriminator.s_a: batch_si_d, self.local_discriminator.Actions_a: batch_actions_d, self.local_discriminator.t_e: batch_t_ex, self.local_discriminator.t_a: batch_t_d }) sess.run(self.clip_local_d_weights ) #every update make sure u clip weihtfs critic_r = self.local_discriminator.run_critic(sess, batch_si_d, batch_t_d, batch_actions_d, self.scopes_d) critic_r = critic_r * 0.1 rewards = rewards + critic_r #We concatenate the rewrds function # Compute the advantage function , return and stack them as batches in Agent for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values, targets): R = ri + GAMMA * R #calculatung the adcantage function td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 #making the actions one hot batch_actions.append(ai) batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_t.append(ti) #syncying the new paramters to the old network in the thread PPO sess.run(self.old_new_sync) for i in range(4): #sess.run(self.reset_gradients) #reset the gradients sess.run( self. accum_gradients, #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R, }) sess.run( self. apply_gradients_local, #apply the gradients to the local networ feed_dict={self.learning_rate_input: cur_learning_rate}) #theoritcally we can have one accume gradient operation here sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) sess.run( self.sync_discriminator_l_G ) #syncing the paramters from the local network to the global newok if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class DaggerThread(object): def __init__(self, config, global_network, thread_index, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.config = config self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = global_network self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) self.env.reset() self.expert = Expert(self.env) self.local_t = 0 self.episode_length = 0 self.first_iteration = True # first iteration of Dagger # training dataset self.states = [] self.actions = [] self.targets = [] def choose_action_label_smooth(self, expected_action, epsilon): """ P(k) = (1-epsilon) * P_e + e * 1/N """ pi_values = [epsilon / float(self.config.action_size) ] * self.config.action_size pi_values[expected_action] += 1 - epsilon return pi_values def choose_action_greedy(self, pi_values): # greedy algorithm since this is supervised learning return np.argmax(pi_values, axis=0) def choose_action(self, pi_values): values = [] s = 0.0 for rate in pi_values: s += rate values.append(s) r = random.random() * s for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def add_summary(self, writer, value_dict): if writer is None or len(value_dict) == 0: return value = [ tf.Summary.Value(tag=k, simple_value=v) for k, v in value_dict.items() ] summary = tf.Summary(value=value) writer.add_summary(summary, global_step=self.local_network.get_global_step()) logging.debug("writing summary %s" % (str(summary))) def train(self, session, writer): assert len(self.states) == len( self.actions), "data count of action and state mismatch" s = self.states a = self.actions n_total = len(s) assert n_total > 0, "null dataset" t = [self.env.s_target] * n_total if n_total > self.config.batch_size: data = list(zip(s, a)) np.random.shuffle(data) s, a = zip(*data) local_t = self.local_t scope = self.scene_scope + '/' + self.task_scope for epoch in range(self.config.max_epochs): train_loss, train_accuracy = self.local_network.run_epoch( session, self.scopes, s, t, a, True, writer) global_step = self.local_network.get_global_step() logging.info( "%(scope)s:t=%(local_t)d " "train_step=%(global_step)d loss=%(train_loss)f acc=%(train_accuracy)f" % locals()) return def process(self, sess, global_t, summary_writer): start_local_t = self.local_t # draw experience with current policy or expert policy terminal = False for i in range(self.config.local_t_max): if self.first_iteration: # use expert policy before any training expert_action = action = self.expert.get_next_action() expert_lsr_pi = self.choose_action_label_smooth( expert_action, self.config.lsr_epsilon) else: expert_action = self.expert.get_next_action() expert_lsr_pi = self.choose_action_label_smooth( expert_action, self.config.lsr_epsilon) pi_ = self.local_network.run_policy(sess, self.env.s_t, self.env.s_target, self.scopes) action = self.choose_action(pi_) logging.debug( "action=%(action)d expert_action=%(expert_action)d " "expert_lsr_pi=%(expert_lsr_pi)s pi_=%(pi_)s" % locals()) self.states.insert(0, self.env.s_t) self.actions.insert(0, expert_lsr_pi) self.env.step(action) self.env.update() terminal = True if self.episode_length > self.config.max_steps_per_e else self.env.terminal self.episode_length += 1 self.local_t += 1 if terminal: logging.info( "[episode end] time %d | thread #%d | scene %s | target #%s expert:%s episode length = %d\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, "T" if self.first_iteration else "F", self.episode_length)) summary_values = { "episode_length_input": float(self.episode_length), } if not self.first_iteration: # record agent's score only self.add_summary(summary_writer, summary_values) self.episode_length = 0 self.env.reset() break # train policy network with gained labels self.train(sess, summary_writer) self.first_iteration = False return self.local_t - start_local_t def evaluate(self, sess, n_episodes, expert_agent=False): ep_lengths = [] ep_collisions = [] accuracies = [] for i in range(n_episodes): self.env.reset() terminal = False step = 0 n_collision = 0 while not terminal: if expert_agent: action = self.expert.get_next_action() else: expert_action = self.expert.get_next_action() pi_ = self.local_network.run_policy( sess, self.env.s_t, self.env.s_target, self.scopes) action = self.choose_action(pi_) accuracies.append(1.0 if expert_action == action else 0.0) logging.debug( "action=%(action)d expert_action=%(expert_action)d pi_=%(pi_)s" % locals()) self.env.step(action) self.env.update() terminal = self.env.terminal if step > self.config.max_steps_per_e: terminal = True logging.debug("episode %(i)d hits max steps" % locals()) n_collision += int(self.env.collided) step += 1 logging.debug("episode %(i)d ends with %(step)d steps" % locals()) ep_lengths.append(step) ep_collisions.append(n_collision) return ep_lengths, ep_collisions, accuracies
class Expert(object): def __init__(self, loop_index, max_global_time_step, network_scope="network", scene_scope="scene", task_scope="task"): self.max_global_time_step = max_global_time_step self.network_scope = network_scope #assiciated with the thread number self.scene_scope = scene_scope #Whether this is kitchen or not self.task_scope = task_scope #This the targe self.scopes = [network_scope, scene_scope, task_scope] # ["thread-n", "scene", "target"] self.env = None self.local_t = 0 self.oracle = None def choose_action(self, oracle_pi_values): pi_values = oracle_pi_values r = random.random() * np.sum(pi_values) values = np.cumsum(pi_values) for i in range(len(values)): if values[i] >= r: return i def open_file_and_save(self, file_path, data): try: with open(file_path, 'ab') as f_handle: np.savetxt(f_handle, data, fmt='%s') except FileNotFoundError: with open(file_path, 'wb') as f_handle: np.savetxt(f_handle, data, fmt='%s') def save_expert(self): states = [] action_list = [] target_list = [] action_history_list = [] for i in range(10): self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) print( "Starting -", "*******************************************************************-----------", i) print("From the environment Current State ID--", self.env.current_state_id) print("From the environment Traget State ID--", self.env.terminal_state_id) print("Frm the environment Number of possible states", self.env.n_locations) print( "________________________________________________________________________________" ) self.oracle = ShortestPathOracle(self.env, ACTION_SIZE) action_itr = 0 #To stack action history while (not (self.env.terminal)): s_t = self.env.s_t target = self.env.s_target #self.oracle = ShortestPathOracle(self.env, ACTION_SIZE) oracle_pi = self.oracle.run_policy( self.env.current_state_id ) #get the policy of the oracle which means the shotest path kind of action in the given state action = self.choose_action( oracle_pi) #select the action probabilistically if action_itr == 0: action_his = np.tile(action, (4)) action_itr = 1 else: action_his = np.append(action_his[1:], action) #Saving current state,targets and action thriplets states.append(s_t) #stack action action_list.append(action) target_list.append(target) action_history_list.append(action_his) self.env.step(action) #here we change the next step is_terminal = self.env.terminal is_collided = self.env.collided self.local_t += 1 if is_collided: print( "Wrong action-------- Error Error Error Error Error Error Error Error Error Error" ) break # s_t1 -> s_t self.env.update() #update the new state #self.env.reset() #With this print("Done with one epoach one start state to end goal ") self.env.reset() states = np.reshape(states, newshape=[-1] + list([8192])) target_list = np.reshape(target_list, newshape=[-1] + list([8192])) self.open_file_and_save('trajectory/observations.csv', states) print(np.shape(states)) print(np.shape(action_list)) print(np.shape(action_history_list)) print(np.shape(target_list)) self.open_file_and_save('trajectory/actions.csv', action_list) self.open_file_and_save('trajectory/targets.csv', target_list) self.open_file_and_save('trajectory/actions_history.csv', action_history_list)
class SmashNetTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, initial_diffidence_rate_seed, mode="train", network_scope="network", scene_scope="scene", task_scope="task", encourage_symmetry=False): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] # ["thread-n", "scene", "target"] self.local_network = SmashNet( action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(self.scopes) if mode is "train": self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()] global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate # self.episode_reward = 0 self.episode_length = 0 # self.episode_max_q = -np.inf self.episode_pi_sim = 0 self.episode_loss = 0 self.initial_diffidence_rate_seed = initial_diffidence_rate_seed self.oracle = None self.mode = mode self.encourage_symmetry = encourage_symmetry def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':','_') + '_accum_grad:0' def _anneal_rate(self, init_rate, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) rate = init_rate * time_step_to_go / self.max_global_time_step return rate def _anneal_learning_rate(self, global_time_step): learning_rate = self._anneal_rate(self.initial_learning_rate, global_time_step) return learning_rate def _inverse_sigmoid_decay_rate(self, init_rate_seed, global_time_step): rate = init_rate_seed*np.exp(-global_time_step/init_rate_seed) rate = rate / (1. + rate) return rate def _anneal_diffidence_rate(self, global_time_step): if self.initial_diffidence_rate_seed == 0: return 0 else: return self._inverse_sigmoid_decay_rate(self.initial_diffidence_rate_seed, global_time_step) # TODO: check def choose_action(self, smashnet_pi_values, oracle_pi_values, confidence_rate): r = random.random() if r < confidence_rate: pi_values = oracle_pi_values else: pi_values = smashnet_pi_values r = random.random() * np.sum(pi_values) values = np.cumsum(pi_values) for i in range(len(values)): if values[i] >= r: return i def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) writer.add_summary(summary_str, global_t) # writer.flush() def _evaluate(self, sess, list_of_tasks, num_episodes, max_steps, success_cutoff): scene_scopes = list_of_tasks.keys() results = {} for scene_scope in scene_scopes: for task_scope in list_of_tasks[scene_scope]: env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) ep_lengths = [] ep_collisions = [] oracle_lengths = [] ep_successes = [] scopes = [self.network_scope, scene_scope, task_scope] for i_episode in range(num_episodes): env.reset() oracle_lengths.append(env.shortest_path_distances[env.current_state_id][env.terminal_state_id]) terminal = False ep_length = 0 ep_collision = 0 while not terminal: pi_values = self.local_network.run_policy(sess, env.s_t, env.target, scopes) action = sample_action(pi_values) env.step(action) env.update() terminal = env.terminal if ep_length == max_steps: break if env.collided: ep_collision += 1 ep_length += 1 ep_lengths.append(ep_length) ep_collisions.append(ep_collision) ep_successes.append(int(ep_length < success_cutoff)) results[scene_scope + task_scope] = [np.mean(ep_lengths), np.mean(ep_collisions), np.mean(oracle_lengths), np.mean(ep_successes)] return results def _flip_policy(self, policy): flipped_policy = np.array([policy[3], policy[2], policy[1], policy[0]]) return flipped_policy def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index*1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) self.env.reset() self.oracle = ShortestPathOracle(self.env, ACTION_SIZE) states = [] targets = [] oracle_pis = [] terminal_end = False if self.mode is "train": # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t # t_max times loop (5 steps) for i in range(LOCAL_T_MAX): flipped_run = self.encourage_symmetry and np.random.random() > 0.5 if flipped_run: s_t = self.env.target; g = self.env.s_t else: s_t = self.env.s_t; g = self.env.target smashnet_pi = self.local_network.run_policy(sess, s_t, g, self.scopes) if flipped_run: smashnet_pi = self._flip_policy(smashnet_pi) oracle_pi = self.oracle.run_policy(self.env.current_state_id) diffidence_rate = self._anneal_diffidence_rate(global_t) action = self.choose_action(smashnet_pi, oracle_pi, diffidence_rate) states.append(s_t) targets.append(g) if flipped_run: oracle_pis.append(self._flip_policy(oracle_pi)) else: oracle_pis.append(oracle_pi) # if VERBOSE and global_t % 10000 == 0: # print("Thread %d" % (self.thread_index)) # sys.stdout.write("SmashNet Pi = {}, Oracle Pi = {}\n".format(["{:0.2f}".format(i) for i in smashnet_pi], ["{:0.2f}".format(i) for i in oracle_pi])) if VALIDATE and global_t % VALIDATE_FREQUENCY == 0 and global_t > 0 and self.thread_index == 0: results = self._evaluate(sess, list_of_tasks=VALID_TASK_LIST, num_episodes=NUM_VAL_EPISODES, max_steps=MAX_VALID_STEPS, success_cutoff=SUCCESS_CUTOFF) print("Thread %d" % (self.thread_index)) print("Validation results: %s" % (results)) self.env.step(action) is_terminal = self.env.terminal or self.episode_length > 5e3 if self.mode is "val" and self.episode_length > 1e3: is_terminal = True self.episode_length += 1 self.episode_pi_sim += 1. - cosine(smashnet_pi, oracle_pi) self.local_t += 1 # s_t1 -> s_t self.env.update() if is_terminal: terminal_end = True if self.mode is "val": sess.run(self.sync) sys.stdout.write("time %d | thread #%d | scene %s | target %s | episode length = %d\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.episode_length)) summary_values = { "episode_length_input": float(self.episode_length), "episode_pi_sim_input": self.episode_pi_sim / float(self.episode_length), "episode_loss_input": float(self.episode_loss) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_length = 0 self.episode_pi_sim = 0 self.episode_loss = 0 self.env.reset() break if self.mode is "train": states.reverse() oracle_pis.reverse() batch_si = [] batch_ti = [] batch_opi = [] # compute and accmulate gradients for(si, ti, opi) in zip(states, targets, oracle_pis): batch_si.append(si) batch_ti.append(ti) batch_opi.append(opi) sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.t: batch_ti, self.local_network.opi: batch_opi} ) self.episode_loss += sum(sess.run(self.local_network.loss, feed_dict={ self.local_network.s: batch_si, self.local_network.t: batch_ti, self.local_network.opi: batch_opi})) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) # if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: # sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
class SmashNetTrainingThread(object): #Threading th training def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, initial_diffidence_rate_seed, mode="train", network_scope="network", scene_scope="scene", task_scope="task", encourage_symmetry=False): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope #assiciated with the thread number self.scene_scope = scene_scope #Score self.task_scope = task_scope #This the targe self.scopes = [network_scope, scene_scope, task_scope] # ["thread-n", "scene", "target"] self.local_network = SmashNet( #locally smash net policy netwotk action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss( self.scopes) #This is to calculate the loss for this thread if mode is "train": self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] #This is more of we apply gradients to globabl network global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate # self.episode_reward = 0 self.episode_length = 0 # self.episode_max_q = -np.inf self.episode_pi_sim = 0 self.episode_loss = 0 self.initial_diffidence_rate_seed = initial_diffidence_rate_seed self.oracle = None self.mode = mode self.encourage_symmetry = encourage_symmetry def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' def _anneal_rate(self, init_rate, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) rate = init_rate * time_step_to_go / self.max_global_time_step return rate def _anneal_learning_rate(self, global_time_step): learning_rate = self._anneal_rate(self.initial_learning_rate, global_time_step) return learning_rate def _inverse_sigmoid_decay_rate(self, init_rate_seed, global_time_step): rate = init_rate_seed * np.exp(-global_time_step / init_rate_seed) rate = rate / (1. + rate) return rate def _anneal_diffidence_rate(self, global_time_step): if self.initial_diffidence_rate_seed == 0: return 0 else: return self._inverse_sigmoid_decay_rate( self.initial_diffidence_rate_seed, global_time_step) # TODO: check def choose_action(self, smashnet_pi_values, oracle_pi_values, confidence_rate): #can change the action to take r = random.random() if r < confidence_rate: pi_values = oracle_pi_values else: pi_values = smashnet_pi_values r = random.random() * np.sum(pi_values) values = np.cumsum(pi_values) for i in range(len(values)): if values[i] >= r: return i def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) writer.add_summary(summary_str, global_t) # writer.flush() def _evaluate(self, sess, list_of_tasks, num_episodes, max_steps, success_cutoff): scene_scopes = list_of_tasks.keys() results = {} for scene_scope in scene_scopes: for task_scope in list_of_tasks[scene_scope]: env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) ep_lengths = [] ep_collisions = [] oracle_lengths = [] ep_successes = [] scopes = [self.network_scope, scene_scope, task_scope] for i_episode in range(num_episodes): env.reset() oracle_lengths.append(env.shortest_path_distances[ env.current_state_id][env.terminal_state_id]) terminal = False ep_length = 0 ep_collision = 0 while not terminal: pi_values = self.local_network.run_policy( sess, env.s_t, env.target, scopes) action = sample_action(pi_values) env.step(action) env.update() terminal = env.terminal if ep_length == max_steps: break if env.collided: ep_collision += 1 ep_length += 1 ep_lengths.append(ep_length) ep_collisions.append(ep_collision) ep_successes.append(int(ep_length < success_cutoff)) results[scene_scope + task_scope] = [ np.mean(ep_lengths), np.mean(ep_collisions), np.mean(oracle_lengths), np.mean(ep_successes) ] return results def _flip_policy(self, policy): flipped_policy = np.array([policy[3], policy[2], policy[1], policy[0]]) return flipped_policy def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): #This is to run the process if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment( { #This is where you access in to the environment #scene_loader import THORDiscreteEnvironment 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) self.env.reset() self.oracle = ShortestPathOracle( self.env, ACTION_SIZE ) #Get the probabilities of the shortest paths to go to exat position states = [] targets = [] oracle_pis = [] #expert policies terminal_end = False if self.mode is "train": #if the trainign is there # reset accumulated gradients sess.run(self.reset_gradients) #reet all the gradients # copy weights from shared to local sess.run(self.sync) # start_local_t = self.local_t # t_max times loop (5 steps) for i in range(LOCAL_T_MAX): # This is for the training flipped_run = self.encourage_symmetry and np.random.random() > 0.5 if flipped_run: s_t = self.env.target g = self.env.s_t else: s_t = self.env.s_t g = self.env.target #first the initial state start with same state 4 times as the history stacked as frames 2048*5 smashnet_pi = self.local_network.run_policy( sess, s_t, g, self.scopes) #now gethe policy frmo the local network if flipped_run: smashnet_pi = self._flip_policy(smashnet_pi) oracle_pi = self.oracle.run_policy( self.env.current_state_id ) #get the policy of the oracle which means the shotest path kind of action in the given state diffidence_rate = self._anneal_diffidence_rate(global_t) action = self.choose_action(smashnet_pi, oracle_pi, diffidence_rate) states.append(s_t) #stack action targets.append(g) #stack target position if flipped_run: oracle_pis.append(self._flip_policy(oracle_pi)) else: oracle_pis.append(oracle_pi) #get the expert's policies # if VERBOSE and global_t % 10000 == 0: # print("Thread %d" % (self.thread_index)) # sys.stdout.write("SmashNet Pi = {}, Oracle Pi = {}\n".format(["{:0.2f}".format(i) for i in smashnet_pi], ["{:0.2f}".format(i) for i in oracle_pi])) if VALIDATE and global_t % VALIDATE_FREQUENCY == 0 and global_t > 0 and self.thread_index == 0: #This is for the alidation of the results results = self._evaluate(sess, list_of_tasks=VALID_TASK_LIST, num_episodes=NUM_VAL_EPISODES, max_steps=MAX_VALID_STEPS, success_cutoff=SUCCESS_CUTOFF) print("Thread %d" % (self.thread_index)) print("Validation results: %s" % (results)) self.env.step(action) #here we change the next step is_terminal = self.env.terminal or self.episode_length > 5e3 if self.mode is "val" and self.episode_length > 1e3: is_terminal = True self.episode_length += 1 self.episode_pi_sim += 1. - cosine(smashnet_pi, oracle_pi) self.local_t += 1 # s_t1 -> s_t self.env.update() #update the new state if is_terminal: terminal_end = True if self.mode is "val": sess.run(self.sync) sys.stdout.write( "time %d | thread #%d | scene %s | target %s | episode length = %d\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.episode_length)) summary_values = { "episode_length_input": float(self.episode_length), "episode_pi_sim_input": self.episode_pi_sim / float(self.episode_length), "episode_loss_input": float(self.episode_loss) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_length = 0 self.episode_pi_sim = 0 self.episode_loss = 0 self.env.reset() break if self.mode is "train": states.reverse() oracle_pis.reverse() batch_si = [] batch_ti = [] batch_opi = [] # compute and accmulate gradients for (si, ti, opi) in zip(states, targets, oracle_pis): batch_si.append(si) batch_ti.append(ti) batch_opi.append(opi) sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.t: batch_ti, self.local_network.opi: batch_opi }) self.episode_loss += sum( sess.run(self.local_network.loss, feed_dict={ self.local_network.s: batch_si, self.local_network.t: batch_ti, self.local_network.opi: batch_opi })) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) # if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: # sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
import pdb from utils.accum_trainer import AccumTrainer from utils.ops import sample_action from scene_loader import THORDiscreteEnvironment as Environment from dagger_policy_generators import SmashNet, ShortestPathOracle from dagger_constants import ACTION_SIZE, GAMMA, LOCAL_T_MAX, ENTROPY_BETA, VERBOSE, VALID_TASK_LIST, NUM_VAL_EPISODES, VALIDATE, VALIDATE_FREQUENCY, SUCCESS_CUTOFF, MAX_VALID_STEPS import argparse import gym import numpy as np import tensorflow as tf from network_models.policy_net import Policy_net from network_models.discriminator import Discriminator from algo.ppo import PPOTrain scene_scope = 'bathroom_02' task_scope = 26 #26 43 53 32 41 env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) act = 3 next_obs, is_terminal, is_collided = env.step(act) ''' 0=mover forward 1=turn left 2=turn right 3=Move down '''
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.task_scope_name=1 self.local_network = ActorCriticFFNetwork( action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()] global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':','_') + '_accum_grad:0' def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: print('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index*1.0) self.task_scope_name = random.randint(1, 468) - 1 self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': self.task_scope_name }) self.env.reset() states = [] actions = [] rewards = [] values = [] targets = [] terminal_end = False # reset accumulated gradients sess.run( self.reset_gradients ) # copy weights from shared to local sess.run( self.sync ) start_local_t = self.local_t # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value(sess, self.env.s_t, self.env.target, self.scopes) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward terminal = self.env.terminal # ad-hoc reward for navigation reward = 10.0 if terminal else -0.01 if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 self.episode_max_q = max(self.episode_max_q, np.max(value_)) # clip reward rewards.append( np.clip(reward, -1, 1) ) self.local_t += 1 # s_t1 -> s_t self.env.update() if terminal: terminal_end = True print'----------' print('real terminal id is {}'.format(self.task_scope_name)) sys.stdout.write("time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.task_scope_name = random.randint(1, 468) - 1 self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': self.task_scope_name }) self.env.reset() print ('init id is {}'.format(self.env.current_state_id)) print'----------' break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.env.s_t, self.env.target, self.scopes) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] batch_t = [] # compute and accmulate gradients for(ai, ri, si, Vi, ti) in zip(actions, rewards, states, values, targets): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_t.append(ti) if USE_LSTM: batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() batch_t.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.initial_lstm_state: start_lstm_state,}) # self.local_network.step_size: [len(batch_a)] else: sess.run( self.accum_gradients, feed_dict = { self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R} ) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run( self.apply_gradients, feed_dict = { self.learning_rate_input: cur_learning_rate } ) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
def main(args): scene_scope = 'bathroom_02' task_scope = 26 #26 43 53 32 41 env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) S_Class = SIAMESE() #Creating a siamese class -object Policy = Policy_net( 'policy', S_Class) #buiding the actor critic graph / object , Passing Old_Policy = Policy_net('old_policy', S_Class) #same thing as the other PPO PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph D = Discriminator(S_Class) #discriminator of the Gan Kind of thing ''' batch_n=tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ''' #Loading Expert Data State/Tragets etc expert_observations = np.genfromtxt( 'trajectory/observations.csv') #load expert demnetrations expert_targets = np.genfromtxt('trajectory/targets.csv') expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) expert_observations = np.reshape(expert_observations, newshape=[-1, 2048, 4]) expert_targets = np.reshape(expert_targets, newshape=[-1, 2048, 4]) saver = tf.train.Saver( ) #Assign another save if you want to use BC weights if args.restore: #We need a seperate saver only for assigning paramters from BC trained thing saver2 = tf.tran.Saver([ tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy'), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese') ]) with tf.Session() as sess: writer = tf.summary.FileWriter(args.logdir, sess.graph) sess.run( tf.global_variables_initializer() ) #here already variables get intialized both old policy and new policy net if args.restore: if args.model == '': saver2.restore( sess, args.modeldir + '/' + args.alg + '/' + 'shamane.ckpt') print("Model Reastored") else: saver.restore( sess, args.modeldir + '/' + args.alg + '/' + 'model.ckpt-' + args.model) success_num = 0 #This is use to check whether my agent went to the terminal point #var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for iteration in range( 100000): #args.iteration):#Here start the adversial training print( "Starting ........ The Iteration---------------------------------------------------- :", iteration) observations = [] actions = [] #rewards = [] targets = [] #for the gail v_preds = [] run_policy_steps = 0 while ( True ): #Here what is happenning is , this again samples trajectories from untrain agent run_policy_steps += 1 obs = np.stack([env.s_t]).astype( dtype=np.float32 ) # prepare to feed placeholder Policy.obs #Initial observation target = np.stack([env.s_target]).astype( dtype=np.float32 ) #This is to make sure that input is [batch_size,2048,4] act, v_pred, prob = Policy.act( state=obs, target=target, stochastic=True) # Agents action and values act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) #save the set of observations targets.append(target) actions.append(act) #save the set of actions v_preds.append(v_pred) #next_obs, reward, done, info = env.step(act) #get the next observation and reward acording to the observation next_obs, is_terminal, is_collided = env.step(act) if is_terminal: success_num = success_num + 1 print( "Congratz yoy just reach the terminal state which is:", env.terminal_state_id) if is_collided: print( "Bad Luck your agent just collided couldn't made it to the terminal state which is :", env.terminal_state_id) if (is_terminal or is_collided or (run_policy_steps == 100)): #run one episode till the termination print("Number Of Exploration by the AGENT:", run_policy_steps) v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value #this list use to update the parameters of the calue net print( "Environment is resetting after the collition/Terminal" ) obs = env.reset() #reward = -1 break #with tihs vreak all obsercation ,action and other lists get empty #print(sum(rewards)) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps) ]), iteration) #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) #, iteration) if success_num >= 5000: saver.save(sess, args.savedir + '/model.ckpt') print('Clear!! Model saved.') break #else: #success_num = 0 # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, newshape=[-1, 2048, 4]) #collect observations targets = np.reshape(targets, newshape=[-1, 2048, 4]) actions = np.array(actions).astype( dtype=np.int32) #collect the actions # train discriminator #Here comes the Discriminator !! Dis_input = [ expert_observations, expert_targets, expert_actions, observations, targets, actions ] observations.shape[0] expert_observations.shape[0] if observations.shape[0] < expert_observations.shape[0]: High = observations.shape[0] else: High = expert_observations.shape[0] for i in range(100): sample_indices = np.random.randint(low=0, high=High, size=32) sampled_inp_D = [ np.take(a=a, indices=sample_indices, axis=0) for a in Dis_input ] D.train(expert_s=sampled_inp_D[0], expert_t=sampled_inp_D[1], expert_a=sampled_inp_D[2], agent_s=sampled_inp_D[3], agent_t=sampled_inp_D[4], agent_a=sampled_inp_D[5]) ''' D.train(expert_s=expert_observations, expert_t=expert_targets, expert_a=expert_actions, agent_s=observations, agent_t=targets, agent_a=actions) ''' #To get rewards we can use a RNN , then we can get the each time unit output to collect the reward function d_rewards = D.get_rewards( agent_s=observations, agent_t=targets, agent_a=actions ) #how well our agent performed with respect to the expert d_rewards = np.reshape(d_rewards, newshape=[-1]).astype( dtype=np.float32) #rewards for each action pair gaes = PPO.get_gaes( rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next ) #this to calcuate the advantage function in PPO gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype( dtype=np.float32) #This is the next value function #train policy inp = [ observations, targets, actions, gaes, d_rewards, v_preds_next ] PPO.assign_policy_parameters( ) #Assigning policy params means assigning the weights to the default policy nets for epoch in range( 100 ): #This is to train the Agent (Actor Critic ) from the obtaiend agent performances and already trained discriminator sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # Here trainign the policy network PPO.train(state=sampled_inp[0], targets=sampled_inp[1], actions=sampled_inp[2], gaes=sampled_inp[3], rewards=sampled_inp[4], v_preds_next=sampled_inp[5]) summary = PPO.get_summary(obs=inp[0], target=inp[1], actions=inp[2], gaes=inp[3], rewards=inp[4], v_preds_next=inp[5]) writer.add_summary(summary, iteration) writer.close()
def train(rank, scene_scope, task_scope, args, shared_model, counter, lock, optimizer=None): torch.manual_seed(args.seed + rank) #env = create_atari_env(args.env_name) #env.seed(args.seed + rank) env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) model = ActorCriticFFNetwork(ACTION_SIZE) if optimizer is None: # TODO: Discount learning rate based on episode length optimizer = my_optim.SharedRMSprop(shared_model.parameters(), lr=args.lr, alpha=args.alpha, eps=args.eps) optimizer.share_memory() model.train() env.reset() state = torch.from_numpy(env.s_t) done = True episode_length = 0 for i in range(int(args.max_episode_length)): # Sync with the shared model model.load_state_dict(shared_model.state_dict()) ''' if done: cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) else: cx = Variable(cx.data) hx = Variable(hx.data) ''' values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): print('Thread: ', rank, ', step: ', step, 'epochs:', i) episode_length += 1 logit, value = model(env.s_t, env.target) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).data log_prob = log_prob.gather(1, Variable(action)) env.step(action) #state, reward, done, _ = env.step(action.numpy()) env.update() state = env.s_t reward = env.reward done = env.terminal done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) with lock: if counter.value % 1000 == 0: print('Now saving data. Please wait.') torch.save(shared_model.state_dict(), CHECKPOINT_DIR + '/' + 'checkpoint.pth.tar') counter.value += 1 if done: episode_length = 0 if env.terminal: print('Task completed') counter.value += 1 if done: episode_length = 0 env.reset() state = env.s_t state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: _, value = model(env.s_t, env.target) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: sys.stdout.write('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) states = [] actions = [] rewards = [] values = [] targets = [] rnn_inits = [] state_representation = [] usf = [] reward_vector = [] terminal_end = False # reset accumulated gradients sess.run(self.reset_gradients) # copy weights from shared to local sess.run(self.sync) #At each episode start we set the initial state of the RNN to zero start_local_t = self.local_t start_lstm_state = self.local_network.lstm_state_out # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_, usf_s_g = self.local_network.run_policy_and_value( sess, self.env.s_t, self.env.target, self.scopes) imidia_s = self.local_network.run_state(sess, self.env.s_t, self.scopes) #usf_s_g = self.local_network.run_usf(sess, self.env.s_t, self.env.target,self.rnn_state_init[0] ,self.rnn_state_init[1] ,self.scopes) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) usf.append(usf_s_g) state_representation.append(imidia_s) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward terminal = self.env.terminal # ad-hoc reward for navigation reward = 10.0 if terminal else -0.01 if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 self.episode_max_q = max(self.episode_max_q, np.max(value_)) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 # s_t1 -> s_t self.env.update() if i == (LOCAL_T_MAX - 1) or terminal: imidiate_state_representation_next = [] usf_next = [] #reward_vector_predictor_next=[] last_state = self.env.s_t imidia_s_next = self.local_network.run_state( sess, self.env.s_t, self.scopes) state_representation_next = state_representation[1:] + [ imidia_s_next ] if terminal: usf_next_imi = 0 else: usf_next_imi = self.local_network.run_usf( sess, self.env.s_t, self.env.target, self.scopes) usf_next = usf[1:] + [usf_next_imi] if terminal: terminal_end = True sys.stdout.write( "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) oneResult = [ global_t, self.thread_index, self.scene_scope, self.task_scope, self.episode_reward, self.episode_length, self.episode_max_q ] with open('trainingOutput.csv', 'a+') as fp: # fd.write(oneResult) wr = csv.writer(fp) wr.writerow(oneResult) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.local_network.reset_state() self.env.reset() break R = 0.0 usf_R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.env.s_t, self.env.target, self.scopes) usf_R = self.local_network.run_usf(sess, self.env.s_t, self.env.target, self.scopes) actions.reverse() states.reverse() rewards.reverse() values.reverse() state_representation.reverse() state_representation_next.reverse() usf_next.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] batch_usf_R = [] batch_t = [] # compute and accmulate gradients for (ai, ri, si, Vi, ti, state, usf_n) in zip(actions, rewards, states, values, targets, state_representation_next, usf_next): R = ri + GAMMA * R usf_R = state + GAMMA * usf_R #usf_R = state + GAMMA*usf_n td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_usf_R.append(usf_R) batch_t.append(ti) #We need to reverse this since in the training we unroll for 5 steps unlike in the inferences batch_si.reverse() batch_a.reverse() batch_td.reverse() batch_R.reverse() batch_usf_R.reverse() batch_t.reverse() sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.return_usf: batch_usf_R, self.local_network.initial_lstm_state: start_lstm_state, self.local_network.step_size: [len(batch_a)], }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
def test(rank, scene_scope, task_scope, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = Environment({ 'scene_name': scene_scope, 'terminal_state_id': int(task_scope) }) model = ActorCriticFFNetwork(ACTION_SIZE) model.eval() height, width, layers = env.observation.shape video = cv2.VideoWriter('video/' + task_scope + '.mp4',-1,1,(width,height)) env.reset() state = torch.from_numpy(env.s_t) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 img = cv2.cvtColor(env.observation, cv2.COLOR_BGR2RGB) video.write(img) for i in range(100): episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) logit, value = model(env.s_t, env.target) prob = F.softmax(logit, dim=1) action = prob.max(1, keepdim=True)[1].data.numpy() env.step(action[0, 0]) env.update() img = cv2.cvtColor(env.observation, cv2.COLOR_BGR2RGB) video.write(img) reward = env.reward state = env.s_t done = env.terminal print(env.terminal_state_id, env.current_state_id) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: done = True if done: print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() env.reset() state = env.s_t break state = torch.from_numpy(state) cv2.destroyAllWindows() video.release()
class A3CTrainingThread(object): def __init__(self, thread_index, global_network, global_discriminator, initial_learning_rate, learning_rate_input, grad_applier, grad_applier_discriminator, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.network_scope_D = network_scope + "_d" self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.scopes_d = [self.network_scope_D, scene_scope, task_scope] self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_discriminator = Discriminator_WGAN( action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.local_discriminator.prepare_loss_D(ENTROPY_BETA, self.scopes_d) self.trainer = AccumTrainer(device) self.trainer_D = AccumTrainer( device, name="AccumTrainer_d") #new instance for discrimninateor self.trainer.prepare_minimize( self.local_network. total_loss, #getting the gradients of for the local network variablkes self.local_network.get_vars()) self.trainer_D.prepare_minimize(self.local_discriminator.total_loss_d, self.local_discriminator.get_vars()) new_variable_list = self.local_network.get_vars() old_varaible_list = self.local_network.get_vars_old() self.old_new_sync = self.local_network.sync_curre_old() self.accum_gradients = self.trainer.accumulate_gradients( ) #This is to assign gradients self.reset_gradients = self.trainer.reset_gradients( ) #after applying the grads to variables we need to resent those variables #This is for the discriminatro self.accum_gradients_d = self.trainer_D.accumulate_gradients() self.reset_gradients_d = self.trainer_D.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] #get the name list of all the grad vars accum_grad_names_discrimi = [ self._local_var_name(x) for x in self.trainer_D.get_accum_grad_list() ] global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] #check whether the global_network vars are mentioned in gradiet computations for them local_net_vars = [ x for x in self.local_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] #self.trainer.get_accum_grad_list() this is about gradients righjt now global_discri_vars = [ x for x in global_discriminator.get_vars() if self._get_accum_grad_name(x) in accum_grad_names_discrimi ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.apply_gradients_local = grad_applier.apply_gradients_local_net( local_net_vars, self.trainer.get_accum_grad_list()) #Discriminator self.apply_gradients_discriminator = grad_applier_discriminator.apply_gradients( global_discri_vars, self.trainer_D.get_accum_grad_list()) self.clip_global_d_weights = global_discriminator.clip_weights( ) #here we are clipping the global net weights directly. #If this is unstable it is desireable to first apply the gradients on the local network and then clip and after that we apply self.sync = self.local_network.sync_from( global_network ) #this is to sync from the glocal network Apply updated global params to the local network self.sync_discriminator = self.local_discriminator.sync_from( global_discriminator) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf def _local_var_name(self, var): return '/'.join(var.name.split('/')[1:]) def _get_accum_grad_name(self, var): return self._local_var_name(var).replace(':', '_') + '_accum_grad:0' def _anneal_learning_rate(self, global_time_step): time_step_to_go = max(self.max_global_time_step - global_time_step, 0.0) learning_rate = self.initial_learning_rate * time_step_to_go / self.max_global_time_step return learning_rate def choose_action(self, pi_values): values = [] sum = 0.0 for rate in pi_values: sum = sum + rate value = sum values.append(value) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def _record_score(self, sess, writer, summary_op, placeholders, values, global_t): feed_dict = {} for k in placeholders: feed_dict[placeholders[k]] = values[k] summary_str = sess.run(summary_op, feed_dict=feed_dict) if VERBOSE: print('writing to summary writer at time %d\n' % (global_t)) writer.add_summary(summary_str, global_t) # writer.flush() def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) self.env.reset() #resetting the environment for each thread states = [] #to keeep state ,actions ,targets and other stae actions = [] rewards = [] values = [] targets = [] dones = [] terminal_end = False #in the start terminal state_end is false # reset accumulated gradients sess.run( self.reset_gradients ) #resetting the gradient positions when starting the process for each # copy weights from shared to local sess.run(self.sync) start_local_t = self.local_t # t_max times loop for i in range( LOCAL_T_MAX ): #one thread will run for maximum amoound to 5 iterations then do a gradiet uodate pi_, value_ = self.local_network.run_policy_and_value( sess, self.env.s_t, self.env.target, self.scopes) #pi_Old, value_Old = self.local_network.run_policy_and_value_old(sess, self.env.s_t, self.env.target, self.scopes) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward #getting the reward from the env terminal = self.env.terminal #geting whether the agent went to a terminal state # ad-hoc reward for navigation reward = 10.0 if terminal else -0.01 #this is the normal reward here 10 if terminal all the others it is -0.01 (ollision donesst take in to the accout) if self.episode_length > 5e3: terminal = True #Here we do not let agent to run more that 5000 steps so we make it terminal #but the above terminal thing has no effect on giving 10 as the rwaerd because we set the rweard above self.episode_reward += reward self.episode_length += 1 #this is what is the maximum value got in the episode self.episode_max_q = max(self.episode_max_q, np.max( value_)) #self.episode_max_q-This is -inf in the beggining # clip reward rewards.append( np.clip(reward, -1, 1) ) #make sure the rewartds is between -1 and +1 even thore rtthere is a 10 self.local_t += 1 # s_t1 -> s_t self.env.update() if terminal: #if we go to the terminal state we will surely break the function sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) terminal_end = True sys.stdout.write( "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 #after terminal state we gonna make all these variables to zero self.episode_length = 0 #Now the AI need to start from new position self.episode_max_q = -np.inf #after a terminaltion we do this self.env.reset() break ''' Here I should call the discriminator and get the reward signal from that R_D=sess.run(D.get_reward(state,action)) ''' R = 0.0 #In the terminal Return is nothing #If it's terminal end we do not have a return from the final state if not terminal_end: #But if it's not the turminal Return is the next value function R = self.local_network.run_value(sess, self.env.s_t, self.env.target, self.scopes) Returns = np.zeros_like(rewards) Advants = np.zeros_like(rewards) lastgaelam = 0 LAMBDA = 0.9 GAM = 0.9 self.nsteps = len(rewards) ############################################################################# we should assined all params to the new params #This will only has an effect on ##################################################################### for t in reversed(range(self.nsteps)): if t == self.nsteps - 1: nextnonterminal = 1.0 - bool( R == 0) #if R ==0 means the agent found the terminal stage nextvalues = R else: nextnonterminal = 1.0 - bool(R == 0) nextvalues = values[t + 1] delta = rewards[t] + GAM * nextvalues * nextnonterminal - values[t] Advants[ t] = lastgaelam = delta + GAM * LAMBDA * lastgaelam * nextnonterminal Returns[t] = Advants[t] + values[t] #Returns=Advants+values #This is more of the v_next Advants = (Advants - Advants.mean()) / (Advants.std() + 1e-5) #Returns=(Returns - Returns.mean()) / (Returns.std() + 1e-5) Returns = Returns.tolist() Advants = Advants.tolist() actions.reverse() states.reverse() rewards.reverse() values.reverse() Returns.reverse() Advants.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] batch_t = [] batch_advant = [] batch_Return = [] # compute and accmulate gradients for (ai, ri, si, Vi, ti, Re, Ad) in zip(actions, rewards, states, values, targets, Returns, Advants): R = ri + GAMMA * R #calculatung the adcantage function td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 #making the actions one hot batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_t.append(ti) batch_advant.append(Ad) batch_Return.append(Re) sess.run(self.old_new_sync) cur_learning_rate = self._anneal_learning_rate(global_t) for i in range(3): sess.run( self. accum_gradients, #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R, self.local_network.Returns: batch_Return, self.local_network.Advantages: batch_advant }) sess.run(self.apply_gradients_local, feed_dict={self.learning_rate_input: cur_learning_rate}) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write("Local timestep %d\n" % self.local_t) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t