def test(self, render=False): # re-initialize game for evaluation episode_buffer = [] self.game_state.reset(random_restart=False, terminate_loss_of_life=False) observation = self._reset(testing=True) episode_buffer.append(self.game_state.screen_buffer) max_steps = self.eval_max_steps total_reward = 0.0 total_steps = 0 sub_total_reward = 0.0 sub_steps = 0 n_episodes = 0 time.sleep(0.5) while max_steps > 0: self._update_state_input(observation) readout_t = self.net.evaluate(self.state_input)[0] action = get_action_index(readout_t, is_random=(random.random() <= 0.05), n_actions=self.game_state.n_actions) observation, reward, terminal = self.game_state.step(action, render=render) if n_episodes == 0: episode_buffer.append(observation) observation = process_frame(observation, self.resized_h, self.resized_w) sub_total_reward += reward sub_steps += 1 max_steps -= 1 if terminal: if n_episodes == 0: time_per_step = 0.05 images = np.array(episode_buffer) make_gif(images, self.folder + '/frames/image{ep:010d}.gif'.format( ep=(self.t - self.observe)), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 print("\tTRIAL", n_episodes, "/ REWARD", sub_total_reward, "/ STEPS", sub_steps, "/ TOTAL STEPS", total_steps) self.game_state.reset(random_restart=True, terminate_loss_of_life=False) observation = self._reset(testing=True) total_reward += sub_total_reward total_steps += sub_steps sub_total_reward = 0.0 sub_steps = 0 time.sleep(0.5) # (timestep, total sum of rewards, toal # of steps before terminating) total_reward = total_reward / max(1, n_episodes) total_steps = total_steps / max(1, n_episodes) total_reward = round(total_reward, 4) self.rewards['eval'].append( ((self.t - self.observe), total_reward, total_steps)) return total_reward, total_steps, n_episodes
def get_meta_state(self,s,g): """compute the 4 channel meta-state from meta-action (goal / option) Parameters ========== s: the raw state g: the goal mask """ return np.dstack([process_frame(s),g]) # stack state and goal
def _reset(self): self.state_input.fill(0) observation, r_0, terminal = self.game_state.step(0, render=True) observation = process_frame(observation, self.resized_h, self.resized_w) for _ in range(self.phi_length - 1): empty_img = np.zeros((self.resized_w, self.resized_h), dtype=np.uint8) self.D.add_sample(empty_img, 0, 0, 0) return observation
def rolloutPCL(self, sess, initial_state, rnn_state_init, max_path_length=None, episode_count=1): # ToDo: Do not loop over "episode_count" but perform only one sess.run per step # Perform rollout of given environment if max_path_length is None: max_path_length = self.env.envs[0].spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') # Reset rnn_state for every iteration s = initial_state rnn_state = rnn_state_init path_length = np.zeros(len(self.env)) # Sample one episode while all(path_length) < max_path_length and not self.env.all_done(): dummy_lengths = np.ones(len(self.env)) a, v, rnn_state, _ = self.act(s, rnn_state, dummy_lengths, sess) # Get action for every environment act_ = [np.argmax(a_) for a_ in a] # Sample new state and reward from environment s2, r, terminal, info = self.env.step(act_) if self.preprocessing_state: s2 = U.process_frame(s2, self.preprocessing_config) # Add states, rewards, actions, values and terminal information to PCL episode batch self.add_to_batch(s, r, a, v, terminal) for i in range(len(self.env)): if not self.env.dones[i]: path_length[i] = path_length[i] + 1 s = s2 episodes = [] for i in range(len(self.env)): path_length_temp = int(path_length[i]) + 1 episodes.append( dict(states=np.expand_dims( self.episode_states_train[i][:path_length_temp], 0), actions=np.expand_dims( self.episode_actions_train[i][:path_length_temp], 0), rewards=np.expand_dims( self.episode_reward_train[i][:path_length_temp], 0), values=np.expand_dims( self.episode_values_train[i][:path_length_temp], 0), path_length=path_length_temp)) return episodes
def step(self,m_a): """Take a step in this meta_environment This single meta_step involves possibly many steps in the environment Parameters ========== m_a: an action of the meta_agent, which is also a goal of this sub agent Current this is an input to the get_mask() function """ if self.sess is None: # I cannot init before the sess exists self.sess = tf.get_default_session() self.summary_writer.add_graph(self.sess.graph) self.sess.run(self.agent.update_local_ops) episode_buffer = [] episode_values = [] episode_frames = [] episode_reward = 0 episode_step_count = 0 d = False i_r = 0 m_r = 0 s = self.get_last_obs() # The meta-agent is responsible for resetting s = process_frame(s) self.subgoal.set_meta_action(m_a) s = self.subgoal.augment_obs(s) episode_frames.append((self.subgoal.visualize(s),['i_r = 0', 'm_r = 0', 'step = 0'])) self.agent.start_trial() while d == False: # Take an action using probabilities from policy # network output. a,v = self.agent.sample_av(s, self.sess, i_r) s1,f,m_d = self.env.step(a) self.last_obs = s1.copy() s1 = process_frame(s1) s1 = self.subgoal.augment_obs(s1) # ARA - todo: make into internal critic or provide a env. wrapper i_r, m_r_step, i_d = self.subgoal.intrinsic_reward(s,a,s1,f,m_d) m_r += m_r_step # if(self.flags['verbose']): # print('i_r: ' + str(i_r)) d = m_d or i_d or episode_step_count == self.max_ep_len-1 data = ['i_r = ' + str(i_r), 'm_r = ' + str(m_r_step), 'd = ' + str(d), 'step = ' + str(episode_step_count), 'a = ' + str(np.round(a,2)), 'm_a = ' + str(np.round(m_a,2)), 'v = ' + str(v[0,0])] episode_frames.append((self.subgoal.visualize(s1), data)) episode_buffer.append([s,a,i_r,s1,d,v[0,0]]) episode_values.append(v[0,0]) episode_reward += i_r s = s1 episode_step_count += 1 self.total_step_count += 1 self.episode_count += 1 if(self.flags['verbose']): print('\ttotal intrisic episode reward: ' + str(episode_reward)) print('\tsubagent length: ' + str(episode_step_count)) # Update the network using the experience buffer at the # end of the episode. if len(episode_buffer) != 0 and \ self.flags['train']: v_l,p_l,e_l,g_n,v_n = self.agent.train(episode_buffer, self.sess, self.gamma, self.lam, 0.0) if self.episode_count % 50 == 0: global_ep_count = self.sess.run(self.global_episodes) data = {'Perf/Intrinsic Reward' : episode_reward, 'Perf/Length' : episode_step_count, 'Perf/Value' : np.mean(episode_values), 'Perf/Total Step Count' : self.total_step_count, 'Perf/Global Ep Count' : global_ep_count, 'Losses/Value Loss' : v_l, 'Losses/Policy Loss' : p_l, 'Losses/Entropy' : e_l, 'Losses/Grad Norm' : g_n, 'Losses/Var Norm' : v_n} self.agent.write_summary(data, self.episode_count) # ARA - todo: check if max meta-episodes is reached in meta-agent # only send a done (m_d) signal if inner env. needs resetting. self.frames = episode_frames return self.last_obs, m_r, m_d
def evaluate(self, sess, n=0): episode_count = sess.run(self.global_episodes) s = self.env.reset() self.reset_agent() self.start_trial() step = 0 s = process_frame(s) d = False r = 0 episode_r = 0 is_meta = hasattr(self.env, 'flags') if is_meta: self.env.flags['train'] = False self.env.flags['verbose'] = True printing = True frames = [] while d == False and step < self.max_ep: a, v = self.sample_av(s, sess, r) s1, r, d = self.env.step(a) episode_r += r s = process_frame(s1) step += 1 if is_meta: frames += self.env.get_frames() else: data = [ 'r = ' + str(r), 'd = ' + str(d), 'v = ' + str(v), 'a = ' + str(a), 'step = ' + str(step), 'cum_r = ' + str(episode_r) ] frames.append((s1, data)) print('episode reward: ' + str(episode_r)) if not printing: return fig = plt.figure() f, d = frames[0] lf_sp = fig.add_subplot(121) l = plt.imshow(f) data_plot = fig.add_subplot(122) plt.imshow(np.ones(f.shape)) plt.axis('off') FFMpegWriter = manimation.writers['ffmpeg'] metadata = dict(title='Episode ' + str(n), artist='Matplotlib', comment='Movie support!') writer = FFMpegWriter(fps=15, metadata=metadata) movie_path = self.movie_path + "episode_" + str(n) + ".mp4" with writer.saving(fig, movie_path, 100): for f, data in frames: l.set_data(f) data_plot.cla() data_plot.axis('off') h = 3 for text in data: data_plot.text(1, h, text) h += 8 writer.grab_frame() plt.close() if is_meta: self.env.flags['train'] = True self.env.flags['verbose'] = False
def run(self): # get the first state by doing nothing and preprocess the image to 80x80x4 observation = self._reset() self.t, self.epsilon, self.rewards = self._load() # only executed at the very beginning of training and never again if self.t == 0 and self.train_with_demo_steps > 0: self.train_with_demo_memory_only() # set start time self.start_time = time.time() - self.wall_t print("D size: ", self.D.size) total_reward = 0.0 sub_steps = 0 while (self.t - self.observe) < self.train_max_steps: # Evaluation of policy if (self.t - self.observe) >= 0 and ( self.t - self.observe) % self.eval_freq == 0: terminal = 0 total_reward, total_steps, n_episodes = self.test() self.net.add_accuracy(total_reward, total_steps, n_episodes, (self.t - self.observe)) print("TIMESTEP", (self.t - self.observe), "/ AVE REWARD", total_reward, "/ AVE TOTAL STEPS", total_steps, "/ # EPISODES", n_episodes) # re-initialize game for training self.game_state.reset(random_restart=True) observation = self._reset() sub_steps = 0 time.sleep(0.5) # choose an action epsilon greedily self._update_state_input(observation) readout_t = self.net.evaluate(self.state_input)[0] action = get_action_index( readout_t, is_random=(random.random() <= self.epsilon or self.t <= self.observe), n_actions=self.game_state.n_actions) # scale down epsilon if self.epsilon > self.final_epsilon and self.t > self.observe: self.epsilon -= (self.init_epsilon - self.final_epsilon) / self.explore ##### HUMAN ADVICE OVERRIDE ACTION ##### if self.use_human_advice and self.psi > self.final_epsilon: use_advice = False # After n exploration steps, decay psi if (self.t - self.observe) >= self.explore: self.psi *= self.init_psi if random.random() > self.final_epsilon: psi_cond = True if self.psi == self.init_psi else ( self.psi > random.random()) if psi_cond: action_advice = self.human_net.evaluate( self.state_input)[0] action_human = np.argmax(action_advice) if action_advice[action_human] >= self.confidence: action = action_human use_advice = True ##### HUMAN ADVICE OVERRIDE ACTION ##### # Training # run the selected action and observe next state and reward next_observation, reward, terminal = self.game_state.step( action, random_restart=True) next_observation = process_frame(next_observation, self.resized_h, self.resized_w) terminal_ = terminal or ( (self.t + 1 - self.observe) >= 0 and (self.t + 1 - self.observe) % self.eval_freq == 0) # store the transition in D self.D.add_sample(observation, action, reward, (1 if terminal_ else 0)) # only train if done observing if self.t > self.observe and self.t % self.update_freq == 0: s_j_batch, a_batch, r_batch, s_j1_batch, terminals = self.D.random_batch( self.batch) # perform gradient step summary = self.net.train(s_j_batch, a_batch, r_batch, s_j1_batch, terminals) self.net.add_summary(summary, self.t - self.observe) self.rewards['train'].append(round(reward, 4)) # update the old values sub_steps += 1 self.t += 1 observation = next_observation if terminal: observation = self._reset() sub_steps = 0 # save progress every SAVE_FREQ iterations if (self.t - self.observe) % self.save_freq == 0: self.net.save(self.t) data = { 'D.width': self.D.width, 'D.height': self.D.height, 'D.max_steps': self.D.max_steps, 'D.phi_length': self.D.phi_length, 'D.num_actions': self.D.num_actions, 'D.actions': self.D.actions, 'D.rewards': self.D.rewards, 'D.terminal': self.D.terminal, 'D.bottom': self.D.bottom, 'D.top': self.D.top, 'D.size': self.D.size, 'epsilon': self.epsilon, 't': self.t } print(colored('Saving data...', 'blue')) pickle.dump( data, open(self.folder + '/' + self.name + '-dqn.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump( self.rewards, open(self.folder + '/' + self.name + '-dqn-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) print(colored('Successfully saved data!', 'green')) print( colored('Compressing and saving replay memory...', 'blue')) save_compressed_images( self.folder + '/' + self.name + '-dqn-images.h5', self.D.imgs) print(colored('Compressed and saved replay memory', 'green')) # write wall time self.wall_t = time.time() - self.start_time print('Total time: {} seconds'.format(self.wall_t)) # print info state = "" if self.t <= self.observe: state = "observe" elif self.t > self.observe and self.t <= self.observe + self.explore: state = "explore" else: state = "train" if self.t % 1000 == 0: if self.use_human_advice: print("T:", self.t, "/ STATE", state, "/ EPSILON", round(self.epsilon, 4), "/ PSI", round(self.psi, 4), "/ ADVICE", use_advice, "/ ACTION", action, "/ REWARD", reward, "/ Q_MAX %e" % np.max(readout_t)) else: print("T:", self.t, "/ STATE", state, "/ EPSILON", round(self.epsilon, 4), "/ ACTION", action, "/ REWARD", reward, "/ Q_MAX %e" % np.max(readout_t))
def run(self, minutes_limit=5, demo_type=0, model_net=None): imgs = [] acts = [] rews = [] terms = [] rewards = {'train': [], 'eval': []} # regular game start_time = datetime.now() timeout_start = time.time() timeout = 60 * minutes_limit t = 0 terminal = False terminal_force = False is_reset = True total_reward = 0.0 score1 = score2 = 0 sub_t = 0 sub_r = 0. rewards = [] sub_steps = [] total_episodes = 0 # re-initialize game for evaluation self.game_state.reset( render=True, random_restart=True, terminate_loss_of_life=self.terminate_loss_of_life) observation = self._reset() while True: if demo_type == 1: # RANDOM AGENT action = np.random.randint(self.game_state.n_actions) elif demo_type == 2: # MODEL AGENT if sub_t % self._skip == 0: self._update_state_input(observation) readout_t = model_net.evaluate(self.state_input)[0] action = get_action_index( readout_t, is_random=False, n_actions=self.game_state.n_actions) else: # HUMAN action = self.game_state.human_agent_action next_observation, reward, terminal = self.game_state.step( action, render=True, random_restart=True) next_observation = process_frame(next_observation, self.resized_h, self.resized_w) terminal = True if terminal or ( time.time() > timeout_start + timeout) else False # store the transition in D # when using frameskip=1, should store every four steps if sub_t % self._skip == 0: self.D.add_sample(observation, action, reward, terminal) observation = next_observation sub_r += reward total_reward += reward #time.sleep(0.0166666) sub_t += 1 t += 1 # Ensure that D does not reach max memory that mitigate # problems when combining different human demo files if (self.D.size + 3) == self.D.max_steps: terminal_force = True terminal = True if terminal: total_episodes += 1 rewards.append(sub_r) sub_steps.append(sub_t) sub_r = 0. sub_t = 0 self.game_state.reset( render=True, random_restart=True, terminate_loss_of_life=self.terminate_loss_of_life) observation = self._reset() is_reset = True time.sleep(0.5) if terminal_force or time.time() > timeout_start + timeout: break if demo_type == 0: # HUMAN self.game_state.stop_thread = True print("Duration: {}".format(datetime.now() - start_time)) print("Total # of episodes: {}".format(total_episodes)) print("Mean steps: {} / Mean reward: {}".format( t / total_episodes, total_reward / total_episodes)) print("\tsteps / episode:", sub_steps) print("\treward / episode:", rewards) print("Total Replay memory saved: {}".format(self.D.size)) # Resize replay memory to exact memory size self.D.resize() data = { 'D.width': self.D.width, 'D.height': self.D.height, 'D.max_steps': self.D.max_steps, 'D.phi_length': self.D.phi_length, 'D.num_actions': self.D.num_actions, 'D.actions': self.D.actions, 'D.rewards': self.D.rewards, 'D.terminal': self.D.terminal, 'D.bottom': self.D.bottom, 'D.top': self.D.top, 'D.size': self.D.size } images = self.D.imgs pkl_file = '{name}-dqn.pkl'.format(name=self.name) h5_file = '{name}-dqn-images.h5'.format(name=self.name) pickle.dump(data, open(self.folder + pkl_file, 'wb'), pickle.HIGHEST_PROTOCOL) print(colored('Compressing and saving replay memory...', 'blue')) save_compressed_images(self.folder + h5_file, images) print(colored('Compressed and saved replay memory', 'green'))
def work(self, sess, coord, saver): gamma = self.gamma lam = self.lam t0 = time.time() episode_count = sess.run(self.global_episodes) total_steps = 0 print("Starting worker " + str(self.name)) with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run(self.update_local_ops) episode_buffer = [] episode_values = [] episode_frames = [] episode_reward = 0 episode_step_count = 0 action_mag = [] d = False r = 0 s = self.env.reset() self.reset_agent() episode_frames.append((s, [''])) s = process_frame(s) self.start_trial() while d == False: # Take an action using probabilities from policy # network output. a, v = self.sample_av(s, sess, r) s1, r, d = self.env.step(a) # if episode_count == 50:o # coord.request_stop() s1 = process_frame(s1) if episode_step_count == self.max_ep - 1: d = True data = ['r = ' + str(r), 'd = ' + str(d), 'a = ' + str(a)] episode_frames.append((s1, data)) episode_buffer.append([s, a, r, s1, d, v[0, 0]]) episode_values.append(v[0, 0]) episode_reward += r s = s1 total_steps += 1 episode_step_count += 1 # If the episode hasn't ended, but the experience # buffer is full, then we make an update step using # that experience rollout. if len(episode_buffer) == self.update_ival and d != True and \ episode_step_count != self.max_ep - 1: # Since we don't know what the true final return # is, we "bootstrap" from our current value # estimation. v1 = sess.run(self.local_AC.value, feed_dict={ self.local_AC.inputs: [s], self.local_AC.prev_actions: [a], self.local_AC.prev_rewards: [[r]], self.local_AC.is_training_ph: False, self.local_AC.state_in[0]: self.rnn_state[0], self.local_AC.state_in[1]: self.rnn_state[1] })[0, 0] v_l, p_l, e_l, g_n, v_n = self.train( episode_buffer, sess, gamma, lam, v1) episode_buffer = [] sess.run(self.update_local_ops) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) self.episode_mean_values.append(np.mean(episode_values)) # Update the network using the experience buffer at the # end of the episode. if len(episode_buffer) != 0: v_l, p_l, e_l, g_n, v_n = self.train( episode_buffer, sess, gamma, lam, 0.0) # Periodically save model parameters, and summary statistics. if episode_count % 100 == 0 and self.is_writer: saver.save( sess, self.model_path + '/model-' + str(episode_count) + '.cptk') s_dt = str(timedelta(seconds=time.time() - t0)) self.evaluate(sess, episode_count) print("Saved Model " + str(episode_count) + '\tat time ' + s_dt) if episode_count % 5 == 0 and episode_count != 0: data = { 'Perf/Reward': episode_reward, 'Perf/Length': episode_step_count, 'Perf/Value': np.mean(episode_values), 'Losses/Value Loss': v_l, 'Losses/Policy Loss': p_l, 'Losses/Entropy': e_l, 'Losses/Grad Norm': g_n, 'Losses/Var Norm': v_n } self.write_summary(data, episode_count) if self.is_writer: sess.run(self.increment) episode_count += 1
def main(job, task, worker_num, ps_num, initport, ps_hosts, worker_hosts): PS_HOST = ps_hosts.split(",") WORKER_HOSTS = worker_hosts.split(",") INITPORT = initport CLUSTER = dict() """ workers = [] ps_ = [] for i in range(ps_num): ps_.append('localhost:{}'.format(INITPORT + i)) for i in range(worker_num): workers.append("localhost:{}".format(i + ps_num + INITPORT)) CLUSTER['worker'] = workers CLUSTER['ps'] = ps_ """ # Infer the amount of workers and ps servers cluster = tf.train.ClusterSpec({"ps": PS_HOST, "worker": WORKER_HOSTS}) print({"ps": PS_HOST, "worker": WORKER_HOSTS}) num_ps, num_workers = len(PS_HOST), len(WORKER_HOSTS) # Get the Cluster Spec # cluster = tf.train.ClusterSpec(CLUSTER) # Get the current server element TASK_ID = task JOB = job server = tf.train.Server(cluster, job_name=JOB, task_index=TASK_ID) # Check if we have a worker or ps node running if JOB == 'ps': server.join() else: # Get all required Paramters # Running Paramters TOTAL_GLOBAL_EPISODES = 100000 # Gym environment ENV_NAME = 'SpaceInvaders-v0' # MsPacman CartPole NUM_ENVS = 3 PREPROCESSING = True IMAGE_SIZE_PREPROCESSED = 80 PREPROCESSING_CONFIG = [ { "type": "image_resize", "width": IMAGE_SIZE_PREPROCESSED, "height": IMAGE_SIZE_PREPROCESSED }, { "type": "grayscale" } # { # "type": "sequence", # TO-DO: sequence not supported # "length": 2 # } ] # Get env parameters gw = GymWrapper(ENV_NAME) ACTION_DIM = gw.act_space.n if PREPROCESSING: STATE_DIM = IMAGE_SIZE_PREPROCESSED * IMAGE_SIZE_PREPROCESSED types_of_preprocess = [] for operation in PREPROCESSING_CONFIG: types_of_preprocess.append(operation['type']) if operation['type'] == "sequence": length_sequence = operation['length'] print("Do following preprocessing steps: {0}".format( types_of_preprocess)) else: PREPROCESSING_CONFIG = None STATE_DIM = gw.obs_space.shape[0] # Network configuration network_config = dict(shared=True, shared_config=dict( kind=["CNN"], cnn_input_size=IMAGE_SIZE_PREPROCESSED, cnn_output_size=256, lstm_cell_units=16), policy_config=dict(layers=[ACTION_DIM], noise_dist=None), value_config=dict(layers=[1], noise_dist=None)) # Learning rate LEARNING_RATE = 0.01 UPDATE_LEARNING_RATE = False # Discount rate for advantage estimation and reward discounting GAMMA = 0.99 # Summary LOGDIR # LOG_DIR = '~/A3C/MyDistTest/' LOG_DIR = os.getcwd() + '_tensorflowlogs' LOG_DIR_CHECKPOINT = os.getcwd() + "_modelcheckpoints" # Print latest checkpoint checkpoint_sync = True # Choose RL method (A3C, PCL) METHOD = "A3C" print("Run method: " + METHOD) # PCL variables TAU = 0.2 ROLLOUT = 5 # Define the global network and get relevant worker_device worker_device = '/job:worker/task:{}/cpu:0'.format(TASK_ID) with tf.device( tf.train.replica_device_setter( cluster=cluster, # Makes sure global variables defined in worker_device=worker_device, # this contexts are synced across processes ps_strategy=U.greedy_ps_strategy(ps_tasks=num_ps))): global_episodes = tf.train.get_or_create_global_step() master_network = AC_Network( STATE_DIM, ACTION_DIM, 'global', network_config, learning_rate=None, tau=TAU, rollout=ROLLOUT, method=METHOD) # Generate global network with tf.device(worker_device): worker = Worker(TASK_ID, STATE_DIM, ACTION_DIM, network_config, LEARNING_RATE, global_episodes, ENV_NAME, number_envs=NUM_ENVS, tau=TAU, rollout=ROLLOUT, method=METHOD, update_learning_rate_=UPDATE_LEARNING_RATE, preprocessing_config=PREPROCESSING_CONFIG) # Get summary information if worker.name == "worker_0": merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(LOG_DIR, graph=tf.get_default_graph()) else: merged_summary = None local_init_op = tf.global_variables_initializer() with tf.Session(server.target) as sess: sess.run(local_init_op) # Setup monitoring is_chief = (TASK_ID == 0) # Setup hooks required to coordinate training stopHook = tf.train.StopAtStepHook(num_steps=TOTAL_GLOBAL_EPISODES) saver = tf.train.Saver(max_to_keep=3, var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='global')) saverHook = tf.train.CheckpointSaverHook( checkpoint_dir=LOG_DIR_CHECKPOINT, save_steps=200, checkpoint_basename=worker.method, saver=saver) # Start Training with tf.train.MonitoredTrainingSession(master=server.target, is_chief=is_chief, chief_only_hooks=[saverHook], hooks=[stopHook]) as sess: # Reload global model from chief if is_chief: try: saver.restore( sess, tf.train.latest_checkpoint(LOG_DIR_CHECKPOINT, latest_filename=None)) except ValueError: print("No Model Checkpoint available") # If Checkpoint is loaded make sure all workers start after # the variables have been reloaded in order to avoid "bad" updates if checkpoint_sync: while sess.run(worker.global_episodes) == 0: print(worker.name + " Waiting for Sync of Checkpoint loaded by worker_0") time.sleep(1) # Update from global sess.run(worker.update_local_ops) # Define input to worker.work( gaxmma, sess, coord, merged_summary, writer_summary) gamma = GAMMA MINI_BATCH = 40 REWARD_FACTOR = 0.001 EPISODE_RUNS = 1000 episode_count = 0 total_steps = 0 train_steps = 0 print("Starting worker " + str(TASK_ID)) while not sess.should_stop(): worker.episode_values = [] worker.episode_reward = [] # Objects to hold the bacth used to update the Agent worker.reset_batch() # Used by PCL # Hold reward and value function mean value of sampled episodes from replay buffer episode_reward_offline = 0 episode_value_offline = 0 episode_step_count = 0 # Restart environment s = worker.env.reset() if worker.preprocessing_state: s = U.process_frame(s, worker.preprocessing_config) if worker.rnn_network: # Set initial rnn state based on number of episodes c_init = np.zeros( (len(worker.env), worker.local_AC.cell_units), np.float32) h_init = np.zeros( (len(worker.env), worker.local_AC.cell_units), np.float32) rnn_state = np.array([c_init, h_init]) else: rnn_state = None # sample new noisy parameters in fully connected layers if # noisy net is used # if episode_count % 15 == 0: if worker.noisy_policy is not None or worker.noisy_value is not None: sess.run(worker.local_AC.noisy_sampling) if worker.method == "PCL": # Perform a rollout of the chosen environment episodes = worker.rolloutPCL(sess, s, rnn_state, max_path_length=1000, episode_count=len(worker.env)) # Add sampled episode to replay buffer worker.replay_buffer.add(episodes) # Get rewards and value estimates of current sample _, _, r_ep, v_ep, _, _ = unpack_episode(episodes) episode_values = np.mean(np.sum(v_ep, axis=1)) episode_reward = np.mean(np.sum(r_ep, axis=1)) # Train on online episode if applicable train_online = False train_offline = True if train_online: # Train PCL agent _, _, summary = worker.train_pcl( episodes, gamma, sess, merged_summary) # Update summary information train_steps = train_steps + 1 # if worker.name == "worker_0": # writer_summary.add_summary(summary, train_steps) if train_offline: # Sample len(envs) many episodes from the replay buffer sampled_episodes = worker.replay_buffer.sample( episode_count=len(worker.env)) # Train PCL agent r_ep, v_ep, summary, logits = worker.train_pcl( sampled_episodes, gamma, sess, merged_summary) # Update global network sess.run(worker.update_local_ops) # Update learning rate based on calculated KL Divergence if worker.update_learning_rate_: # Calculate KL-Divergence of updated policy and policy before update kl_divergence = worker.calculate_kl_divergence( logits, sampled_episodes, sess) # Perform learning rate update based on KL-Divergence worker.update_learning_rate(kl_divergence, sess) # Update summary information train_steps = train_steps + 1 if worker.name == "worker_0": writer.add_summary(summary, train_steps) # Write add. summary information episode_reward_offline = np.mean(np.sum(r_ep, axis=1)) episode_value_offline = np.mean(np.sum(v_ep, axis=1)) elif worker.method == "A3C": # Run an episode while not worker.env.all_done(): # Get preferred action distribution dummy_lengths = np.ones(len(worker.env)) a, v, rnn_state, _ = worker.act( s, rnn_state, dummy_lengths, sess) # Get action for every environment act_ = [np.argmax(a_) for a_ in a] # Sample new state and reward from environment s2, r, terminal, info = worker.env.step(act_) if worker.preprocessing_state: s2 = U.process_frame(s2, worker.preprocessing_config) # Add states, rewards, actions, values and terminal information to A3C minibatch worker.add_to_batch(s, r, a, v, terminal) # Get episode information for tracking the training process worker.episode_values.append(v) worker.episode_reward.append(r) # Train on mini batches from episode if (episode_step_count % MINI_BATCH == 0 and episode_step_count > 0 ) or worker.env.all_done(): feed_dict_ = { worker.local_AC.inputs: s2, worker.local_AC.lengths_episodes: dummy_lengths } if worker.rnn_network: feed_dict_[ worker.local_AC.state_in[0]] = rnn_state[0] feed_dict_[ worker.local_AC.state_in[1]] = rnn_state[1] v1 = sess.run([worker.local_AC.value], feed_dict_) v_l, p_l, e_l, g_n, v_n, summary, logits = worker.train( worker.episode_states_train, worker.episode_reward_train, worker.episode_actions_train, worker.episode_values_train, worker.episode_done_train, sess, gamma, np.squeeze(v1), merged_summary) if worker.env.all_done(): # Update global network sess.run(worker.update_local_ops) # Update learning rate based on calculated KL Divergence if worker.update_learning_rate_: # Calculate KL-Divergence of updated policy and policy before update kl_divergence = worker.calculate_kl_divergence( logits, worker.episode_states_train, sess, worker.episode_done_train) # Perform learning rate update based on KL-Divergence if not np.isnan(kl_divergence): worker.update_learning_rate( kl_divergence, sess) train_steps = train_steps + 1 # Update summary information if worker.name == "worker_0": writer.add_summary(summary, train_steps) # Reset A3C minibatch after it has been used to update the model worker.reset_batch() # Set previous state for next step s = s2 total_steps += 1 episode_step_count += 1 episode_values = np.mean( np.sum(worker.episode_values, axis=0)) episode_reward = np.mean( np.sum(worker.episode_reward, axis=0)) if episode_count % 20 == 0: print("Reward: " + str(episode_reward), " | Episode", episode_count, " of " + worker.name, " | Global Episode", str(sess.run(worker.global_episodes))) if worker.method == "PCL": print("Reward Offline: " + str(episode_reward_offline), " | Episode", episode_count, " of " + worker.name) worker.episode_rewards.append(episode_reward) worker.episode_lengths.append(episode_step_count) worker.episode_mean_values.append(episode_values) sess.run(worker.increment) # Next global episode episode_count += 1 # Ask for all the services to stop. print("Worker stops because max episode runs are reached")
trainer = tf.train.AdamOptimizer(learning_rate=1e-5) ac_net = AC_rnn_ra_Network(s_shape, a_size, 'global_0', None) saver = tf.train.Saver(max_to_keep=5) with tf.Session() as sess: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) rnn_state = ac_net.state_init i = 0 d = False r = 0 a = np.array([0, 0]) while i < max_episode_length and d == False: s_p = process_frame(s) # Take an action using probabilities from policy network output. a, v, rnn_state = sess.run( [ac_net.sample_a, ac_net.value, ac_net.state_out], feed_dict={ ac_net.inputs: [s_p], ac_net.prev_actions: [a], ac_net.prev_rewards: [[r]], ac_net.is_training_ph: False, ac_net.state_in[0]: rnn_state[0], ac_net.state_in[1]: rnn_state[1] }) s, r, d = env_g.step(a) sarray.append(s) rarray.append(r)