def work_acer(self): b_states=[None] done = True step = 0 print(self.name, " using ", self.offline_steps, "offline steps, per online step") while step < self.MAX_STEPS: """ """ self.agent.update_target() # n -step rollout from the environment, with n = RETURN_STEPS or until done. b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done, self.RETURN_STEPS) pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions) importance_weights = np.divide(pi, np.add(b_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions)) #calculate retrace values. retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT) #update step, returns current global step and summary (not used here) _, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights) # append trajectory to the replay buffer self.memory.remember((b_states, b_actions, b_rewards, b_mus, done)) #offline version, instead of rollout the trajectory is sampled. if self.offline_steps>0 and self.memory.can_sample(): for _ in range(self.offline_steps): mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory() pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions) importance_weights = np.divide(pi, np.add(mem_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions)) retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT) sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
def work_and_eval_acer(self, net_saver, TB_DIR, evalrewards=[]): b_states = [None] done = True step = 0 runningreward = 1 bestreward = 0 rewardlist=[] if evalrewards !=[]: runningreward = evalrewards[-1] print(runningreward) next_verbose = 0 summary_writer = tf.summary.FileWriter(TB_DIR + "/tb", self.sess.graph, flush_secs=30) print(self.name, " using ", self.offline_steps, "offline steps, per online step") while step < self.MAX_STEPS: self.agent.update_target() b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done, self.RETURN_STEPS) pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions) rewardlist.append(np.sum(b_rewards)) importance_weights = np.divide(pi, np.add(b_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions)) retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT) sum, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights) self.memory.remember((b_states, b_actions, b_rewards, b_mus, done)) if done: bestreward = np.maximum(bestreward,np.sum(rewardlist)) runningreward = 0.9*runningreward+0.1*np.sum(rewardlist) evalrewards.append(runningreward) np.savetxt(TB_DIR + "reward.out",evalrewards) rewardlist=[] if step > next_verbose: print("Worker ", self.name, "At ", step, " Running/Max: ", runningreward, bestreward, " Frames:", self.memory.counter) print("pi:", self.agent.get_pi(b_states[-1])) print("Saving Model") next_verbose +=(self.MAX_STEPS/100) net_saver.save(self.sess, TB_DIR + "checkpoints/model" + str(step) + ".cptk") if sum is not None: summary_writer.add_summary(sum, step) if self.offline_steps>0 and self.memory.can_sample(): for _ in range(self.offline_steps): mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory() pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions) importance_weights = np.divide(pi, np.add(mem_mus, 1e-14)) importance_weights_a = np.take(np.reshape(importance_weights, [-1]), ( np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions)) retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT) sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
def train_acer(agent, env, sess, worker_id, replay_buffer, k_steps=20, DISCOUNT=0.99, step_limit=5000000, verbose_every=1000, net_saver=None, TB_DIR=None): print("Starting Agent", worker_id) rewardlist = [] runningreward = 0 bestreward = 0 replay_ratio = 1 avg_ep_length = 20 RETURN_STEPS = k_steps b_states = [None] step = 0 done = True online = True write_summary = False sum, summary_writer = None, None if worker_id == 0: if TB_DIR != None: summary_writer = tf.summary.FileWriter(TB_DIR + "/tb", sess.graph, flush_secs=30) write_summary = True while step < step_limit: if online or step < 1000000: agent.update_target() b_states, b_actions, b_rewards, b_mus, done = rollout( agent, env, [b_states[-1]], done, RETURN_STEPS) pi, q_a, val = agent.get_retrace_values(b_states[:-1], b_actions) importance_weights = np.ones_like(pi) importance_weights_a = np.take( np.reshape(importance_weights, [-1]), (np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions)) retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, DISCOUNT) sum, step = agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights) replay_buffer.remember( (b_states, b_actions, b_rewards, b_mus, done)) rewardlist.append(np.sum(b_rewards)) if done: bestreward = np.maximum(np.sum(rewardlist), bestreward) runningreward = 0.95 * runningreward + 0.05 * np.sum( rewardlist) replay_ratio = replay_ratio * 0.99 + 0.01 offline_decider = np.random.rand(1) * 0.7 if offline_decider + 0.3 > (1 - step / step_limit): online = False avg_ep_length = 0.9 * avg_ep_length + 0.1 * len(rewardlist) rewardlist = [] else: mem_states, mem_actions, mem_rewards, mem_mus, done = replay_buffer.sample_from_memory( ) pi, q_a, val = agent.get_retrace_values(mem_states[:-1], mem_actions) importance_weights = np.divide(pi, np.add(mem_mus, 1e-14)) importance_weights_a = np.take( np.reshape(importance_weights, [-1]), (np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions)) retrace_targets = q_retrace(mem_rewards, done, q_a, val, importance_weights_a, DISCOUNT) sum, step = agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights) online = step % 2 == 0 replay_ratio = replay_ratio * 0.99 if step % verbose_every == 0: print("Worker ", worker_id, "At ", step, " Running/Max: ", runningreward, bestreward, " Replay Ratio: ", replay_ratio) print("EPlen:", avg_ep_length * RETURN_STEPS, "pi:", agent.get_pi(b_states[-1])) if step % 5000 == 0: print("Saving Model") net_saver.save(sess, TB_DIR + "checkpoints/model" + str(step) + ".cptk") if write_summary and sum is not None: summary_writer.add_summary(sum, step)