Exemplo n.º 1
0
    def work_acer(self):
        b_states=[None]
        done = True
        step = 0
        print(self.name, " using ", self.offline_steps, "offline steps, per online step")

        while step < self.MAX_STEPS:
            """
            """
            self.agent.update_target()
            # n -step rollout from the environment, with n = RETURN_STEPS or until done.
            b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done, self.RETURN_STEPS)
            pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions)

            importance_weights = np.divide(pi, np.add(b_mus, 1e-14))
            importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                    np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions))
            #calculate retrace values.
            retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT)
            #update step, returns current global step and summary (not used here)
            _, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights)
            # append trajectory to the replay buffer
            self.memory.remember((b_states, b_actions, b_rewards, b_mus, done))
            #offline version, instead of rollout the trajectory is sampled.
            if self.offline_steps>0 and self.memory.can_sample():
                for _ in range(self.offline_steps):
                    mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory()
                    pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions)

                    importance_weights = np.divide(pi, np.add(mem_mus, 1e-14))
                    importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                            np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions))
                    retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT)
                    sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
Exemplo n.º 2
0
    def work_and_eval_acer(self, net_saver, TB_DIR, evalrewards=[]):
        b_states = [None]
        done = True
        step = 0
        runningreward = 1
        bestreward = 0
        rewardlist=[]
        if evalrewards !=[]:
            runningreward = evalrewards[-1]
            print(runningreward)
        next_verbose = 0
        summary_writer = tf.summary.FileWriter(TB_DIR + "/tb", self.sess.graph, flush_secs=30)
        print(self.name, " using ", self.offline_steps, "offline steps, per online step")

        while step < self.MAX_STEPS:
            self.agent.update_target()
            b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done,
                                                                  self.RETURN_STEPS)
            pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions)
            rewardlist.append(np.sum(b_rewards))
            importance_weights = np.divide(pi, np.add(b_mus, 1e-14))
            importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                    np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions))
            retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT)
            sum, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights)
            self.memory.remember((b_states, b_actions, b_rewards, b_mus, done))
            if done:
                bestreward = np.maximum(bestreward,np.sum(rewardlist))
                runningreward = 0.9*runningreward+0.1*np.sum(rewardlist)
                evalrewards.append(runningreward)
                np.savetxt(TB_DIR + "reward.out",evalrewards)
                rewardlist=[]
                if step > next_verbose:
                    print("Worker ", self.name, "At ", step, " Running/Max: ", runningreward, bestreward, " Frames:", self.memory.counter)
                    print("pi:", self.agent.get_pi(b_states[-1]))
                    print("Saving Model")
                    next_verbose +=(self.MAX_STEPS/100)
                    net_saver.save(self.sess, TB_DIR + "checkpoints/model" + str(step) + ".cptk")
                if sum is not None:
                    summary_writer.add_summary(sum, step)

            if self.offline_steps>0 and self.memory.can_sample():
                for _ in range(self.offline_steps):

                    mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory()
                    pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions)

                    importance_weights = np.divide(pi, np.add(mem_mus, 1e-14))
                    importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                            np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions))
                    retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT)
                    sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
Exemplo n.º 3
0
def train_acer(agent,
               env,
               sess,
               worker_id,
               replay_buffer,
               k_steps=20,
               DISCOUNT=0.99,
               step_limit=5000000,
               verbose_every=1000,
               net_saver=None,
               TB_DIR=None):
    print("Starting Agent", worker_id)
    rewardlist = []
    runningreward = 0
    bestreward = 0
    replay_ratio = 1
    avg_ep_length = 20
    RETURN_STEPS = k_steps
    b_states = [None]
    step = 0
    done = True
    online = True
    write_summary = False
    sum, summary_writer = None, None
    if worker_id == 0:
        if TB_DIR != None:
            summary_writer = tf.summary.FileWriter(TB_DIR + "/tb",
                                                   sess.graph,
                                                   flush_secs=30)
            write_summary = True
    while step < step_limit:
        if online or step < 1000000:
            agent.update_target()
            b_states, b_actions, b_rewards, b_mus, done = rollout(
                agent, env, [b_states[-1]], done, RETURN_STEPS)
            pi, q_a, val = agent.get_retrace_values(b_states[:-1], b_actions)

            importance_weights = np.ones_like(pi)
            importance_weights_a = np.take(
                np.reshape(importance_weights, [-1]),
                (np.arange(importance_weights.shape[0]) *
                 importance_weights.shape[1] + b_actions))
            retrace_targets = q_retrace(b_rewards, done, q_a, val,
                                        importance_weights_a, DISCOUNT)
            sum, step = agent.update_step(b_states[:-1], b_actions,
                                          retrace_targets, importance_weights)
            replay_buffer.remember(
                (b_states, b_actions, b_rewards, b_mus, done))
            rewardlist.append(np.sum(b_rewards))
            if done:
                bestreward = np.maximum(np.sum(rewardlist), bestreward)
                runningreward = 0.95 * runningreward + 0.05 * np.sum(
                    rewardlist)
                replay_ratio = replay_ratio * 0.99 + 0.01
                offline_decider = np.random.rand(1) * 0.7
                if offline_decider + 0.3 > (1 - step / step_limit):
                    online = False
                avg_ep_length = 0.9 * avg_ep_length + 0.1 * len(rewardlist)
                rewardlist = []

        else:
            mem_states, mem_actions, mem_rewards, mem_mus, done = replay_buffer.sample_from_memory(
            )
            pi, q_a, val = agent.get_retrace_values(mem_states[:-1],
                                                    mem_actions)

            importance_weights = np.divide(pi, np.add(mem_mus, 1e-14))
            importance_weights_a = np.take(
                np.reshape(importance_weights, [-1]),
                (np.arange(importance_weights.shape[0]) *
                 importance_weights.shape[1] + mem_actions))
            retrace_targets = q_retrace(mem_rewards, done, q_a, val,
                                        importance_weights_a, DISCOUNT)
            sum, step = agent.update_step(mem_states[:-1], mem_actions,
                                          retrace_targets, importance_weights)
            online = step % 2 == 0
            replay_ratio = replay_ratio * 0.99

        if step % verbose_every == 0:
            print("Worker ", worker_id, "At ", step, " Running/Max: ",
                  runningreward, bestreward, " Replay Ratio: ", replay_ratio)
            print("EPlen:", avg_ep_length * RETURN_STEPS, "pi:",
                  agent.get_pi(b_states[-1]))

        if step % 5000 == 0:
            print("Saving Model")
            net_saver.save(sess,
                           TB_DIR + "checkpoints/model" + str(step) + ".cptk")
        if write_summary and sum is not None:
            summary_writer.add_summary(sum, step)