class TestEpisodeMemory(unittest.TestCase):
    def setUp(self):
        self.memory = EpisodicMemory()

    def test_append(self):
        for i in range(20):
            a = Transition([0, 1, 2, 3], [0, 1], [4, 5, 6, 7], 1, True)
            self.memory.push(a)
        self.assertEqual(len(self.memory), 20)

    def test_sample(self):
        for i in range(10):
            a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True)
            self.memory.push(a)

        s, a, s1, r, done = self.memory.sample()
        self.assertEqual(s.shape, (10, 4))
        self.assertEqual(a.shape, (10, 2))
        self.assertEqual(s1.shape, (10, 4))
        self.assertEqual(r.shape, (10, 1))
        self.assertEqual(done.shape, (10, 1))
        self.assertAlmostEqual(r[0][0], 9.561792, 6)

    def test_no_reward_accumulation(self):
        self.memory = EpisodicMemory(accum_reward=False)
        for i in range(10):
            a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True)
            self.memory.push(a)

        s, a, s1, r, done = self.memory.sample()
        self.assertAlmostEqual(r[0][0], 1.0)
示例#2
0
class ActorCriticMoveToBeacon(base_agent.BaseAgent):
    """A2C agent for move to beacon"""
    def __init__(self):
        super().__init__()
        self.brain = ScreenSelectAndMoveLearner()
        self.memory = EpisodicMemory(accum_reward=False)
        self.sarsd = SARSD()

    def reset(self):
        super().reset()
        self.sarsd.reset()
        print("Total Steps:", self.steps)
        if len(self.memory) > 0:
            self.train()

    def step(self, obs):
        super().step(obs)

        if FUNCTIONS.Move_screen.id in obs.observation.available_actions:
            # Only record observations if army is selected
            state = obs.observation.feature_screen.player_relative  # make it simple for now
            state = np.expand_dims(state, axis=2)

            reward = obs.reward
            done = obs.reward == 1

            if hasattr(self, "memory"):
                transition = self.sarsd.observe(state, reward, done)
                if transition is not None: self.memory.push(transition)

            # get action
            action, coords = self.get_action(state)

            self.sarsd.act(action)
            return FUNCTIONS.Move_screen("now", coords)
        else:
            return FUNCTIONS.select_army("select")

    def get_best_action(self, state):
        state = np.expand_dims(state, axis=0)  # convert to batch
        state = tf.constant(state, dtype=tf.float32)
        p = self.brain.P(state)
        argmax = tf.argmax(p, axis=1)
        coords = tf.unravel_index(argmax, state.shape[1:3])
        return argmax.numpy(), tf.squeeze(coords).numpy()

    def get_action(self, state):
        """tf.multinomial doesn't work in eager as of v1.8"""
        state = np.expand_dims(state, axis=0)  # convert to batch
        state = tf.constant(state, dtype=tf.float32)
        p = self.brain.P(state)
        p = p.numpy().squeeze()
        choice = np.random.choice(p.shape[0], p=p)
        coords = np.unravel_index(choice, state.shape[1:3])
        return choice, (coords[1], coords[0]
                        )  # seems like pysc2 uses Y before X as coord

    def train(self):
        # if self.batch_size >= len(self.memory):
        #   return
        # print("training")
        # s_0, a_0, s_1, r_1, done = self.memory.sample(self.batch_size)
        s_0, a_0, s_1, r_1, done = self.memory.sample()
        self.brain.train(s_0=s_0,
                         a_0=a_0,
                         s_1=s_1,
                         r_t=r_1,
                         done=done,
                         num_actions=6 * 6)