class TestEpisodeMemory(unittest.TestCase): def setUp(self): self.memory = EpisodicMemory() def test_append(self): for i in range(20): a = Transition([0, 1, 2, 3], [0, 1], [4, 5, 6, 7], 1, True) self.memory.push(a) self.assertEqual(len(self.memory), 20) def test_sample(self): for i in range(10): a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True) self.memory.push(a) s, a, s1, r, done = self.memory.sample() self.assertEqual(s.shape, (10, 4)) self.assertEqual(a.shape, (10, 2)) self.assertEqual(s1.shape, (10, 4)) self.assertEqual(r.shape, (10, 1)) self.assertEqual(done.shape, (10, 1)) self.assertAlmostEqual(r[0][0], 9.561792, 6) def test_no_reward_accumulation(self): self.memory = EpisodicMemory(accum_reward=False) for i in range(10): a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True) self.memory.push(a) s, a, s1, r, done = self.memory.sample() self.assertAlmostEqual(r[0][0], 1.0)
class ActorCriticMoveToBeacon(base_agent.BaseAgent): """A2C agent for move to beacon""" def __init__(self): super().__init__() self.brain = ScreenSelectAndMoveLearner() self.memory = EpisodicMemory(accum_reward=False) self.sarsd = SARSD() def reset(self): super().reset() self.sarsd.reset() print("Total Steps:", self.steps) if len(self.memory) > 0: self.train() def step(self, obs): super().step(obs) if FUNCTIONS.Move_screen.id in obs.observation.available_actions: # Only record observations if army is selected state = obs.observation.feature_screen.player_relative # make it simple for now state = np.expand_dims(state, axis=2) reward = obs.reward done = obs.reward == 1 if hasattr(self, "memory"): transition = self.sarsd.observe(state, reward, done) if transition is not None: self.memory.push(transition) # get action action, coords = self.get_action(state) self.sarsd.act(action) return FUNCTIONS.Move_screen("now", coords) else: return FUNCTIONS.select_army("select") def get_best_action(self, state): state = np.expand_dims(state, axis=0) # convert to batch state = tf.constant(state, dtype=tf.float32) p = self.brain.P(state) argmax = tf.argmax(p, axis=1) coords = tf.unravel_index(argmax, state.shape[1:3]) return argmax.numpy(), tf.squeeze(coords).numpy() def get_action(self, state): """tf.multinomial doesn't work in eager as of v1.8""" state = np.expand_dims(state, axis=0) # convert to batch state = tf.constant(state, dtype=tf.float32) p = self.brain.P(state) p = p.numpy().squeeze() choice = np.random.choice(p.shape[0], p=p) coords = np.unravel_index(choice, state.shape[1:3]) return choice, (coords[1], coords[0] ) # seems like pysc2 uses Y before X as coord def train(self): # if self.batch_size >= len(self.memory): # return # print("training") # s_0, a_0, s_1, r_1, done = self.memory.sample(self.batch_size) s_0, a_0, s_1, r_1, done = self.memory.sample() self.brain.train(s_0=s_0, a_0=a_0, s_1=s_1, r_t=r_1, done=done, num_actions=6 * 6)