def test_no_reward_accumulation(self): self.memory = EpisodicMemory(accum_reward=False) for i in range(10): a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True) self.memory.push(a) s, a, s1, r, done = self.memory.sample() self.assertAlmostEqual(r[0][0], 1.0)
class TestEpisodeMemory(unittest.TestCase): def setUp(self): self.memory = EpisodicMemory() def test_append(self): for i in range(20): a = Transition([0, 1, 2, 3], [0, 1], [4, 5, 6, 7], 1, True) self.memory.push(a) self.assertEqual(len(self.memory), 20) def test_sample(self): for i in range(10): a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True) self.memory.push(a) s, a, s1, r, done = self.memory.sample() self.assertEqual(s.shape, (10, 4)) self.assertEqual(a.shape, (10, 2)) self.assertEqual(s1.shape, (10, 4)) self.assertEqual(r.shape, (10, 1)) self.assertEqual(done.shape, (10, 1)) self.assertAlmostEqual(r[0][0], 9.561792, 6) def test_no_reward_accumulation(self): self.memory = EpisodicMemory(accum_reward=False) for i in range(10): a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True) self.memory.push(a) s, a, s1, r, done = self.memory.sample() self.assertAlmostEqual(r[0][0], 1.0)
class REINFORCE(RLAgent): """ REINFORCE with a baseline for discrete actions """ def __init__(self): super().__init__() self.gt = Gt() def setup(self, observation_space, action_space): super().setup(observation_space, action_space) self.learner = SimpleDenseLearner(nb_actions=action_space.n, learning_rate=0.1) self.memory = EpisodicMemory() self.actions = np.arange(self.action_space.n) def getAction(self, s_0): s_0 = tf.constant([s_0], dtype=tf.float32) action_probability = self.learner(s_0).numpy() action = np.random.choice(self.actions, p=action_probability[0]) return action def train(self): if len(self.memory) == 0: return s_0, a_0, s_1, r_1, done = self.memory.sample() baseline = self.gt.add_episode_rewards(np.squeeze(r_1)) baseline = np.expand_dims(baseline, axis=1) A = r_1 - baseline self.learner.train(s_0=s_0, a_0=a_0, r_1=A)
class MCPGWithBaseline(RLAgent): """ Monte Carlo actor critic """ def setup(self, observation_space, action_space): super().setup(observation_space, action_space) self.learner = SimpleMCPolicyGradientWithBaseline(nb_actions=action_space.n, policy_lr=0.001, value_lr=0.001, gamma=0.99) self.memory = EpisodicMemory(gamma=0.99) self.actions = np.arange(self.action_space.n) print("no. of actions:", self.action_space.n) def getAction(self, s_0): return TFHelper.get_action(self.learner.P, self.actions, s_0) def train(self): if len(self.memory) == 0: return s_0, a_0, s_1, r_1, done = self.memory.sample() self.learner.train(s_0=s_0, a_0=a_0, s_1=s_1, r_1=r_1, done=done)
def setup(self, observation_space, action_space): super().setup(observation_space, action_space) self.learner = SimpleDenseLearner(nb_actions=action_space.n, learning_rate=0.1) self.memory = EpisodicMemory() self.actions = np.arange(self.action_space.n)
def setUp(self): self.memory = EpisodicMemory()
def setup(self, observation_space, action_space): super().setup(observation_space, action_space) self.learner = SimpleMCPolicyGradientWithBaseline(nb_actions=action_space.n, policy_lr=0.001, value_lr=0.001, gamma=0.99) self.memory = EpisodicMemory(gamma=0.99) self.actions = np.arange(self.action_space.n) print("no. of actions:", self.action_space.n)
def __init__(self): super().__init__() self.brain = ScreenSelectAndMoveLearner() self.memory = EpisodicMemory(accum_reward=False) self.sarsd = SARSD()
class ActorCriticMoveToBeacon(base_agent.BaseAgent): """A2C agent for move to beacon""" def __init__(self): super().__init__() self.brain = ScreenSelectAndMoveLearner() self.memory = EpisodicMemory(accum_reward=False) self.sarsd = SARSD() def reset(self): super().reset() self.sarsd.reset() print("Total Steps:", self.steps) if len(self.memory) > 0: self.train() def step(self, obs): super().step(obs) if FUNCTIONS.Move_screen.id in obs.observation.available_actions: # Only record observations if army is selected state = obs.observation.feature_screen.player_relative # make it simple for now state = np.expand_dims(state, axis=2) reward = obs.reward done = obs.reward == 1 if hasattr(self, "memory"): transition = self.sarsd.observe(state, reward, done) if transition is not None: self.memory.push(transition) # get action action, coords = self.get_action(state) self.sarsd.act(action) return FUNCTIONS.Move_screen("now", coords) else: return FUNCTIONS.select_army("select") def get_best_action(self, state): state = np.expand_dims(state, axis=0) # convert to batch state = tf.constant(state, dtype=tf.float32) p = self.brain.P(state) argmax = tf.argmax(p, axis=1) coords = tf.unravel_index(argmax, state.shape[1:3]) return argmax.numpy(), tf.squeeze(coords).numpy() def get_action(self, state): """tf.multinomial doesn't work in eager as of v1.8""" state = np.expand_dims(state, axis=0) # convert to batch state = tf.constant(state, dtype=tf.float32) p = self.brain.P(state) p = p.numpy().squeeze() choice = np.random.choice(p.shape[0], p=p) coords = np.unravel_index(choice, state.shape[1:3]) return choice, (coords[1], coords[0] ) # seems like pysc2 uses Y before X as coord def train(self): # if self.batch_size >= len(self.memory): # return # print("training") # s_0, a_0, s_1, r_1, done = self.memory.sample(self.batch_size) s_0, a_0, s_1, r_1, done = self.memory.sample() self.brain.train(s_0=s_0, a_0=a_0, s_1=s_1, r_t=r_1, done=done, num_actions=6 * 6)