def test_no_reward_accumulation(self):
        self.memory = EpisodicMemory(accum_reward=False)
        for i in range(10):
            a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True)
            self.memory.push(a)

        s, a, s1, r, done = self.memory.sample()
        self.assertAlmostEqual(r[0][0], 1.0)
class TestEpisodeMemory(unittest.TestCase):
    def setUp(self):
        self.memory = EpisodicMemory()

    def test_append(self):
        for i in range(20):
            a = Transition([0, 1, 2, 3], [0, 1], [4, 5, 6, 7], 1, True)
            self.memory.push(a)
        self.assertEqual(len(self.memory), 20)

    def test_sample(self):
        for i in range(10):
            a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True)
            self.memory.push(a)

        s, a, s1, r, done = self.memory.sample()
        self.assertEqual(s.shape, (10, 4))
        self.assertEqual(a.shape, (10, 2))
        self.assertEqual(s1.shape, (10, 4))
        self.assertEqual(r.shape, (10, 1))
        self.assertEqual(done.shape, (10, 1))
        self.assertAlmostEqual(r[0][0], 9.561792, 6)

    def test_no_reward_accumulation(self):
        self.memory = EpisodicMemory(accum_reward=False)
        for i in range(10):
            a = Transition([0, 1, 2, i], [0, 1], [4, 5, 6, i * i], 1, True)
            self.memory.push(a)

        s, a, s1, r, done = self.memory.sample()
        self.assertAlmostEqual(r[0][0], 1.0)
示例#3
0
class REINFORCE(RLAgent):
    """
  REINFORCE with a baseline for discrete actions
  """
    def __init__(self):
        super().__init__()
        self.gt = Gt()

    def setup(self, observation_space, action_space):
        super().setup(observation_space, action_space)
        self.learner = SimpleDenseLearner(nb_actions=action_space.n,
                                          learning_rate=0.1)
        self.memory = EpisodicMemory()
        self.actions = np.arange(self.action_space.n)

    def getAction(self, s_0):
        s_0 = tf.constant([s_0], dtype=tf.float32)
        action_probability = self.learner(s_0).numpy()
        action = np.random.choice(self.actions, p=action_probability[0])
        return action

    def train(self):
        if len(self.memory) == 0:
            return

        s_0, a_0, s_1, r_1, done = self.memory.sample()

        baseline = self.gt.add_episode_rewards(np.squeeze(r_1))
        baseline = np.expand_dims(baseline, axis=1)

        A = r_1 - baseline
        self.learner.train(s_0=s_0, a_0=a_0, r_1=A)
示例#4
0
class MCPGWithBaseline(RLAgent):
  """
    Monte Carlo actor critic
  """

  def setup(self, observation_space, action_space):
    super().setup(observation_space, action_space)
    self.learner = SimpleMCPolicyGradientWithBaseline(nb_actions=action_space.n, policy_lr=0.001, value_lr=0.001, gamma=0.99)
    self.memory = EpisodicMemory(gamma=0.99)
    self.actions = np.arange(self.action_space.n)
    print("no. of actions:", self.action_space.n)

  def getAction(self, s_0):
    return TFHelper.get_action(self.learner.P, self.actions, s_0)

  def train(self):
    if len(self.memory) == 0:
      return

    s_0, a_0, s_1, r_1, done = self.memory.sample()
    self.learner.train(s_0=s_0, a_0=a_0, s_1=s_1, r_1=r_1, done=done)
示例#5
0
 def setup(self, observation_space, action_space):
     super().setup(observation_space, action_space)
     self.learner = SimpleDenseLearner(nb_actions=action_space.n,
                                       learning_rate=0.1)
     self.memory = EpisodicMemory()
     self.actions = np.arange(self.action_space.n)
 def setUp(self):
     self.memory = EpisodicMemory()
示例#7
0
 def setup(self, observation_space, action_space):
   super().setup(observation_space, action_space)
   self.learner = SimpleMCPolicyGradientWithBaseline(nb_actions=action_space.n, policy_lr=0.001, value_lr=0.001, gamma=0.99)
   self.memory = EpisodicMemory(gamma=0.99)
   self.actions = np.arange(self.action_space.n)
   print("no. of actions:", self.action_space.n)
示例#8
0
 def __init__(self):
     super().__init__()
     self.brain = ScreenSelectAndMoveLearner()
     self.memory = EpisodicMemory(accum_reward=False)
     self.sarsd = SARSD()
示例#9
0
class ActorCriticMoveToBeacon(base_agent.BaseAgent):
    """A2C agent for move to beacon"""
    def __init__(self):
        super().__init__()
        self.brain = ScreenSelectAndMoveLearner()
        self.memory = EpisodicMemory(accum_reward=False)
        self.sarsd = SARSD()

    def reset(self):
        super().reset()
        self.sarsd.reset()
        print("Total Steps:", self.steps)
        if len(self.memory) > 0:
            self.train()

    def step(self, obs):
        super().step(obs)

        if FUNCTIONS.Move_screen.id in obs.observation.available_actions:
            # Only record observations if army is selected
            state = obs.observation.feature_screen.player_relative  # make it simple for now
            state = np.expand_dims(state, axis=2)

            reward = obs.reward
            done = obs.reward == 1

            if hasattr(self, "memory"):
                transition = self.sarsd.observe(state, reward, done)
                if transition is not None: self.memory.push(transition)

            # get action
            action, coords = self.get_action(state)

            self.sarsd.act(action)
            return FUNCTIONS.Move_screen("now", coords)
        else:
            return FUNCTIONS.select_army("select")

    def get_best_action(self, state):
        state = np.expand_dims(state, axis=0)  # convert to batch
        state = tf.constant(state, dtype=tf.float32)
        p = self.brain.P(state)
        argmax = tf.argmax(p, axis=1)
        coords = tf.unravel_index(argmax, state.shape[1:3])
        return argmax.numpy(), tf.squeeze(coords).numpy()

    def get_action(self, state):
        """tf.multinomial doesn't work in eager as of v1.8"""
        state = np.expand_dims(state, axis=0)  # convert to batch
        state = tf.constant(state, dtype=tf.float32)
        p = self.brain.P(state)
        p = p.numpy().squeeze()
        choice = np.random.choice(p.shape[0], p=p)
        coords = np.unravel_index(choice, state.shape[1:3])
        return choice, (coords[1], coords[0]
                        )  # seems like pysc2 uses Y before X as coord

    def train(self):
        # if self.batch_size >= len(self.memory):
        #   return
        # print("training")
        # s_0, a_0, s_1, r_1, done = self.memory.sample(self.batch_size)
        s_0, a_0, s_1, r_1, done = self.memory.sample()
        self.brain.train(s_0=s_0,
                         a_0=a_0,
                         s_1=s_1,
                         r_t=r_1,
                         done=done,
                         num_actions=6 * 6)