Пример #1
0
    def test3(self):
        exprep = exp_replay.ExpReplay(mem_size=100, state_size=[2, 2], kth=4)
        for i in xrange(120):
            exprep.add_step(
                Step(cur_step=[[i, i], [i, i]],
                     action=0,
                     next_step=[[i + 1, i + 1], [i + 1, i + 1]],
                     reward=0,
                     done=False))
        self.assertEqual(len(exprep.mem), 100)
        self.assertEqual(exprep.mem[-1:][0].cur_step, [[119, 119], [119, 119]])
        last_state = exprep.get_last_state()

        self.assertEqual(np.shape(last_state), (2, 2, 4))
        self.assertTrue(
            np.array_equal(last_state[:, :, 0], [[116, 116], [116, 116]]))
        self.assertTrue(
            np.array_equal(last_state[:, :, 1], [[117, 117], [117, 117]]))
        self.assertTrue(
            np.array_equal(last_state[:, :, 2], [[118, 118], [118, 118]]))
        self.assertTrue(
            np.array_equal(last_state[:, :, 3], [[119, 119], [119, 119]]))

        sample = exprep.sample(5)
        self.assertEqual(len(sample), 5)
        self.assertEqual(np.shape(sample[0].cur_step), (2, 2, 4))
        self.assertEqual(np.shape(sample[0].next_step), (2, 2, 4))
Пример #2
0
 def test1(self):
     exprep = exp_replay.ExpReplay(mem_size=100, state_size=[1], kth=1)
     for i in xrange(120):
         exprep.add_step(
             Step(cur_step=i,
                  action=0,
                  next_step=i + 1,
                  reward=0,
                  done=False))
     self.assertEqual(len(exprep.mem), 100)
     self.assertEqual(exprep.mem[-1:][0].cur_step, 119)
Пример #3
0
    def test4(self):
        # -1 for sending raw state
        exprep = exp_replay.ExpReplay(mem_size=100, state_size=[4], kth=-1)
        for i in xrange(120):
            exprep.add_step(
                Step(cur_step=[i, i, i, i],
                     action=0,
                     next_step=[i + 1, i + 1, i + 1, i + 1],
                     reward=0,
                     done=False))
        last_state = exprep.get_last_state()
        self.assertEqual(np.shape(last_state), (4, ))
        self.assertTrue(np.array_equal(last_state, [119, 119, 119, 119]))

        sample = exprep.sample(5)
        self.assertEqual(len(sample), 5)
        self.assertEqual(np.shape(sample[0].cur_step), (4, ))
Пример #4
0
                     reward=reward,
                     done=done))
            cur_state = next_state
            if t == MAX_STEPS - 1:
                print("Episode {} finished after {} timesteps".format(
                    i, t + 1))
                yield t + 1
        agent.epsilon_decay()
        agent.learn_epoch(exprep, EPOCH_SIZE)
        print 'epsilon: {}'.format(agent.epsilon)


env = gym.make('CartPole-v0')
exprep = exp_replay.ExpReplay(mem_size=MEM_SIZE,
                              start_mem=START_MEM,
                              state_size=STATE_SIZE,
                              kth=-1,
                              batch_size=BATCH_SIZE)

sess = tf.Session()
with tf.device('/{}:0'.format(DEVICE)):
    agent = dqn.DQNAgent(session=sess,
                         epsilon=EPSILON,
                         epsilon_anneal=EPSILON_DECAY,
                         end_epsilon=END_EPSILON,
                         lr=LEARNING_RATE,
                         gamma=DISCOUNT_FACTOR,
                         state_size=4,
                         action_size=len(ACTIONS),
                         n_hidden_1=10,
                         n_hidden_2=10)