예제 #1
0
    def test_network(self):
        test_env = AtariEnvironment(self.rom)
        total_reward = 0.
        total_steps = 0.
        q_avg_total = 0.
        max_reward = 0.
        for ep in range(20):
            obs1 = test_env.reset()
            obs2 = test_env.step(test_env.sample_action())[0]
            obs3 = test_env.step(test_env.sample_action())[0]
            obs4, _, done = test_env.step(test_env.sample_action())
            obs1, obs2, obs3, obs4 = preprocess(obs1), preprocess(
                obs2), preprocess(obs3), preprocess(obs4)
            state = np.transpose([obs1, obs2, obs3, obs4], (1, 2, 0))
            episode_reward = 0.
            num_steps = 0.
            ep_q_total = 0.
            # done = False
            while not test_env.ale.game_over():
                _, action, reward, new_state, obs1, obs2, obs3, obs4, Qval, done =\
                    self.true_step(0.05, state, obs2, obs3, obs4, test_env)
                state = new_state
                episode_reward += reward
                num_steps += 1.
                ep_q_total += Qval
            max_reward = max(episode_reward, max_reward)
            ep_q_avg = ep_q_total / num_steps
            q_avg_total += ep_q_avg
            total_reward += episode_reward
            total_steps += num_steps

        avg_Q = q_avg_total / 20.
        avg_reward = total_reward / 20.
        avg_steps = total_steps / 20.
        print("Average Q-value: {}".format(avg_Q))
        print("Average episode reward: {}".format(avg_reward))
        print("Average number of steps: {}".format(avg_steps))
        print("Max reward over 20 episodes: {}".format(max_reward))

        return avg_Q, avg_reward, max_reward, avg_steps
예제 #2
0
    def true_step(self, prob, state, obs2, obs3, obs4, env):

        q_vals = self.sess.run(
            self.convnet.output,
            feed_dict={self.convnet.input: [np.array(state)]})
        if random.uniform(0, 1) > prob:
            step_action = q_vals.argmax()
        else:
            step_action = env.sample_action()

        if prob > 0.1:
            prob -= 9.0e-7

        new_obs, step_reward, step_done = env.step(step_action)

        processed_obs = preprocess(new_obs)
        new_state = np.transpose([obs2, obs3, obs4, processed_obs], (1, 2, 0))

        return prob, step_action, step_reward, new_state, obs2, obs3, obs4, processed_obs, q_vals.max(
        ), step_done
예제 #3
0
    def true_step(self, prob, state, obs2, obs3, obs4, env):
        output = self.sess.run(
            self.answer_network.output,
            feed_dict={self.answer_network.input: [np.array(state)]})
        Q_vals = output[0][-len(self.env.action_space):]
        if random.uniform(0, 1) > prob:
            step_action = Q_vals.argmax()
        else:
            step_action = env.sample_action()

        if prob > 0.1:
            prob -= 9.0e-7

        new_obs, step_reward, step_done = env.step(step_action)

        processed_obs = preprocess(new_obs)
        new_state = np.transpose([obs2, obs3, obs4, processed_obs], (1, 2, 0))

        return prob, step_action, step_reward, new_state, obs2, obs3, obs4, processed_obs, Q_vals.max(
        ), step_done
예제 #4
0
target_weights = dqn.sess.run(dqn.answer_network.weights)
episode_step_count = []
total_steps = 1.
prob = 1.0
learning_data = []
weight_average_array = []
loss_vals = []
episode_number = 0

while total_steps < 20000000:
    obs1 = dqn.env.reset()
    obs2 = dqn.env.step(dqn.env.sample_action())[0]
    obs3 = dqn.env.step(dqn.env.sample_action())[0]
    obs4, _, terminal = dqn.env.step(dqn.env.sample_action())
    obs1, obs2, obs3, obs4 = preprocess(obs1), preprocess(obs2), preprocess(
        obs3), preprocess(obs4)
    state = np.transpose([obs1, obs2, obs3, obs4], (1, 2, 0))
    steps = 0

    while not dqn.env.ale.game_over():
        prob, action, reward, new_state, obs1, obs2, obs3, obs4, _, terminal = dqn.true_step(
            prob, state, obs2, obs3, obs4, dqn.env)
        dqn.update_replay_memory((state, action, reward, new_state, terminal))
        state = new_state

        if len(dqn.replay_memory) >= 1000 and total_steps % 4 == 0:
            minibatch = random.sample(dqn.replay_memory, BATCH_SIZE)
            expanded_minibatch = []
            for i in range(len(minibatch)):
                minibatch[i] = minibatch[i] + ([minibatch[i][2]], )