def test_network(self): test_env = AtariEnvironment(self.rom) total_reward = 0. total_steps = 0. q_avg_total = 0. max_reward = 0. for ep in range(20): obs1 = test_env.reset() obs2 = test_env.step(test_env.sample_action())[0] obs3 = test_env.step(test_env.sample_action())[0] obs4, _, done = test_env.step(test_env.sample_action()) obs1, obs2, obs3, obs4 = preprocess(obs1), preprocess( obs2), preprocess(obs3), preprocess(obs4) state = np.transpose([obs1, obs2, obs3, obs4], (1, 2, 0)) episode_reward = 0. num_steps = 0. ep_q_total = 0. # done = False while not test_env.ale.game_over(): _, action, reward, new_state, obs1, obs2, obs3, obs4, Qval, done =\ self.true_step(0.05, state, obs2, obs3, obs4, test_env) state = new_state episode_reward += reward num_steps += 1. ep_q_total += Qval max_reward = max(episode_reward, max_reward) ep_q_avg = ep_q_total / num_steps q_avg_total += ep_q_avg total_reward += episode_reward total_steps += num_steps avg_Q = q_avg_total / 20. avg_reward = total_reward / 20. avg_steps = total_steps / 20. print("Average Q-value: {}".format(avg_Q)) print("Average episode reward: {}".format(avg_reward)) print("Average number of steps: {}".format(avg_steps)) print("Max reward over 20 episodes: {}".format(max_reward)) return avg_Q, avg_reward, max_reward, avg_steps
def true_step(self, prob, state, obs2, obs3, obs4, env): q_vals = self.sess.run( self.convnet.output, feed_dict={self.convnet.input: [np.array(state)]}) if random.uniform(0, 1) > prob: step_action = q_vals.argmax() else: step_action = env.sample_action() if prob > 0.1: prob -= 9.0e-7 new_obs, step_reward, step_done = env.step(step_action) processed_obs = preprocess(new_obs) new_state = np.transpose([obs2, obs3, obs4, processed_obs], (1, 2, 0)) return prob, step_action, step_reward, new_state, obs2, obs3, obs4, processed_obs, q_vals.max( ), step_done
def true_step(self, prob, state, obs2, obs3, obs4, env): output = self.sess.run( self.answer_network.output, feed_dict={self.answer_network.input: [np.array(state)]}) Q_vals = output[0][-len(self.env.action_space):] if random.uniform(0, 1) > prob: step_action = Q_vals.argmax() else: step_action = env.sample_action() if prob > 0.1: prob -= 9.0e-7 new_obs, step_reward, step_done = env.step(step_action) processed_obs = preprocess(new_obs) new_state = np.transpose([obs2, obs3, obs4, processed_obs], (1, 2, 0)) return prob, step_action, step_reward, new_state, obs2, obs3, obs4, processed_obs, Q_vals.max( ), step_done
target_weights = dqn.sess.run(dqn.answer_network.weights) episode_step_count = [] total_steps = 1. prob = 1.0 learning_data = [] weight_average_array = [] loss_vals = [] episode_number = 0 while total_steps < 20000000: obs1 = dqn.env.reset() obs2 = dqn.env.step(dqn.env.sample_action())[0] obs3 = dqn.env.step(dqn.env.sample_action())[0] obs4, _, terminal = dqn.env.step(dqn.env.sample_action()) obs1, obs2, obs3, obs4 = preprocess(obs1), preprocess(obs2), preprocess( obs3), preprocess(obs4) state = np.transpose([obs1, obs2, obs3, obs4], (1, 2, 0)) steps = 0 while not dqn.env.ale.game_over(): prob, action, reward, new_state, obs1, obs2, obs3, obs4, _, terminal = dqn.true_step( prob, state, obs2, obs3, obs4, dqn.env) dqn.update_replay_memory((state, action, reward, new_state, terminal)) state = new_state if len(dqn.replay_memory) >= 1000 and total_steps % 4 == 0: minibatch = random.sample(dqn.replay_memory, BATCH_SIZE) expanded_minibatch = [] for i in range(len(minibatch)): minibatch[i] = minibatch[i] + ([minibatch[i][2]], )