def testPolicy(): num_actions = 6 start_value = 1 end_value = 0.1 num_steps = 1000 policy1 = GreedyPolicy() policy2 = LinearDecayGreedyEpsilonPolicy(num_actions, start_value, end_value, num_steps) q_values = np.array([1.0, 1.3, 1.2, 1.5, 1.1, 1.4]) assert(policy1.select_action(q_values) == 3) assert(policy2.epsilon == 1) policy2.select_action(q_values) assert(np.isclose(policy2.epsilon, 0.9991)) policy2.select_action(q_values) policy2.select_action(q_values) assert(np.isclose(policy2.epsilon, 0.9973)) for i in range(num_steps): policy2.select_action(q_values) assert(np.isclose(policy2.epsilon, end_value)) policy2.select_action(q_values) assert(np.isclose(policy2.epsilon, end_value))
history_store.reset() state = atari_processor.state_for_nn(observation) history_store.add_history(state) nn_tmp = history_store.get_history() nn_input = np.zeros((1, IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH), dtype=float) nn_input[0, :] = nn_tmp episode_interaction_cnt = 0 flag_first = 0 done = False while done == False: # Interact with environment and store into memory. q_values = model_online.predict(nn_input) action = greedy_epsilon_linear_decay_selector.select_action(q_values) observation, reward, done, info = env.step(action) if flag_first == 0: info_prev = info episode_end == False flag_first = 1 else: if info != info_prev: episode_end = True else: episode_end = False info_prev = info reward = atari_processor.process_reward(reward) state_next = atari_processor.state_for_nn(observation) history_store.add_history(state_next)