def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample() max_episode_steps = 10000 state = env.reset() TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL = [1, 2, 3, 4] choose_tower = DQNAdaptive( name="tower", choices=[TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL], network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = tf.summary.FileWriter(training_summaries_path) #Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 episode_summary = tf.Summary() start_time = time.time() tower_to_kill, _ = choose_tower.predict(state.state) end_time = time.time() action = env.new_action() env_start_time = time.time() action.attack_quadrant(tower_to_kill) state = env.act(action) counter = 0 choose_tower.reward(state.reward) total_reward += state.reward if state.is_terminal(): logger.info("End Episode of episode %d!" % (episode + 1)) logger.info("Total Reward %d!" % (total_reward)) env_end_time = time.time() logger.debug("Counter: %d" % counter) logger.debug("Neural Network Time: %.2f" % (end_time - start_time)) logger.debug("Env Time: %.2f" % (env_end_time - env_start_time)) choose_tower.end_episode(state.state) episode_summary.value.add(tag="Reward", simple_value=total_reward) train_summary_writer.add_summary(episode_summary, episode + 1) train_summary_writer.flush() choose_tower.disable_learning() test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = tf.summary.FileWriter(test_summaries_path) choose_tower.explanation = True explanation = Explanation("Tower Capture", (40, 40)) chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type") layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"] #Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 episode_summary = tf.Summary() tower_to_kill, q_values, saliencies = choose_tower.predict(state.state) choices = env.actions()['actions'] for choice, action_value in choices.items(): key = choice explanation.add_layers(layer_names, saliencies[action_value - 1], key=key) group = BarGroup("Attack {}".format(choice), saliency_key=key) key = choice + "_Overall" explanation.add_layers(layer_names, saliencies[action_value - 1], key=key) bar = Bar("Attack {}".format(choice), q_values[action_value - 1], saliency_key=key) group.add_bar(bar) chart.add_bar_group(group) explanation.with_bar_chart(chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False if evaluation_config.render else True state = env.act(action, explanation=explanation) while not state.is_terminal(): time.sleep(0.5) action = env.new_action() action.skip = False state = env.act(action, explanation=explanation) total_reward += state.reward time.sleep(10) if state.is_terminal(): logger.info("End Episode of episode %d!" % (episode + 1)) logger.info("Total Reward %d!" % (total_reward)) episode_summary.value.add(tag="Reward", simple_value=total_reward) test_summary_writer.add_summary(episode_summary, episode + 1) test_summary_writer.flush()
def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample("multi_step") reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 50}] networks.append({"name": name, "layers": layers}) network_config.networks = networks choose_tower = HRAAdaptive(name="tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 step = 1 while not state.is_terminal(): step += 1 (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type, reward in state.typed_reward.items(): choose_tower.reward(reward_type, reward) total_reward += reward choose_tower.end_episode(state.state.flatten()) logger.debug("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) choose_tower.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): layer_names = [ "HP", "Agent Location", "Small Towers", "Big Towers", "Friend", "Enemy" ] saliency_explanation = Saliency(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 step = 0 while not state.is_terminal(): step += 1 explanation = SkyExplanation("Tower Capture", (40, 40)) (tower_to_kill, q_values, combined_q_values) = choose_tower.predict(state.state.flatten()) q_values = q_values.data.numpy() combined_q_values = combined_q_values.data.numpy() saliencies = saliency_explanation.generate_saliencies( step, state.state.flatten(), choice_descriptions, layer_names, reshape=state.state.shape) decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] group = BarGroup("Attack {}".format(key), saliency_key=key) explanation.add_layers(layer_names, saliencies["all"], key) for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) group.add_bar(bar) explanation.add_layers(layer_names, saliencies[reward_type], key=key) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action, explanation=explanation) time.sleep(0.5) total_reward += state.reward logger.info("End Episode of episode %d with %d steps" % (episode + 1, step)) logger.info("Total Reward %d!" % (total_reward))
from scaii.env.sky_rts.env.scenarios.tower_example import TowerExample from scaii.env.explanation import Explanation, BarChart, BarGroup, Bar import numpy as np def invert_actions(env): out = dict([]) for k, v in env.actions()['actions'].items(): out[v] = k return out env = TowerExample() print("Possible reward types:", env.reward_types()) print("Possible actions:", env.actions()) print("Action description", env.action_desc()) actions = invert_actions(env) s = env.reset(record=True) print("acting") act = env.new_action() explanation = Explanation("Fake Random Saliency Info", layer_shape=(40, 40)) chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type") layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"] max_quad = 0 max_value = -np.inf for quad in range(1, 5):
from scaii.env.sky_rts.env.scenarios.tower_example import TowerExample import numpy as np env = TowerExample() print("Possible reward types:", env.reward_types()) print("Possible actions:", env.actions()) print("Action description", env.action_desc()) for i in range(0, 2): print("episode", i) s = env.reset() print("acting") act = env.new_action() act.attack_quadrant(2) s = env.act(act) while not s.is_terminal(): raise Exception("Should not get in loop") noop = env.new_action() s = env.act(noop) print("Reward is:", s.reward, "Terminal?:", s.is_terminal()) print("With types:", s.typed_reward)
def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample() reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 max_episode_steps = 10000 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) choose_tower = HRAAdaptive(name="Tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = tf.summary.FileWriter(training_summaries_path) #Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 episode_summary = tf.Summary() step = 1 while not state.is_terminal(): step += 1 tower_to_kill, q_values = choose_tower.predict(state.state) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type, reward in state.typed_reward.items(): choose_tower.reward(reward_type, reward) total_reward += state.reward choose_tower.end_episode(state.state) logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Train/Reward", simple_value=total_reward) train_summary_writer.add_summary(episode_summary, episode + 1) train_summary_writer.flush() choose_tower.disable_learning() test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = tf.summary.FileWriter(test_summaries_path) #Test Episodes for episode in range(evaluation_config.test_episodes): contrastive = True explanation = SkyExplanation("Tower Capture", (40, 40)) layer_names = [ "HP", "Agent Location", "Small Towers", "Big Towers", "Friend", "Enemy" ] adaptive_explanation = Explanation(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 episode_summary = tf.Summary() while not state.is_terminal(): tower_to_kill, q_values = choose_tower.predict(state.state) combined_q_values = np.sum(q_values, axis=0) saliencies = adaptive_explanation.generate_saliencies( state.state, contrastive) charts = [] decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] explanation.add_layers(layer_names, saliencies[choice]["all"], key=key) group = BarGroup("Attack {}".format(key), saliency_key=key) for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) explanation.add_layers(layer_names, saliencies[choice][reward_type], key=key) group.add_bar(bar) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False if evaluation_config.render else True state = env.act(action, explanation=explanation) time.sleep(0.5) total_reward += state.reward logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Test/Episode Reward", simple_value=total_reward) test_summary_writer.add_summary(episode_summary, episode + 1) test_summary_writer.flush()
def state_to_pixels(state): pixels = state.state.transpose((2, 0, 1)) # HP ranges from [0,.13], rescale it here pixels[0] *= 6 # Rescale the full image to [0,255] return pixels * 255 def state_to_reward(state): return {k: v for k, v in state.typed_reward.items()} for i in tqdm(range(COUNT)): #env = TowerExample(map_name="multi_step") env = TowerExample(map_name="tower_example") state = env.reset(record=False) pixels = state_to_pixels(state) filename = 'towers/images/{:09d}.png'.format(i) imutil.show(pixels, filename=filename, normalize_color=False) # Compute reward for one randomly-selected action tower_id = random.choice(range(1, 5)) act = env.new_action() act.attack_quadrant(tower_id) next_state = env.act(act) print('Took action {}, got reward {}'.format(action_names[tower_id], state_to_reward(next_state)))