def visualize(agent: SMARTAgent, ep: int, ts: int): images = [] for seed in seeds: if seed is not None: env.seed(seed) state = env.reset() goal_point = find(state, 'Goal') option = Option(goal_point, depth=0) agent.reset(env, option, random_seed=3) visualize_decision(agent, state, writer, f'likelihoods: {seed}', ep, ts) images.append(env.render('rgb_array')) done = False while not done: action = agent.act(state, option) state, reward, done, _ = env.step(action) options = _get_option_tree_(agent) print( f"@{onehot2directedpoint(state)} : {reward} => {options}") rendered = _render_options_(env.render('rgb_array'), options) images.append(rendered) gif = np.stack(images, 0) # np.ndarray [t, imx, imy, 3] gif_tensor: torch.Tensor = torch.from_numpy(gif).type( torch.uint8).unsqueeze(0) # torch.Tensor[uint8] [1, t, imx, imy, 3] gif_tensor = gif_tensor.permute(0, 1, 4, 2, 3) writer.add_video('sample trajectory', gif_tensor, global_step=ts)
def test(agent: SMARTAgent, ep, ts): rewards = [0] * len(seeds) for i, seed in enumerate(seeds): env.seed(seed) state = env.reset() goal_point = find(state, 'Goal') option = Option(goal_point, depth=0) agent.reset(env, option, random_seed=3) done = False while not done: action = agent.act(state) state, reward, done, info = env.step(action) rewards[seed] += reward for i, seed in enumerate(seeds): writer.add_scalar(f"Test Reward: {seed}", rewards[i], global_step=ts)
def visualize_decision(agent: SMARTAgent, state, writer: SummaryWriter, tag: str, ep: int = None, ts: int = None) -> None: prev_option = agent._prev_option_() parent_option = agent.current_option_node.value possibilities = agent.generator.generate(state, prev_option, parent_option) probabilities = agent.evaluator._selection_probabilities_( state, possibilities, prev_option, parent_option) xx, yy, zz = make_mesh_grid(possibilities, probabilities) fig = render_mesh(xx, yy, zz) writer.add_figure(tag, fig, global_step=ts)
fig = plt.figure() images = [] low_level_agent: IAgent = BacktrackingMazeAgent(env) low_level_agent: IAgent = Grid2PointWrapper(low_level_agent) evaluator: IEvaluator = GridworldEvaluator(XDIMS + 2, YDIMS + 2, settings, gamma=0.99) generator: IGenerator = SimpleGridworldGenerator() fulfils_goal = lambda state, goal: array_equal(state[:, :, -1], goal[:, :, 0]) goal_manager: IGoalManager = SimpleGoalManager(evaluator, generator, 1, fulfils_goal) memory: IMemory = CompleteMemory(100, 3) agent = SMARTAgent(goal_manager, low_level_agent, memory) totals = [] step: int = 0 for iter, seed in enumerate([0] * 500): total_reward: int = 0 print(f"================={seed}=================") env = MazeWorld(cache._get_cached_board(seed)) state, goal = env.reset(3) agent.reset(env, state, goal) done = False states: List[State] = [state]
low_level_agent = MinigridBacktrackingAgent() shape = env.observation_space.shape shape = (-1, shape[-1], shape[0], shape[1]) v_model = VModel(shape, 32, 2, device=settings['device']) q_model = QModel(shape, 32, 2, device=settings['device']) planning_terminator = DepthPlanningTerminator(max_depth=settings['max_depth']) evaluator = Evaluator(v_model, q_model, planning_terminator, settings, get_beta=lambda step: 3, gamma=0.99) generator = SimpleMinigridGenerator() memory = CompleteMemory(max_length=100000) def goal_met(s, o): agent_loc: np.ndarray = s[:, :, 8] # imx, imy, onehot agent_loc = np.unravel_index(np.argmax(agent_loc), agent_loc.shape) return np.all(agent_loc == o.value) policy_terminator = StrictGoalTerminator(goal_met) agent = SMARTAgent( evaluator, generator, planning_terminator, policy_terminator=policy_terminator, low_level=low_level_agent, memory=memory, settings=settings) testfn = training.make_simple_minigrid_test(env, writer, range(5)) vizfn = training.make_visualize(env, writer, range(5)) training.train(agent, env, settings, testfn=testfn, vizfn=vizfn) training.summarize(agent, env, settings, list(range(10)), writer)
gamma=0.99) generator = SimpleMinigridGenerator() memory = CompleteMemory(max_length=100000) def goal_met(s, o): agent_loc: np.ndarray = s[:, :, 8] # imx, imy, onehot agent_loc = np.unravel_index(np.argmax(agent_loc), agent_loc.shape) return np.all(agent_loc == o.value) policy_terminator = StrictGoalTerminator(goal_met) agent = SMARTAgent(evaluator, generator, planning_terminator, policy_terminator=policy_terminator, low_level=low_level_agent, memory=memory, settings=settings) def visualize(rgb_array, options): #tiles of size 32 x 32 for option in options: tile_ur: np.ndarray = option.value.astype(np.int32) * 32 tile_ur = tile_ur[::-1] for y in range(4, 28): for x in range(4, 28): new_x, new_y = tile_ur + np.asarray([x, y]) colors = [[255, 0, 0], [255, 102, 102], [255, 128, 0],
get_beta = lambda step: 0.001 * step evaluator: IEvaluator = SimpleMazeworldEvaluator(planning_terminator, v_model, q_model, settings, get_beta, gamma=0.99) generator: IGenerator = SimpleMazeworldGenerator() low_level: IOptionBasedAgent = BacktrackingMazeAgent(env) memory: IMemory = CompleteMemory(max_length=100, random_seed=settings['random']) agent: SMARTAgent = SMARTAgent(evaluator, generator, planning_terminator, policy_terminator, low_level, memory, settings) step: int = 0 images = [] for seed in [0] * 500: env = MazeWorld(cache._get_cached_board(seed)) total_reward: int = 0 t: int = 0 done: bool = False state, goal = env.reset(3) goal = Option(goal, 0) states: List[State] = state agent.reset(env, goal)