def observe(self, experience): """Updates internal state counts based on observing this experience. Args: experience (Experience) """ next_state = AS.AbstractState(experience.next_state) self._state_counts[next_state] += 1
def _step(self, action): state, env_reward, done, info = self.env.step(action) info["env reward"] = env_reward abstract_state = AS.AbstractState(state) new_reward = sum( rule(self._prev_abstract_state, abstract_state) for rule in self._reward_rules) self._prev_abstract_state = abstract_state return state, new_reward, done, info
def start_abstract_state(self): start_state = np.zeros(128).astype(np.uint8) indices = np.array([42, 43, 3, 65, 66]) values = [149, 155, 8, 2, 0] start_state[indices] = values start_state = State(start_state, None) start_state.set_object_changes(3) start_state = AS.AbstractState(start_state) return start_state
def goal_abstract_state(self): goal_state = np.zeros(128).astype(np.uint8) indices = np.array([42, 43, 3, 65, 66]) values = [149, 235, 8, 2, 0] goal_state[indices] = values goal_state = State(goal_state, None) goal_state.set_object_changes(3) goal_state = AS.AbstractState(goal_state) return goal_state
def _update(self, path, episode, graph_updates, edge_trajectories): """Updates internal state based on trying to follow path. Args: path (list[DirectedEdge]): planned path episode (list[Experience]): actually executed experiences graph_updates (list[GraphUpdate]): updates to make on the graph edge_trajectories (dict): map from edges to the experiences for those edges, paired with the number of worker steps the worker was active at the beginning of the experience (Experience, int) """ # Add all experienced states to the graph for experience in episode: state = AS.AbstractState(experience.state) next_state = AS.AbstractState(experience.next_state) if state == next_state: # Abstract state must change on positive reward # TODO: Fix this: violated by PrivateEye #assert experience.reward <= 0, (state, reward, next_state) pass if not experience.done: edge = self._graph.get_edge(state, next_state) if edge is not None: edge.update_reward(experience.reward, force=self._new_reward) # Make graph updates for graph_update in graph_updates: graph_update.update(self._graph) # Add experiences to the worker for edge in edge_trajectories: # Hindsight-like #for edge_to_update in edge.start.neighbors: # if edge_to_update.training() and not edge_to_update.dead: trajectory = edge_trajectories[edge] for i, (experience, worker_steps, cum_reward) in enumerate(trajectory): self._worker.add_experience(edge, experience, worker_steps, cum_reward, trajectory.success) # Add path back to the queue self._path_prioritizer.add_path(path)
def visualize(self, state): """Returns a PIL Image visualizing this Justification.""" def plot_state(canvas, state, value): canvas[int(state.pixel_y) - 3:int(state.pixel_y) + 3, int(state.pixel_x) - 3:int(state.pixel_x) + 3] = value def match(state1, state2): """Returns True if both states have the same room number and inventory """ return state1.room_number == state2.room_number and \ np.array_equal(state1.match_attributes, state2.match_attributes) canvas = state.unmodified_pixels x_lines, y_lines = AS.AbstractState.bucket_lines() x_lines = [int(x) for x in x_lines] y_lines = [int(x) for x in y_lines] canvas[:, x_lines] = np.array([0., 0., 255.]) canvas[y_lines, :] = np.array([0., 0., 255.]) abstract_state = AS.AbstractState(state) # Only draw the nodes that match on inventory and room number feasible_set = self._graph.feasible_set for feasible_node in feasible_set: if not match(feasible_node.abstract_state, abstract_state): continue color = np.array([36., 255., 36.]) plot_state(canvas, feasible_node.abstract_state, color) for goal_edge in self._path: goal = goal_edge.end.abstract_state if match(goal, abstract_state): plot_state(canvas, goal, np.array([255., 109., 182.])) # Plot current position plot_state(canvas, abstract_state, np.array([255., 255., 109.])) image = Image.fromarray(canvas, "RGB") width, height = image.size image = image.resize((width * 2, height * 2)) draw = ImageDraw.Draw(image) draw.text((0, 0), self._text, (255, 255, 255)) font = ImageFont.truetype(data.workspace.arial, 8) for node in self._graph.nodes: if match(node.abstract_state, abstract_state): draw.text((node.abstract_state.pixel_x * 2 - 4, node.abstract_state.pixel_y * 2 - 4), str(node.uid), (255, 255, 255), font=font) return image
def start_abstract_state(self): start_state = np.zeros(128).astype(np.uint8) start_state[42] = 3 start_state[43] = 235 start_state[3] = 7 start_state[65] = 0 start_state[66] = 1 start_state = State(start_state, None) start_state = AS.AbstractState(start_state) return start_state
def goal_abstract_state(self): goal_state = np.zeros(128).astype(np.uint8) goal_state[42] = 60 goal_state[43] = 148 goal_state[3] = 1 goal_state[65] = 0 goal_state[66] = 15 goal_state = State(goal_state, None) goal_state = AS.AbstractState(goal_state) return goal_state
def _step(self, action): state, reward, done, info = self.env.step(action) info["done"] = done abstract_state = AS.AbstractState(state) new_done = sum( rule(self._prev_abstract_state, abstract_state) for rule in self._done_rules) done = new_done or done self._prev_abstract_state = abstract_state return state, reward, done, info
def _make_goal(self, state): raise NotImplementedError("Deprecated! set_goal was updated") goal = np.zeros(AS.AbstractState.DIM + 2) goal[:AS.AbstractState.DIM] = \ self.goal_abstract_state.numpy - AS.AbstractState(state).unbucketed goal[AS.AbstractState.DIM] = float(self._steps) / self.max_steps if AS.AbstractState(state) == self.goal_abstract_state: goal[AS.AbstractState.DIM + 1] = 1. difference = \ self.goal_abstract_state.numpy - self.start_abstract_state.numpy normalization = np.linalg.norm(difference) goal[0] /= (normalization * 3) goal[1] /= (normalization * 3) state_copy = copy.copy(state) state_copy.set_goal(goal) reward = 0. if self.goal_abstract_state == AS.AbstractState(state): reward = 1. return state_copy, reward
def _skill_state(self, state, goal_edge, step, cum_reward): """Adds the goal and step to the state. Args: state (State) goal_edge (DirectedEdge): goal_edge.end is goal step (int): number of steps the skill has been active Returns: State """ goal_abstract_state = goal_edge.end.abstract_state abstract_state_diff = \ goal_abstract_state.numpy - AS.AbstractState(state).unbucketed worker_step_frac = float(step) / self.max_steps(goal_edge) on_goal = AS.AbstractState(state) == goal_edge.end.abstract_state goal = Goal(abstract_state_diff, worker_step_frac, on_goal, cum_reward) # Shallow copy OK. Copy references to the np.arrays. Getters don't # expose the underlying arrays directly state_copy = copy.copy(state) state_copy.set_goal(goal) return state_copy
def act(self, state, test=False): """Returns action for the current state. Args: state (State) test (bool): if True, no teleporting is used Returns: action (Action) justification (Justification) """ if len(self._plan) == 0: node = self._graph.get_node(AS.AbstractState(state)) if (node is not None and node.active() and not self._explorer.active()): # This is happening in a separate process, so this shared # graph doesn't get updated self._graph_updates.append(Visit(node)) node.visit() self._explorer.activate(node) if self._explorer.active(): action, s = self._explorer.act(state) return action, Justification([], self._graph, s) elif test: # No resetting on test episodes! action = DefaultAction(random.randint(0, self._num_actions - 1)) justification = Justification([], self._graph, "test random") return action, justification else: return EndEpisode(), Justification([], self._graph, "reset") elif (self._enable_teleporting and not test and not self._teleported and len(self._plan) > 0 and self._plan[-1].start.teleport is not None): self._teleported = True self._plan = self._plan[-1:] s = "teleport to: {}".format(self._plan[-1].start.uid) justification = Justification(self._plan, self._graph, s) return self._plan[-1].start.teleport, justification next_edge = self._plan[0] self._allow_setting_teleport = \ self._allow_setting_teleport and not next_edge.training() action = DefaultAction( self._worker.act(state, next_edge, len(self._worker_rewards), sum(self._worker_rewards))) s = "{} -> {} step={} [{:.2f}], train={}, [{:.2f}]".format( next_edge.start.uid, next_edge.end.uid, len(self._worker_rewards), sum(self._worker_rewards), next_edge.train_count, next_edge.success_rate) justification = Justification(copy.copy(self._plan), self._graph, s) return action, justification
def __call__(self, experience): """Returns another experience with the bonus added to the reward. Args: experience (Experience) Returns: Experience """ next_state = AS.AbstractState(experience.next_state) next_state_count = self._state_counts[next_state] assert next_state_count > 0 reward_bonus = self._beta / np.sqrt(next_state_count) return Experience(experience.state, experience.action, experience.reward + reward_bonus, experience.next_state, experience.done)
def crop(state): abstract_state = AS.AbstractState(state) y = int(abstract_state.pixel_y * 84. / 210.) x = int(abstract_state.pixel_x * 84. / 160.) cropped = state.pixel_state[:, y - 10:y + 10, max(0, x - 30):x + 30] padding = (0, 0) if x - 30 < 0: padding = (30 - x, 0) elif x + 30 > 160: padding = (0, 190 - x) cropped = torch.FloatTensor(cropped) cropped = try_gpu( torch.nn.functional.pad(cropped, padding, mode="reflect")) return cropped
def reward(self, next_state, edge, env_reward, done): """Defines the worker's intrinsic reward for reaching next_state while trying to traverse the edge. Args: next_state (State) edge (DirectedEdge) env_reward (float): environment extrinsic reward done (bool): True if overall episode ended Returns: float """ if AS.AbstractState(next_state) == edge.end.abstract_state and \ not done and env_reward >= 0: return 1. else: return 0.
def crop(state): abstract_state = AS.AbstractState(state) y = int(abstract_state.pixel_y * 84. / 210.) cropped = state.pixel_state[:, y - 10:y + 10, :] return cropped
def __init__(self, num_actions, worker, abstract_graph_config, start_state, runner_config, num_parallel, domain, success_weight): """Use from_config to construct Master. Args: num_actions (int): number of possible actions at each state worker (Worker): worker to use abstract_graph_config (Config): config for AbstractGraph start_state (State): state returned by env.reset(). NOTE: assumed that there is only a single start state runner_config (Config): config for EpisodeRunner num_parallel (int): number of workers to run in parallel domain (str): environment domain e.g. MontezumaRevengeNoFrameskip-v4 success_weight (float): how much to weight successes in priority """ super(Master, self).__init__() self._num_actions = num_actions self._worker = worker self._room_dir = data.room_dir(domain) self._success_weight = success_weight self._path_prioritizer = MultiPriorityQueue(self._priority_fns()) self._path_prioritizer.add_path([]) def eval_to_train(edge): if not edge.dead: worker.mark_failed_evaluation(edge) # All paths are added to the queue via new reliable edges or via new # edges as an optimization def eval_to_reliable(edge): worker.mark_reliable(edge) feasible_set = self._graph.feasible_set for neighbor_edge in edge.end.neighbors: if neighbor_edge.end not in feasible_set: path = edge.end.path_to_start() + [neighbor_edge] self._path_prioritizer.add_path(path) def new_edge_callback(new_edge): feasible_set = self._graph.feasible_set if new_edge.start in feasible_set: if new_edge.end not in feasible_set: path = new_edge.start.path_to_start() + [new_edge] self._path_prioritizer.add_path(path) else: # Update priority of paths who've gained a new edge for parent_edge in new_edge.start.parents: if parent_edge.start in feasible_set: path = parent_edge.start.path_to_start() + [parent_edge] self._path_prioritizer.add_path(path) max_edge_degree = abstract_graph_config.max_edge_degree # Add higher-distance neighbors parent_edges = [ parent_edge for parent_edge in new_edge.start.parents if parent_edge.degree == 1 ] bfs_queue = deque(parent_edges) visited = set([parent_edge.start for parent_edge in parent_edges] + [new_edge.end, new_edge.start]) while len(bfs_queue) > 0: edge = bfs_queue.popleft() if edge.degree >= max_edge_degree: break for parent_edge in edge.start.parents: parent = parent_edge.start # Only traverse degree 1 edges to preserve BFS property if parent_edge.degree == 1 and parent not in visited: visited.add(parent) combined_degree = edge.degree + parent_edge.degree combined_reward = edge.reward + parent_edge.reward combined_life_lost = edge.life_lost or parent_edge.life_lost if not parent.contains_neighbor(edge.end): combined_edge = self._graph.get_edge(parent.abstract_state, edge.end.abstract_state, combined_degree, combined_reward, combined_life_lost) bfs_queue.append(combined_edge) # Forwards bfs_queue = deque([new_edge]) visited = set([new_edge.end, new_edge.start]) while len(bfs_queue) > 0: edge = bfs_queue.popleft() if edge.degree >= max_edge_degree: break for neighbor_edge in edge.end.neighbors: neighbor = neighbor_edge.end # Only traverse degree 1 edges to preserve BFS property if neighbor_edge.degree == 1 and neighbor not in visited: visited.add(neighbor) combined_degree = edge.degree + neighbor_edge.degree combined_reward = edge.reward + neighbor_edge.reward combined_life_lost = edge.life_lost or neighbor_edge.life_lost if not neighbor.contains_parent(edge.start): combined_edge = self._graph.get_edge(edge.start.abstract_state, neighbor.abstract_state, combined_degree, combined_reward, combined_life_lost) bfs_queue.append(combined_edge) ########################## # END HACK ########################## edge_callbacks = { (DirectedEdge.EVALUATING, DirectedEdge.TRAINING): eval_to_train, (DirectedEdge.EVALUATING, DirectedEdge.RELIABLE): eval_to_reliable, } self._graph = AbstractGraph.from_config(abstract_graph_config, AS.AbstractState(start_state), edge_callbacks, new_edge_callback, domain) self._start_node = self._graph.get_node(AS.AbstractState(start_state)) self._runner_config = runner_config # runners[i] is None when previous episode[i] terminated self._runners = [None for _ in range(num_parallel)]
def start_abstract_state(self): ram = np.zeros(128) ram[[97, 105, 1, 113]] = [39, 30, 18, 31] state = State(ram, None) return AS.AbstractState(state)
def __init__(self, start_state, edge_window_size, traverse_threshold, min_visit_count, max_edge_degree, edge_callbacks, new_edge_callback): super(OracleGraph, self).__init__(start_state, edge_window_size, traverse_threshold, min_visit_count, max_edge_degree, edge_callbacks, new_edge_callback) # Oracle nodes and edges for the first room # AbstractStates in order ROOM_NUMBER = 1 path = [ (70, 220), # down ladder (x, y) (70, 210), # down ladder (70, 200), # down ladder (70, 190), # bottom of ladder (80, 190), # go right (90, 190), # jump across rope (100, 190), # on rope (110, 190), # jump across rope (120, 190), # on ledge (130, 180), # down ladder (130, 170), # down ladder (130, 160), # down ladder (130, 150), # bottom of ladder (120, 150), # left (110, 150), # left (100, 150), # left (90, 150), # left (80, 150), # left (70, 150), # left (60, 150), # left (50, 150), # left (40, 150), # left (30, 150), # left (20, 150), # bottom of ladder (20, 160), # up ladder (20, 170), # up ladder (20, 180), # up ladder (10, 180), # left (10, 190), # jump (10, 200), # jump ] key_state = (10, 210, ROOM_NUMBER, 2, 14) # revisit the states in order except getting key backward_path = list(reversed(path)) + [ (80, 240), # right (90, 240), # jump gap (100, 240), # right gap (110, 240), # right (120, 240), # right ] open_door = (130, 240, ROOM_NUMBER, 0, 10) # Add (room #, inv mask, inv) for i, s in enumerate(path): path[i] = path[i] + (ROOM_NUMBER, 0, 15) for i, s in enumerate(backward_path): backward_path[i] = backward_path[i] + (ROOM_NUMBER, 2, 14) path = path + [key_state] + backward_path + [open_door] prev_node = self._start_node for i, state in enumerate(path): ram = np.zeros(128).astype(np.uint8) ram[np.array((42, 43, 3, 65, 66))] = state ram[42] += 10 abstract_state = AS.AbstractState(State(ram_state=ram)) curr_node = self._nodes[abstract_state] = AbstractNode( abstract_state, self._min_visit_count, self._uid_count) self._uid_count += 1 edge = DirectedEdge( prev_node, curr_node, self._edge_window_size, self._edge_window_size, self._traverse_threshold, self._callbacks, degree=1) prev_node.add_neighbor(edge) curr_node.add_parent(edge) prev_node = curr_node
def _step(self, action): next_state, reward, done, info = super(BeamWrapper, self)._step(action) if AS.AbstractState(next_state).room_number != 7: done = True return next_state, reward, done, info
def goal_abstract_state(self): ram = np.zeros(128) ram[[97, 105, 1, 113]] = [117, 30, 18, 31] goal_state = State(ram, None) return AS.AbstractState(goal_state)
def _reset(self): state = self.env.reset() self._prev_abstract_state = AS.AbstractState(state) return state