def plan(self, state, observation): for i in range(self.config['iterations']): if (i + 1) % 10 == 0: logger.debug('{} / {}'.format(i + 1, self.config['iterations'])) self.run(safe_deepcopy_env(state), observation) return self.get_plan()
def get_trajectories(self, initial_state, initial_observation=None, as_observations=True, full_trajectories=True, include_leaves=True): """ Get a list of visited nodes/states/trajectories corresponding to the node subtree :param initial_state: the state at the root :param initial_observation: the observation for the root state :param as_observations: return nodes instead of observations :param full_trajectories: return a list of observation sequences, else a list of observations :param include_leaves: include leaves or only expanded nodes :return: the list of trajectories """ trajectories = [] if initial_observation is None: initial_observation = initial_state.reset() if not as_observations: initial_observation = self # Return this node instead of this observation if self.children: for action, child in self.children.items(): next_state = safe_deepcopy_env(initial_state) next_observation, _, _, _ = next_state.step(action) child_trajectories = child.get_trajectories(next_state, next_observation, as_observations, full_trajectories, include_leaves) if full_trajectories: trajectories.extend([[initial_observation] + trajectory for trajectory in child_trajectories]) else: trajectories.extend(child_trajectories) if not full_trajectories: trajectories.append(initial_observation) elif include_leaves: trajectories = [[initial_observation]] if full_trajectories else [initial_observation] return trajectories
def expand(self, next_layer, count=1): """ Expand the node by querying the oracle model for every possible action :param next_layer: list of nodes at the next depth, to be updated with new children nodes :param count: number of times each transition must be evaluated """ if self.state is None: raise Exception("The state should be set before expanding a node") try: actions = self.state.get_available_actions() except AttributeError: actions = range(1, self.state.action_space.n) self.planner.openings += count if self.done and PlaTyPOOSNode.STOP_ON_ANY_TERMINAL_STATE: return for _ in range(count): for action in actions: state = safe_deepcopy_env(self.state) state.seed(self.planner.np_random.randint(2**30)) _, reward, done, _ = state.step(action) if action not in self.children: self.children[action] = type(self)(self, self.planner, state, depth=self.depth + 1) next_layer.append(self.children[action]) self.children[action].update(reward, done)
def robustify_env(self): """ Important distinction with RobustEPC: the nominal lpv model is stabilized. We start with a system: dx = A(theta)x + Bu + omega, that we first stabilize with u0 = Kx, without constraint satisfaction. Then, we predict the interval of the stabilized system under additional controls: dx = (A(theta) + BK)x + Bu' + omega where A0 + BK is stable, which eases the similarity transformation to a Metlzer system. """ from highway_env.interval import LPV a0, da = self.config["A0"], self.config["dA"] K = 2 * self.feedback.K0[:, :(self.feedback.K0.shape[1] // 2)] da = da / 100 # da = [np.zeros(a0.shape)] lpv = LPV(a0=a0, da=da, x0=self.env.unwrapped.state.squeeze(-1), b=self.B, d=self.config["D"], k=K, omega_i=self.config["omega"]) robust_env = safe_deepcopy_env(self.env) robust_env.unwrapped.lpv = lpv robust_env.unwrapped.automatic_record_callback = None return robust_env
def plan(self, observation): action_distribution = Normal( torch.zeros(self.config["horizon"], self.action_size), torch.ones(self.config["horizon"], self.action_size)) for i in range(self.config["iterations"]): # Evaluate J action sequences from the current belief (in batch) actions = action_distribution.sample([self.config["candidates"] ]) # Sample actions candidates = [ safe_deepcopy_env(self.env) for _ in range(self.config["candidates"]) ] returns = torch.zeros(self.config["candidates"]) # Sample next states for t in range(self.config["horizon"]): for c, candidate in enumerate(candidates): _, reward, _, _ = candidate.step(actions[c, t]) returns[c] += self.config["gamma"]**t * reward # Re-fit belief to the K best action sequences _, topk = returns.topk(self.config["top_candidates"], largest=True, sorted=False) # K ← argsort({R(j)} best_actions = actions[topk] # Update belief with new means and standard deviations action_distribution = Normal( best_actions.mean(dim=0), best_actions.std(dim=0, unbiased=False)) # Return first action mean µ_t return action_distribution.mean.tolist()
def plan(self, state, observation): for self.episode in range(self.config['episodes']): if (self.episode+1) % max(self.config['episodes'] // 10, 1) == 0: logger.debug('{} / {}'.format(self.episode+1, self.config['episodes'])) self.run(safe_deepcopy_env(state)) return self.get_plan()
def plan(self, state, observation): self.root.state = safe_deepcopy_env(state) self.root.state.seed() # self.root.state = state for epoch in np.arange(self.config["budget"] // state.action_space.n): logger.debug("Expansion {}/{}".format( epoch + 1, self.config["budget"] // state.action_space.n)) self.run() return self.get_plan()
def expand(self): try: actions = self.state.get_available_actions() except AttributeError: actions = range(self.state.action_space.n) for action in actions: # Simulate transition state = safe_deepcopy_env(self.state) next_observation, reward, done, _ = self.planner.step(state, action) # Record the transition next_node = self.planner.get_node(next_observation) next_node.state = state next_node.parents.add(self) self.rewards[action] = reward self.children[action] = next_node
def expand(self): self.planner.leaves.remove(self) if self.state is None: raise Exception("The state should be set before expanding a node") try: actions = self.state.get_available_actions() except AttributeError: actions = range(self.state.action_space.n) for action in actions: self.children[action] = type(self)(self, self.planner, state=safe_deepcopy_env(self.state), depth=self.depth + 1) observation, reward, done, _ = self.planner.step(self.children[action].state, action) self.planner.leaves.append(self.children[action]) self.children[action].update(reward, done, observation)
def expand(self, state, leaves, update_children=False): if state is None: raise Exception("The state should be set before expanding a node") try: actions = state.get_available_actions() except AttributeError: actions = range(state.action_space.n) for action in actions: self.children[action] = type(self)(self, self.planner) if update_children: _, reward, done, _ = safe_deepcopy_env(state).step(action) self.children[action].update(reward, done) idx = leaves.index(self) leaves = leaves[:idx] + list(self.children.values()) + leaves[idx + 1:] return leaves
def get_obs_visits(self, state=None): visits = defaultdict(int) updates = defaultdict(int) if hasattr(self, "observation"): for node in self.get_trajectories(full_trajectories=False, include_leaves=False): if hasattr(node, "observation"): visits[str(node.observation)] += 1 if hasattr(node, "updates_count"): updates[str(node.observation)] += node.updates_count else: # Replay required for node in self.get_trajectories(full_trajectories=False, include_leaves=False): replay_state = safe_deepcopy_env(state) for action in node.path(): observation, _, _, _ = replay_state.step(action) visits[str(observation)] += 1 return visits, updates
def plan(self, state, observation): done = False episode = 0 while not done: best, challenger = self.run(safe_deepcopy_env(state)) # Stopping rule done = challenger.value_upper - best.value_lower < self.config[ "accuracy"] if best is not None else False done = done or episode > self.config["episodes"] episode += 1 if episode % 10 == 0: logger.debug('Episode {}: delta = {}/{}'.format( episode, challenger.value_upper - best.value_lower, self.config["accuracy"])) self.budget_used = episode * self.config["horizon"] return self.get_plan()
def estimateQ(self, state, action): if self.depth == self.planner.config["horizon"]: return # logger.debug(f"Run estimateQ at {state.mdp.state},{action} with depth {self.depth}") for i in range(self.planner.config["C"]): next_state = safe_deepcopy_env(state) # We need randomness next_state.seed(self.planner.np_random.randint(2**30)) observation, reward, done, _ = next_state.step(action) # observation = str(observation) + str(i) # Prevent state merge self.get_child(observation).count += 1 self.get_child(observation).state = next_state for next_state_node in self.children.values(): next_state_node.estimateV(next_state_node.state) self.value = reward + self.planner.config["gamma"] * sum( next_state_node.value * next_state_node.count for next_state_node in self.children.values()) / self.planner.config["C"]
def robustify_env(self): """ Make a robust version of the environment: 1. compute the dynamics polytope (A0, dA) 2. set the LPV interval predictor, so that it can be stepped with the environment 3. the environment, when provided with an interval predictor, should return pessimistic rewards 4. disable the recording of environment transitions, since we are not observing when planning. :return: the robust version of the environment. """ a0, da = self.polytope() from highway_env.interval import LPV lpv = LPV(a0=a0, da=da, x0=self.env.unwrapped.state.squeeze(-1), b=self.B, d=self.config["D"], omega_i=self.config["omega"]) robust_env = safe_deepcopy_env(self.env) robust_env.unwrapped.lpv = lpv robust_env.unwrapped.automatic_record_callback = None return robust_env
def evaluate(experiment): # Prepare workspace seed, agent_config, env_config, path = experiment gym.logger.set_level(gym.logger.DISABLED) path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) # Make environment env = load_environment(env_config) # Make agent agent_name, agent_config = agent_config agent = load_agent(agent_config, env) # Evaluate print("Evaluating agent {} on seed {}".format(agent_name, seed)) evaluation = Evaluation(env, agent, directory=path.parent / agent_name, num_episodes=1, sim_seed=seed, display_env=True, display_agent=True, display_rewards=False) estimate_value = False if estimate_value: rewards, values, terminal = [], [], False evaluation.seed(episode=0) evaluation.reset() evaluation.training = False gamma = 0.99 or agent.config["gamma"] while not terminal: # Estimate state value oracle_env = safe_deepcopy_env(agent.env) oracle = load_agent(agent_configs()["oracle"], oracle_env) oracle_done, oracle_rewards = False, [] while not oracle_done: action = oracle.act(None) _, oracle_reward, oracle_done, _ = oracle_env.step(action) oracle_rewards.append(oracle_reward) value = np.sum([ gamma**t * oracle_rewards[t] for t in range(len(oracle_rewards)) ]) values.append(value) reward, terminal = evaluation.step() rewards.append(reward) evaluation.close() returns = [ np.sum( [gamma**t * rewards[k + t] for t in range(len(rewards[k:]))]) for k in range(len(rewards)) ] # Save intermediate results df = pd.DataFrame({ "agent": agent_name, "time": range(len(rewards)), "seed": [seed] * len(rewards), "reward": rewards, "return": returns, "value": values }) else: evaluation.test() rewards = evaluation.monitor.stats_recorder.episode_rewards_[0] length = evaluation.monitor.stats_recorder.episode_lengths[0] total_reward = np.sum(rewards) cum_discount = lambda signal, gamma: np.sum( [gamma**t * signal[t] for t in range(len(signal))]) return_ = cum_discount(rewards, 0.9) return_undisc = cum_discount(rewards, 0.99) result = { "agent": agent_name, "seed": seed, "total_reward": total_reward, "return": return_, "return_undisc": return_undisc, "length": length, } df = pd.DataFrame.from_records([result]) with open(path, 'a') as f: df.to_csv(f, sep=',', encoding='utf-8', header=f.tell() == 0, index=False)
def plan(self, state, observation): self.available_budget = self.config["budget"] while self.available_budget > 0: rollout = self.rollout(safe_deepcopy_env(state), observation) self.update(rollout) return self.get_plan()
def plan(self, state, observation): self.root = self.get_node(observation, state=state) for _ in np.arange(self.config["episodes"]): self.run(safe_deepcopy_env(state), observation) return self.get_plan()