def expand(self, next_layer, count=1): """ Expand the node by querying the oracle model for every possible action :param next_layer: list of nodes at the next depth, to be updated with new children nodes :param count: number of times each transition must be evaluated """ if self.state is None: raise Exception("The state should be set before expanding a node") try: actions = self.state.get_available_actions() except AttributeError: actions = range(1, self.state.action_space.n) self.planner.openings += count if self.done and PlaTyPOOSNode.STOP_ON_ANY_TERMINAL_STATE: return for _ in range(count): for action in actions: state = safe_deepcopy_env(self.state) _, reward, done, _ = state.step(action) if action not in self.children: self.children[action] = type(self)(self, self.planner, state, depth=self.depth + 1) next_layer.append(self.children[action]) self.children[action].update(reward, done)
def plan(self, state, observation): for i in range(self.config['iterations']): if (i + 1) % 10 == 0: logger.debug('{} / {}'.format(i + 1, self.config['iterations'])) self.run(safe_deepcopy_env(state), observation) return self.get_plan()
def plan(self, observation): action_distribution = Normal( torch.zeros(self.config["horizon"], self.action_size), torch.ones(self.config["horizon"], self.action_size)) for i in range(self.config["iterations"]): # Evaluate J action sequences from the current belief (in batch) actions = action_distribution.sample([self.config["candidates"] ]) # Sample actions candidates = [ safe_deepcopy_env(self.env) for _ in range(self.config["candidates"]) ] returns = torch.zeros(self.config["candidates"]) # Sample next states for t in range(self.config["horizon"]): for c, candidate in enumerate(candidates): _, reward, _, _ = candidate.step(actions[c, t]) returns[c] += self.config["gamma"]**t * reward # Re-fit belief to the K best action sequences _, topk = returns.topk(self.config["top_candidates"], largest=True, sorted=False) # K ← argsort({R(j)} best_actions = actions[topk] # Update belief with new means and standard deviations action_distribution = Normal( best_actions.mean(dim=0), best_actions.std(dim=0, unbiased=False)) # Return first action mean µ_t return action_distribution.mean.tolist()
def plan(self, state, observation): for self.episode in range(self.config['episodes']): if (self.episode + 1) % max(self.config['episodes'] // 10, 1) == 0: logger.debug('{} / {}'.format(self.episode + 1, self.config['episodes'])) self.run(safe_deepcopy_env(state)) return self.get_plan()
def expand(self, state, leaves, update_children=False): if state is None: raise Exception("The state should be set before expanding a node") try: actions = state.get_available_actions() except AttributeError: actions = range(state.action_space.n) for action in actions: self.children[action] = type(self)(self, self.planner) if update_children: _, reward, done, _ = safe_deepcopy_env(state).step(action) self.children[action].update(reward, done) idx = leaves.index(self) leaves = leaves[:idx] + list(self.children.values()) + leaves[idx + 1:] return leaves
def expand(self, leaves): if self.state is None: raise Exception("The state should be set before expanding a node") try: actions = self.state.get_available_actions() except AttributeError: actions = range(self.state.action_space.n) for action in actions: self.children[action] = type(self)(self, self.planner, state=safe_deepcopy_env( self.state), depth=self.depth + 1) _, reward, done, _ = self.children[action].state.step(action) self.children[action].update(reward, done) leaves.remove(self) leaves.extend(self.children.values())