Exemplo n.º 1
0
class ValSearchPolicy(Policy):
    """Searches for the maximum reward path using a model."""
    def __init__(self,
                 V,
                 replan=False,
                 epsilon=0,
                 noise=1,
                 anneal=1,
                 **kwargs):
        super().__init__(**kwargs)
        self.V = V
        self.replan = replan
        self.epsilon = epsilon
        self.noise = noise
        self.anneal = anneal
        self.history = None
        self.model = None
        self.plan = None

    def start_episode(self, state):
        self.history = Counter()
        self.model = Model(self.env)
        self.plan = iter(())  # start with no plan

    def finish_episode(self, trace):
        self.ep_trace['berries'] = self.env._observe()[-1]

    def act(self, state):
        # return self.env.action_space.sample()
        self.history[state] += 1
        try:
            if self.replan:
                raise StopIteration()
            else:
                return next(self.plan)
        except StopIteration:
            self.plan = iter(self.make_plan(state))
            return next(self.plan)

    def make_plan(self, state, expansions=2000):

        Node = namedtuple('Node', ('state', 'path', 'reward', 'done'))
        env = self.env
        V = memoize(self.V.predict)
        self.node_history = []

        def eval_node(node, noisy=False):
            if not node.path:
                return np.inf  # the empty plan has infinite cost
            obs = env._observe(node.state)
            noise = np.random.rand() * (
                self.noise * self.anneal**self.i_episode) if noisy else 0
            value = 0 if node.done else V(obs)[0]
            boredom = -0.1 * self.history[obs]
            score = node.reward + value + noise + boredom
            return -score

        start = Node(env._state, [], 0, False)
        frontier = PriorityQueue(key=eval_node)
        frontier.push(start)
        reward_to_state = defaultdict(lambda: -np.inf)
        reward_to_state[start.state] = 0
        best_finished = start

        def expand(node):
            nonlocal best_finished
            best_finished = min((best_finished, node), key=eval_node)
            s0, p0, r0, _ = node
            for a, s1, r, done in self.model.options(s0):
                node1 = Node(s1, p0 + [a], r0 + r, done)
                if node1.reward <= reward_to_state[s1]:
                    continue  # cannot be better than an existing node
                self.node_history.append({
                    'path':
                    node1.path,
                    'r':
                    node1.reward,
                    'b':
                    self.env._observe(node1.state)[-1],
                    'v':
                    -eval_node(node1)
                })
                reward_to_state[s1] = node1.reward
                if done:
                    best_finished = min((best_finished, node1), key=eval_node)
                else:
                    frontier.push(node1)

        for i in range(expansions):
            if frontier:
                expand(frontier.pop())
            else:
                break

        if frontier:
            plan = min(best_finished, frontier.pop(), key=eval_node)
        else:
            plan = best_finished
        # choices = concat([completed, map(get(1), take(100, frontier))])
        # plan = min(choices, key=eval_node(noisy=True))
        self.log(
            i,
            len(plan.path),
            -round(eval_node(plan, noisy=False), 2),
            plan.done,
        )
        # self._trace['paths'].append(plan.path)
        return plan.path
Exemplo n.º 2
0
class Astar(Policy):
    """A* search finds the shortest path to a goal."""
    def __init__(self, heuristic):
        assert 0  # this implementation is incorrect
        super().__init__()
        self.heuristic = heuristic
        self.plan = iter(())

    def start_episode(self, state):
        self.history = Counter()
        self.model = Model(self.env)

    def act(self, state):
        # return self.env.action_space.sample()
        self.history[state] += 1
        try:
            return next(self.plan)
        except StopIteration:
            self.plan = iter(self.make_plan(state))
            return next(self.plan)

    def eval_node(self, node):
        if not node.path:
            return np.inf  # the empty plan has infinite cost
        obs = self.env._observe(node.state)
        value = 0 if node.done else self.heuristic(self.env, obs)
        boredom = -0.1 * self.history[obs]
        score = node.reward + value + boredom
        return -score

    def make_plan(self, state, expansions=5000):

        Node = namedtuple('Node', ('state', 'path', 'reward', 'done'))
        eval_node = self.eval_node
        start = Node(self.env._state, [], 0, False)
        frontier = PriorityQueue(key=eval_node)
        frontier.push(start)
        reward_to_state = defaultdict(lambda: -np.inf)
        # import IPython; IPython.embed()
        best_finished = start

        def expand(node):
            # print(node.state, node.reward, self.rts[node.state], V(env._observe(node.state)))
            # time.sleep(0.1)
            nonlocal best_finished
            # best_finished = min((best_finished, node), key=eval_node)
            s0, p0, r0, _ = node
            for a, s1, r, done in self.model.options(s0):
                node1 = Node(s1, p0 + [a], r0 + r, done)
                if node1.reward <= reward_to_state[s1]:
                    # print('abandon')
                    pass
                    continue  # cannot be better than an existing node
                # self.save('node', node)
                reward_to_state[s1] = node1.reward
                if done:
                    best_finished = min((best_finished, node1), key=eval_node)
                else:
                    frontier.push(node1)

        for i in range(expansions):
            self.save('frontier', [n[1].state for n in frontier])
            if frontier:
                expand(frontier.pop())
            else:
                break

        if frontier:
            # plan = min(best_finished, frontier.pop(), key=eval_node)
            plan = frontier.pop()
            raise RuntimeError('No plan found.')
        else:
            plan = best_finished
        # choices = concat([completed, map(get(1), take(100, frontier))])
        # plan = min(choices, key=eval_node(noisy=True))
        # self.log(
        #     i,
        #     len(plan.path),
        #     -round(eval_node(plan, noisy=False), 2),
        #     plan.done,
        # )
        # self._trace['paths'].append(plan.path)
        self.save('plan', plan)
        return plan.path
Exemplo n.º 3
0
class MetaBestFirstSearchEnv(gym.Env):
    """A meta-MDP for best first search with a deterministic transition model."""
    Node = namedtuple('Node', ('state', 'path', 'reward', 'done'))
    State = namedtuple('State', ('frontier', 'reward_to_state', 'best_done'))
    TERM = 'TERM'

    def __init__(self, env, eval_node, expansion_cost=0.01):
        super().__init__()
        self.env = env
        self.expansion_cost = -abs(expansion_cost)

        # This guy interacts with the external environment, what a chump!
        self.surface_agent = Agent()
        self.surface_agent.register(self.env)
        self.eval_node = eval_node

    def _reset(self):
        self.env.reset()
        self.model = Model(
            self.env)  # warning: this breaks if env resets again
        start = self.Node(self.env._state, [], 0, False)
        frontier = PriorityQueue(key=self.eval_node(
            noisy=True))  # this is really part of the Meta Policy
        frontier.push(start)
        reward_to_state = defaultdict(lambda: -np.inf)
        best_done = None
        # Warning: state is mutable (and we mutate it!)
        self._state = self.State(frontier, reward_to_state, best_done)
        return self._state

    def _step(self, action):
        """Expand a node in the frontier."""
        if action is self.TERM:
            # The return of one episode in the external env is
            # one reward in the MetaSearchEnv.
            trace = self._execute_plan()
            external_reward = trace['return']
            return None, external_reward, True, {'trace': trace}
        else:
            return self._expand_node(action), self.expansion_cost, False, {}

    def _execute_plan(self):
        frontier, reward_to_state, best_done = self._state

        if not best_done:
            raise RuntimeError('Cannot make plan.')

        policy = FixedPlanPolicy(best_done.path)
        self.surface_agent.register(policy)
        trace = self.surface_agent.run_episode(reset=False)
        return trace

        # elif frontier:
        #     plan = min(best_done, frontier.pop(), key=eval_node)
        #     plan = frontier.pop()

    def _expand_node(self, node):
        frontier, reward_to_state, best_done = self._state
        s0, p0, r0, _ = node

        for a, s1, r, done in self.model.options(s0):
            node1 = self.Node(s1, p0 + [a], r0 + r, done)
            if node1.reward <= reward_to_state[s1] - 0.002:
                continue  # cannot be better than an existing node
            reward_to_state[s1] = node1.reward
            if done:
                best_done = max((best_done, node1),
                                key=self.eval_node(noisy=False))
            else:
                frontier.push(node1)

        self._state = self.State(frontier, reward_to_state, best_done)
        return self._state