示例#1
0
    def _state_step(
        self, action: D.T_agent[D.T_concurrency[D.T_event]]
    ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]],
                           D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]:

        # Get players' moves
        move1, move2 = action["player1"], action["player2"]

        # Compute rewards
        r1, r2 = {
            (Move.rock, Move.rock): (0, 0),
            (Move.rock, Move.paper): (-1, 1),
            (Move.rock, Move.scissors): (1, -1),
            (Move.paper, Move.rock): (1, -1),
            (Move.paper, Move.paper): (0, 0),
            (Move.paper, Move.scissors): (-1, 1),
            (Move.scissors, Move.rock): (-1, 1),
            (Move.scissors, Move.paper): (1, -1),
            (Move.scissors, Move.scissors): (0, 0),
        }[move1, move2]

        # Compute num_move increment
        last_state = self._memory
        num_move = last_state.num_move + 1

        return TransitionOutcome(
            state=State(num_move=num_move),
            value={
                "player1": Value(reward=r1),
                "player2": Value(reward=r2)
            },
            termination=(num_move >= self._max_moves),
        )
示例#2
0
 def _get_transition_value(
     self,
     memory: D.T_memory[D.T_state],
     action: D.T_agent[D.T_concurrency[D.T_event]],
     next_state: Optional[D.T_state] = None,
 ) -> D.T_agent[Value[D.T_value]]:
     return Value(cost=1)
示例#3
0
 def _get_transition_value(
     self,
     memory: D.T_memory[D.T_state],
     event: D.T_event,
     next_state: Optional[D.T_state] = None,
 ) -> Value[D.T_value]:
     return Value(cost=self.next_state_map[memory][event][next_state][1])
示例#4
0
    def __init__(
        self,
        from_state: Optional[D.T_state] = None,
        heuristic: Optional[
            Callable[[Domain, D.T_state], D.T_agent[Value[D.T_value]]]
        ] = None,
        weight: float = 1.0,
        verbose: bool = False,
        max_iter=5000,
        max_depth=200,
    ) -> None:
        self._from_state = from_state
        self._heuristic = (
            (lambda _, __: Value(cost=0.0)) if heuristic is None else heuristic
        )
        self._weight = weight
        self.max_iter = max_iter
        self.max_depth = max_depth
        self._plan = []
        self.values = {}

        self._verbose = verbose

        self.heuristic_changed = False
        self._policy = {}
示例#5
0
 def _get_next_state(
     self,
     memory: D.T_memory[D.T_state],
     action: D.T_agent[D.T_concurrency[D.T_event]],
 ) -> D.T_state:
     env = memory._context[0]
     if self._set_state is None or self._get_state is None:
         env = deepcopy(env)
     elif memory._context[4] != self._get_state(env):
         self._set_state(env, memory._context[4])
     self._gym_env = env  # Just in case the simulation environment would be different from the planner's environment...
     obs, reward, done, info = env.step(action)
     outcome = TransitionOutcome(state=obs,
                                 value=Value(reward=reward),
                                 termination=done,
                                 info=info)
     # print('Transition:', str(memory._state), ' -> ', str(action), ' -> ', str(outcome.state))
     return GymDomainStateProxy(
         state=outcome.state,
         context=[
             env,
             memory._state,
             action,
             outcome,
             self._get_state(env) if
             (self._get_state is not None
              and self._set_state is not None) else None,
         ],
     )
示例#6
0
 def _get_transition_value(
     self,
     memory: D.T_memory[D.T_state],
     action: D.T_agent[D.T_concurrency[D.T_event]],
     next_state: Optional[D.T_state] = None,
 ) -> D.T_agent[Value[D.T_value]]:
     v = super()._get_transition_value(memory, action, next_state)
     return Value(reward=v.reward - 1)
示例#7
0
 def _get_transition_value(
     self,
     memory: D.T_memory[D.T_state],
     event: D.T_event,
     next_state: Optional[D.T_state] = None,
 ) -> Value[D.T_value]:
     return Value(cost=self.next_state_attributes[memory[-1]][event][
         self.attribute_weight])
示例#8
0
 def _get_transition_value(
     self,
     memory: D.T_memory[D.T_state],
     action: D.T_agent[D.T_concurrency[D.T_event]],
     next_state: Optional[D.T_state] = None,
 ) -> D.T_agent[Value[D.T_value]]:
     # every move costs 1
     return Value(cost=abs(next_state.x - memory.x) + abs(next_state.y - memory.y))
示例#9
0
 def _state_step(
     self, action: D.T_agent[D.T_concurrency[D.T_event]]
 ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]],
                        D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]:
     obs, reward, done, info = self._gym_env.step(action)
     return TransitionOutcome(state=obs,
                              value=Value(reward=reward),
                              termination=done,
                              info=info)
示例#10
0
 def _state_step(
     self, action: D.T_agent[D.T_concurrency[D.T_event]]
 ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]],
                        D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]:
     o = super()._state_step(action)
     return TransitionOutcome(
         state=GymDomainStateProxy(state=normalize_and_round(
             o.state._state),
                                   context=o.state._context),
         value=Value(reward=o.value.reward - 1),
         termination=o.termination,
         info=o.info,
     )
示例#11
0
 def _state_step(
     self, action: D.T_agent[D.T_concurrency[D.T_event]]
 ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]],
                        D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]:
     self._gym_env.set_state(self._current_state)
     o = super()._state_step(action)
     self._current_state = self._gym_env.get_state()
     return TransitionOutcome(
         state=o.state,
         value=Value(reward=o.value.reward - 1),
         termination=o.termination,
         info=o.info,
     )
示例#12
0
 def _state_step(
     self, action: D.T_agent[D.T_concurrency[D.T_event]]
 ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]],
                        D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]:
     obs, reward, done, info = self._gym_env.step(action)
     if self._set_state is not None and self._get_state is not None:
         state = GymDomainStateProxy(state=obs,
                                     context=self._initial_env_state)
     else:
         state = GymDomainStateProxy(state=obs, context=self._init_env)
     return TransitionOutcome(state=state,
                              value=Value(reward=reward),
                              termination=done,
                              info=info)
 def _get_transition_value(
     self,
     memory: D.T_memory[D.T_state],
     action: D.T_agent[D.T_concurrency[D.T_event]],
     next_state: Optional[D.T_state] = None,
 ) -> D.T_agent[Value[D.T_value]]:
     if next_state.x == -1 and next_state.y == -1:
         cost = 2 * (
             self.num_cols + self.num_rows
         )  # dead-end state, penalty higher than optimal goal-reaching paths
     else:
         cost = abs(next_state.x - memory.x) + abs(
             next_state.y - memory.y)  # every move costs 1
     return Value(cost=cost)
示例#14
0
    def _get_transition_value(
        self,
        memory: D.T_memory[D.T_state],
        action: D.T_agent[D.T_concurrency[D.T_event]],
        next_state: Optional[D.T_state] = None,
    ) -> D.T_agent[Value[D.T_value]]:

        if next_state.x == memory.x and next_state.y == memory.y:
            cost = 2  # big penalty when hitting a wall
        else:
            cost = abs(next_state.x - memory.x) + abs(
                next_state.y - memory.y)  # every move costs 1

        return Value(cost=cost)
示例#15
0
 def _sample(
     self,
     memory: D.T_memory[D.T_state],
     action: D.T_agent[D.T_concurrency[D.T_event]],
 ) -> EnvironmentOutcome[D.T_agent[D.T_observation],
                         D.T_agent[Value[D.T_value]],
                         D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]:
     o = super()._sample(memory, action)
     return EnvironmentOutcome(
         observation=GymDomainStateProxy(
             state=normalize_and_round(o.observation._state),
             context=o.observation._context,
         ),
         value=Value(reward=o.value.reward - 1),
         termination=o.termination,
         info=o.info,
     )
示例#16
0
    def __init__(
        self,
        from_state: Optional[D.T_state] = None,
        heuristic: Optional[
            Callable[[Domain, D.T_state], D.T_agent[Value[D.T_value]]]
        ] = None,
        weight: float = 1.0,
        verbose: bool = False,
        render: bool = False,
    ) -> None:

        self._from_state = from_state
        self._heuristic = (
            (lambda _, __: Value(cost=0.0)) if heuristic is None else heuristic
        )
        self._weight = weight
        self._verbose = verbose
        self._render = render
        self._values = {}
        self._plan = []
            "config": {
                "domain_factory":
                domain_factory,
                "parallel":
                False,
                "discount":
                1.0,
                "max_tip_expanions":
                1,
                "detect_cycles":
                False,
                "debug_logs":
                False,
                "heuristic":
                lambda d, s: Value(cost=sqrt(
                    (s.x - (rows - 1)) * (s.x - (rows - 1)) +
                    (s.y - (columns - 1)) * (s.y - (columns - 1)))),
            },
        }
    ]

    # Load solvers (filtering out badly installed ones)
    solvers = map(lambda s: dict(s, entry=load_registered_solver(s["entry"])),
                  try_solvers)
    solvers = list(filter(lambda s: s["entry"] is not None, solvers))

    # Run loop to ask user input
    domain = domain_factory()
    while True:
        # Ask user input to select solver
        choice = int(
        def decode(val):
            return [val[0], val[1]]


if __name__ == "__main__":

    try_solvers = [
        # A* (planning)
        {
            "name": "A* (planning)",
            "entry": "Astar",
            "config": {
                "domain_factory":
                lambda: MyDomain(),
                "heuristic":
                lambda d, s: Value(cost=sqrt((d.num_cols - 1 - s.x)**2 +
                                             (d.num_rows - 1 - s.y)**2)),
                "parallel":
                True,
                "debug_logs":
                False,
            },
        },
        # IW (planning)
        {
            "name": "IW (planning)",
            "entry": "IW",
            "config": {
                "domain_factory": lambda: MyDomain(),
                "state_features": lambda d, s: [s.x, s.y],
                "use_state_feature_hash": False,
                "parallel": True,
示例#19
0
 def decode(value):
     if value[1].value:
         return Value(reward=value[0].value)
     else:
         return Value(cost=value[0].value)
示例#20
0
        @staticmethod
        def decode(val):
            return int(val.value)


if __name__ == "__main__":

    try_solvers = [
        # LRTDP
        {
            "name": "LRTDP",
            "entry": "LRTDP",
            "config": {
                "domain_factory": lambda: MyDomain(),
                "heuristic": lambda d, s: Value(
                    cost=sqrt((d.num_cols - 1 - s.x) ** 2 + (d.num_rows - 1 - s.y) ** 2)
                ),
                "use_labels": True,
                "time_budget": 60000,
                "rollout_budget": 10000,
                "max_depth": 50,
                "discount": 1.0,
                "epsilon": 0.001,
                "online_node_garbage": True,
                "continuous_planning": False,
                "parallel": True,
                "debug_logs": False,
            },
        },
        # ILAO*
        {