def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: # Get players' moves move1, move2 = action["player1"], action["player2"] # Compute rewards r1, r2 = { (Move.rock, Move.rock): (0, 0), (Move.rock, Move.paper): (-1, 1), (Move.rock, Move.scissors): (1, -1), (Move.paper, Move.rock): (1, -1), (Move.paper, Move.paper): (0, 0), (Move.paper, Move.scissors): (-1, 1), (Move.scissors, Move.rock): (-1, 1), (Move.scissors, Move.paper): (1, -1), (Move.scissors, Move.scissors): (0, 0), }[move1, move2] # Compute num_move increment last_state = self._memory num_move = last_state.num_move + 1 return TransitionOutcome( state=State(num_move=num_move), value={ "player1": Value(reward=r1), "player2": Value(reward=r2) }, termination=(num_move >= self._max_moves), )
def _get_transition_value( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None, ) -> D.T_agent[Value[D.T_value]]: return Value(cost=1)
def _get_transition_value( self, memory: D.T_memory[D.T_state], event: D.T_event, next_state: Optional[D.T_state] = None, ) -> Value[D.T_value]: return Value(cost=self.next_state_map[memory][event][next_state][1])
def __init__( self, from_state: Optional[D.T_state] = None, heuristic: Optional[ Callable[[Domain, D.T_state], D.T_agent[Value[D.T_value]]] ] = None, weight: float = 1.0, verbose: bool = False, max_iter=5000, max_depth=200, ) -> None: self._from_state = from_state self._heuristic = ( (lambda _, __: Value(cost=0.0)) if heuristic is None else heuristic ) self._weight = weight self.max_iter = max_iter self.max_depth = max_depth self._plan = [] self.values = {} self._verbose = verbose self.heuristic_changed = False self._policy = {}
def _get_next_state( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], ) -> D.T_state: env = memory._context[0] if self._set_state is None or self._get_state is None: env = deepcopy(env) elif memory._context[4] != self._get_state(env): self._set_state(env, memory._context[4]) self._gym_env = env # Just in case the simulation environment would be different from the planner's environment... obs, reward, done, info = env.step(action) outcome = TransitionOutcome(state=obs, value=Value(reward=reward), termination=done, info=info) # print('Transition:', str(memory._state), ' -> ', str(action), ' -> ', str(outcome.state)) return GymDomainStateProxy( state=outcome.state, context=[ env, memory._state, action, outcome, self._get_state(env) if (self._get_state is not None and self._set_state is not None) else None, ], )
def _get_transition_value( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None, ) -> D.T_agent[Value[D.T_value]]: v = super()._get_transition_value(memory, action, next_state) return Value(reward=v.reward - 1)
def _get_transition_value( self, memory: D.T_memory[D.T_state], event: D.T_event, next_state: Optional[D.T_state] = None, ) -> Value[D.T_value]: return Value(cost=self.next_state_attributes[memory[-1]][event][ self.attribute_weight])
def _get_transition_value( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None, ) -> D.T_agent[Value[D.T_value]]: # every move costs 1 return Value(cost=abs(next_state.x - memory.x) + abs(next_state.y - memory.y))
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: obs, reward, done, info = self._gym_env.step(action) return TransitionOutcome(state=obs, value=Value(reward=reward), termination=done, info=info)
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: o = super()._state_step(action) return TransitionOutcome( state=GymDomainStateProxy(state=normalize_and_round( o.state._state), context=o.state._context), value=Value(reward=o.value.reward - 1), termination=o.termination, info=o.info, )
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: self._gym_env.set_state(self._current_state) o = super()._state_step(action) self._current_state = self._gym_env.get_state() return TransitionOutcome( state=o.state, value=Value(reward=o.value.reward - 1), termination=o.termination, info=o.info, )
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: obs, reward, done, info = self._gym_env.step(action) if self._set_state is not None and self._get_state is not None: state = GymDomainStateProxy(state=obs, context=self._initial_env_state) else: state = GymDomainStateProxy(state=obs, context=self._init_env) return TransitionOutcome(state=state, value=Value(reward=reward), termination=done, info=info)
def _get_transition_value( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None, ) -> D.T_agent[Value[D.T_value]]: if next_state.x == -1 and next_state.y == -1: cost = 2 * ( self.num_cols + self.num_rows ) # dead-end state, penalty higher than optimal goal-reaching paths else: cost = abs(next_state.x - memory.x) + abs( next_state.y - memory.y) # every move costs 1 return Value(cost=cost)
def _get_transition_value( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None, ) -> D.T_agent[Value[D.T_value]]: if next_state.x == memory.x and next_state.y == memory.y: cost = 2 # big penalty when hitting a wall else: cost = abs(next_state.x - memory.x) + abs( next_state.y - memory.y) # every move costs 1 return Value(cost=cost)
def _sample( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], ) -> EnvironmentOutcome[D.T_agent[D.T_observation], D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: o = super()._sample(memory, action) return EnvironmentOutcome( observation=GymDomainStateProxy( state=normalize_and_round(o.observation._state), context=o.observation._context, ), value=Value(reward=o.value.reward - 1), termination=o.termination, info=o.info, )
def __init__( self, from_state: Optional[D.T_state] = None, heuristic: Optional[ Callable[[Domain, D.T_state], D.T_agent[Value[D.T_value]]] ] = None, weight: float = 1.0, verbose: bool = False, render: bool = False, ) -> None: self._from_state = from_state self._heuristic = ( (lambda _, __: Value(cost=0.0)) if heuristic is None else heuristic ) self._weight = weight self._verbose = verbose self._render = render self._values = {} self._plan = []
"config": { "domain_factory": domain_factory, "parallel": False, "discount": 1.0, "max_tip_expanions": 1, "detect_cycles": False, "debug_logs": False, "heuristic": lambda d, s: Value(cost=sqrt( (s.x - (rows - 1)) * (s.x - (rows - 1)) + (s.y - (columns - 1)) * (s.y - (columns - 1)))), }, } ] # Load solvers (filtering out badly installed ones) solvers = map(lambda s: dict(s, entry=load_registered_solver(s["entry"])), try_solvers) solvers = list(filter(lambda s: s["entry"] is not None, solvers)) # Run loop to ask user input domain = domain_factory() while True: # Ask user input to select solver choice = int(
def decode(val): return [val[0], val[1]] if __name__ == "__main__": try_solvers = [ # A* (planning) { "name": "A* (planning)", "entry": "Astar", "config": { "domain_factory": lambda: MyDomain(), "heuristic": lambda d, s: Value(cost=sqrt((d.num_cols - 1 - s.x)**2 + (d.num_rows - 1 - s.y)**2)), "parallel": True, "debug_logs": False, }, }, # IW (planning) { "name": "IW (planning)", "entry": "IW", "config": { "domain_factory": lambda: MyDomain(), "state_features": lambda d, s: [s.x, s.y], "use_state_feature_hash": False, "parallel": True,
def decode(value): if value[1].value: return Value(reward=value[0].value) else: return Value(cost=value[0].value)
@staticmethod def decode(val): return int(val.value) if __name__ == "__main__": try_solvers = [ # LRTDP { "name": "LRTDP", "entry": "LRTDP", "config": { "domain_factory": lambda: MyDomain(), "heuristic": lambda d, s: Value( cost=sqrt((d.num_cols - 1 - s.x) ** 2 + (d.num_rows - 1 - s.y) ** 2) ), "use_labels": True, "time_budget": 60000, "rollout_budget": 10000, "max_depth": 50, "discount": 1.0, "epsilon": 0.001, "online_node_garbage": True, "continuous_planning": False, "parallel": True, "debug_logs": False, }, }, # ILAO* {