def simulation_process(self, child_node: Node) -> None: # From child node onwards sample randomly for n time steps or until terminal state and collect rewards current_state = child_node.state for t in range(self.episode_duration): # Sample action randomly action = np.random.choice(self.action_space) # Simulate next state next_state_values = self.prediction_model(s_t=current_state, a_t=action) next_state_idx = self.env.get_state_space_idx( observation=next_state_values) next_state = State(x=next_state_values[0], y=next_state_values[1], x_pos=next_state_idx[0], y_pos=next_state_idx[1]) # Add reward self.reward_collection += self.reward_model.get_reward( s=next_state, a=action) # check if terminal state if self.env.is_terminal(state=next_state): break else: current_state = next_state
def expansion_process(self, leaf_node: Node) -> Node: # if not all actions were explored choose randomly from those available_actions = [ self.action_space[i] for i in range(len(self.action_space)) if leaf_node.action_visits[i] == 0 ] action = np.random.choice(available_actions) # Simulate next state next_state_values = self.prediction_model(s_t=leaf_node.state, a_t=action) next_state_idx = self.env.get_state_space_idx( observation=next_state_values) next_state = State(x=next_state_values[0], y=next_state_values[1], x_pos=next_state_idx[0], y_pos=next_state_idx[1]) self.reward_collection += self.reward_model.get_reward(s=next_state, a=action) # Add node to tree child_node = Node(state=next_state, action_space=self.action_space) leaf_node.add_successor(node=child_node, action=action) # Update node if leaf_node.total_visits == np.inf: leaf_node.total_visits = 1 else: leaf_node.total_visits += 1 leaf_node.action_visits[action] += 1 self.action_trajectory.put(action) return child_node
def gen_node_from_observation(self, observation: List[float]) -> Node: """ Generates a node object for the environment observation :param observation: :return: """ x_idx, y_idx = self.env.get_state_space_idx(observation=observation) state = State(x=observation[0], y=observation[1], x_pos=x_idx, y_pos=y_idx) return Node(state=state, action_space=self.action_space)
def tree_traversal(self, ref_node: Node, ref_state: State, search_tree: queue.LifoQueue) \ -> Tuple[Union[Node, None], queue.LifoQueue]: for action_id in self.action_space: nodes_list = ref_node.successor_nodes[action_id] if len(nodes_list) == 0: continue for node in nodes_list: search_tree.put(node) if node.state.get_state_idx() == ref_state.get_state_idx(): return node, search_tree return None, search_tree
def gen_state_from_observation(self, observation: List[float]) -> State: """ Generates a state object for the environment observation :param observation: :return: """ if self.state_space_type == STATE_SPACE_TYPE.DISCRETE: x_idx, y_idx = self.env.get_state_space_idx( observation=observation) else: x_idx, y_idx = None, None return State(x=observation[0], y=observation[1], x_pos=x_idx, y_pos=y_idx)
def get_future_reward(self, state: State, action: int) -> float: """ Computation of future reward which will be obtained by taking action a in state s. For determinisitic system dynamics. :param state: :param action: :return: Reward of action a taken in state s """ s_t1_obs = self.transition_model.state_transition(state, action) x_t1_idx, y_t1_idx = self.env.get_state_space_idx(observation=s_t1_obs) new_state = State(x=s_t1_obs[0], y=s_t1_obs[1], x_pos=x_t1_idx, y_pos=y_t1_idx) reward = self.value_space[new_state.x_idx][new_state.y_idx] return reward
def simulate_n_steps(self): for i in range(self.n_simulations + 1): # Sample from experience sample_state_idx = np.random.choice(range(len(self.state_action_observations.keys()))) sample_state = list(self.state_action_observations.keys())[sample_state_idx] sample_action = np.random.choice(self.state_action_observations[sample_state]) # Simulate one step and get reward state_t0 = self.env.get_state_space_value(x_idx=sample_state[0], y_idx=sample_state[1]) s_t1 = self.prediction_model(s_t=state_t0, a_t=sample_action) x_t1_idx, y_t1_idx = self.env.get_state_space_idx(observation=s_t1) state_t1 = State(x=s_t1[0], y=s_t1[1], x_pos=x_t1_idx, y_pos=y_t1_idx) reward_t1 = self.reward_model.get_reward(s=state_t1, a=sample_action) # Make Q-Learning Update self.q_learning_update(state_t0=state_t0, action_t0=Action(action=sample_action), reward=reward_t1, state_t1=state_t1)