示例#1
0
    def step(self, reward, observation):
        if not self.keep_subtree:
            self.subtree_node = Node(None, observation)
            self.expansion(self.subtree_node)

        for i in range(self.num_iterations):
            self.MCTS_iteration()
        action, sub_tree = self.choose_action()
        self.subtree_node = sub_tree
        return action
示例#2
0
    def insertStart(self, data):

        self.counter += 1

        newNode = Node(data)

        if not self.head:
            self.head = newNode
        else:
            newNode.nextNode = self.head
            self.head = newNode
示例#3
0
    def start(self, observation):
        self.episode_counter += 1
        if self.keep_tree and self.root is None:
            self.root = Node(None, observation)
            self.expansion(self.root)

        if self.keep_tree:
            self.subtree_node = self.root
        else:
            self.subtree_node = Node(None, observation)
            self.expansion(self.subtree_node)

        action = BaseDynaAgent.start(self, observation)
        return action
示例#4
0
    def expansion(self, node):
        for a in self.action_list:
            next_state, is_terminal, reward = self.true_model(
                node.get_state(),
                a)  # with the assumption of deterministic model
            # if np.array_equal(next_state, node.get_state()):
            #     continue
            value = self.get_initial_value(next_state)
            child = Node(node,
                         next_state,
                         is_terminal=is_terminal,
                         action_from_par=a,
                         reward_from_par=reward,
                         value=value)
            node.add_child(child)

            buffer_prev_state = self.getStateRepresentation(node.get_state())
            act_ind = self.getActionIndex(a)
            buffer_prev_action = torch.tensor([act_ind],
                                              device=self.device).view(1, 1)
            buffer_reward = torch.tensor([reward], device=self.device)
            buffer_state = None
            buffer_action = None
            if not is_terminal:
                buffer_state = self.getStateRepresentation(next_state)
                buffer_action = self.policy(buffer_state)
            self.updateTransitionBuffer(
                utils.transition(buffer_prev_state, buffer_prev_action,
                                 buffer_reward, buffer_state, buffer_action,
                                 is_terminal, self.time_step, 0))
示例#5
0
    def expansion(self, node):
        children_list = []
        sort_list = []
        for a in self.action_list:
            next_state, is_terminal, reward = self.true_model(
                node.get_state(),
                a)  # with the assumption of deterministic model
            # if np.array_equal(next_state, node.get_state()):
            #     continue
            value = self.get_initial_value(next_state)
            child = Node(node,
                         next_state,
                         is_terminal=is_terminal,
                         action_from_par=a,
                         reward_from_par=reward,
                         value=value)
            children_list.append(child)
            sort_value = self.get_state_value(next_state)
            sort_list.append(sort_value)

        children_list = [
            x for _, x in sorted(zip(sort_list, children_list),
                                 key=lambda pair: pair[0],
                                 reverse=True)
        ]
        for i in range(self.branch_factor):
            node.add_child(children_list[i])
示例#6
0
    def enqueue(self, data):
        n = Node(data)
        if self.count() == 0:
            self.first = self.last = n
            return

        self.last.next = n
        self.last = n
示例#7
0
    def start(self, observation):
        if self.keep_tree and self.root is None:
            self.root = Node(None, observation)
            self.expansion(self.root)

        if self.keep_tree:
            self.subtree_node = self.root
            print(self.subtree_node.get_avg_value())
        else:
            self.subtree_node = Node(None, observation)
            self.expansion(self.subtree_node)

        # self.render_tree()

        for i in range(self.num_iterations):
            self.MCTS_iteration()
        action, sub_tree = self.choose_action()
        self.subtree_node = sub_tree
        return action
示例#8
0
    def append_to_tail(self, data):
        end = Node(data)
        n = self.head

        if n is None:
            self.head = end
            return

        while n.next is not None:
            n = n.next

        n.next = end
示例#9
0
 def expansion(self, node):
     for a in self.action_list:
         next_state, is_terminal, reward = self.true_model(
             node.get_state(),
             a)  # with the assumption of deterministic model
         # if np.array_equal(next_state, node.get_state()):
         #     continue
         value = self.get_initial_value(next_state)
         child = Node(node,
                      next_state,
                      is_terminal=is_terminal,
                      action_from_par=a,
                      reward_from_par=reward,
                      value=value)
         node.add_child(child)
示例#10
0
 def add(self,data):
     self.no_of_nodes+=1
     if self.head == None:
         node = Node(data)
         node.node_next = None
         self.head = node
     else:
         node = Node(data)
         node.node_next = self.head
         self.head = node
示例#11
0
    def expansion(self, node):
        for a in self.action_list:
            next_state, is_terminal, reward = self.true_model(
                node.get_state(),
                a)  # with the assumption of deterministic model
            # if np.array_equal(next_state, node.get_state()):
            #     continue
            value = self.get_initial_value(next_state)
            child = Node(node,
                         next_state,
                         is_terminal=is_terminal,
                         action_from_par=a,
                         reward_from_par=reward,
                         value=value)
            node.add_child(child)

            buffer_prev_state = self.getStateRepresentation(node.get_state())
            act_ind = self.getActionIndex(a)
            buffer_prev_action = torch.tensor([act_ind],
                                              device=self.device).view(1, 1)
            buffer_reward = torch.tensor([reward], device=self.device)
            buffer_state = None
            buffer_action = None
            if not is_terminal:
                buffer_state = self.getStateRepresentation(next_state)
                buffer_action = self.policy(buffer_state)
            with torch.no_grad():
                real_prev_action = self.action_list[buffer_prev_action.item()]
                prev_state_value = self.getStateActionValue(
                    buffer_prev_state, real_prev_action).item()
                state_value = 0
                if not is_terminal:
                    state_value = self._vf['q']['network'](buffer_state).max(
                        1)[1].view(1, 1).item()
                    buffer_state = buffer_state.float()
                td_error = buffer_reward.item(
                ) + self.gamma * state_value - prev_state_value
                if (td_error >= self.td_average):
                    self.updateTransitionBuffer(
                        utils.transition(buffer_prev_state.float(),
                                         buffer_prev_action,
                                         buffer_reward.float(), buffer_state,
                                         None, is_terminal, self.time_step, 0))
                    self.mcts_count += 1

                    self.update_average_td_error(td_error)
示例#12
0
    def step(self, reward, observation):
        if not self.keep_subtree:
            self.subtree_node = Node(None, observation)
            self.expansion(self.subtree_node)

        self.time_step += 1

        self.state = self.getStateRepresentation(observation)

        reward = torch.tensor([reward], device=self.device)
        self.action = self.policy(self.state)

        # store the new transition in buffer
        if self.episode_counter % 2 == 1:
            self.updateTransitionBuffer(
                utils.transition(self.prev_state, self.prev_action, reward,
                                 self.state, self.action, False,
                                 self.time_step, 0))
        # update target
        if self._target_vf['counter'] >= self._target_vf['update_rate']:
            self.setTargetValueFunction(self._vf['q'], 'q')
            # self.setTargetValueFunction(self._vf['s'], 's')

        # update value function with the buffer
        if self._vf['q']['training']:
            if len(self.transition_buffer) >= self._vf['q']['batch_size']:
                transition_batch = self.getTransitionFromBuffer(
                    n=self._vf['q']['batch_size'])
                self.updateValueFunction(transition_batch, 'q')
        if self._vf['s']['training']:
            if len(self.transition_buffer) >= self._vf['s']['batch_size']:
                transition_batch = self.getTransitionFromBuffer(
                    n=self._vf['q']['batch_size'])
                self.updateValueFunction(transition_batch, 's')

        # train/plan with model
        self.trainModel()
        self.plan()

        self.updateStateRepresentation()

        self.prev_state = self.getStateRepresentation(observation)
        self.prev_action = self.action  # another option:** we can again call self.policy function **

        return self.action_list[self.prev_action.item()]
示例#13
0
class MCTSAgent(BaseAgent):
    name = "MCTSAgent"

    def __init__(self, params={}):

        self.time_step = 0
        # self.writer = SummaryWriter()

        self.prev_state = None
        self.state = None

        self.action_list = params['action_list']
        self.num_actions = self.action_list.shape[0]
        self.actions_shape = self.action_list.shape[1:]

        self.gamma = params['gamma']
        self.epsilon = params['epsilon']

        self.device = params['device']

        if is_gridWorld:
            self.transition_dynamics = params['transition_dynamics']
        else:
            self.true_model = params['true_fw_model']
        # MCTS parameters
        self.C = params['c']
        self.num_iterations = params['num_iteration']
        self.num_rollouts = params['num_simulation']
        self.rollout_depth = params['simulation_depth']
        self.keep_subtree = False
        self.keep_tree = False
        self.root = None

        self.is_model_imperfect = False
        self.corrupt_prob = 0.025
        self.corrupt_step = 1

    def start(self, observation):
        if self.keep_tree and self.root is None:
            self.root = Node(None, observation)
            self.expansion(self.root)

        if self.keep_tree:
            self.subtree_node = self.root
            print(self.subtree_node.get_avg_value())
        else:
            self.subtree_node = Node(None, observation)
            self.expansion(self.subtree_node)

        # self.render_tree()

        for i in range(self.num_iterations):
            self.MCTS_iteration()
        action, sub_tree = self.choose_action()
        self.subtree_node = sub_tree
        return action

    def step(self, reward, observation):
        if not self.keep_subtree:
            self.subtree_node = Node(None, observation)
            self.expansion(self.subtree_node)

        for i in range(self.num_iterations):
            self.MCTS_iteration()
        action, sub_tree = self.choose_action()
        self.subtree_node = sub_tree
        return action

    def end(self, reward):
        pass

    def get_initial_value(self, state):
        return 0

    def choose_action(self):
        max_visit = -np.inf
        max_action_list = []
        max_child_list = []
        for child in self.subtree_node.get_childs():
            if child.num_visits > max_visit:
                max_visit = child.num_visits
                max_action_list = [child.get_action_from_par()]
                max_child_list = [child]
            elif child.num_visits == max_visit:
                max_action_list.append(child.get_action_from_par())
                max_child_list.append(child)
        random_ind = random.randint(0, len(max_action_list) - 1)
        return max_action_list[random_ind], max_child_list[random_ind]

#     @timecall(immediate=False)

    def MCTS_iteration(self):
        # self.render_tree()
        selected_node = self.selection()
        # now we decide to expand the leaf or rollout
        if selected_node.is_terminal:
            self.backpropagate(selected_node, 0)
        elif selected_node.num_visits == 0:  # don't expand just roll-out
            rollout_value = self.rollout(selected_node)
            self.backpropagate(selected_node, rollout_value)
        else:  # expand then roll_out
            self.expansion(selected_node)
            rollout_value = self.rollout(selected_node.get_childs()[0])
            self.backpropagate(selected_node.get_childs()[0], rollout_value)

#     @timecall(immediate=False)

    def selection(self):
        selected_node = self.subtree_node
        while len(selected_node.get_childs()) > 0:
            max_uct_value = -np.inf
            child_values = list(
                map(lambda n: n.get_avg_value() + n.reward_from_par,
                    selected_node.get_childs()))
            max_child_value = max(child_values)
            min_child_value = min(child_values)
            for ind, child in enumerate(selected_node.get_childs()):
                if child.num_visits == 0:
                    selected_node = child
                    break
                else:
                    child_value = child_values[ind]
                    if min_child_value != np.inf and max_child_value != np.inf and min_child_value != max_child_value:
                        child_value = (child_value - min_child_value) / (
                            max_child_value - min_child_value)
                    uct_value = child_value + \
                                self.C * ((child.parent.num_visits / child.num_visits) ** 0.5)
                if max_uct_value < uct_value:
                    max_uct_value = uct_value
                    selected_node = child
        return selected_node

#     @timecall(immediate=False)

    def expansion(self, node):
        for a in self.action_list:
            next_state, is_terminal, reward = self.true_model(
                node.get_state(),
                a)  # with the assumption of deterministic model
            # if np.array_equal(next_state, node.get_state()):
            #     continue
            value = self.get_initial_value(next_state)
            child = Node(node,
                         next_state,
                         is_terminal=is_terminal,
                         action_from_par=a,
                         reward_from_par=reward,
                         value=value)
            node.add_child(child)

#     @timecall(immediate=False)

    def rollout(self, node):
        sum_returns = 0
        for i in range(self.num_rollouts):
            depth = 0
            single_return = 0
            is_terminal = node.is_terminal
            state = node.get_state()
            while not is_terminal and depth < self.rollout_depth:
                a = random.choice(self.action_list)
                next_state, is_terminal, reward = self.true_model(state, a)
                single_return += reward
                depth += 1
                state = next_state
            sum_returns += single_return
        return sum_returns / self.num_rollouts

#     @timecall(immediate=False)

    def backpropagate(self, node, value):
        while node is not None:
            node.add_to_values(value)
            node.inc_visits()
            value *= self.gamma
            value += node.reward_from_par
            node = node.parent

    def true_model(self, state, action):
        action_index = self.getActionIndex(action)
        transition = self.transition_dynamics[int(state[0]),
                                              int(state[1]), action_index]
        next_state, is_terminal, reward = transition[0:2], transition[
            2], transition[3]
        if self.is_model_imperfect:
            r = random.random()
            if r < self.corrupt_prob:
                for _ in range(self.corrupt_step):
                    action_index = random.randint(0, self.num_actions - 1)
                    transition = self.transition_dynamics[int(state[0]),
                                                          int(state[1]),
                                                          action_index]
                    next_state, is_terminal, reward = transition[
                        0:2], transition[2], transition[3]
                    state = next_state
        return next_state, is_terminal, reward

    def show(self):
        queue = [self.subtree_node, "*"]
        while queue:
            node = queue.pop(0)
            if node == "*":
                print("********")
                continue
            node.show()
            for child in node.get_childs():
                queue.append(child)
            if len(node.get_childs()) > 0:
                queue.append("*")

    def render_tree(self):
        def my_layout(node):
            F = TextFace(node.name, tight_text=True)
            add_face_to_node(F, node, column=0, position="branch-right")

        t = Tree()
        ts = TreeStyle()
        ts.show_leaf_name = False
        queue = [(self.subtree_node, None)]
        while queue:
            node, parent = queue.pop(0)
            uct_value = 0
            if node.parent is not None:
                child_values = list(
                    map(lambda n: n.get_avg_value() + n.reward_from_par,
                        node.parent.get_childs()))
                max_child_value = max(child_values)
                min_child_value = min(child_values)
                child_value = node.get_avg_value()
                if min_child_value != np.inf and max_child_value != np.inf and min_child_value != max_child_value:
                    child_value = (child_value - min_child_value) / (
                        max_child_value - min_child_value)
                if node.num_visits == 0:
                    uct_value = np.inf
                else:
                    uct_value = child_value + \
                                self.C * ((node.parent.num_visits / node.num_visits) ** 0.5)



            node_face = str(node.get_state()) + "," + str(node.num_visits) + "," + str(node.get_avg_value()) \
                        + "," + str(node.is_terminal) + "," + str(uct_value)
            if parent is None:
                p = t.add_child(name=node_face)
            else:
                p = parent.add_child(name=node_face)
            for child in node.get_childs():
                queue.append((child, p))

        ts.layout_fn = my_layout
        # t.render('t.png', tree_style=ts)
        # print(t.get_ascii(show_internal=Tree))
        t.show(tree_style=ts)

    def getActionIndex(self, action):
        # print(action)
        if is_gridWorld:
            if action[0] == 0:
                if action[1] == 1:
                    return 2
                else:
                    return 0
            elif action[0] == 1:
                return 3
            else:
                return 1
        for i, a in enumerate(self.action_list):
            if np.array_equal(a, action):
                return i
        raise ValueError("action is not defined")
 def add(self, element):
     new_node = Node(element)
     # new_node.get_children()
     new_node.set_children(self.head_node)
     self.head_node = new_node
     self.size += 1
示例#15
0
 def push(self, data):
     n = Node(data)
     n.next = self.top
     self.top = n