示例#1
0
    def select_action_following_policy(self, node, random):
        """
        Selects an action according to the policy given by _get_policy() (default is uniform distribution). It only
        takes into account nodes that have not been solved yet: it sets probabilities of already solved nodes to 0 and
        samples an action from the normalized resulting policy. It returns:
            - (action, None): if the successor corresponding to the selected action is not in the tree
            - (action, successor): if the successor corresponding to the selected action exists in the tree
            - (None, None): if all actions have been solved (or the sum of probabilities of the remaining actions is
            lower than min_prob) and therefore the current node needs to be pruned
        :param node:
        :return: A tuple (action, successor), (action, None) or (None, None).
        """
        if random:
            policy = self.uniform_policy_fn(node)
        else:
            policy = self.policy_fn(node)

        if node.is_leaf():
            # return action to expand
            assert not node.solved and not node.data[
                "done"], "Solved: %s.  Done: %s.  Depth: %s" % (str(
                    node.solved), str(node.data["done"]), str(node.depth))
            return sample_pmf(policy), None

        node_children = [None] * self.branching_factor
        available_actions = (policy > 0)
        for child in node.children:
            node_children[child.data["a"]] = child
            if child.solved:
                available_actions[child.data["a"]] = False

        # Take out actions that have been solved
        p = (policy * available_actions)
        ps = p.sum()

        # No actions available?
        if ps <= self.min_cum_prob:
            # All actions recommended by the policy (i.e. with prob >0) have been (or should be considered) solved. Solve node.
            # It is posible that some nodes in the subtree are not marked as solved. That's not a problem, and it's because the policy gives those branches low probability (less than min_prob)
            self.solve_and_propagate_label(node)
            return None, None

        # Select action not solved
        p = p / ps
        assert all(
            (p > 0) == available_actions), "p: %s;  avail_a: %s;   ps:%s" % (
                str(p), str(available_actions), str(ps))
        a = sample_pmf(p)

        child = node_children[a]
        if child:
            assert not child.solved and not child.data[
                "done"], "a: %i, Solved: %s.  Done: %s.  Depth: %s.  policy: %s.  avail_actions: %s.  p: %s.  ps: %s.  children: %s." % (
                    a, str(child.solved), str(
                        child.data["done"]), str(child.depth), str(policy),
                    str(available_actions), str(p), str(ps),
                    str([(c.data["a"], c.solved) for c in node.children]))

        return a, child
示例#2
0
文件: mcts.py 项目: aig-upf/pi-IW
    def select(self, node):
        """
        Selects a node and an action to expand in a tree.
        Returns (node, action) to expand, or (None, None) if the subtree has
        been solved.
        """
        while True:
            if node.data["done"]:
                return node, None

            policy = self._get_policy(node)
            a = sample_pmf(policy)

            if node.is_leaf():
                # return action to expand
                return node, a

            not_in_tree = True
            for child in node.children:
                if child.data["a"] == a:
                    node = child
                    not_in_tree = False
                    break
            if not_in_tree:
                return node, a
 def select(self, unpruned):
     # Get all feature values in the tree
     reachable_features = list(unpruned.keys())
     # Select state based on counts
     counts = np.array([self.visits[f] for f in reachable_features])
     probs = softmax(1 / (counts + 1), temp=self.temp)
     features = reachable_features[sample_pmf(probs)]
     return unpruned[features]
示例#4
0
def run_episode(plan_step_fn,
                learner,
                dataset,
                cache_subtree,
                add_returns,
                preproc_obs_fn=None,
                render=False):
    episode_done = False
    actor.reset()
    episode_rewards = []
    aux_replay = ExperienceReplay(
    )  # New auxiliary buffer to save current episode transitions
    while not episode_done:
        # Planning step
        tree_policy = plan_step_fn(len(episode_rewards))

        # Execute action (choose one node as the new root from depth 1)
        a = sample_pmf(tree_policy)
        prev_root_data, current_root_data = actor.step(a,
                                                       cache_subtree,
                                                       render,
                                                       render_size=(512, 512))
        aux_replay.append({
            "observations": prev_root_data["obs"],
            "target_policy": tree_policy
        })
        episode_rewards.append(current_root_data["r"])
        episode_done = current_root_data["done"]

        # Learning step
        if learner is not None:
            batch = dataset.sample(batch_size)
            if preproc_obs_fn is not None:
                batch["observations"] = preproc_obs_fn(batch["observations"])
            obs = tf.constant(batch["observations"], dtype=tf.float32)
            target_policy = tf.constant(batch["target_policy"],
                                        dtype=tf.float32)
            if add_returns:
                returns = tf.constant(batch["returns"], dtype=tf.float32)
                loss, _ = learner.train_step(obs, target_policy, returns)
            else:
                loss, _ = learner.train_step(obs, target_policy)

    # Add episode to the dataset
    if add_returns:
        returns = compute_returns(episode_rewards,
                                  discount_factor)  # Backpropagate rewards
        aux_replay.add_column("returns", returns)  # Add them to the dataset
    dataset.extend(
        aux_replay
    )  # Add transitions to the buffer that will be used for learning

    return episode_rewards
示例#5
0
 def _evaluate():
     done = False
     obs = env_eval.reset()
     episode_rewards = []
     while not done:
         x = tf.constant(preproc_obs_fn([obs]).astype(np.float32))
         res = policy_NN(x, training=False)
         p = softmax(res["policy_logits"].numpy().ravel(),
                     temp=args.eval_temp)
         a = sample_pmf(p)
         obs, r, done, info = env_eval.step(a)
         episode_rewards.append(r)
     return episode_rewards
示例#6
0
def planning_step(actor, planner, dataset, policy_fn, tree_budget, cache_subtree, discount_factor):
    nodes_before_planning = len(actor.tree)
    budget_fn = lambda: len(actor.tree) - nodes_before_planning == tree_budget
    planner.plan(tree=actor.tree,
                 successor_fn=actor.generate_successor,
                 stop_condition_fn=budget_fn,
                 policy_fn=policy_fn)
    tree_policy = softmax_Q_tree_policy(actor.tree, actor.tree.branching_factor, discount_factor, temp=0)
    a = sample_pmf(tree_policy)
    prev_root_data, current_root_data = actor.step(a, cache_subtree=cache_subtree)
    dataset.append({"observations": prev_root_data["obs"],
                    "target_policy": tree_policy})
    return current_root_data["r"], current_root_data["done"]
示例#7
0
    def run_episode(train, use_value_for_tree_policy):
        reset_counts_fn = getattr(planner, "reset_counts", None)
        if callable(reset_counts_fn):
            reset_counts_fn()

        episode_done = False
        tree = actor.reset()
        if args.hierarchical:  # TODO:
            trajectory = [tree.root.low_level_tree.root.data]
        else:
            trajectory = [tree.root.data]
        solved_in_one_planning_step = False
        r_found = False
        while not episode_done:
            time_start_step = time.time()
            interactions_before_step = interactions.value

            # Planning step
            nodes_before_plan = actor.get_tree_size(tree)
            time_start_plan = time.time()
            interactions.reset_budget()
            planner.initialize(tree=tree)
            planner.plan(tree=tree)
            time_plan = time.time() - time_start_plan
            nodes_after_plan = actor.get_tree_size(tree)

            # if len(trajectory) == 1 and reward_in_tree(tree):
            #     solved_in_one_planning_step = True

            if args.hierarchical:
                stats.add(
                    {
                        "n_abstract_states_so_far":
                        len(planner.visits.keys()),
                        "n_abstract_states_in_tree":
                        len(set(n.data["high_level_features"] for n in tree)),
                        "n_abstract_nodes_in_tree":
                        len(tree),
                        "avg_nodes_per_abstract_node":
                        nodes_after_plan / len(tree)
                    },
                    step=interactions.value)

            # Execute action (choose one node as the new root from depth 1)
            time_start_execute_action = time.time()

            if args.debug:
                r_in_tree = reward_in_tree(tree)
                r_found = r_found or r_in_tree

            actor.compute_returns(tree,
                                  discount_factor=args.discount_factor,
                                  add_value=False)
            if args.hierarchical:
                root_node = tree.root.low_level_tree.root
            else:
                root_node = tree.root
            action_returns = compute_node_Q(
                node=root_node,
                n_actions=n_actions,
                discount_factor=args.discount_factor,
                add_value=False)
            if args.compute_value:
                actor.compute_returns(
                    tree,
                    discount_factor=args.discount_factor,
                    add_value=True,
                    use_value_all_nodes=args.use_value_all_nodes)
                Q = compute_node_Q(node=root_node,
                                   n_actions=n_actions,
                                   discount_factor=args.discount_factor,
                                   add_value=args.use_value_all_nodes)

            # TARGET
            if use_value_for_tree_policy:
                target_policy = softmax(Q, temp=args.target_policy_temp)
            else:
                target_policy = softmax(action_returns,
                                        temp=args.target_policy_temp)

            # EXECUTION POLICY
            if use_value_for_tree_policy:
                Q_aux = compute_node_Q(node=root_node,
                                       n_actions=n_actions,
                                       discount_factor=args.discount_factor,
                                       add_value=False)
                tree_policy = softmax(Q_aux, temp=args.tree_policy_temp)
            else:
                tree_policy = softmax(action_returns,
                                      temp=args.tree_policy_temp)

            if args.tree_policy_counts_temp is not None:
                counts = actor.get_counts(tree, n_actions)
                counts_policy = softmax(counts,
                                        temp=args.tree_policy_counts_temp)
                p = tree_policy * counts_policy
                sum_p = sum(p)
                if sum_p != 0:
                    tree_policy = p / sum_p

            a = sample_pmf(tree_policy)

            if args.render:
                actor.render_tree(tree,
                                  size=(512, 512),
                                  window_name="Tree before step")

            prev_root_data, current_root = actor.step(
                tree, a, cache_subtree=args.cache_subtree)

            prev_root_data["target_policy"] = target_policy
            nodes_after_execution = actor.get_tree_size(tree)

            time_execute_action = time.time() - time_start_execute_action
            if args.debug:
                actions_explored = sum(counts > 0)

            if args.render:
                actor.render(tree, size=(512, 512))
                actor.render_tree(tree,
                                  size=(512, 512),
                                  window_name="Tree after step")
                if args.hierarchical:
                    actor.render_downsampled(
                        tree,
                        max_pix_value=args.downsampling_pix_values,
                        size=(512, 512))

                if args.render_fps is not None:
                    time.sleep(1 / args.render_fps)

            trajectory.append(current_root.data)

            episode_done = current_root.data["done"]

            # Learning step
            time_start_learn = time.time()
            if train and len(dataset) > args.batch_size and len(
                    dataset) > args.replay_min_transitions:
                _, batch = dataset.sample(size=args.batch_size)

                if train:
                    input_dict = {
                        "observations":
                        tf.constant(preproc_obs_fn(batch["observations"]),
                                    dtype=tf.float32),
                        "target_policy":
                        tf.constant(batch["target_policy"], dtype=tf.float32)
                    }
                    if args.compute_value:
                        if value_scalars_to_distrs is not None:
                            input_dict["returns"] = tf.constant(
                                value_scalars_to_distrs(batch["returns"]),
                                dtype=tf.float32)
                        else:
                            input_dict["returns"] = tf.constant(
                                batch["returns"], dtype=tf.float32)

                    loss, train_output = train_fn(input_dict)
                    stats.add(
                        {
                            "loss":
                            loss,
                            "global_gradients_norm":
                            train_output["global_gradients_norm"],
                            "cross_entropy_loss":
                            train_output["cross_entropy_loss"],
                            "regularization_loss":
                            train_output["regularization_loss"]
                        },
                        step=interactions.value)

                    if args.compute_value:
                        if "errors" in train_output.keys():
                            td_errors = train_output["errors"].numpy()
                        else:
                            assert args.use_value_classification
                            td_errors = batch[
                                "returns"] - value_logits_to_scalars(
                                    train_output["value_logits"])
                        stats.add(
                            {
                                "value_loss": train_output["value_loss"],
                                "td_error": np.mean(np.abs(td_errors)),
                            },
                            step=interactions.value)
            time_learn = time.time() - time_start_learn

            # Evaluate
            if args.eval_episodes > 0:
                if interactions.value - interactions.last_eval_interactions >= args.eval_every_interactions:
                    time_start_eval = time.time()
                    eval_sum_rewards = []
                    eval_steps = []
                    for _ in range(args.eval_episodes):
                        eval_rewards = eval_fn()
                        eval_sum_rewards.append(np.sum(eval_rewards))
                        eval_steps.append(len(eval_rewards))
                    stats.add(
                        {
                            "eval_episode_reward": np.mean(eval_sum_rewards),
                            "eval_episode_steps": np.mean(eval_steps),
                            "time_eval": time.time() - time_start_eval
                        },
                        step=interactions.value)
                    stats.report(["eval_episode_reward", "eval_episode_steps"])
                    interactions.last_eval_interactions = interactions.value

            # Statistics
            interactions_step = interactions.value - interactions_before_step
            time_step = time.time() - time_start_step
            stats.add(
                {
                    # "nodes_before_plan": nodes_before_plan,
                    "nodes_after_plan": nodes_after_plan,
                    "nodes_after_execution": nodes_after_execution,
                    "generated_nodes": nodes_after_plan - nodes_before_plan,
                    "discarded_nodes":
                    nodes_after_plan - nodes_after_execution,
                    "delta_nodes": nodes_after_execution -
                    nodes_before_plan,  # generated - discarded
                    "interactions_per_step": interactions_step,
                    "time_plan": time_plan,
                    "time_execute_action": time_execute_action,
                    "time_step": time_step,
                    "time_learn": time_learn,
                    "steps_per_sec": 1 / time_step,
                    "interactions_per_sec": interactions_step / time_step
                },
                step=interactions.value)

        # Add episode to the dataset
        traj_dict = process_trajectory(trajectory=trajectory,
                                       add_returns=add_returns,
                                       discount_factor=args.discount_factor)
        dataset.extend({
            k: traj_dict[k]
            for k in dataset.keys()
        })  # Add transitions to the buffer that will be used for learning

        stats.add(
            {
                "episode_reward": sum(traj_dict['rewards']),
                # "solved_in_one_planning_step": solved_in_one_planning_step,
                "steps_per_episode": len(traj_dict['rewards']),
                "memory_usage": memory_usage_fn(),
                "dataset_size": len(dataset)
            },
            step=interactions.value)

        if args.debug:
            stats.add(
                {
                    "reward_found": r_found,
                    "actions_explored": actions_explored,
                },
                step=interactions.value)

        if add_returns:
            stats.add({"return_init_state": traj_dict["returns"][0]},
                      step=interactions.value)
        if args.compute_value:
            stats.add(
                {
                    "value_init_state":
                    trajectory[0]["v"],
                    "value_init_state_error":
                    np.abs(trajectory[0]["v"] - traj_dict["returns"][0])
                },
                step=interactions.value)

        stats.increment("episodes", step=interactions.value)
        if args.debug:
            report_stats = [
                "episodes", "episode_reward", "reward_found",
                "steps_per_episode", "memory_usage", "dataset_size"
            ]
        else:
            report_stats = [
                "episodes", "episode_reward", "steps_per_episode",
                "memory_usage", "dataset_size"
            ]
        stats.report(report_stats)
        return trajectory
示例#8
0
    planner = RolloutIW(branching_factor=env.action_space.n,
                        ignore_cached_nodes=True)

    tree = actor.reset()
    episode_done = False
    steps_cnt = 0
    while not episode_done:
        planner.plan(tree=tree,
                     successor_fn=actor.generate_successor,
                     stop_condition_fn=lambda: len(tree) == max_tree_nodes)

        p = softmax_Q_tree_policy(tree,
                                  env.action_space.n,
                                  discount_factor,
                                  temp=0)
        a = sample_pmf(p)
        prev_root_data, current_root_data = actor.step(
            a, cache_subtree=cache_subtree)

        episode_done = current_root_data["done"]
        steps_cnt += 1
        print(
            "\n".join([
                " ".join(row) for row in env.unwrapped.get_char_matrix(
                    actor.tree.root.data["s"])
            ]), "Action: ", current_root_data["a"], "Reward: ",
            current_root_data["r"], "Simulator steps:", actor.nodes_generated,
            "Planning steps:", steps_cnt, "\n")

    print("It took %i steps but the problem can be solved in 13." % steps_cnt)
    abstract_tree = abstract_tree_actor.reset()
    episode_done = False
    stats = Stats()
    abstract_tree_actor.render_tree(abstract_tree, size=None)
    while not episode_done:
        interactions.reset_budget()
        high_level_planner.initialize(tree=abstract_tree)
        high_level_planner.plan(tree=abstract_tree)

        abstract_tree_actor.compute_returns(abstract_tree, discount_factor=discount_factor, add_value=False)
        Q = compute_node_Q(node=abstract_tree.root.low_level_tree.root,
                           n_actions=env.action_space.n,
                           discount_factor=discount_factor,
                           add_value=False)
        low_level_policy = softmax(Q, temp=0)
        a = sample_pmf(low_level_policy)
        abstract_tree_nodes = len(abstract_tree)

        abstract_tree_actor.render_tree(abstract_tree, size=None)
        prev_root_data, current_root = abstract_tree_actor.step(abstract_tree, a, cache_subtree=cache_subtree)

        episode_done = current_root.data["done"]

        stats.increment("planning_steps", step=interactions.value)
        stats.add({"action": current_root.data["a"],
                   "reward": current_root.data["r"],
                   "abstract_tree_nodes": abstract_tree_nodes,},
                   step=interactions.value)

        stats.report()
        cv2.waitKey(display_time)  # wait time in ms