def select_action_following_policy(self, node, random): """ Selects an action according to the policy given by _get_policy() (default is uniform distribution). It only takes into account nodes that have not been solved yet: it sets probabilities of already solved nodes to 0 and samples an action from the normalized resulting policy. It returns: - (action, None): if the successor corresponding to the selected action is not in the tree - (action, successor): if the successor corresponding to the selected action exists in the tree - (None, None): if all actions have been solved (or the sum of probabilities of the remaining actions is lower than min_prob) and therefore the current node needs to be pruned :param node: :return: A tuple (action, successor), (action, None) or (None, None). """ if random: policy = self.uniform_policy_fn(node) else: policy = self.policy_fn(node) if node.is_leaf(): # return action to expand assert not node.solved and not node.data[ "done"], "Solved: %s. Done: %s. Depth: %s" % (str( node.solved), str(node.data["done"]), str(node.depth)) return sample_pmf(policy), None node_children = [None] * self.branching_factor available_actions = (policy > 0) for child in node.children: node_children[child.data["a"]] = child if child.solved: available_actions[child.data["a"]] = False # Take out actions that have been solved p = (policy * available_actions) ps = p.sum() # No actions available? if ps <= self.min_cum_prob: # All actions recommended by the policy (i.e. with prob >0) have been (or should be considered) solved. Solve node. # It is posible that some nodes in the subtree are not marked as solved. That's not a problem, and it's because the policy gives those branches low probability (less than min_prob) self.solve_and_propagate_label(node) return None, None # Select action not solved p = p / ps assert all( (p > 0) == available_actions), "p: %s; avail_a: %s; ps:%s" % ( str(p), str(available_actions), str(ps)) a = sample_pmf(p) child = node_children[a] if child: assert not child.solved and not child.data[ "done"], "a: %i, Solved: %s. Done: %s. Depth: %s. policy: %s. avail_actions: %s. p: %s. ps: %s. children: %s." % ( a, str(child.solved), str( child.data["done"]), str(child.depth), str(policy), str(available_actions), str(p), str(ps), str([(c.data["a"], c.solved) for c in node.children])) return a, child
def select(self, node): """ Selects a node and an action to expand in a tree. Returns (node, action) to expand, or (None, None) if the subtree has been solved. """ while True: if node.data["done"]: return node, None policy = self._get_policy(node) a = sample_pmf(policy) if node.is_leaf(): # return action to expand return node, a not_in_tree = True for child in node.children: if child.data["a"] == a: node = child not_in_tree = False break if not_in_tree: return node, a
def select(self, unpruned): # Get all feature values in the tree reachable_features = list(unpruned.keys()) # Select state based on counts counts = np.array([self.visits[f] for f in reachable_features]) probs = softmax(1 / (counts + 1), temp=self.temp) features = reachable_features[sample_pmf(probs)] return unpruned[features]
def run_episode(plan_step_fn, learner, dataset, cache_subtree, add_returns, preproc_obs_fn=None, render=False): episode_done = False actor.reset() episode_rewards = [] aux_replay = ExperienceReplay( ) # New auxiliary buffer to save current episode transitions while not episode_done: # Planning step tree_policy = plan_step_fn(len(episode_rewards)) # Execute action (choose one node as the new root from depth 1) a = sample_pmf(tree_policy) prev_root_data, current_root_data = actor.step(a, cache_subtree, render, render_size=(512, 512)) aux_replay.append({ "observations": prev_root_data["obs"], "target_policy": tree_policy }) episode_rewards.append(current_root_data["r"]) episode_done = current_root_data["done"] # Learning step if learner is not None: batch = dataset.sample(batch_size) if preproc_obs_fn is not None: batch["observations"] = preproc_obs_fn(batch["observations"]) obs = tf.constant(batch["observations"], dtype=tf.float32) target_policy = tf.constant(batch["target_policy"], dtype=tf.float32) if add_returns: returns = tf.constant(batch["returns"], dtype=tf.float32) loss, _ = learner.train_step(obs, target_policy, returns) else: loss, _ = learner.train_step(obs, target_policy) # Add episode to the dataset if add_returns: returns = compute_returns(episode_rewards, discount_factor) # Backpropagate rewards aux_replay.add_column("returns", returns) # Add them to the dataset dataset.extend( aux_replay ) # Add transitions to the buffer that will be used for learning return episode_rewards
def _evaluate(): done = False obs = env_eval.reset() episode_rewards = [] while not done: x = tf.constant(preproc_obs_fn([obs]).astype(np.float32)) res = policy_NN(x, training=False) p = softmax(res["policy_logits"].numpy().ravel(), temp=args.eval_temp) a = sample_pmf(p) obs, r, done, info = env_eval.step(a) episode_rewards.append(r) return episode_rewards
def planning_step(actor, planner, dataset, policy_fn, tree_budget, cache_subtree, discount_factor): nodes_before_planning = len(actor.tree) budget_fn = lambda: len(actor.tree) - nodes_before_planning == tree_budget planner.plan(tree=actor.tree, successor_fn=actor.generate_successor, stop_condition_fn=budget_fn, policy_fn=policy_fn) tree_policy = softmax_Q_tree_policy(actor.tree, actor.tree.branching_factor, discount_factor, temp=0) a = sample_pmf(tree_policy) prev_root_data, current_root_data = actor.step(a, cache_subtree=cache_subtree) dataset.append({"observations": prev_root_data["obs"], "target_policy": tree_policy}) return current_root_data["r"], current_root_data["done"]
def run_episode(train, use_value_for_tree_policy): reset_counts_fn = getattr(planner, "reset_counts", None) if callable(reset_counts_fn): reset_counts_fn() episode_done = False tree = actor.reset() if args.hierarchical: # TODO: trajectory = [tree.root.low_level_tree.root.data] else: trajectory = [tree.root.data] solved_in_one_planning_step = False r_found = False while not episode_done: time_start_step = time.time() interactions_before_step = interactions.value # Planning step nodes_before_plan = actor.get_tree_size(tree) time_start_plan = time.time() interactions.reset_budget() planner.initialize(tree=tree) planner.plan(tree=tree) time_plan = time.time() - time_start_plan nodes_after_plan = actor.get_tree_size(tree) # if len(trajectory) == 1 and reward_in_tree(tree): # solved_in_one_planning_step = True if args.hierarchical: stats.add( { "n_abstract_states_so_far": len(planner.visits.keys()), "n_abstract_states_in_tree": len(set(n.data["high_level_features"] for n in tree)), "n_abstract_nodes_in_tree": len(tree), "avg_nodes_per_abstract_node": nodes_after_plan / len(tree) }, step=interactions.value) # Execute action (choose one node as the new root from depth 1) time_start_execute_action = time.time() if args.debug: r_in_tree = reward_in_tree(tree) r_found = r_found or r_in_tree actor.compute_returns(tree, discount_factor=args.discount_factor, add_value=False) if args.hierarchical: root_node = tree.root.low_level_tree.root else: root_node = tree.root action_returns = compute_node_Q( node=root_node, n_actions=n_actions, discount_factor=args.discount_factor, add_value=False) if args.compute_value: actor.compute_returns( tree, discount_factor=args.discount_factor, add_value=True, use_value_all_nodes=args.use_value_all_nodes) Q = compute_node_Q(node=root_node, n_actions=n_actions, discount_factor=args.discount_factor, add_value=args.use_value_all_nodes) # TARGET if use_value_for_tree_policy: target_policy = softmax(Q, temp=args.target_policy_temp) else: target_policy = softmax(action_returns, temp=args.target_policy_temp) # EXECUTION POLICY if use_value_for_tree_policy: Q_aux = compute_node_Q(node=root_node, n_actions=n_actions, discount_factor=args.discount_factor, add_value=False) tree_policy = softmax(Q_aux, temp=args.tree_policy_temp) else: tree_policy = softmax(action_returns, temp=args.tree_policy_temp) if args.tree_policy_counts_temp is not None: counts = actor.get_counts(tree, n_actions) counts_policy = softmax(counts, temp=args.tree_policy_counts_temp) p = tree_policy * counts_policy sum_p = sum(p) if sum_p != 0: tree_policy = p / sum_p a = sample_pmf(tree_policy) if args.render: actor.render_tree(tree, size=(512, 512), window_name="Tree before step") prev_root_data, current_root = actor.step( tree, a, cache_subtree=args.cache_subtree) prev_root_data["target_policy"] = target_policy nodes_after_execution = actor.get_tree_size(tree) time_execute_action = time.time() - time_start_execute_action if args.debug: actions_explored = sum(counts > 0) if args.render: actor.render(tree, size=(512, 512)) actor.render_tree(tree, size=(512, 512), window_name="Tree after step") if args.hierarchical: actor.render_downsampled( tree, max_pix_value=args.downsampling_pix_values, size=(512, 512)) if args.render_fps is not None: time.sleep(1 / args.render_fps) trajectory.append(current_root.data) episode_done = current_root.data["done"] # Learning step time_start_learn = time.time() if train and len(dataset) > args.batch_size and len( dataset) > args.replay_min_transitions: _, batch = dataset.sample(size=args.batch_size) if train: input_dict = { "observations": tf.constant(preproc_obs_fn(batch["observations"]), dtype=tf.float32), "target_policy": tf.constant(batch["target_policy"], dtype=tf.float32) } if args.compute_value: if value_scalars_to_distrs is not None: input_dict["returns"] = tf.constant( value_scalars_to_distrs(batch["returns"]), dtype=tf.float32) else: input_dict["returns"] = tf.constant( batch["returns"], dtype=tf.float32) loss, train_output = train_fn(input_dict) stats.add( { "loss": loss, "global_gradients_norm": train_output["global_gradients_norm"], "cross_entropy_loss": train_output["cross_entropy_loss"], "regularization_loss": train_output["regularization_loss"] }, step=interactions.value) if args.compute_value: if "errors" in train_output.keys(): td_errors = train_output["errors"].numpy() else: assert args.use_value_classification td_errors = batch[ "returns"] - value_logits_to_scalars( train_output["value_logits"]) stats.add( { "value_loss": train_output["value_loss"], "td_error": np.mean(np.abs(td_errors)), }, step=interactions.value) time_learn = time.time() - time_start_learn # Evaluate if args.eval_episodes > 0: if interactions.value - interactions.last_eval_interactions >= args.eval_every_interactions: time_start_eval = time.time() eval_sum_rewards = [] eval_steps = [] for _ in range(args.eval_episodes): eval_rewards = eval_fn() eval_sum_rewards.append(np.sum(eval_rewards)) eval_steps.append(len(eval_rewards)) stats.add( { "eval_episode_reward": np.mean(eval_sum_rewards), "eval_episode_steps": np.mean(eval_steps), "time_eval": time.time() - time_start_eval }, step=interactions.value) stats.report(["eval_episode_reward", "eval_episode_steps"]) interactions.last_eval_interactions = interactions.value # Statistics interactions_step = interactions.value - interactions_before_step time_step = time.time() - time_start_step stats.add( { # "nodes_before_plan": nodes_before_plan, "nodes_after_plan": nodes_after_plan, "nodes_after_execution": nodes_after_execution, "generated_nodes": nodes_after_plan - nodes_before_plan, "discarded_nodes": nodes_after_plan - nodes_after_execution, "delta_nodes": nodes_after_execution - nodes_before_plan, # generated - discarded "interactions_per_step": interactions_step, "time_plan": time_plan, "time_execute_action": time_execute_action, "time_step": time_step, "time_learn": time_learn, "steps_per_sec": 1 / time_step, "interactions_per_sec": interactions_step / time_step }, step=interactions.value) # Add episode to the dataset traj_dict = process_trajectory(trajectory=trajectory, add_returns=add_returns, discount_factor=args.discount_factor) dataset.extend({ k: traj_dict[k] for k in dataset.keys() }) # Add transitions to the buffer that will be used for learning stats.add( { "episode_reward": sum(traj_dict['rewards']), # "solved_in_one_planning_step": solved_in_one_planning_step, "steps_per_episode": len(traj_dict['rewards']), "memory_usage": memory_usage_fn(), "dataset_size": len(dataset) }, step=interactions.value) if args.debug: stats.add( { "reward_found": r_found, "actions_explored": actions_explored, }, step=interactions.value) if add_returns: stats.add({"return_init_state": traj_dict["returns"][0]}, step=interactions.value) if args.compute_value: stats.add( { "value_init_state": trajectory[0]["v"], "value_init_state_error": np.abs(trajectory[0]["v"] - traj_dict["returns"][0]) }, step=interactions.value) stats.increment("episodes", step=interactions.value) if args.debug: report_stats = [ "episodes", "episode_reward", "reward_found", "steps_per_episode", "memory_usage", "dataset_size" ] else: report_stats = [ "episodes", "episode_reward", "steps_per_episode", "memory_usage", "dataset_size" ] stats.report(report_stats) return trajectory
planner = RolloutIW(branching_factor=env.action_space.n, ignore_cached_nodes=True) tree = actor.reset() episode_done = False steps_cnt = 0 while not episode_done: planner.plan(tree=tree, successor_fn=actor.generate_successor, stop_condition_fn=lambda: len(tree) == max_tree_nodes) p = softmax_Q_tree_policy(tree, env.action_space.n, discount_factor, temp=0) a = sample_pmf(p) prev_root_data, current_root_data = actor.step( a, cache_subtree=cache_subtree) episode_done = current_root_data["done"] steps_cnt += 1 print( "\n".join([ " ".join(row) for row in env.unwrapped.get_char_matrix( actor.tree.root.data["s"]) ]), "Action: ", current_root_data["a"], "Reward: ", current_root_data["r"], "Simulator steps:", actor.nodes_generated, "Planning steps:", steps_cnt, "\n") print("It took %i steps but the problem can be solved in 13." % steps_cnt)
abstract_tree = abstract_tree_actor.reset() episode_done = False stats = Stats() abstract_tree_actor.render_tree(abstract_tree, size=None) while not episode_done: interactions.reset_budget() high_level_planner.initialize(tree=abstract_tree) high_level_planner.plan(tree=abstract_tree) abstract_tree_actor.compute_returns(abstract_tree, discount_factor=discount_factor, add_value=False) Q = compute_node_Q(node=abstract_tree.root.low_level_tree.root, n_actions=env.action_space.n, discount_factor=discount_factor, add_value=False) low_level_policy = softmax(Q, temp=0) a = sample_pmf(low_level_policy) abstract_tree_nodes = len(abstract_tree) abstract_tree_actor.render_tree(abstract_tree, size=None) prev_root_data, current_root = abstract_tree_actor.step(abstract_tree, a, cache_subtree=cache_subtree) episode_done = current_root.data["done"] stats.increment("planning_steps", step=interactions.value) stats.add({"action": current_root.data["a"], "reward": current_root.data["r"], "abstract_tree_nodes": abstract_tree_nodes,}, step=interactions.value) stats.report() cv2.waitKey(display_time) # wait time in ms