def take_cleanup_steps(search_policy, eval_env, num_cleanup_steps): set_env_difficulty(eval_env, 0.95) search_policy.set_cleanup(True) cleanup_start = time.perf_counter() # Collector.eval_agent(search_policy, eval_env, num_cleanup_steps, by_episode=False) # random goals in env Collector.step_cleanup( search_policy, eval_env, num_cleanup_steps) # samples goals from nodes in state graph cleanup_end = time.perf_counter() search_policy.set_cleanup(False) cleanup_time = cleanup_end - cleanup_start return cleanup_time
def eval_pointenv_dists(agent, eval_env, num_evals=10, eval_distances=[2, 5, 10]): for dist in eval_distances: eval_env.set_sample_goal_args( prob_constraint=1, min_dist=dist, max_dist=dist ) # NOTE: samples goal distances in [min_dist, max_dist] closed interval returns = Collector.eval_agent(agent, eval_env, num_evals) # For debugging, it's helpful to check the predicted distances for # goals of known distance. states = dict(observation=[], goal=[]) for _ in range(num_evals): state = eval_env.reset() states['observation'].append(state['observation']) states['goal'].append(state['goal']) pred_dist = list(agent.get_dist_to_goal(states)) print(f'\tset goal dist = {dist}') print(f'\t\treturns = {returns}') print(f'\t\tpredicted_dists = {pred_dist}') print(f'\t\taverage return = {np.mean(returns)}') print( f'\t\taverage predicted_dist = {np.mean(pred_dist):.1f} ({np.std(pred_dist):.2f})' )
def visualize_search_path(search_policy, eval_env, difficulty=0.5): set_env_difficulty(eval_env, difficulty) if search_policy.open_loop: state = eval_env.reset() start = state['observation'] goal = state['goal'] search_policy.select_action(state) waypoints = search_policy.get_waypoints() else: goal, observations, waypoints, _ = Collector.get_trajectory(search_policy, eval_env) start = observations[0] plt.figure(figsize=(6, 6)) plot_walls(eval_env.walls) waypoint_vec = np.array(waypoints) print(f'waypoints: {waypoint_vec}') print(f'waypoints shape: {waypoint_vec.shape}') print(f'start: {start}') print(f'goal: {goal}') plt.scatter([start[0]], [start[1]], marker='+', color='red', s=200, label='start') plt.scatter([goal[0]], [goal[1]], marker='*', color='green', s=200, label='goal') plt.plot(waypoint_vec[:, 0], waypoint_vec[:, 1], 'y-s', alpha=0.3, label='waypoint') plt.legend(loc='lower left', bbox_to_anchor=(-0.1, -0.15), ncol=4, fontsize=16) plt.show()
def eval_search_policy(search_policy, eval_env, num_evals=10): eval_start = time.perf_counter() successes = 0. for _ in range(num_evals): try: _, _, _, ep_reward_list = Collector.get_trajectory( search_policy, eval_env) successes += int(len(ep_reward_list) < eval_env.duration) except: pass eval_end = time.perf_counter() eval_time = eval_end - eval_start success_rate = successes / num_evals return success_rate, eval_time
def visualize_compare_search(agent, search_policy, eval_env, difficulty=0.5, seed=0): set_env_difficulty(eval_env, difficulty) plt.figure(figsize=(12, 5)) for col_index in range(2): title = 'no search' if col_index == 0 else 'search' plt.subplot(1, 2, col_index + 1) plot_walls(eval_env.walls) use_search = (col_index == 1) set_global_seed(seed) set_env_seed(eval_env, seed + 1) if use_search: policy = search_policy else: policy = agent goal, observations, waypoints, _ = Collector.get_trajectory(policy, eval_env) start = observations[0] obs_vec = np.array(observations) waypoint_vec = np.array(waypoints) print(f'policy: {title}') print(f'start: {start}') print(f'goal: {goal}') print(f'steps: {obs_vec.shape[0] - 1}') print('-' * 10) plt.plot(obs_vec[:, 0], obs_vec[:, 1], 'b-o', alpha=0.3) plt.scatter([start[0]], [start[1]], marker='+', color='red', s=200, label='start') plt.scatter([obs_vec[-1, 0]], [obs_vec[-1, 1]], marker='+', color='green', s=200, label='end') plt.scatter([goal[0]], [goal[1]], marker='*', color='green', s=200, label='goal') plt.title(title, fontsize=24) if use_search: plt.plot(waypoint_vec[:, 0], waypoint_vec[:, 1], 'y-s', alpha=0.3, label='waypoint') plt.legend(loc='lower left', bbox_to_anchor=(-0.8, -0.15), ncol=4, fontsize=16) plt.show()
def visualize_trajectory(agent, eval_env, difficulty=0.5): set_env_difficulty(eval_env, difficulty) plt.figure(figsize=(8, 4)) for col_index in range(2): plt.subplot(1, 2, col_index + 1) plot_walls(eval_env.walls) goal, observations_list, _, _ = Collector.get_trajectory(agent, eval_env) obs_vec = np.array(observations_list) print(f'traj {col_index}, num steps: {len(obs_vec)}') plt.plot(obs_vec[:, 0], obs_vec[:, 1], 'b-o', alpha=0.3) plt.scatter([obs_vec[0, 0]], [obs_vec[0, 1]], marker='+', color='red', s=200, label='start') plt.scatter([obs_vec[-1, 0]], [obs_vec[-1, 1]], marker='+', color='green', s=200, label='end') plt.scatter([goal[0]], [goal[1]], marker='*', color='green', s=200, label='goal') if col_index == 0: plt.legend(loc='lower left', bbox_to_anchor=(0.3, 1), ncol=3, fontsize=16) plt.show()
def train_eval( policy, agent, replay_buffer, env, eval_env, num_iterations=int(1e6), initial_collect_steps=1000, collect_steps=1, opt_steps=1, batch_size_opt=64, eval_func=lambda agent, eval_env: None, num_eval_episodes=10, opt_log_interval=100, eval_interval=10000, ): collector = Collector(policy, replay_buffer, env, initial_collect_steps=initial_collect_steps) collector.step(collector.initial_collect_steps) for i in range(1, num_iterations + 1): collector.step(collect_steps) agent.train() opt_info = agent.optimize(replay_buffer, iterations=opt_steps, batch_size=batch_size_opt) if i % opt_log_interval == 0: print(f'iteration = {i}, opt_info = {opt_info}') if i % eval_interval == 0: agent.eval() print(f'evaluating iteration = {i}') eval_func(agent, eval_env) print('-' * 10)
torch.save(agent.state_dict(), os.path.join(cfg.ckpt_dir, 'agent.pth')) elif True: ckpt_file = os.path.join(cfg.ckpt_dir, 'agent.pth') agent.load_state_dict(torch.load(ckpt_file)) agent.eval() # from pud.visualize import visualize_trajectory # eval_env.duration = 100 # We'll give the agent lots of time to try to find the goal. # visualize_trajectory(agent, eval_env, difficulty=0.5) # We now will implement the search policy, which automatically finds these waypoints via graph search. # The first step is to fill the replay buffer with random data. # from pud.collector import Collector env.set_sample_goal_args(prob_constraint=0.0, min_dist=0, max_dist=np.inf) rb_vec = Collector.sample_initial_states(eval_env, replay_buffer.max_size) # from pud.visualize import visualize_buffer # visualize_buffer(rb_vec, eval_env) pdist = agent.get_pairwise_dist(rb_vec, aggregate=None) # from scipy.spatial import distance # euclidean_dists = distance.pdist(rb_vec) # As a sanity check, we'll plot the pairwise distances between all # observations in the replay buffer. We expect to see a range of values # from 1 to 20. Distributional RL implicitly caps the maximum predicted # distance by the largest bin. We've used 20 bins, so the critic # predicts 20 for all states that are at least 20 steps away from one another. # # from pud.visualize import visualize_pairwise_dists