예제 #1
0
파일: runner.py 프로젝트: etaoxing/sgm-sorb
def take_cleanup_steps(search_policy, eval_env, num_cleanup_steps):
    set_env_difficulty(eval_env, 0.95)

    search_policy.set_cleanup(True)
    cleanup_start = time.perf_counter()
    # Collector.eval_agent(search_policy, eval_env, num_cleanup_steps, by_episode=False) # random goals in env
    Collector.step_cleanup(
        search_policy, eval_env,
        num_cleanup_steps)  # samples goals from nodes in state graph
    cleanup_end = time.perf_counter()
    search_policy.set_cleanup(False)
    cleanup_time = cleanup_end - cleanup_start
    return cleanup_time
예제 #2
0
파일: runner.py 프로젝트: etaoxing/sgm-sorb
def eval_pointenv_dists(agent,
                        eval_env,
                        num_evals=10,
                        eval_distances=[2, 5, 10]):
    for dist in eval_distances:
        eval_env.set_sample_goal_args(
            prob_constraint=1, min_dist=dist, max_dist=dist
        )  # NOTE: samples goal distances in [min_dist, max_dist] closed interval
        returns = Collector.eval_agent(agent, eval_env, num_evals)
        # For debugging, it's helpful to check the predicted distances for
        # goals of known distance.
        states = dict(observation=[], goal=[])
        for _ in range(num_evals):
            state = eval_env.reset()
            states['observation'].append(state['observation'])
            states['goal'].append(state['goal'])
        pred_dist = list(agent.get_dist_to_goal(states))

        print(f'\tset goal dist = {dist}')
        print(f'\t\treturns = {returns}')
        print(f'\t\tpredicted_dists = {pred_dist}')
        print(f'\t\taverage return = {np.mean(returns)}')
        print(
            f'\t\taverage predicted_dist = {np.mean(pred_dist):.1f} ({np.std(pred_dist):.2f})'
        )
예제 #3
0
def visualize_search_path(search_policy, eval_env, difficulty=0.5):
    set_env_difficulty(eval_env, difficulty)

    if search_policy.open_loop:
        state = eval_env.reset()
        start = state['observation']
        goal = state['goal']

        search_policy.select_action(state)
        waypoints = search_policy.get_waypoints()
    else:
        goal, observations, waypoints, _ = Collector.get_trajectory(search_policy, eval_env)
        start = observations[0]

    plt.figure(figsize=(6, 6))
    plot_walls(eval_env.walls)

    waypoint_vec = np.array(waypoints)

    print(f'waypoints: {waypoint_vec}')
    print(f'waypoints shape: {waypoint_vec.shape}')
    print(f'start: {start}')
    print(f'goal: {goal}')

    plt.scatter([start[0]], [start[1]], marker='+',
                color='red', s=200, label='start')
    plt.scatter([goal[0]], [goal[1]], marker='*',
                color='green', s=200, label='goal')
    plt.plot(waypoint_vec[:, 0], waypoint_vec[:, 1], 'y-s', alpha=0.3, label='waypoint')
    plt.legend(loc='lower left', bbox_to_anchor=(-0.1, -0.15), ncol=4, fontsize=16)
    plt.show()
예제 #4
0
파일: runner.py 프로젝트: etaoxing/sgm-sorb
def eval_search_policy(search_policy, eval_env, num_evals=10):
    eval_start = time.perf_counter()

    successes = 0.
    for _ in range(num_evals):
        try:
            _, _, _, ep_reward_list = Collector.get_trajectory(
                search_policy, eval_env)
            successes += int(len(ep_reward_list) < eval_env.duration)
        except:
            pass

    eval_end = time.perf_counter()
    eval_time = eval_end - eval_start
    success_rate = successes / num_evals
    return success_rate, eval_time
예제 #5
0
def visualize_compare_search(agent, search_policy, eval_env, difficulty=0.5, seed=0):
    set_env_difficulty(eval_env, difficulty)

    plt.figure(figsize=(12, 5))
    for col_index in range(2):
        title = 'no search' if col_index == 0 else 'search'
        plt.subplot(1, 2, col_index + 1)
        plot_walls(eval_env.walls)
        use_search = (col_index == 1)

        set_global_seed(seed)
        set_env_seed(eval_env, seed + 1)

        if use_search:
            policy = search_policy
        else:
            policy = agent
        goal, observations, waypoints, _ = Collector.get_trajectory(policy, eval_env)
        start = observations[0]

        obs_vec = np.array(observations)
        waypoint_vec = np.array(waypoints)

        print(f'policy: {title}')
        print(f'start: {start}')
        print(f'goal: {goal}')
        print(f'steps: {obs_vec.shape[0] - 1}')
        print('-' * 10)

        plt.plot(obs_vec[:, 0], obs_vec[:, 1], 'b-o', alpha=0.3)
        plt.scatter([start[0]], [start[1]], marker='+',
                    color='red', s=200, label='start')
        plt.scatter([obs_vec[-1, 0]], [obs_vec[-1, 1]], marker='+',
                    color='green', s=200, label='end')
        plt.scatter([goal[0]], [goal[1]], marker='*',
                    color='green', s=200, label='goal')
        plt.title(title, fontsize=24)

        if use_search:
            plt.plot(waypoint_vec[:, 0], waypoint_vec[:, 1], 'y-s', alpha=0.3, label='waypoint')
            plt.legend(loc='lower left', bbox_to_anchor=(-0.8, -0.15), ncol=4, fontsize=16)
    plt.show()
예제 #6
0
def visualize_trajectory(agent, eval_env, difficulty=0.5):
    set_env_difficulty(eval_env, difficulty)

    plt.figure(figsize=(8, 4))
    for col_index in range(2):
        plt.subplot(1, 2, col_index + 1)
        plot_walls(eval_env.walls)
        goal, observations_list, _, _ = Collector.get_trajectory(agent, eval_env)
        obs_vec = np.array(observations_list)

        print(f'traj {col_index}, num steps: {len(obs_vec)}')

        plt.plot(obs_vec[:, 0], obs_vec[:, 1], 'b-o', alpha=0.3)
        plt.scatter([obs_vec[0, 0]], [obs_vec[0, 1]], marker='+',
                    color='red', s=200, label='start')
        plt.scatter([obs_vec[-1, 0]], [obs_vec[-1, 1]], marker='+',
                    color='green', s=200, label='end')
        plt.scatter([goal[0]], [goal[1]], marker='*',
                    color='green', s=200, label='goal')
        if col_index == 0:
            plt.legend(loc='lower left', bbox_to_anchor=(0.3, 1), ncol=3, fontsize=16)
    plt.show()
예제 #7
0
파일: runner.py 프로젝트: etaoxing/sgm-sorb
def train_eval(
    policy,
    agent,
    replay_buffer,
    env,
    eval_env,
    num_iterations=int(1e6),
    initial_collect_steps=1000,
    collect_steps=1,
    opt_steps=1,
    batch_size_opt=64,
    eval_func=lambda agent, eval_env: None,
    num_eval_episodes=10,
    opt_log_interval=100,
    eval_interval=10000,
):
    collector = Collector(policy,
                          replay_buffer,
                          env,
                          initial_collect_steps=initial_collect_steps)
    collector.step(collector.initial_collect_steps)
    for i in range(1, num_iterations + 1):
        collector.step(collect_steps)
        agent.train()
        opt_info = agent.optimize(replay_buffer,
                                  iterations=opt_steps,
                                  batch_size=batch_size_opt)

        if i % opt_log_interval == 0:
            print(f'iteration = {i}, opt_info = {opt_info}')

        if i % eval_interval == 0:
            agent.eval()
            print(f'evaluating iteration = {i}')
            eval_func(agent, eval_env)
            print('-' * 10)
예제 #8
0
    torch.save(agent.state_dict(), os.path.join(cfg.ckpt_dir, 'agent.pth'))
elif True:
    ckpt_file = os.path.join(cfg.ckpt_dir, 'agent.pth')
    agent.load_state_dict(torch.load(ckpt_file))
    agent.eval()

    # from pud.visualize import visualize_trajectory
    # eval_env.duration = 100 # We'll give the agent lots of time to try to find the goal.
    # visualize_trajectory(agent, eval_env, difficulty=0.5)

    # We now will implement the search policy, which automatically finds these waypoints via graph search. 
    # The first step is to fill the replay buffer with random data.
    #
    from pud.collector import Collector
    env.set_sample_goal_args(prob_constraint=0.0, min_dist=0, max_dist=np.inf)
    rb_vec = Collector.sample_initial_states(eval_env, replay_buffer.max_size)

    # from pud.visualize import visualize_buffer
    # visualize_buffer(rb_vec, eval_env)

    pdist = agent.get_pairwise_dist(rb_vec, aggregate=None)
    # from scipy.spatial import distance
    # euclidean_dists = distance.pdist(rb_vec)

    # As a sanity check, we'll plot the pairwise distances between all 
    # observations in the replay buffer. We expect to see a range of values 
    # from 1 to 20. Distributional RL implicitly caps the maximum predicted 
    # distance by the largest bin. We've used 20 bins, so the critic 
    # predicts 20 for all states that are at least 20 steps away from one another.
    # 
    # from pud.visualize import visualize_pairwise_dists