예제 #1
0
def evaluate_inferred_reward(reward_irl, inferred_rewards, image_irl, start_states_irl, horizon):
    """
    Calculates 1-%regret for each (reward, inferred_reward) pair. Regret is amt of reward
    lost by planning with the inferred reward, rather than planning with the true reward.

    reward_irl(list): list of true rewards (n, imsize, imsize)
    inferred_rewards(list): list of inferred rewards (n, imsize, imsize)
    image_irl(list): list of walls for each grid (n, imsize, imsize):
    start_states_irl(list): list of start states (n, 2) where each entry of form [x,y]
    horizon(integer): number of steps to evaluate inferred reward
    Returns 1-%regret
    """
    reward_percents = []
    for label, reward, wall, start_state, i in zip(reward_irl, inferred_rewards, image_irl, start_states_irl, range(len(reward_irl))):
        if i < 10:
            plot_reward(label, reward, wall, 'reward_pics/reward_{}'.format(i))
        percent = evaluate_proxy(wall, start_state, reward, label, episode_length=horizon)
        print("Reward had: {}".format(percent))
        reward_percents.append(percent)

    average_percent_reward = float(sum(reward_percents)) / len(reward_percents)
    print(reward_percents[:10])
    print('On average planning with the inferred rewards is '
          + str(100 * average_percent_reward)
          + '% as good as planning with the true rewards')
    return average_percent_reward
def main(_):
    env_name = XMAGICAL_EMBODIMENT_TO_ENV_NAME[FLAGS.embodiment]
    env = utils.make_env(env_name, seed=0)

    # Reward learning wrapper.
    if FLAGS.config.reward_wrapper.pretrained_path is not None:
        env = utils.wrap_learned_reward(env, FLAGS.config)

    viewer = KeyboardEnvInteractor(action_dim=env.action_space.shape[0])

    env.reset()
    obs = env.render("rgb_array")
    viewer.imshow(obs)

    i = [0]
    rews = []

    def step(action):
        obs, rew, done, info = env.step(action)
        rews.append(rew)
        if obs.ndim != 3:
            obs = env.render("rgb_array")
        if done:
            print(f"Done, score {info['eval_score']:.2f}/1.00")
            print("Episode metrics: ")
            for k, v in info["episode"].items():
                print(f"\t{k}: {v}")
            if FLAGS.exit_on_done:
                return
        i[0] += 1
        return obs

    viewer.run_loop(step)

    utils.plot_reward(rews)
def dqfd_test(exp, agent):
    out_filename_format = '_imagetest/episode_{:0>4d}/{:s}/{:0>6d}'

    overall_reward = []

    for i_episode in range(config.TEST_EPISODE):
        print("Environment reset...")
        exp.reset()
        state = None
        meas = None
        next_state = None
        next_meas = None
        offroad_list = []
        otherlane_list = []
        episode_reward = 0

        # transition_queue = collections.deque(maxlen=config.TRAJECTORY_NUM)
        for steps in range(config.REPLAY_FRAME):
            print("Test episode: %d, frame: %d , length of replaymemory %d" %
                  (i_episode, steps, len(agent.replay_memory)))
            action_no = agent.e_greedy_select_action(state)
            action = exp.reverse_action(action_no)
            next_meas, next_state, reward, done, _ = exp.step(action)
            next_state = utils.rgb_image_to_tensor(next_state['CameraRGB'])
            offroad_list.append(next_meas['offroad'])
            otherlane_list.append(next_meas['other_lane'])
            episode_reward += reward

            # reset the enviroment if the car stay offroad or other_lane for 5 consequent steps
            if len(offroad_list) > 10:
                ar = np.array(offroad_list[-5:]).astype('int64')
                tag1 = np.bitwise_and.reduce(ar)
                br = np.array(otherlane_list[-10:]).astype('int64')
                tag2 = np.bitwise_and.reduce(br)
                tag = tag1 | tag2
                if tag:
                    break
                    print("Reset because of off road or intersection!")

            for name, images in exp.cur_image.items():
                filename = out_filename_format.format(i_episode, name, steps)
                images.save_to_disk(filename)

            state = next_state
            meas = next_meas

        overall_reward.append(episode_reward / steps)
        utils.plot_reward(overall_reward)
        print("Episode finished!")

    print("Test finished!")

    return overall_reward
def show_agents(grids,
                agent_list,
                agent_names,
                grid_names,
                filename='AgentComparison',
                figtitle=''):
    """Shows how agents perform on a gridworld

    grid - list of gridworlds (see examples in earlier part of file)
    agent_list - list of agent (objects)
    agent_names - names of agents (strings)
    """
    num_ex = len(agent_list)
    num_grids = len(grids)
    fig, axes_grid = plt.subplots(num_grids, num_ex, figsize=(14.0, 4.5))

    if num_grids == 1:
        axes_grid = [axes_grid]

    for i, axes in enumerate(axes_grid):
        # Give each gridworld a name (uncomment to do so)
        # ax.set_ylabel(grid_names[i])
        # Generate MDP
        grid = grids[i]
        mdp = GridworldMdp(grid, noise=0.2)
        walls, reward, start = mdp.convert_to_numpy_input()

        for idx, agent in enumerate(agent_list):
            ax = axes[idx]
            ax.set_aspect('equal')

            plot_reward(reward, walls, '', fig=fig, ax=ax)
            plot_trajectory(walls,
                            reward,
                            start,
                            agent,
                            arrow_width=0.35,
                            fig=fig,
                            ax=ax)
            # Only write Agent names if it's the first row
            if i == 0:
                ax.set_title(agent_names[idx],
                             fontname='Times New Roman',
                             fontsize=16)

            print('Agent {} is {}'.format(agent_names[idx], agent))

    # Increase vertical space btwn subplots
    # fig.subplots_adjust(hspace=0.2)
    # fig.suptitle(figtitle)
    fig.savefig(filename, bbox_inches='tight', dpi=500)
    print("Saved figure to {}.png".format(filename))
def random_gridworld_plot(agent, other_agent, size, filename='RandomGrid'):
    """Plots random gridworld"""
    from gridworld.gridworld import Direction
    from utils import Distribution
    if agent is None:
        raise ValueError("agent cannot be None")

    num_R = 5
    mdp = GridworldMdp.generate_random_connected(size, size, num_R, noise=0)

    walls, reward, start = mdp.convert_to_numpy_input()

    def get_policy(agent):
        num_actions = 5
        imsize = len(walls)

        def dist_to_numpy(dist):
            return dist.as_numpy_array(Direction.get_number_from_direction,
                                       num_actions)

        def action(state):
            # Walls are invalid states and the MDP will refuse to give an action for
            # them. However, the VIN's architecture requires it to provide an action
            # distribution for walls too, so hardcode it to always be STAY.
            x, y = state
            if mdp.walls[y][x]:
                return dist_to_numpy(Distribution({Direction.STAY: 1}))
            return dist_to_numpy(agent.get_action_distribution(state))

        agent.set_mdp(mdp)
        action_dists = [[action((x, y)) for x in range(imsize)]
                        for y in range(imsize)]
        action_dists = np.array(action_dists)
        return action_dists

    fig, axes = plt.subplots(1, 1)
    fig.set_size_inches(5, 5)

    # Reward only
    plot_reward(reward, np.zeros_like(walls), fig=fig, ax=axes, ax_title='')
    fig.savefig(filename + 'R', bbox_inches='tight', dpi=100)

    # Walls only
    plot_reward(np.zeros_like(reward), walls, fig=fig, ax=axes, ax_title='')
    fig.savefig(filename + 'W', bbox_inches='tight', dpi=100)

    # Trajectory + Walls + Rewards
    plot_reward(reward, walls, fig=fig, ax=axes, ax_title='')
    # plot_trajectory(walls, reward, start, agent, fig=fig, ax=axes)
    policy = get_policy(agent)
    plot_policy(walls, policy, fig=fig, ax=axes)
    fig.savefig(filename + 'Ptrue', bbox_inches='tight', dpi=100)

    axes.clear()
    plot_reward(reward, walls, fig=fig, ax=axes, ax_title='')
    predicted = get_policy(other_agent)
    plot_policy_diff(predicted, policy, walls, fig=fig, ax=axes)
    fig.savefig(filename + 'Ppredicted', bbox_inches='tight', dpi=100)
예제 #6
0
    accumulated_reward += state.reward
    accumulated_reward_list.append(accumulated_reward)

    if opt.disp_on:
        if win_all is None:
            plt.subplot(121)
            win_all = plt.imshow(state.screen)
            plt.subplot(122)
            win_pob = plt.imshow(state.pob)
        else:
            win_all.set_data(state.screen)
            win_pob.set_data(state.pob)
        plt.pause(opt.disp_interval)
        plt.draw()
    epi_step = epi_step + 1

    if step % 100:
        print("Step {}: Accumulated reward= {}".format(step,
                                                       accumulated_reward))

elapsed_time = time() - start_time
print("----- Finished testing -----")
print("Completed {} games, reached targt {} times".format(
    opt.steps, n_completed_episodes))
print("{}% of games the target was reached".format(n_completed_episodes /
                                                   opt.steps))
print("Total time        : {:.2f} seconds".format(elapsed_time))
print("Accumulated reward: ", accumulated_reward_list[-1])

plot_reward(accumulated_reward_list, opt.acc_reward_test_figure_path)
예제 #7
0
            test(env, likelihood)
            env.adapt_a()

            likelihood.reinitialize_optimizer(lr=1e-2)
            l_a2b, l_b2a = control.train_with_buffer(
                env, likelihood, hparam['nb_episode_adapt'])
            likelihood_a2b_adapt[run, :] = l_a2b
            likelihood_b2a_adapt[run, :] = l_b2a

    utils.plot_training(likelihood_a2b, likelihood_b2a, hparam['output'], True)
    utils.plot_adaptation(likelihood_a2b_adapt, likelihood_b2a_adapt,
                          hparam['output'], True)

    if rl_mode:
        reward_a2b = np.cumsum(reward_a2b, axis=1)
        reward_b2a = np.cumsum(reward_b2a, axis=1)
        reward_a2b_adapt = np.cumsum(reward_a2b_adapt, axis=1)
        reward_b2a_adapt = np.cumsum(reward_b2a_adapt, axis=1)
        utils.plot_reward(reward_a2b, reward_b2a, hparam['output'], True)
        utils.plot_reward_adapt(reward_a2b_adapt, reward_b2a_adapt,
                                hparam['output'], True)


def test(env, likelihood):
    env.compare_directions(likelihood)
    env.compare_likelihood(likelihood, 0, 0, 0, 0, 0)
    env.compare_likelihood(likelihood, 1, 0, 0, 0, 0)
    env.compare_likelihood(likelihood, 2, 0, 0, 0, 0)
    env.compare_likelihood(likelihood, 3, 0, 0, 0, 0)
    __import__('ipdb').set_trace()
예제 #8
0
    def run_game(self):

        config = self.config
        n = config.runs_per_agent
        prev_best_reward = -1000

        for run in range(n):

            # potentially, we can change the goals as agent picks up more skills
            env = eval(config.environment)
            test_env = eval(config.environment)
            cLoss, aLoss = [], []

            # 0. instantiate an agent instance of this class
            agent = self.agentCls(**self.agentArgs)
            obs_dim, act_dim = agent.obs_dim, agent.act_dim

            # 1. instantiate a memory pool and warm up
            rpm = ReplayMemory(config.memory_size, obs_dim, act_dim)

            # 2. set up logging file
            save_dir = config.log_path + "{}_{}".format(self.name, run + 1)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)

            # 3. start training
            test_flag, total_steps = 0, 0
            train_rewards, test_means, test_stds = [], [], []
            pbar = tqdm(total=config.train_total_steps)
            while total_steps < config.train_total_steps:

                para = [
                    config.reward_scale, config.warmup_size, config.batch_size,
                    config.expl_noise
                ]
                train_reward, steps, costC, costA = run_train_episode(
                    env, agent, rpm, *para)

                total_steps += steps
                train_rewards.append(train_reward)
                cLoss.append(costC)
                aLoss.append(costA)

                pbar.set_description('Steps: {} Reward: {}'.format(
                    total_steps, train_reward))
                pbar.update(steps)

                # 4. start testing
                if total_steps // config.test_every_steps >= test_flag:
                    while total_steps // config.test_every_steps >= test_flag:
                        test_flag += 1
                    r_mean, r_std = run_evaluate_episode(test_env, agent)
                    logger.info('Steps {}, Evaluate reward: {}'.format(
                        total_steps, r_mean))
                    test_means.append(r_mean)
                    test_stds.append(r_std)
                    if config.save_model and r_mean > prev_best_reward:
                        prev_best_reward = r_mean
                        ckpt = save_dir + '/Steps_{}_reward_{}.ckpt'.format(
                            total_steps, int(r_mean))
                        agent.save(ckpt, program=agent.pred_program)
                    np.savez(save_dir + '/record.npz',
                             train=train_rewards,
                             mean=test_means,
                             std=test_stds,
                             closs=cLoss,
                             aloss=aLoss)
            if config.visual_result:
                plot_reward(train_rewards)
                plot_reward(test_means, test_stds)
예제 #9
0
                            '-------------------------------------------------'
                        )
                        subtask_list.append(subtask_index)
                        subtask_reward.append(acc_reward)
                        task_r += acc_reward
                if render:
                    task[subtask_index].close()

        task_reward.append(task_r)
        time_task = time.time() - time_task_0

        if task_epi % 1 == 0:
            # record the reward
            plot_reward(subtask_list,
                        subtask_reward,
                        name=model.name + '_' +
                        'subtask_reward_CartPole_SwingUp_' + str(task_epi),
                        xlabel='subtask',
                        y=195)
            plot_reward(range(len(task_reward)),
                        task_reward,
                        name=model.name + '_' +
                        'task_reward_CartPole_SwingUp_' + str(task_epi),
                        xlabel='episode',
                        y=1560,
                        scatter=False)

            print('***************************')
            print('task_episode: ', task_epi, ' time: ', time_task)
            if model.name == 'DPGPMM':
                numbers = []
                for comp in model.DP_mix.comps:
                'optimizer_actor_state_dict':
                agent.optimizer_actor.state_dict(),
                'optimizer_critic_state_dict':
                agent.optimizer_actor.state_dict()
            }, 'agent_state_dict.pt')

        if np.mean(scores_window) >= 32:

            print("\n Problem Solved!")

            break

print("Score: {}".format(score))

#Plotting Rewards
plot_reward(scores)

############################## TESTING AGENT ########################################################
#####################################################################################################

# Loading saved parameters to test agent over 100 trials
checkpoint = torch.load('agent_state_dict.pt', map_location="cpu")
agent.actor_main.load_state_dict(checkpoint['actor_main_network_state_dict'])
agent.critic_main.load_state_dict(checkpoint['critic_main_network_state_dict'])


def test(num_episodes=100):

    all_scores = []

    from tqdm import tqdm
예제 #11
0
    config = init_birl_flags()
    if config.datafile is None:
        print('--datafile option is required')
        exit()

    # seed random generators
    set_seeds(config.seed)

    imagetest, rewardtest, ytest = load_dataset(config.datafile)[-3:]
    for image, reward, policy in zip(imagetest, rewardtest, ytest):
        mdp = GridworldMdp.from_numpy_input(image, reward)
        mdp = GridworldMdpLearnableR.from_full_mdp(mdp)
        inferred_reward = birl(mdp,
                               policy,
                               config.beta,
                               num_burn_in=config.num_burn_in,
                               num_samples=config.num_samples,
                               display_step=config.display_step)

        print('The first set of walls is:')
        print(image)
        print('The first reward should be:')
        print(reward)
        inferred_reward = inferred_reward / inferred_reward.max()
        inferred_reward = np.reshape(inferred_reward, image.shape)
        print('The inferred reward is:')
        print(inferred_reward)

        plot_reward(reward, inferred_reward)
        break
def dqfd_replay(exp, agent):

    overall_reward = []

    for i_episode in range(config.REPLAY_EPISODE):
        print("Environment reset...")
        exp.reset()
        state = None
        meas = None
        next_state = None
        next_meas = None
        offroad_list = []
        otherlane_list = []
        episode_reward = 0

        # transition_queue = collections.deque(maxlen=config.TRAJECTORY_NUM)
        for steps in itertools.count(config.DEMO_BUFFER_SIZE):
            frame_no = steps - config.DEMO_BUFFER_SIZE
            print("Replay episode: %d, frame: %d , length of replaymemory %d" %
                  (i_episode, frame_no, len(agent.replay_memory)))
            action_no = agent.e_greedy_select_action(state)
            action = exp.reverse_action(action_no)
            next_meas, next_state, reward, done, _ = exp.step(action)
            next_state = utils.rgb_image_to_tensor(next_state['CameraRGB'])
            offroad_list.append(next_meas['offroad'])
            otherlane_list.append(next_meas['other_lane'])
            episode_reward += reward

            # reset the enviroment if the car stay offroad or other_lane for 5 consequent steps
            if len(offroad_list) > 10:
                ar = np.array(offroad_list[-5:]).astype('int64')
                tag1 = np.bitwise_and.reduce(ar)
                br = np.array(otherlane_list[-10:]).astype('int64')
                tag2 = np.bitwise_and.reduce(br)
                tag = tag1 | tag2
                if tag:
                    exp.reset()

            if meas:
                transition = Transition(
                    meas, state, torch.tensor([[action_no]]),
                    torch.tensor([[reward]]), next_state, next_meas,
                    torch.zeros(1)
                )  # TODO: use both the measurement and the image later
                agent.replay_memory_push([transition])

            state = next_state
            meas = next_meas

            if agent.replay_memory.is_full:
                print("Trainning!")
                agent.train()
            #
            # if done:
            #     print("episode: %d, memory length: %d  epsilon: %f" % (i_episode, len(agent.replay_memory), agent.epsilon))
            #     break
            if steps % 100 == 0:
                agent.update_target_net()

            if frame_no >= config.REPLAY_FRAME:
                overall_reward.append(episode_reward / config.REPLAY_FRAME)
                utils.plot_reward(overall_reward)
                print("Episode finished!")
                break

#save the result every 20 episode
        if i_episode % 20 == 0:
            print("Saving prameters for the last 20 episodes")
            reward_df = pd.DataFrame(overall_reward)
            reward_df.to_csv('_episode_reward%d.csv' % i_episode)
            with open(config.CARLA_TRAIN_FILE, 'wb') as f:
                pickle.dump(agent, f)
                print("Trained parameters achevied!")

    print("Replay finished!")

    return overall_reward