def policyEval(envs,
               model_path,
               log_dir,
               algo_class,
               algo_args,
               num_timesteps=251,
               num_cpu=1):
    """
    evaluation for the policy in the given envs
    :param envs: the environment we want to evaluate
    :param model_path: (str)the path to the policy ckp
    :param log_dir: (str) the path from a gym temporal file
    :param algo_class:
    :param algo_args:
    :param num_timesteps: (int) numbers of the timesteps we want to evaluate the policy
    :param num_cpu:
    :return:
    """
    tf.reset_default_graph()

    method = algo_class.load(model_path, args=algo_args)

    using_custom_vec_env = isinstance(envs, WrapFrameStack)

    obs = envs.reset()

    if using_custom_vec_env:
        obs = obs.reshape((1, ) + obs.shape)
    n_done = 0
    last_n_done = 0
    episode_reward = []
    dones = [False for _ in range(num_cpu)]

    for i in range(num_timesteps):
        actions = method.getAction(obs, dones)
        obs, rewards, dones, _ = envs.step(actions)
        if using_custom_vec_env:
            obs = obs.reshape((1, ) + obs.shape)
        if using_custom_vec_env:
            if dones:
                obs = envs.reset()
                obs = obs.reshape((1, ) + obs.shape)

        n_done += np.sum(dones)
        if (n_done - last_n_done) > 1:
            last_n_done = n_done
            _, mean_reward = computeMeanReward(log_dir, n_done)
            episode_reward.append(mean_reward)
            printRed('Episode:{} Reward:{}'.format(n_done, mean_reward))
    _, mean_reward = computeMeanReward(log_dir, n_done)
    printRed('Episode:{} Reward:{}'.format(n_done, mean_reward))

    episode_reward.append(mean_reward)

    episode_reward = np.array(episode_reward)
    envs.close()
    return episode_reward
Exemplo n.º 2
0
def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global win, win_smooth, win_episodes, n_steps, viz, params_saved, best_mean_reward
    # Create vizdom object only if needed
    if viz is None:
        viz = Visdom(port=VISDOM_PORT)

    is_es = registered_rl[ALGO_NAME][1] == AlgoType.EVOLUTION_STRATEGIES

    # Save RL agent parameters
    if not params_saved:
        # Filter locals
        params = filterJSONSerializableObjects(_locals)
        with open(LOG_DIR + "rl_locals.json", "w") as f:
            json.dump(params, f)
        params_saved = True

    # Save the RL model if it has improved
    if (n_steps + 1) % SAVE_INTERVAL == 0:
        # Evaluate network performance
        ok, mean_reward = computeMeanReward(LOG_DIR, N_EPISODES_EVAL, is_es=is_es, return_n_episodes=True)
        if ok:
            # Unpack mean reward and number of episodes
            mean_reward, n_episodes = mean_reward
            print(
                "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))
        else:
            # Not enough episode
            mean_reward = -10000
            n_episodes = 0

        # Save Best model
        if mean_reward > best_mean_reward and n_episodes >= MIN_EPISODES_BEFORE_SAVE:
            # Try saving the running average (only valid for mlp policy)
            try:
                if 'env' in _locals:
                    _locals['env'].save_running_average(LOG_DIR)
                else:
                    _locals['self'].env.save_running_average(LOG_DIR)
            except AttributeError:
                pass

            best_mean_reward = mean_reward
            printGreen("Saving new best model")
            ALGO.save(LOG_DIR + ALGO_NAME + "_model.pkl", _locals)

    # Plots in visdom
    if viz and (n_steps + 1) % LOG_INTERVAL == 0:
        win = timestepsPlot(viz, win, LOG_DIR, ENV_NAME, ALGO_NAME, bin_size=1, smooth=0, title=PLOT_TITLE, is_es=is_es)
        win_smooth = timestepsPlot(viz, win_smooth, LOG_DIR, ENV_NAME, ALGO_NAME, title=PLOT_TITLE + " smoothed",
                                   is_es=is_es)
        win_episodes = episodePlot(viz, win_episodes, LOG_DIR, ENV_NAME, ALGO_NAME, window=EPISODE_WINDOW,
                                   title=PLOT_TITLE + " [Episodes]", is_es=is_es)
    n_steps += 1
    return True
Exemplo n.º 3
0
def main():
    load_args = parseArguments()
    train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(load_args)
    log_dir, envs, algo_args = createEnv(load_args, train_args, algo_name, algo_class, env_kwargs)

    assert (not load_args.plotting and not load_args.action_proba)\
        or load_args.num_cpu == 1, "Error: cannot run plotting with more than 1 CPU"

    tf.reset_default_graph()
    set_global_seeds(load_args.seed)
    # createTensorflowSession()


    printYellow("Compiling Policy function....")
    printYellow(load_path)
    method = algo_class.load(load_path, args=algo_args)

    dones = [False for _ in range(load_args.num_cpu)]
    # HACK: check for custom vec env by checking if the last wrapper is WrapFrameStack
    # this is used for detecting algorithms that have a similar wrapping to deepq
    # is considered a hack because we are unable to detect if this wrapper was added earlier to the environment object
    using_custom_vec_env = isinstance(envs, WrapFrameStack)

    obs = envs.reset()
    if using_custom_vec_env:
        obs = obs.reshape((1,) + obs.shape)

    # plotting init
    if load_args.plotting:
        plt.pause(0.1)
        fig = plt.figure()
        old_obs = []
        if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D:
            ax = fig.add_subplot(111, projection='3d')
            line, = ax.plot([], [], [], c=[1, 0, 0, 1], label="episode 0")
            point = ax.scatter([0], [0], [0], c=[1, 0, 0, 1])
            min_zone = [+np.inf, +np.inf, +np.inf]
            max_zone = [-np.inf, -np.inf, -np.inf]
            amplitude = [0, 0, 0]
            min_state_dim = 3
        else:
            ax = fig.add_subplot(111)
            line, = ax.plot([], [], c=[1, 0, 0, 1], label="episode 0")
            point = ax.scatter([0], [0], c=[1, 0, 0, 1])
            min_zone = [+np.inf, +np.inf]
            max_zone = [-np.inf, -np.inf]
            amplitude = [0, 0]
            min_state_dim = 2
        fig.legend()

        if train_args["srl_model"] in ["ground_truth", "supervised"]:
            delta_obs = [envs.get_original_obs()[0]]
        else:
            # we need to rebuild the PCA representation, in order to visualize correctly in 3D
            # load the saved representations
            path = srl_model_path.split("/")[:-1] + "/image_to_state.json"
            X = np.array(list(json.load(open(path, 'r')).values()))

            X = fixStateDim(X, min_state_dim=min_state_dim)

            # estimate the PCA
            if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D:
                pca = PCA(n_components=3)
            else:
                pca = PCA(n_components=2)
            pca.fit(X)
            delta_obs = [pca.transform(fixStateDim([obs[0]], min_state_dim=min_state_dim))[0]]
        plt.pause(0.00001)

    # check if the algorithm has a defined getActionProba function before allowing action_proba plotting
    if load_args.action_proba:
        if not hasattr(method, "getActionProba"):
            printYellow("Warning: requested flag --action-proba, "
                        "but the algorihtm {} does not implement 'getActionProba'".format(algo_name))
        else:
            fig_prob = plt.figure()
            ax_prob = fig_prob.add_subplot(111)
            old_obs = []
            if train_args["continuous_actions"]:
                ax_prob.set_ylim(np.min(envs.action_space.low), np.max(envs.action_space.high))
                bar = ax_prob.bar(np.arange(np.prod(envs.action_space.shape)),
                                  np.array([0] * np.prod(envs.action_space.shape)),
                                  color=plt.get_cmap('viridis')(int(1 / np.prod(envs.action_space.shape) * 255)))
            else:
                ax_prob.set_ylim(0, 1)
                bar = ax_prob.bar(np.arange(envs.action_space.n), np.array([0] * envs.action_space.n),
                                  color=plt.get_cmap('viridis')(int(1 / envs.action_space.n * 255)))
            plt.pause(1)
            background_prob = fig_prob.canvas.copy_from_bbox(ax_prob.bbox)

    n_done = 0
    last_n_done = 0
    episode = 0
    for i in range(load_args.num_timesteps):
        actions = method.getAction(obs, dones)
        obs, rewards, dones, _ = envs.step(actions)
        if using_custom_vec_env:
            obs = obs.reshape((1,) + obs.shape)

        # plotting
        if load_args.plotting:
            if train_args["srl_model"] in ["ground_truth", "supervised"]:
                ajusted_obs = envs.get_original_obs()[0]
            else:
                ajusted_obs = pca.transform(fixStateDim([obs[0]], min_state_dim=min_state_dim))[0]

            # create a new line, if the episode is finished
            if np.sum(dones) > 0:
                old_obs.append(np.array(delta_obs))
                line.set_c(sns.color_palette()[episode % len(sns.color_palette())])
                episode += 1
                if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D:
                    line, = ax.plot([], [], [], c=[1, 0, 0, 1], label="episode " + str(episode))
                else:
                    line, = ax.plot([], [], c=[1, 0, 0, 1], label="episode " + str(episode))
                fig.legend()
                delta_obs = [ajusted_obs]
            else:
                delta_obs.append(ajusted_obs)

            coor_plt = fixStateDim(np.array(delta_obs), min_state_dim=min_state_dim)[1:]
            unstack_val = coor_plt.shape[1] // train_args.get("num_stack", 1)
            coor_plt = coor_plt[:, -unstack_val:]

            # updating the 3d vertices for the line and the dot drawing, to avoid redrawing the entire image
            if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D:
                line._verts3d = (coor_plt[:, 0], coor_plt[:, 1], coor_plt[:, 2])
                point._offsets3d = (coor_plt[-1:, 0], coor_plt[-1:, 1], coor_plt[-1:, 2])
                if coor_plt.shape[0] > 0:
                    min_zone = np.minimum(np.amin(coor_plt, axis=0), min_zone)
                    max_zone = np.maximum(np.amax(coor_plt, axis=0), max_zone)
                    amplitude = max_zone - min_zone + 1e-10
                ax.set_xlim(min_zone[0] - abs(amplitude[0] * 0.2), max_zone[0] + abs(amplitude[0] * 0.2))
                ax.set_ylim(min_zone[1] - abs(amplitude[1] * 0.2), max_zone[1] + abs(amplitude[1] * 0.2))
                ax.set_zlim(min_zone[2] - abs(amplitude[2] * 0.2), max_zone[2] + abs(amplitude[2] * 0.2))
            else:
                line.set_xdata(coor_plt[:, 0])
                line.set_ydata(coor_plt[:, 1])
                point._offsets = coor_plt[-1:, :]
                if coor_plt.shape[0] > 0:
                    min_zone = np.minimum(np.amin(coor_plt, axis=0), min_zone)
                    max_zone = np.maximum(np.amax(coor_plt, axis=0), max_zone)
                    amplitude = max_zone - min_zone + 1e-10
                ax.set_xlim(min_zone[0] - abs(amplitude[0] * 0.2), max_zone[0] + abs(amplitude[0] * 0.2))
                ax.set_ylim(min_zone[1] - abs(amplitude[1] * 0.2), max_zone[1] + abs(amplitude[1] * 0.2))

            # Draw every 5 frames to avoid UI freezing
            if i % 5 == 0:
                fig.canvas.draw()
                plt.pause(0.000001)

        if load_args.action_proba and hasattr(method, "getActionProba"):
            # When continuous actions are needed, we cannot plot the action probability of every action
            # in the action space, so we show the action directly instead
            if train_args["continuous_actions"]:
                pi = method.getAction(obs, dones)
            else:
                pi = method.getActionProba(obs, dones)

            fig_prob.canvas.restore_region(background_prob)
            for act, rect in enumerate(bar):
                if train_args["continuous_actions"]:
                    rect.set_height(pi[0][act])
                    color_val = np.abs(pi[0][act]) / max(np.max(envs.action_space.high),
                                                         np.max(np.abs(envs.action_space.low)))
                else:
                    rect.set_height(softmax(pi[0])[act])
                    color_val = softmax(pi[0])[act]
                rect.set_color(plt.get_cmap('viridis')(int(color_val * 255)))
                ax_prob.draw_artist(rect)
            fig_prob.canvas.blit(ax_prob.bbox)

        if using_custom_vec_env:
            if dones:
                obs = envs.reset()
                obs = obs.reshape((1,) + obs.shape)

        n_done += np.sum(dones)
        if (n_done - last_n_done) > 1:

            last_n_done = n_done
            _, mean_reward = computeMeanReward(log_dir, n_done)
            print("{} episodes - Mean reward: {:.2f}".format(n_done, mean_reward))
            print("print: ", n_done, log_dir)
    _, mean_reward = computeMeanReward(log_dir, n_done)
    print("{} episodes - Mean reward: {:.2f}".format(n_done, mean_reward))