fig, ax = plt.subplots(1, 1, figsize=(6, 6))

    # define the data between 0 and 20
    NUM_VALS = 20
    x = np.random.uniform(0, NUM_VALS, size=NUM_VALS)
    y = np.random.uniform(0, NUM_VALS, size=NUM_VALS)

    # define the color chart between 2 and 10 using the 'autumn_r' colormap, so
    #   y <= 2  is yellow
    #   y >= 10 is red
    #   2 < y < 10 is between from yellow to red, according to its value
    COL = MplColorHelper('autumn_r', 2, 10)

    scat = ax.scatter(x, y, s=300, c=COL.get_rgb(y))
    ax.set_title('Well defined discrete colors')
    plt.show()


if __name__ == '__main__':
    import gym
    import environments
    from sac.envs import GymEnv
    from time import sleep
    env = GymEnv('MountainCarContinuousColor-v0')

    env.reset()
    env.env.render()
    env.step(0.5)
    env.env.render()
    sleep(3)
Exemplo n.º 2
0
def main(env_id, seed, entropy_coeff, n_epochs, dynamic_coeff, clip_norm,
         normalize_obs, buffer_size, max_path_length, min_pool_size,
         batch_size, policy_mode, eval_model, e, stochastic):
    tf.set_random_seed(seed=seed)

    env = GymEnv(env_id)
    env.min_action = env.action_space.low[0]
    env.max_action = env.action_space.high[0]
    if hasattr(env, "seed"):
        env.seed(seed)
    else:
        env.env.seed(seed)

    # define value function
    layer_size = 100
    qf = NNQFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec,
                     hidden_layer_sizes=(layer_size, layer_size))
    print("here")

    # use GMM policy
    if policy_mode == "GMMPolicy":
        # use GMM policy
        policy = GMMPolicy(env_spec=env.spec,
                           K=4,
                           hidden_layer_sizes=[layer_size, layer_size],
                           qf=qf,
                           reg=1e-3,
                           squash=True)
    elif policy_mode == "EExploitationPolicy":
        policy = EExploitationPolicy(
            env_spec=env.spec,
            K=4,
            hidden_layer_sizes=[layer_size, layer_size],
            qf=qf,
            reg=1e-3,
            squash=True,
            e=e)

    else:
        _, mode = str(policy_mode).split('-')
        if _ != "Knack":
            raise AssertionError(
                "policy_mode should be GMMPolicy or Knack-p_control or Knack-exploitation or Knack-exploration"
            )
        else:
            policy = KnackBasedPolicy(
                a_lim_lows=env.action_space.low,
                a_lim_highs=env.action_space.high,
                mode=mode,
                env_spec=env.spec,
                K=4,
                hidden_layer_sizes=[layer_size, layer_size],
                qf=qf,
                vf=vf,
                reg=1e-3,
                squash=True)

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=n_epochs,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    max_replay_buffer_size = buffer_size
    pool = SimpleReplayBuffer(env_spec=env.spec,
                              max_replay_buffer_size=max_replay_buffer_size)
    sampler_params = {
        'max_path_length': max_path_length,
        'min_pool_size': min_pool_size,
        'batch_size': batch_size
    }
    sampler = NormalizeSampler(
        **sampler_params) if normalize_obs else SimpleSampler(**sampler_params)

    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(base_kwargs=base_kwargs,
                    env=env,
                    policy=policy,
                    pool=pool,
                    qf=qf,
                    vf=vf,
                    lr=3e-4,
                    scale_reward=1.,
                    discount=0.99,
                    tau=1e-2,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    dynamic_coeff=dynamic_coeff,
                    entropy_coeff=entropy_coeff,
                    clip_norm=clip_norm)

    algorithm._sess.run(tf.global_variables_initializer())
    # -------------- setting done ------------------------

    # -------------- main process ------------------------
    with algorithm._sess.as_default():
        algorithm._saver.restore(algorithm._sess, eval_model)

        if stochastic:
            knack_file = os.path.join(os.path.dirname(eval_model),
                                      "array/epoch0_2001.npz")
            final_knacks = np.load(knack_file)['knack_kurtosis'][-1]

        env = algorithm._env

        if hasattr(env, "env"):
            env = env.env

        # np.random.seed(seed)
        # env.seed(seed)
        num_data = 50  # num_data * nprocess == 1500
        steps_thresh = 1000
        data = {'acs': [], 'ep_rets': [], 'obs': [], 'rews': []}
        for i in range(num_data):
            obs = env.reset()
            done = False
            steps = 0
            ret = 0
            tmp_data = {'acs': [], 'obs': [], 'rews': []}
            if stochastic:
                _min = np.min(final_knacks)
                _max = np.max(final_knacks)
            print("start episode {}".format(i))
            while not done:
                steps += 1
                # env.render()
                if stochastic:
                    if hasattr(algorithm.pi, "knack_thresh"):
                        v, mean, var, kurtosis = algorithm._policy.calc_and_update_knack(
                            [obs])
                        knack_value = kurtosis[0]
                        # _min = min(knack_value, _min)
                        # _max = max(knack_value, _max)
                        knack_value = (knack_value - _min) / (_max - _min)
                        if knack_value > 0.8:  ## TODO hyper param
                            print("knack {}".format(knack_value))
                            was = algorithm._policy._is_deterministic
                            algorithm._policy._is_deterministic = True
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                            algorithm._policy._is_deterministic = was
                        else:
                            action, _ = algorithm.policy.get_action(
                                obs.flatten())
                    else:
                        algorithm._policy._is_deterministic = False
                        action, _ = algorithm.policy.get_action(obs.flatten())
                else:
                    if hasattr(algorithm._policy, "_is_deterministic"):
                        algorithm._policy._is_deterministic = True
                    action, _ = algorithm.policy.get_action(obs.flatten())

                obs_next, rew, done, _ = env.step(action)
                tmp_data['obs'].append(obs)
                tmp_data['acs'].append(action)
                tmp_data['rews'].append(rew)
                ret += rew

                obs = obs_next
                if steps >= steps_thresh:
                    done = True

            data['ep_rets'].append(ret)
            for k, v in tmp_data.items():
                data[k].append(v)

    # np.savez_compressed("a.npz", **data)
    # print("return mean: {}".format(np.mean(data['ep_rets'])))
    return data
Exemplo n.º 3
0
def main(root_dir):
    # tf.set_random_seed(seed=seed)
    # env = GymEnv('MountainCarContinuous-v0')
    env = GymEnv('MountainCarContinuousColor-v0')

    max_replay_buffer_size = int(1e6)
    sampler_params = {'max_path_length': 1000, 'min_pool_size': 1000, 'batch_size': 128}

    # TODO Normalize or not
    sampler = SimpleSampler(**sampler_params)

    entropy_coeff = 0.
    dynamic_coeff = True

    # define value function
    layer_size = 100

    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(layer_size, layer_size))

    # use GMM policy
    policy = GMMPolicy(
        env_spec=env.spec,
        K=4,
        hidden_layer_sizes=[layer_size, layer_size],
        qf=qf,
        reg=1e-3,
        squash=True
    )

    # TODO
    base_kwargs = dict(
        epoch_length=1000,
        n_epochs=10,
        # scale_reward=1,
        n_train_repeat=1,
        eval_render=False,
        eval_n_episodes=20,
        eval_deterministic=True,
    )

    pool = SimpleReplayBuffer(env_spec=env.spec, max_replay_buffer_size=max_replay_buffer_size)
    base_kwargs = dict(base_kwargs, sampler=sampler)

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,
        lr=3e-4,
        scale_reward=1.,
        discount=0.99,
        tau=1e-2,
        target_update_interval=1,
        action_prior='uniform',
        save_full_state=False,
        dynamic_coeff=dynamic_coeff,
        entropy_coeff=entropy_coeff
    )


    algorithm._sess.run(tf.global_variables_initializer())


    # TODO Normalize or not
    # Currently only MountainCar is available
    with algorithm._sess.as_default():
        model_file = os.path.join(root_dir, 'model')
        algorithm._saver.restore(algorithm._sess, model_file)

        for i in range(1):
            obs = env.reset()
            env.env.render()
            sleep(4.0)
            traj = [obs]
            done = False

            while not done:
                env.env.render()
                action = algorithm.policy.get_action(obs.flatten())
                obs, rew, done, _ = env.step(action)
                traj.append(obs.flatten())

            knack, knack_kurtosis = sub_goal_detect(algorithm, traj)
            idxs = np.argsort(knack_kurtosis)
            # idxs = np.argsort(knack)
            print(idxs[::-1])

            COL = MplColorHelper('Blues', np.min(knack_kurtosis), np.max(knack_kurtosis))
            for j, s in enumerate(traj):
                env.env.state = np.array(traj[j])
                rgba = COL.get_rgb(knack_kurtosis[j])
                env.env.render(car_rgba=rgba)
            sleep(1.0)

            for idx in idxs[::-1]:
                obs = env.reset()
                env.env.state = np.array(traj[0])
                rgba = COL.get_rgb(knack_kurtosis[0])
                env.env.render(car_rgba=rgba)
                for j in range(idx+1):
                    env.env.state = np.array(traj[j])
                    rgba = COL.get_rgb(knack_kurtosis[j])

                    # env.env.viewer.geoms[1].set_color(*(0.0, 0.0, 1.0))
                    env.env.render(car_rgba=rgba)
                sleep(0.5)