Exemplo n.º 1
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'Pendulum-v0'
    pickle_path = '../gpirl/notebooks/plots/rllab_trpo_trainig/itr_112.pkl'
    # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl'
    iter_data = joblib.load(pickle_path)
    env = GymEnv(env_name)
    max_r = 1
    while True:
        o = env.reset()
        disc_r = 0
        r_sum = 0
        done = False
        i = 0
        while not done:
            env.render()
            a, _ = iter_data['policy'].get_action(o)
            o, r, done, _ = env.step(a)
            # s = [np.arccos(o[0]), np.arccos(o[1])]
            # r = -np.cos(s[0]) - np.cos(s[1] + s[0])
            disc_r += r * 0.99**(500 - i)
            r_sum += r
            i += 1
        # max_r = r
        print("disc_r : {} , sum_r : {}".format(disc_r, r_sum))
        print("last x : {}".format(o[0]))
        print("-------------------------")
Exemplo n.º 2
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'Cartpole-v3'
    pickle_path = 'data/Cartpole_v3_data_rllab_TRPO/exp_1/itr_1200.pkl'
    # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl'
    iter_data = joblib.load(pickle_path)
    env = GymEnv(env_name)
    max_r = 1
    while True:
        o = env.reset()
        disc_r = 0
        r_sum = 0
        done = False
        i = 0
        print("stable point : {}".format(env.env._stable_x))
        while not done:
            env.render()
            a, _ = iter_data['policy'].get_action(o)
            o, r, done, _ = env.step(a)
            # s = [np.arccos(o[0]), np.arccos(o[1])]
            # r = -np.cos(s[0]) - np.cos(s[1] + s[0])
            disc_r += r * 0.99**(500 - i)
            r_sum += r
            i += 1
        # max_r = r
        print("disc_r : {} , sum_r : {}".format(disc_r, r_sum))
        print("last x : {}".format(o[0]))
        print("-------------------------")
Exemplo n.º 3
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    es = OUStrategy(env_spec=env.spec)
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=350,
        epoch_length=350,
        min_pool_size=350,
        n_epochs=600,
        discount=0.99,
        scale_reward=1.0/140.0,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), 
                                             str(algo.__class__.__name__), 
                                             exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
Exemplo n.º 4
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    itr_num = 400
    episode_length = 400

    while True:
        o = env.reset()
        disc_r = 0
        r_sum = 0
        done = False
        i = 0
        pickle_path = '../gpirl/notebooks_lunarlander/plots/gpirl_400_iter_post_trainig/itr_{}.pkl'.format(
            itr_num)
        # pickle_path = 'data/LunarLanderContinuous_v3_data_rllab_PPO/exp_1/itr_{}.pkl'.format(itr_num)
        iter_data = joblib.load(pickle_path)
        while i < episode_length:
            env.render()
            a, _ = iter_data['policy'].get_action(o)
            o, r, done, _ = env.step(a)
            # s = [np.arccos(o[0]), np.arccos(o[1])]
            # r = -np.cos(s[0]) - np.cos(s[1] + s[0])
            disc_r += r * 0.99**(500 - i)
            r_sum += r
            i += 1
            if done:
                break
        # max_r = r
        print("disc_r : {} , sum_r : {}".format(disc_r, r_sum))
        print("last x : {}".format(o[0]))
        print("iterations : {}".format(i))
        print("-------------------------")
Exemplo n.º 5
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    max_r = 1

    itr_num = 0
    itr_inc = 30
    init_max_trials = 6
    norm_max_trials = 4

    max_iters = 400
    trail_num = 1

    while itr_num < 211:
        o = env.reset()
        if itr_num < 1:
            max_trials = init_max_trials
        else:
            max_trials = norm_max_trials

        disc_r = 0
        r_sum = 0
        done = False
        i = 0
        # pickle_path = '../gpirl/notebooks_lunarlander/plots/gpirl_400_iter_post_trainig/itr_{}.pkl'.format(itr_num)
        pickle_path = 'data/LunarLanderContinuous_v3_data_rllab_PPO/exp_1/itr_{}.pkl'.format(
            itr_num)
        iter_data = joblib.load(pickle_path)
        while i < max_iters:
            env.render()
            a, _ = iter_data['policy'].get_action(o)
            o, r, done, _ = env.step(a)
            # s = [np.arccos(o[0]), np.arccos(o[1])]
            # r = -np.cos(s[0]) - np.cos(s[1] + s[0])
            disc_r += r * 0.99**(500 - i)
            r_sum += r
            i += 1
            if done:
                break
        # max_r = r
        print("disc_r : {} , sum_r : {}".format(disc_r, r_sum))
        print("last x : {}".format(o[0]))
        print("iterations : {}".format(i))
        print("-------------------------")
        if trail_num % max_trials == 0:
            if itr_num < 1:
                trail_num = norm_max_trials
            itr_num += itr_inc
            print("**************** itr number : {} **********************".
                  format(itr_num))
        trail_num += 1
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'Acrobot-v2'
    env = GymEnv(env_name)
    policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64))
    algo = PPO(env=env,
               policy=policy,
               n_itr=1500,
               batch_size=8000,
               max_path_length=1000,
               discount=0.95,
               store_paths=True,
               entropy_weight=ent_wt,
               baseline=LinearFeatureBaseline(env_spec=env))
    data_path = 'data/acrobat_data_rllab_ppo/%s/' % exp_name
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64))
    baseline = GaussianMLPBaseline(env_spec=env)
    algo = PPO(env=env,
               policy=policy,
               n_itr=1500,
               batch_size=8000,
               max_path_length=1000,
               discount=0.99,
               store_paths=True,
               entropy_weight=ent_wt,
               baseline=baseline)
    data_path = 'data/%s_data_rllab_%s/%s/' % (env_name.replace(
        '-', '_'), str(algo.__class__.__name__), exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
Exemplo n.º 8
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'Acrobot-v2'
    pickle_path = 'data/acrobat_data_moded_reward_ppo/exp_1/itr_800.pkl'
    # pickle_path = 'data/acrobat_data_rllab_trpo/exp_1/itr_800.pkl'
    iter_data = joblib.load(pickle_path)
    env = GymEnv(env_name)
    max_r = 1
    while True:
        o = env.reset()
        disc_r=0
        done=False
        i=0
        # print("New episode!")
        while not done:
            env.render()
            a, _ = iter_data['policy'].get_action(o)
            o, r, done, _ = env.step(a)
            s = [np.arccos(o[0]) , np.arccos(o[1])]
            # r = -np.cos(s[0]) - np.cos(s[1] + s[0])
            disc_r += r * 0.99 ** (1000-i)
            i += 1
        # max_r = r
        print("disc_r : {}".format(disc_r))
Exemplo n.º 9
0
from __future__ import print_function
from inverse_rl.envs import register_custom_envs
import sys, gym, time
import numpy as np
from pyglet.window import key as ks
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import os
#
# Test yourself as a learning agent! Pass environment name as a command-line argument, for example:
#
# python keyboard_agent.py SpaceInvadersNoFrameskip-v4
#
keyboard = ks.KeyStateHandler()
register_custom_envs()
env = gym.make(
    'LunarLanderContinuous-v3' if len(sys.argv) < 2 else sys.argv[1])

Kp = 0.1
Kt = 0.2
gamma = 0.9
save_path = "data/lunarlander_demo/"
ACTIONS = env.action_space
RESET_ACTION = np.asarray([0., 0.])
SKIP_CONTROL = 0  # Use previous control decision SKIP_CONTROL times, that's how you
# can test what skip is still usable.
# print("ACTION high low : {} , {}".format(env.action_space.high, env.action_space.low))
human_wants_restart = False
human_sets_pause = False
state_cols = ["state_" + str(i) for i in range(env.observation_space.shape[0])]