示例#1
0
def train():
    env = CBNEnv.create(1)
    n_steps = int(1024 / env.num_envs)

    logdir, logger = setup_logger()
    env.set_attr("logger", logger)
    log_callback = lambda locals, globals: env.env_method("log_callback")

    policy_kwargs = dict(n_lstm=192, layers=[], feature_extraction="mlp")

    model = A2C(policy=LstmPolicy,
                env=env,
                alpha=0.95,
                gamma=0.93,
                n_steps=1024,
                vf_coef=0.05,
                ent_coef=0.25,
                learning_rate=1e-4,
                max_grad_norm=10000,
                lr_schedule='linear',
                policy_kwargs=policy_kwargs,
                tensorboard_log=logdir,
                verbose=1,
                seed=94566)

    model.learn(total_timesteps=int(1e7), callback=log_callback)
示例#2
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    elif policy == 'lnlstm':
        policy_fn = CnnLnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)

    model = A2C(policy_fn, env, lr_schedule=lr_schedule)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
示例#3
0
def setup(model_params, output_folder_path):
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        print("Creating model...")
        model = A2C(CnnPolicy, **model_params)
    else:
        print("Loading model...")
        model = A2C.load(latest_model_path, **model_params)
    tensorboard_dir = (output_folder_path / "tensorboard")
    ckpt_dir = (output_folder_path / "checkpoints")
    tensorboard_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_callback = CheckpointCallback(save_freq=200,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    logging_callback = CustomCallback(model=model, verbose=1)
    callbacks = CallbackList([checkpoint_callback, logging_callback])
    return model, callbacks
示例#4
0
def main(env,
         load,
         save_path,
         load_path=None,
         train_timesteps=1.25e6,
         eval_timesteps=5e3):

    # arguments
    print(
        "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;"
        % (env, load, save_path, load_path, train_timesteps, eval_timesteps))
    train_timesteps = int(float(train_timesteps))
    eval_timesteps = int(float(eval_timesteps))

    # models path
    model_dir = os.getcwd() + "/models/"
    os.makedirs(model_dir, exist_ok=True)

    # logging path
    log_dir = os.getcwd() + "/log/" + save_path
    os.makedirs(log_dir, exist_ok=True)

    # absolute save path and models path
    save_path = model_dir + save_path
    if load and not load_path:
        print("no load path given, exiting...")
        sys.exit()
    elif load:
        load_path = model_dir + load_path

    # make environment, flattened environment, monitor, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(
        env, ['observation', 'achieved_goal', 'desired_goal'])
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # load model, or start from scratch
    if load:
        print("loading model from: " + load_path)
        model = A2C.load(load_path, env=env)
    else:
        print("training model from scratch")
        model = A2C(MlpPolicy, env, verbose=1)

    # evaluate current model
    mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps)

    # train model
    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0
    model.learn(total_timesteps=train_timesteps, callback=None)

    # save model
    print("saving model to:" + save_path)
    model.save(save_path)

    # evaluate post training model
    mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps)

    # results
    print("reward before training:" + str(mean_reward_before_train))
    print("reward after training:" + str(mean_reward_after_train))
    print("done")
示例#5
0
from stable_baselines.common.vec_env import DummyVecEnv
# from stable_baselines.ppo2 import PPO2
from stable_baselines.a2c import A2C

# import gym_fin


logging.basicConfig(stream=sys.stdout)
logger = logging.getLogger(__name__)

env = gym.make('gym_fin:Pension-v0')
# vectorized environments allow to easily multiprocess training
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run

# model = PPO2(MlpPolicy, env, verbose=0)
model = A2C(MlpPolicy, env, verbose=0)


def evaluate(model, num_steps=1000):
    """Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_steps: (int) number of timesteps to evaluate it
    :return: (float) Mean reward for the last 100 episodes
    """
    episode_rewards = [0.0]
    obs = env.reset()
    for i in range(num_steps):
        # _states are only useful when using LSTM policies
        action, _states = model.predict(obs)
        # here, action, rewards and dones are arrays
        # because we are using vectorized env
示例#6
0
from stable_baselines.a2c import A2C
from stable_baselines.acer import ACER
from stable_baselines.acktr import ACKTR
from stable_baselines.deepq import DeepQ
from stable_baselines.ppo1 import PPO1
from stable_baselines.ppo2 import PPO2
from stable_baselines.trpo_mpi import TRPO
from stable_baselines.common.identity_env import IdentityEnv
from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.deepq import models as deepq_models

learn_func_list = [
    lambda e: A2C(
        policy=MlpPolicy, learning_rate=1e-3, n_steps=1, gamma=0.7, env=e).
    learn(total_timesteps=10000, seed=0),
    lambda e: ACER(policy=MlpPolicy, env=e, n_steps=1, replay_ratio=1).learn(
        total_timesteps=10000, seed=0),
    lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1
                    ).learn(total_timesteps=20000, seed=0),
    lambda e: DeepQ(policy=deepq_models.mlp([32]),
                    batch_size=16,
                    gamma=0.1,
                    exploration_fraction=0.001,
                    env=e).learn(total_timesteps=40000, seed=0),
    lambda e: PPO1(policy=MlpPolicy,
                   env=e,
                   lam=0.7,
                   optim_batchsize=16,
                   optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0),
示例#7
0
def main(env, load_path, fig_path):

    # arguments
    print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path))
    log_path = os.getcwd() + "/log/" + load_path
    os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True)
    fig_path = os.getcwd() + "/figs/" + "/" + fig_path
    load_path = os.getcwd() + "/models/" + load_path

    # make environment, flattened environment, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal'])
    env = DummyVecEnv([lambda: env])

    # load model
    model = A2C.load(load_path, env=env)
    obs_initial = env.reset()
    obs = obs_initial

    # plot results
    plot_results(fig_path, log_path)

    # initializations
    niter = 10
    counter = 0
    timestep = 0
    results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    print("==============================")

    # check initial positions and quaternions
    print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip'))
    print("box", env.envs[0].env.env.sim.data.get_site_xpos('box'))
    print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool'))
    print("mocap", env.envs[0].env.env.sim.data.mocap_pos)
    print("quat", env.envs[0].env.env.sim.data.mocap_quat)
    print("==============================")

    # mocap quaternion check
    for i in range(5):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        quat = env.envs[0].env.env.sim.data.mocap_quat
        print("obs", obs)
        print("quat", quat)
    print("==============================")

    # start rendering
    dists = []
    box_goal_pos = np.array([0.6, 0.05, -0.17])
    while True:
        if counter == niter:
            break
        action, _states = model.predict(obs)
        obs_old = obs
        obs, rewards, dones, info = env.step(action)
        quaternion = env.envs[0].env.env.sim.data.mocap_quat
        if obs.all() == obs_initial.all():
            if counter % 10 == 0:
                xyzs = current[0]
                quats = current[1]
                print(xyzs)
                print(quats)
                filename = log_path + "/" + "results_" + str(counter) + ".txt"
                os.makedirs(log_path + "/", exist_ok=True)
                file = open(filename, 'w+')
                for xyz, quat in zip(xyzs, quats):
                    for coord in xyz:
                        file.write(str(coord) + " ")
                    for quat_coord in quat:
                        file.write(str(quat_coord) + " ")
                    file.write("\n")
                file.close()

            box_end_pos = np.array(obs_old[0][3:6].tolist())
            print(box_end_pos)
            print(np.shape(box_end_pos))
            print(box_goal_pos)
            print(np.shape(box_goal_pos))
            dists.append(np.linalg.norm(box_goal_pos - box_end_pos))
            current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
            timestep = 0
            counter += 1
        print(timestep)
        print("obs", obs)
        print("quat", quaternion)

        # for average trajectory, smoothed
        for i in range(3):
            results[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            results[1][timestep][j] += quaternion[0].tolist()[j]

        # for current trajectory
        for i in range(3):
            current[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            current[1][timestep][j] += quaternion[0].tolist()[j]

        timestep += 1
        env.render()

    # smooth paths by taking average, and calculate mean distance to goal state
    for timestep in range(100):
        for i in range(3):
            results[0][timeste][i] /= niter
        for j in range(4):
            results[0][timestep][j] /= niter
    dist = np.mean(dists)

    # print and write to file
    xyzs = results[0]
    quats = results[1]
    filename = log_path + "/" + "results_avg.txt"
    os.makedirs(log_path + "/", exist_ok=True)
    file = open(filename, 'w+')
    for xyz, quat in zip(xyzs, quats):
        for coord in xyz:
            file.write(str(coord) + " ")
        for quat_coord in quat:
            file.write(str(quat_coord) + " ")
        file.write("\n")
    file.close()

    # print average distances
    print("average distance of box from end goal: %f" % dist)
示例#8
0
    :param rank: (int) index of the subprocess
    :param board: (numpy array) pre-determined board for env. 
    """
    if board is not None: 
        def _init(): 
            env = gym.make(env_id)
            env.seed(seed + rank)
            env.reset_task(board)
            return env 
    else:
        def _init():
            env = gym.make(env_id)
            env.seed(seed + rank)
            return env
    set_global_seeds(seed)
    return _init


if __name__=='__main__':
    num_episodes=int(sys.argv[1])

    env=make_vec_env('grid-v0',n_envs=1,vec_env_cls=SubprocVecEnv) 
    
    policy_kwargs={'cnn_extractor':mlp,'n_lstm':n_lstm}
    #policy_kwargs={'cnn_extractor':cnn,'n_lstm':n_lstm}  
    model=A2C(policy='CnnLstmPolicy',policy_kwargs=policy_kwargs,tensorboard_log='test_log',env=env,gamma=gamma,
        n_steps=n_steps,lr_schedule=lr_schedule,learning_rate=lr,ent_coef=ent_coef,vf_coef=vf_coef,verbose=True)

    model.learn(num_episodes)  
    model.save("7x7_"+rules+"_"+"metalearning.zip") 
import pytest

from stable_baselines.a2c import A2C
from stable_baselines.ppo1 import PPO1
from stable_baselines.ppo2 import PPO2
from stable_baselines.trpo_mpi import TRPO
from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete
from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy

MODEL_FUNC_LIST = [
    lambda e: A2C(policy=MlpPolicy, env=e),
    lambda e: PPO1(policy=MlpPolicy, env=e),
    lambda e: PPO2(policy=MlpPolicy, env=e),
    lambda e: TRPO(policy=MlpPolicy, env=e),
]


@pytest.mark.slow
@pytest.mark.parametrize("model_func", MODEL_FUNC_LIST)
def test_identity_multidiscrete(model_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multidiscrete action space

    :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)])

    model = model_func(env)