Exemplo n.º 1
0
    env = DummyVecEnv([lambda: env])  # ベクトル環境の生成

    print('行動空間: ', env.action_space)
    print('状態空間: ', env.observation_space)

    return env


env = make_env()
custom_callback = CustomCallback(env, render=True)
model = PPO2(policy=CnnPolicy,
             env=env,
             verbose=0,
             learning_rate=0.000025,
             tensorboard_log=log_dir)
model = PPO2.load('./agents/best_mario_ppo2model', env=env, verbose=0)

state = env.reset()
total_reward = 0
while True:
    # 環境の描画
    env.render()

    # スリープ
    time.sleep(1 / 25)

    # モデルの推論
    action, _ = model.predict(state)

    # 1ステップ実行
    state, reward, done, info = env.step(action)
Exemplo n.º 2
0
import gym
import gym_LoL

from stable_baselines.common.policies import MlpLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from notify_run import Notify

if __name__ == "__main__":
    notify = Notify(endpoint='https://notify.run/1GQn88vSML1rmxdz')
    model = None
    model_path = 'ppo_lol'
    try:
        env = DummyVecEnv([lambda: gym.make('LoL-v0')])
        try:
            model = PPO2.load(model_path, env)
        except ValueError:
            model = PPO2(MlpLstmPolicy, env, verbose=1, nminibatches=1)
        for i in range(100):
            model.learn(total_timesteps=2500)
            model.save(model_path)
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        model.save(model_path)
        notify.send('Training Failed for LoL-v0')
        raise
    else:
        notify.send('Training Completed for LoL-v0')
Exemplo n.º 3
0
def run_experiment(
        not_save=False, 
        folder='experiments', 
        weights_location=None,
        tag=None,
        env='Base',
        env_num=4,
        n=0,
        save_interval=10000,
        train_steps=int(1e6),
        description=None,
        weights=None,
        n_steps=200,
        gamma=0.99,
        ):
    
    env_name = env
    if weights is not None and not os.path.isfile(weights):
        raise ValueError("Weights do not exist")

    # Saving args
    args = deepcopy(locals())

    # Get env
    env = getattr(environments, env)
    envs = DummyVecEnv([lambda : env() for i in range(env_num)])

    args['env_config'] = ""#str(env.get_org_config())
    
    # Check if folder exists and if is a valid name
    if not not_save:
        id,logger,logs_folder,experiment_csv,experiment_folder = \
                create_experiment_folder(folder=folder,tag=tag,args=args)
    else:
        id = -1
        logs_folder= None
        logger = None
        experiment_folder = None

    if weights is not None:
        model = PPO2.load(
                weights,
                verbose=0,
                tensorboard_log=logs_folder,
                max_grad_norm=100,
                n_steps=n_steps,
                gamma=gamma,
                )
        model.set_env(envs)
    else:
        model = PPO2(
                    CnnPolicy, 
                    envs,
                    verbose=0,
                    tensorboard_log=logs_folder,
                    max_grad_norm=100,
                    n_steps=n_steps,
                )

    if 'interrupting' in env_name:
        for env in envs.envs:
            env.set_interrupting_params(ppo=model)

    # set bar
    callback = Callback(
            not_save=not_save,
            logger=logger,
            train_steps=train_steps,
            n=n,
            experiment_folder=experiment_folder,
            save_interval=save_interval,
            id=id,
            )

    # Start running experiment
    # Creating nice table
    _width = 40
    del args['env_config']
    max_k_width = max([len(k) for k in args])
    print("\n{}".format("#"*_width))
    print("# {1:^{0}} #".format(_width-4, "RUNNING EXPERIMENT"))
    print("# {1:^{0}} #".format(_width-4, ""))
    print("# {1:<{0}} #".format(_width-4, "{0:{2}s}: {1:03d}".format("ID",id,max_k_width)))
    for k,v in args.items():
        if type(v) is int:
            print("# {1:<{0}} #".format(_width-4,"{0:{2}s}: {1:0d}".format(k,v,max_k_width)))
        elif type(v) is float:
            print("# {1:<{0}} #".format(_width-4,"{0:{2}s}: {1:0.3f}".format(k,v,max_k_width)))
        else:
            print("# {1:<{0}} #".format(_width-4,"{0:{2}s}: {1:s}".format(k,str(v),max_k_width)))
    print("{}".format("#"*_width))
    del args

    print("\n############ STARTING TRAINING ###########\n")
    try:
        with tqdm.tqdm(total=train_steps, leave=True) as bar:
            callback.set_bars(bar)
            model.learn(
                    total_timesteps=train_steps,
                    callback=callback,
                    )

        if not not_save:
            model.save(experiment_folder+"/weights_final")

    except KeyboardInterrupt:
        if not not_save and input("Do you want to DELETE this experiment? (Yes/n) ") == "Yes":
            remove_experiment(experiment_folder, folder, experiment_csv, id)
        else:
            if not not_save:
                model.save(experiment_folder+"/weights_final")
resume = False

if __name__ == "__main__":
    # multiprocess environment
    # for now, it doens't make sense to have multiple environment
    n_cpu = 8
    env = SubprocVecEnv([
        lambda: SwimmerLocomotionEnv(path=fixed_path,
                                     random_path=use_random_path,
                                     use_hard_path=False,
                                     robot_link_length=robot_link_length)
        for i in range(n_cpu)
    ])
    if resume:
        model = PPO2.load("model/reynolds_forward/reynolds_ppo_weight_99",
                          env=env,
                          gamma=gamma,
                          verbose=1,
                          tensorboard_log='./tf_logs/traj_follow/')
    else:
        model = PPO2(MlpPolicy,
                     env,
                     gamma=gamma,
                     verbose=1,
                     tensorboard_log='./tf_logs/reynolds_forward')

    for i in range(100):
        model.learn(total_timesteps=250000, reset_num_timesteps=False)
        model.save("model/reynolds_forward/reynolds_ppo_weight_" + str(i))
Exemplo n.º 5
0
def run_matchup(drafter1: str, drafter2: str, battler: str, games: int,
                seed: int, concurrency: int) \
        -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]:
    """
    Run the match-up between `drafter1` and `drafter2` using `battler` battler
    :param drafter1: drafter to play as first player
    :param drafter2: drafter to play as second player
    :param battler: battler to simulate the matches
    :param games: amount of matches to simulate
    :param seed: seed used to generate the matches
    :param concurrency: amount of matches executed at the same time
    :return: a tuple containing (i) a tuple containing the win rate of the
    first and second players, (ii) a tuple containing the average mana curves
    of the first and second players, (iii) a tuple containing the
    `30 * games` individual draft choices of the first and second players;
    (iv) a tuple of 3-uples containing the card alternatives presented to the
    players at each of the `games` episodes; and (v) a tuple containing the
    `games` decks built by the first and second players.
    """
    # parse the battle agent
    battler = agents.parse_battle_agent(battler)

    # initialize envs
    env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)]

    # wrap envs in a vectorized env
    env = DummyVecEnv(env)

    for i in range(concurrency):
        # no overlap between episodes at each process
        current_seed = seed + (games // concurrency) * i
        current_seed -= 1  # resetting the env increases the seed by 1

        # set seed to env
        env.env_method('seed', current_seed, indices=[i])

    # reset the env
    env.reset()

    # initialize first player
    if drafter1.endswith('zip'):
        current_drafter = agents.RLDraftAgent(PPO2.load(drafter1))
        current_drafter.use_history = "history" in drafter1
    else:
        current_drafter = agents.parse_draft_agent(drafter1)()

    current_drafter.seed(seed)
    current_drafter.name = drafter1
    drafter1 = current_drafter

    # initialize second player
    if drafter2.endswith('zip'):
        other_drafter = agents.RLDraftAgent(PPO2.load(drafter2))
        other_drafter.use_history = "history" in drafter2
    else:
        other_drafter = agents.parse_draft_agent(drafter2)()

    other_drafter.seed(seed)
    other_drafter.name = drafter2
    drafter2 = other_drafter

    # initialize metrics
    episodes_so_far = 0
    episode_rewards = [[0.0] for _ in range(env.num_envs)]
    drafter1.mana_curve = [0 for _ in range(13)]
    drafter2.mana_curve = [0 for _ in range(13)]
    drafter1.choices = [[] for _ in range(env.num_envs)]
    drafter2.choices = [[] for _ in range(env.num_envs)]
    drafter1.decks = [[[]] for _ in range(env.num_envs)]
    drafter2.decks = [[[]] for _ in range(env.num_envs)]
    alternatives = [[] for _ in range(env.num_envs)]

    # run the episodes
    while True:
        observations = env.get_attr('state')

        # get the current agent's action for all concurrent envs
        if isinstance(current_drafter, agents.RLDraftAgent):
            all_past_choices = env.get_attr('choices')
            new_observations = []

            for i, observation in enumerate(observations):
                new_observation = encode_state_draft(
                    observation,
                    use_history=current_drafter.use_history,
                    past_choices=all_past_choices[i][observation.current_player.id]
                )

                new_observations.append(new_observation)

            actions = current_drafter.act(new_observations)
        else:
            actions = [current_drafter.act(observation)
                       for observation in observations]

        # log chosen cards into current agent's mana curve
        for i, (action, observation) in enumerate(zip(actions, observations)):
            # get chosen index
            try:
                chosen_index = action.origin
            except AttributeError:
                chosen_index = action

            # save choice
            current_drafter.choices[i].append(chosen_index)

            # get chosen card
            chosen_card = observation.current_player.hand[chosen_index]

            # increase amount of cards chosen with the chosen card's cost
            current_drafter.mana_curve[chosen_card.cost] += 1

            # add chosen card to this episode's deck
            current_drafter.decks[i][-1].append(chosen_card.id)

            # save card alternatives
            if observation.current_player.id == PlayerOrder.FIRST:
                alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand)))

        # perform the action and get the outcome
        _, rewards, dones, _ = env.step(actions)

        if isinstance(current_drafter, agents.RLDraftAgent):
            current_drafter.dones = dones

        # update metrics
        for i in range(env.num_envs):
            episode_rewards[i][-1] += rewards[i]

            if dones[i]:
                episode_rewards[i].append(0.0)
                current_drafter.decks[i].append([])
                other_drafter.decks[i].append([])

                episodes_so_far += 1

        # check exiting condition
        if episodes_so_far >= games:
            break

        # swap drafters
        current_drafter, other_drafter = other_drafter, current_drafter

    # normalize mana curves
    total_choices = sum(drafter1.mana_curve)
    drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve]
    drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve]

    # join all parallel rewards
    all_rewards = [reward for rewards in episode_rewards
                   for reward in rewards[:-1]]

    # join all parallel choices
    drafter1.choices = [c for choices in drafter1.choices for c in choices]
    drafter2.choices = [c for choices in drafter2.choices for c in choices]

    # join all parallel decks
    drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck]
    drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck]

    # join all parallel alternatives
    alternatives = [turn for env in alternatives for turn in env]

    # cap any unsolicited data from additional episodes
    all_rewards = all_rewards[:games]
    drafter1.choices = drafter1.choices[:30 * games]
    drafter2.choices = drafter2.choices[:30 * games]
    drafter1.decks = drafter1.decks[:games]
    drafter2.decks = drafter2.decks[:games]
    alternatives = alternatives[:30 * games]

    # convert the list of rewards to the first player's win rate
    win_rate = (mean(all_rewards) + 1) * 50

    return (win_rate, 100 - win_rate), \
        (drafter1.mana_curve, drafter2.mana_curve), \
        (drafter1.choices, drafter2.choices), \
        alternatives, \
        (drafter1.decks, drafter2.decks), \
        all_rewards
Exemplo n.º 6
0
envparameters = f.read()
envparameters = envparameters.strip('[')
envparameters = envparameters.strip(']')
f_list = [float(i) for i in envparameters.split(",")]
print("envparameters: " + str(f_list))

my_step_limit = int(f_list[0])
my_step_size = float(f_list[1])
my_maxspeed = float(f_list[2])

# Initialize environment with signal parameters:
env = CustomEnv(step_limit=my_step_limit,
                step_size=my_step_size,
                maxspeed=my_maxspeed)  # 0.01745*5

# Load trained model and execute it forever:
model = PPO2.load("../Models/" + filename)

while True:
    #obs = env.reset()
    obs = env.reset()
    #obs = obs.reshape((1,4))
    #print(env.observation_space.shape)
    #obs, rewards, dones, info = env.step([0,0])
    for i in range(1000000):  #my_step_limit
        action, _states = model.predict(obs)
        print(action)
        obs, rewards, dones, info = env.step(action)
        #obs = np.array(obs).reshape((1,4))
        env.renderSlow(1000)
Exemplo n.º 7
0
parser.add_argument('--name',
                    default=None,
                    required=True,
                    help='Name of the model (required)')
parser.add_argument(
    '--normalize',
    type=bool,
    default=False,
    help='Normalize the environement for training (default: False)')
args = parser.parse_args()
name_resume = args.name
normalize = args.normalize
commands = [[1, 0], [2, 0], [3, 0]]
if name_resume != None:

    model = PPO2.load(workDirectory + "/resultats/" + name_resume + "/" +
                      name_resume + ".zip")

env = DummyVecEnv(
    [lambda: e.AidaBulletEnv(
        commands,
        render=False,
        on_rack=False,
    )])
if normalize:
    env = VecNormalize(env,
                       clip_obs=1000.0,
                       clip_reward=1000.0,
                       training=False)
    env.load_running_average(workDirectory + "/resultats/" + name_resume +
                             "/normalizeData")
env = MultiAgentSelectObservation(env, DISTRICTS_GROUP_IDS, maac=True)
env = MultiAgentSelectAction(env, DISTRICTS_GROUP_IDS, 1)

no_closures = [1] * n_weeks
weekends = False
(baseline_pd, baseline_ar, inf) = run_model(env.unwrapped._model, n_weeks,
                                            weekends, args.district_name,
                                            no_closures)

ppo_paths = []
for p in args.paths:
    ppo_paths.append(p)

models = {}
# Order of paths and DISTRICTS_GROUP are assumed to be the same.
for district_name, p in zip(DISTRICTS_GROUP, ppo_paths):
    model = PPO2.load(p / "params.zip")
    models[district_name] = model

# Check that we have all the districts
if set(models.keys()) != set(DISTRICTS_GROUP):
    print("set(models.keys())" + str(set(models.keys())))
    print("set(DISTRICTS_GROUP)" + str(set(DISTRICTS_GROUP)))
assert set(models.keys()) == set(DISTRICTS_GROUP)

print("ar-improvement")
for run in range(args.runs):
    attack_rate = evaluate(env, models, DISTRICTS_GROUP_IDS, n_weeks)
    print(baseline_ar - attack_rate)
env.close()
Exemplo n.º 9
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", required=True, help="model path")
    parser.add_argument("--time_steps", required=True, help="time steps")

    args = vars(parser.parse_args())
    time_steps = int(args["time_steps"])
    model_path = str(args["model_path"])

    policy_path = os.path.join(
        model_path, "model_" + str(time_steps) + "_steps"
    )

    model = PPO2.load(policy_path)

    # define a method for the policy fn of your trained model
    def policy_fn(obs):
        return model.predict(obs, deterministic=True)[0]

    # we create the same env as we used for training in train_pushing_ppo.py,
    # such that action and observation space remain coherent with the policy.
    # however, unlike during the training, we set the initialization to the the
    # same as in the standard CubeEnv, since this is what the policy will be
    # evaluated on eventually.
    initializer = cube_env.RandomInitializer(difficulty=2) # difficulty one means pushing
    env = ExamplePushingTrainingEnv(initializer=initializer, 
                                    frameskip=3, 
                                    visualization=True)
    env = FlatObservationWrapper(env)
Exemplo n.º 10
0
from stable_baselines import PPO2

import argparse
import numpy as np

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('model_zip')
    args = parser.parse_args()

    env = gym.make('simpleEnv:simpleEnv-v0')
    vec_env = DummyVecEnv([lambda: env])

    # model =PPO2.load('simpleEnv-full5x5', vec_env, verbose=0, tensorboard_log='learning-ppo')
    model = PPO2.load(args.model_zip,
                      vec_env,
                      verbose=0,
                      tensorboard_log='learning-ppo')

    obs = env.reset()
    rewards = []
    for i in range(1000):
        env.render(rewards=rewards)

        if i == 20:
            env.reference_trajectory = np.random.normal(
                1, 1, env.obs_dimension)

        action, _ = model.predict(obs)
        s, r, _, _ = env.step(action)
        rewards.append(r)
Exemplo n.º 11
0
    color = "#56EEF4"
    headType = 'silly'
    tailType = 'sharp'

    return start_response(color, headType, tailType)


BOARD_WIDTH = 11
BOARD_HEIGHT = 11

NUM_ENVS = 1
NUM_LAYERS = 6
LAYER_WIDTH = 39
LAYER_HEIGHT = 39

model = PPO2.load('/snake/model.pkl')

def prepareObservations(you, snakes, food, orientation):
  head = you['body'][0]
  hx = head['x']
  hy = head['y']
  yourLength = len(you['body'])

  observations = [0] * NUM_ENVS * LAYER_HEIGHT * LAYER_WIDTH * NUM_LAYERS
  def assign(point, layer, value):
      x = point['x']
      y = point['y']
      x = (x - hx) * (-1 if orientation & 1 != 0 else 1)
      y = (x - hy) * (-1 if orientation & 2 != 0 else 1)
      x += LAYER_WIDTH / 2
      y += LAYER_HEIGHT / 2
Exemplo n.º 12
0
import time
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.gail import ExpertDataset, generate_expert_traj
from baselines.common.atari_wrappers import *

env = gym.make('BowlingNoFrameskip-v0')
env = MaxAndSkipEnv(env, skip=4)
env = WarpFrame(env)
env = DummyVecEnv([lambda: env])

dataset = ExpertDataset(expert_path='bowling_demo.npz', verbose=1)

#model = PPO2('CnnPolicy', env, verbose=1)

model = PPO2.load('bowling_model', env=env)

#model.pretrain(dataset, n_epochs=1000)

model.learn(total_timesteps=256000)

model.save('bowling_model')

state = env.reset()
total_reward = 0

while True:
    env.render()
    time.sleep(1 / 60)

    action, _ = model.predict(state)
Exemplo n.º 13
0
def main():
    args = arg_parser()

    # Create log dir
    tensorboard_log_dir = "./tensorboard_log/"
    os.makedirs(tensorboard_log_dir,exist_ok=True)

    # Create result tmp dir
    figdir = "./fig/"
    os.makedirs(figdir,exist_ok=True)

    # Create ndarray save dir
    nd_dir = "./data_each_term_of_rewardfunction_100episodes/" + str(args.agent) + "/"
    os.makedirs(nd_dir, exist_ok=True)

    # Create and wrap the environment 
    env1 = gym.make(config['env'])
    broken_env = ChangeJointRangeEnv(env1)
    #env1 = NormalEnv(env1) # reward custom

    if args.video:
        broken_env = wrappers.Monitor(broken_env,'./videos/' + args.loaddir + "-" + datetime.datetime.now().isoformat(),force=True,video_callable=(lambda ep: ep % 1 == 0)) # for output video

    # broken_env = DummyVecEnv([lambda :broken_env]) #複数の環境用の単純なベクトル化されたラッパーを作成し、現在のPythonプロセスで各環境を順番に呼び出します。
    env1 = DummyVecEnv([lambda : env1])

    # argpaserから入力する場合
    agentName = []
    agentName.append(args.agent)


    plainData = []
    brokenData = []
    perror = []
    berror = []

    plt.figure()
    sns.set()
    # fig,ax = plt.subplots()
    for agent in agentName:
        brokenSeedAveReward = []

        if "Curriculum" in agent:
            # ロードディレクトリの指定
            load_dir = "./ISIS2020/trained_Curriculum/" + agent +"/"
        else:
            # ロードディレクトリの指定
            load_dir = "./ISIS2020/trained_agent_dir/" + agent +"/"

        # seedごとに平均報酬を獲得する ,range(1,6)
        for seed in range(1,6):
            
            if "range09-16million" in agentName and seed >= 4:
                continue

            # PPO2modelの生成(トレーニングを行うエージェントの作成)
            trainedAnt = PPO2(MlpPolicy,env1,verbose=1,tensorboard_log=tensorboard_log_dir)

            # 保存したモデル(学習済みモデル)のload :zipファイルのファイル名のみとパスを指定,seedごとに
            trainedAnt = PPO2.load(load_dir + "trainedAnt" + "-seed" + str(seed)) 

            # seedの設定
            trainedAnt.set_random_seed(seed+100)

            print("loaddir:",load_dir + "trainedAnt" + "-seed" + str(seed))

            broken_obs = broken_env.reset()

            broken_total_rewards = [] 
            rewards = 0
            forwards = 0
            ctrls = 0
            contacts = 0
            survives = 0

            # kを0から1まで,0.01刻みで変化させる
            for k in tqdm(range(0, 100)):
                # 故障が起きる環境でのrewardを求めるループ(100)
                for episode in range(100):
                    # iteration of time steps, default is 1000 time steps
                    for i in range(1000):
                        # predict phase
                        action, _states = trainedAnt.predict(broken_obs)

                        # step phase
                        # broken環境で評価する時
                        broken_obs, reward, done, info = broken_env.step(action, k)
                        rewards += reward
                        forwards += info['reward_forward']
                        ctrls += info['reward_ctrl']
                        contacts += info['reward_contact']
                        survives += info['reward_survive']

                        if done:
                            break

                    # k_geneにkとその時の報酬を格納
                    k_gene[seed-1][episode][k] = rewards

                    # 報酬関数の各項の値を格納
                    reward_forward_map[seed-1][episode][k] = forwards
                    reward_ctrl_map[seed-1][episode][k] = ctrls
                    reward_contact_map[seed-1][episode][k] = contacts
                    reward_survive_map[seed-1][episode][k] = survives

                    # 環境をリセット
                    broken_obs = broken_env.reset()

                    # 合計報酬の記録とリセット
                    broken_total_rewards.append(rewards)
                    rewards = 0
                    forwards = 0
                    ctrls = 0
                    contacts = 0
                    survives = 0

            broken_reward_average1 = sum(broken_total_rewards)/len(broken_total_rewards)
            brokenSeedAveReward.append(broken_reward_average1)

            del trainedAnt 
        
        # agentのplain,broken環境での平均報酬が格納されている
        broken_ave = sum(brokenSeedAveReward)/len(brokenSeedAveReward)
        brokenData.append(broken_ave)
        broken_error = np.std(brokenSeedAveReward,ddof=1)/np.sqrt(len(brokenSeedAveReward))
        berror.append(broken_error)

    brokenData = np.array(brokenData).flatten()
    berror = np.array(berror)

    

    # print(k_gene)
    for seed in range(1, 6):
        seed_gene = k_gene[seed-1,:,:]
        seed_gene = np.sum(seed_gene, axis=0)
        seed_gene = seed_gene/100 # 平均報酬
        np.save(nd_dir + str(agentName[0]) + "_rewardForEachK" + "_seed=" + str(seed), seed_gene)

    # 報酬関数の各項の二次元配列を一次元配列に変形してnpyで保存
    save_reward_map(reward_forward_map, nd_dir, str(agentName[0]), "_rewardForward")
    save_reward_map(reward_ctrl_map, nd_dir, str(agentName[0]), "_rewardCtrl")
    save_reward_map(reward_contact_map, nd_dir, str(agentName[0]), "_rewardContact")
    save_reward_map(reward_survive_map, nd_dir, str(agentName[0]), "_rewardSurvive")
Exemplo n.º 14
0
import gym
import pandas as pd
from qtrade_env import QtradeEnv
root_dir = '/Users/liuyehong/Dropbox/CICC/Algorithm_Trading/Platform2/OHLC/data/1Min/'
import pickle


from stable_baselines.common.policies import MlpPolicy, CnnPolicy, LstmPolicy, CnnLstmPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2


# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: QtradeEnv()])

model = PPO2(MlpLnLstmPolicy, env, verbose=1, nminibatches=1)
model.learn(total_timesteps=50000)
model.save('ppo2_mlplnlstm')

del model
model = PPO2.load('ppo2_mlplnlstm', env=env)

obs = env.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
from stable_baselines import TRPO
from stable_baselines import PPO2
from snake_env.reynolds_swimmer_forward_vel import SwimmerLocomotionEnv
import numpy as np

fixed_path = [(-0.2 * i, 0) for i in range(30)]

use_random_path = True
robot_k = 1.0
robot_link_length = 0.3

model = PPO2.load("model/reynolds_forward/reynolds_ppo_weight_24")
env = SwimmerLocomotionEnv(path=fixed_path,
                           random_path=use_random_path,
                           use_hard_path=False,
                           robot_link_length=robot_link_length,
                           robot_k=robot_k,
                           record_trajectory=True)

obs = env.reset()
total_reward = 0
x_list = []
for i in range(10000):
    action, _states = model.predict(obs)
    #step_time = 0.5
    #action = [-0.8*np.sin(step_time*i), 0.8*np.cos(step_time*i)]
    # print("start of step")
    #print(action)
    x_list.append(action[1])
    obs, rewards, dones, info = env.step(action)
    # print(obs)
Exemplo n.º 16
0
    def __init__(self,
                 obs_shape,
                 action_space,
                 base=None,
                 base_kwargs=None,
                 load_expert=None,
                 env_name=None,
                 rl_baseline_zoo_dir=None,
                 expert_algo=None,
                 normalize=True):
        super(Policy, self).__init__()

        #TODO: Pass these parameters in
        self.epsilon = 0.1
        self.dril = True

        if base_kwargs is None:
            base_kwargs = {}
        if base is None:
            if env_name in ['duckietown']:
                base = DuckieTownCNN
            elif len(obs_shape) == 3:
                print('CNN base check passed')
                base = CNNBase
            elif len(obs_shape) == 1:
                base = MLPBase
            else:
                raise NotImplementedError

        self.base = base(obs_shape[0], normalize=normalize, **base_kwargs)
        self.action_space = None
        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(self.base.output_size, num_outputs)
            self.action_space = "Discrete"
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(self.base.output_size, num_outputs)
            self.action_space = "Box"
        elif action_space.__class__.__name__ == "MultiBinary":
            raise Exception('Error')
        else:
            raise NotImplementedError

        if load_expert == True and env_name not in [
                'duckietown', 'highway-v0'
        ]:
            print('[Loading Expert --- Base]')
            model_path = os.path.join(rl_baseline_zoo_dir, 'trained_agents',
                                      f'{expert_algo}')
            try:
                import mpi4py
                from stable_baselines import TRPO
            except ImportError:
                mpi4py = None
                DDPG, TRPO = None, None

            from stable_baselines import PPO2

            model_path = f'{model_path}/{env_name}.pkl'
            if env_name in ['AntBulletEnv-v0']:
                baselines_model = TRPO.load(model_path)
            else:
                baselines_model = PPO2.load(model_path)
            for key, value in baselines_model.get_parameters().items():
                print(key, value.shape)

            if base.__name__ == 'CNNBase':
                print(['Loading CNNBase expert model'])
                params = copy_cnn_weights(baselines_model)
            elif load_expert == True and base.__name__ == 'MLPBase':
                print(['Loading MLPBase expert model'])
                params = copy_mlp_weights(baselines_model)

            #TODO: I am not sure what this is doing
            try:
                self.load_state_dict(params)
                self.obs_shape = obs_shape[0]
            except:
                self.base = base(obs_shape[0] + 1, **base_kwargs)
                self.load_state_dict(params)
                self.obs_shape = obs_shape[0] + 1
Exemplo n.º 17
0
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: NegativeRewardEnv()])

model = PPO2(get_policy(policy),
             env,
             verbose=0,
             nminibatches=1,
             tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=25000, tb_log_name='PPO2' + model_tag)

model.save(model_folder + "PPO2" + model_tag)
del model
model = PPO2.load(model_folder + "PPO2" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
Exemplo n.º 18
0
import gym

import time

env = gym.make('InvertedPendulum-v0')

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import PPO2
if __name__ == '__main__':
    # multiprocess environment
    n_cpu = 4

    env = SubprocVecEnv([lambda: env for i in range(n_cpu)])
    model = PPO2.load("PPO2_cartpole_tensorboard/ppo2_cartpole_5")
    print(env)
    # Enjoy trained agent
    obs = env.reset()
    cumul = 0

    # Passing state=None to the predict function means
    # it is the initial state
    state = None
    # When using VecEnv, done is a vector
    done = [False for _ in range(env.num_envs)]
    while True:
        a = time.time()
        action, states = model.predict(obs, state=state, mask=done)
        # print(time.time()-a)
        obs, rewards, dones, info = env.step(action)
        cumul = rewards[0] + cumul
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv(map_name='map1')])

model = PPO2(get_policy(policy),
             env,
             verbose=0,
             nminibatches=1,
             tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=2500000, tb_log_name='PPO2_map1' + model_tag)

model.save(model_folder + "PPO2_map1" + model_tag)
del model
model = PPO2.load(model_folder + "PPO2_map1" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
Exemplo n.º 20
0
model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'ent_coef': params['ent_coef'],
    'cliprange': params['cliprange'],
    'noptepochs': int(params['noptepochs']),
    'lam': params['lam'],
}

if curr_idx == -1:
    model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1,
            tensorboard_log="./tensorboard", **model_params)
else:
    model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '.pkl', env=train_env)

for idx in range(curr_idx + 1, 10):
    print('[', idx, '] Training for: ', train_len, ' time steps')

    model.learn(total_timesteps=train_len)

    obs = test_env.reset()
    done, reward_sum = False, 0

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = test_env.step(action)
        reward_sum += reward

    print('[', idx, '] Total reward: ', reward_sum, ' (' + reward_strategy + ')')
Exemplo n.º 21
0
from stable_baselines import PPO2


def env_create():
    env = ClientDapr("ActorUnity")
    env.create("CartPole-v1")

    print(f"[Client] Created Actor {env.actor_id}", flush=True)

    return env


print("===============================================")
print("INFERING")
print("===============================================")
model = PPO2.load("baselines_ppo_cartpole")
env_local = env_create()

# Start monitoring
print("[Client] Starting to monitor", flush=True)
env_local.monitor_start(1)

# Run Experiment
obs = env_local.reset()
is_done = False

while not (is_done):
    action, _states = model.predict(obs)
    obs, rewards, is_done, info = env_local.step(action)

# Stop Monitoring
Exemplo n.º 22
0
    if mi == 0:  # base policy
        model = PPO2('MlpPolicy',
                     env,
                     tensorboard_log='./' + model_names[mi] + '_tb/')

        print('Learning Base PPO2 model:', model_names[mi])
        # learning
        model.learn(total_timesteps=BASE_TRAIN_STEPS,
                    tb_log_name=model_names[mi])
        model.save(model_names[mi])

    else:
        print('Learning PPO2 model:', model_names[mi])
        model = PPO2.load(model_names[0],
                          env=env,
                          tensorboard_log='./' + model_names[0] + '_tb/')
        model.learn(total_timesteps=TOTAL_TRAIN_STEPS - BASE_TRAIN_STEPS,
                    tb_log_name=model_names[mi],
                    reset_num_timesteps=False)
        model.save(model_names[mi])

    total_rewards = 0.

    #-------- run the model -------#
    for e in range(NUM_EPISODES):
        obs = env.reset()
        # env.reset()
        epi_rewards = 0.

        for i in range(1000):
Exemplo n.º 23
0
def cb(a, b):
    global last_time
    t = datetime.now().timestamp()
    if t - last_time > 60:
        last_time = t
        print("SAVING===" * 10)
        model.save(net_name)

# multiprocess environment
env = make_vec_env('MinitaurBulletEnv-v0', n_envs=4)
# env = gym.make('MinitaurBulletEnv-v0', render=True)

try:
    model = PPO2.load(
        net_name,
        policy_kwargs=policy_kwargs,
        env=env
    )
except ValueError:
    model = PPO2(
        MlpPolicy, 
        env, 
        verbose=1,
        tensorboard_log='./tensorboard',
    )

while True:
    model.learn(
        total_timesteps=2000000,
        callback=cb,
        tb_log_name=net_name
Exemplo n.º 24
0
# This is an example of a single-episode rollout, using the QVM trained agent
# on the MaxCut test set.

import os

import gym
import gym_forest
from stable_baselines import PPO2

MODEL_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'qvm.p')
ENV_NAME = 'forest-maxcut-test-v0'
MAX_STEPS = 25

env = gym.make(ENV_NAME)
agent = PPO2.load(MODEL_FILE)

obs = env.reset()
best_reward = 0
eps_reward = 0
for i in range(MAX_STEPS):
    action, _ = agent.predict(obs)
    obs, reward, done, info = env.step(action)
    eps_reward += reward
    if done:
        # early termination returns the remaining episode reward,
        # assuming that we do just as well on the remaining steps
        # here we get the corresponding single-step reward
        single_step_reward = reward / (MAX_STEPS - i)
        print('[{}]\t {}\t reward {:.3f}'.format(i, info['instr'],
                                                 single_step_reward))
Exemplo n.º 25
0
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = T4HistoryEnv(dir)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init

dir = '/home/dan/serpent/market_history/sp500'

# env = T4HistoryEnv(dir, continuous_action=True)
# env2 = T4HistoryEnv(dir)
# file_count = len(env.files)
# print(len(env.files))

# env = DummyVecEnv([lambda: env])
# env = Monitor(env, log_dir, allow_early_resets=True)
# env2 = Monitor(env2, log_dir, allow_early_resets=True)\
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
# env = DummyVecEnv([lambda: env])

env = SubprocVecEnv([make_env('', i) for i in range(8)])
# model = PPO2('CnnPolicy', env, verbose=1, tensorboard_log="./tmp/gym/board/")
model = PPO2.load('sp500_ppo2_pretrain')
model.set_env(env)
model.learn(total_timesteps=int(10e6), log_interval=10, callback=callback)

model.save('sp500_ppo2_pretrain')
Exemplo n.º 26
0
    import argparse
    import numpy as np
    from stable_baselines import PPO2, logger
    from stable_baselines.common.cmd_util import make_atari_env

    parser = argparse.ArgumentParser()
    parser.add_argument('expert', type=str, help='Expert path (*.zip)')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed for env.')
    parser.add_argument('--note',
                        type=str,
                        default='test',
                        help='Logging directory')
    parser.add_argument('--env',
                        type=str,
                        default='PongNoFrameskip-v4',
                        help='Environment ID')
    args = parser.parse_args()

    logdir = os.path.join('logs', args.env, args.note)
    logger.configure(logdir)
    logger.info(args)

    env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4)
    model = PPO2.load(args.expert)
    generate_expert_traj(model,
                         save_path=os.path.join(logdir, 'expert'),
                         env=env)
Exemplo n.º 27
0
def main(args):
    envconfig_string = args.envconfig
    custom_envconfig = _preprocess_custom_envconfig(
        args.envconfig) if args.envconfig is not None else {}
    env_id = 'gym_auv:' + args.env
    env_name = env_id.split(':')[-1] if ':' in env_id else env_id
    envconfig = gym_auv.SCENARIOS[env_name][
        'config'] if env_name in gym_auv.SCENARIOS else {}
    envconfig.update(custom_envconfig)

    NUM_CPU = 8
    EXPERIMENT_ID = str(int(time())) + args.algo.lower()
    model = {
        'ppo': PPO2,
        'ddpg': DDPG,
        'td3': TD3,
        'a2c': A2C,
        'acer': ACER,
        'acktr': ACKTR
    }[args.algo.lower()]

    if args.mode == 'play':
        agent = model.load(args.agent) if args.agent is not None else None
        envconfig_play = envconfig.copy()
        envconfig_play['show_indicators'] = True
        #envconfig_play['autocamera3d'] = False
        env = create_env(env_id,
                         envconfig_play,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot,
                         verbose=True)
        print('Created environment instance')

        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            args.video_dir,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        print(args.video_dir, args.video_name)
        play_scenario(env, recorded_env, args, agent=agent)
        recorded_env.env.close()

    elif (args.mode == 'enjoy'):
        agent = model.load(args.agent)
        # params = agent.get_parameters()
        # policy_weights = [
        #     params['model/pi_fc0/w:0'],
        #     params['model/pi_fc1/w:0'],
        #     params['model/pi/w:0']
        # ]
        # policy_biases = [
        #     params['model/pi_fc0/b:0'],
        #     params['model/pi_fc1/b:0'],
        #     params['model/pi/b:0']
        # ]
        # for param in params:
        #     print(param, params[param].shape)

        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(video_folder, exist_ok=True)

        env = create_env(env_id,
                         envconfig,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot)
        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            video_folder,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        obs = recorded_env.reset()
        state = None
        done = [False for _ in range(vec_env.num_envs)]
        for t_step in range(args.recording_length):
            if args.recurrent:
                action, _states = agent.predict(
                    observation=obs,
                    state=state,
                    mask=done,
                    deterministic=not args.stochastic)
                state = _states
            else:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
            obs, reward, done, info = recorded_env.step(action)
            recorded_env.render()
            if args.env == 'PathGeneration-v0':
                sleep(1)
        recorded_env.env.close()

    elif (args.mode == 'train'):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        os.makedirs(scenario_folder, exist_ok=True)
        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        recording_length = 8000
        os.makedirs(video_folder, exist_ok=True)
        agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(agent_folder, exist_ok=True)
        tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard',
                                       args.env, EXPERIMENT_ID)
        tensorboard_port = 6006

        if (args.nomp or model == DDPG or model == TD3):
            num_cpu = 1
            vec_env = DummyVecEnv(
                [lambda: create_env(env_id, envconfig, pilot=args.pilot)])
        else:
            num_cpu = NUM_CPU
            vec_env = SubprocVecEnv([
                make_mp_env(env_id, i, envconfig, pilot=args.pilot)
                for i in range(num_cpu)
            ])

        if (args.agent is not None):
            agent = model.load(args.agent)
            agent.set_env(vec_env)
        else:
            if (model == PPO2):
                if args.recurrent:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 1,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-3,
                    }

                    class CustomLSTMPolicy(MlpLstmPolicy):
                        def __init__(self,
                                     sess,
                                     ob_space,
                                     ac_space,
                                     n_env,
                                     n_steps,
                                     n_batch,
                                     n_lstm=256,
                                     reuse=False,
                                     **_kwargs):
                            super().__init__(sess,
                                             ob_space,
                                             ac_space,
                                             n_env,
                                             n_steps,
                                             n_batch,
                                             n_lstm,
                                             reuse,
                                             net_arch=[
                                                 256, 256, 'lstm',
                                                 dict(vf=[64], pi=[64])
                                             ],
                                             **_kwargs)

                    agent = PPO2(CustomLSTMPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams)
                else:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 32,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-4,
                    }
                    #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64])
                    #policy_kwargs = dict(net_arch=[64, 64, 64])
                    #layers = [256, 128, 64]
                    layers = [64, 64]
                    policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                    agent = PPO2(MlpPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams,
                                 policy_kwargs=policy_kwargs)
            elif (model == DDPG):
                hyperparams = {
                    'memory_limit':
                    1000000,
                    'normalize_observations':
                    True,
                    'normalize_returns':
                    False,
                    'gamma':
                    0.98,
                    'actor_lr':
                    0.00156,
                    'critic_lr':
                    0.00156,
                    'batch_size':
                    256,
                    'param_noise':
                    AdaptiveParamNoiseSpec(initial_stddev=0.287,
                                           desired_action_stddev=0.287)
                }
                agent = DDPG(LnMlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log,
                             **hyperparams)
            elif (model == TD3):
                action_noise = NormalActionNoise(mean=np.zeros(2),
                                                 sigma=0.1 * np.ones(2))
                agent = TD3(stable_baselines.td3.MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            action_noise=action_noise)
            elif model == A2C:
                hyperparams = {
                    'n_steps': 5,
                    'gamma': 0.995,
                    'ent_coef': 0.00001,
                    'learning_rate': 2e-4,
                }
                layers = [64, 64]
                policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                agent = A2C(MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            **hyperparams,
                            policy_kwargs=policy_kwargs)
            elif model == ACER:
                agent = ACER(MlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log)
            elif model == ACKTR:
                agent = ACKTR(MlpPolicy,
                              vec_env,
                              verbose=True,
                              tensorboard_log=tensorboard_log)

        print('Training {} agent on "{}"'.format(args.algo.upper(), env_id))

        n_updates = 0
        n_episodes = 0

        def callback(_locals, _globals):
            nonlocal n_updates
            nonlocal n_episodes

            sys.stdout.write('Training update: {}\r'.format(n_updates))
            sys.stdout.flush()

            _self = _locals['self']
            vec_env = _self.get_env()

            class Struct(object):
                pass

            report_env = Struct()
            report_env.history = []
            report_env.config = envconfig
            report_env.nsensors = report_env.config[
                "n_sensors_per_sector"] * report_env.config["n_sectors"]
            report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1)
            report_env.last_episode = vec_env.get_attr('last_episode')[0]
            report_env.config = vec_env.get_attr('config')[0]
            report_env.obstacles = vec_env.get_attr('obstacles')[0]

            env_histories = vec_env.get_attr('history')
            for episode in range(max(map(len, env_histories))):
                for env_idx in range(len(env_histories)):
                    if (episode < len(env_histories[env_idx])):
                        report_env.history.append(
                            env_histories[env_idx][episode])
            report_env.episode = len(report_env.history) + 1

            total_t_steps = _self.get_env().get_attr(
                'total_t_steps')[0] * num_cpu
            agent_filepath = os.path.join(agent_folder,
                                          str(total_t_steps) + '.pkl')

            if model == PPO2:
                recording_criteria = n_updates % 70 == 0
                report_criteria = True
                _self.save(agent_filepath)
            elif model == A2C or model == ACER or model == ACKTR:
                save_criteria = n_updates % 100 == 0
                recording_criteria = n_updates % 1000 == 0
                report_criteria = True
                if save_criteria:
                    _self.save(agent_filepath)
            elif model == DDPG or model == TD3:
                save_criteria = n_updates % 10000 == 0
                recording_criteria = n_updates % 50000 == 0
                report_criteria = report_env.episode > n_episodes
                if save_criteria:
                    _self.save(agent_filepath)

            if report_env.last_episode is not None and len(
                    report_env.history) > 0 and report_criteria:
                try:
                    #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode))
                    gym_auv.reporting.report(report_env,
                                             report_dir=figure_folder)
                    #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode)))
                except OSError as e:
                    print("Ignoring reporting OSError:")
                    print(repr(e))

            if recording_criteria:
                if args.pilot:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, args.pilot, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                else:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                subprocess.Popen(cmd)

            n_episodes = report_env.episode
            n_updates += 1

        agent.learn(
            total_timesteps=1500000000000000000000000000000000000000000,
            tb_log_name='log',
            callback=callback)

    elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        agent = PPO2.load(args.agent)

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))
            for valuedict in valuegrid:
                customconfig = envconfig.copy()
                customconfig.update(valuedict)
                env = create_env(env_id,
                                 envconfig,
                                 test_mode=True,
                                 pilot=args.pilot)
                valuedict_str = '_'.join(
                    (key + '-' + str(val) for key, val in valuedict.items()))

                print('Running {} test for {}...'.format(
                    args.mode, valuedict_str))

                if args.mode == 'policyplot':
                    gym_auv.reporting.plot_actions(env,
                                                   agent,
                                                   fig_dir=figure_folder,
                                                   fig_prefix=valuedict_str)
                elif args.mode == 'vectorfieldplot':
                    gym_auv.reporting.plot_vector_field(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)
                elif args.mode == 'streamlinesplot':
                    gym_auv.reporting.plot_streamlines(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)

        else:
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             pilot=args.pilot)
            with open(os.path.join(figure_folder, 'config.json'), 'w') as f:
                json.dump(env.config, f)

            if args.mode == 'policyplot':
                gym_auv.reporting.plot_actions(env,
                                               agent,
                                               fig_dir=figure_folder)
            elif args.mode == 'vectorfieldplot':
                gym_auv.reporting.plot_vector_field(env,
                                                    agent,
                                                    fig_dir=figure_folder)
            elif args.mode == 'streamlinesplot':
                gym_auv.reporting.plot_streamlines(env,
                                                   agent,
                                                   fig_dir=figure_folder)

        print('Output folder: ', figure_folder)

    elif args.mode == 'test':
        figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env,
                                     EXPERIMENT_ID)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        video_folder = os.path.join(figure_folder, 'videos')
        os.makedirs(figure_folder, exist_ok=True)
        os.makedirs(scenario_folder, exist_ok=True)
        os.makedirs(video_folder, exist_ok=True)

        if not args.onlyplot:
            agent = model.load(args.agent)

        def create_test_env(video_name_prefix, envconfig=envconfig):
            print('Creating test environment: ' + env_id)
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             render_mode=args.render if args.video else None,
                             pilot=args.pilot)
            vec_env = DummyVecEnv([lambda: env])
            if args.video:
                video_length = min(500, args.recording_length)
                recorded_env = VecVideoRecorder(vec_env,
                                                video_folder,
                                                record_video_trigger=lambda x:
                                                (x % video_length) == 0,
                                                video_length=video_length,
                                                name_prefix=video_name_prefix)
            active_env = recorded_env if args.video else vec_env

            return env, active_env

        failed_tests = []

        def run_test(id,
                     reset=True,
                     report_dir=figure_folder,
                     scenario=None,
                     max_t_steps=None,
                     env=None,
                     active_env=None):
            nonlocal failed_tests

            if env is None or active_env is None:
                env, active_env = create_test_env(video_name_prefix=args.env +
                                                  '_' + id)

            if scenario is not None:
                obs = active_env.reset()
                env.load(args.scenario)
                print('Loaded', args.scenario)
            else:
                if reset:
                    obs = active_env.reset()
                else:
                    obs = env.observe()

            gym_auv.reporting.plot_scenario(env,
                                            fig_dir=scenario_folder,
                                            fig_postfix=id,
                                            show=args.onlyplot)
            if args.onlyplot:
                return
            cumulative_reward = 0
            t_steps = 0
            if max_t_steps is None:
                done = False
            else:
                done = t_steps > max_t_steps

            while not done:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
                obs, reward, done, info = active_env.step(action)
                if args.video:
                    active_env.render()
                t_steps += 1
                cumulative_reward += reward[0]
                report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format(
                    id, t_steps, cumulative_reward, info[0]['progress'])
                sys.stdout.write(report_msg)
                sys.stdout.flush()

                if args.save_snapshots and t_steps % 30 == 0 and not done:
                    env.save_latest_episode(save_history=False)
                    for size in (20, 50, 100, 200, 300, 400, 500):
                        gym_auv.reporting.plot_trajectory(
                            env,
                            fig_dir=scenario_folder,
                            fig_prefix=(args.env + '_t_step_' + str(t_steps) +
                                        '_' + str(size) + '_' + id),
                            local=True,
                            size=size)
                elif done:
                    gym_auv.reporting.plot_trajectory(env,
                                                      fig_dir=scenario_folder,
                                                      fig_prefix=(args.env +
                                                                  '_' + id))

            env.close()

            gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1)
            #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id))
            #env.save(os.path.join(scenario_folder, id))
            if env.collision:
                failed_tests.append(id)
                with open(os.path.join(figure_folder, 'failures.txt'),
                          'w') as f:
                    f.write(', '.join(map(str, failed_tests)))

            return copy.deepcopy(env.last_episode)

        print('Testing scenario "{}" for {} episodes.\n '.format(
            args.env, args.episodes))
        report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format(
            'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions',
            'CT-Error [m]', 'H-Error [deg]')
        print(report_msg_header)
        print('-' * len(report_msg_header))

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))

        if args.scenario:
            if args.testvals:
                episode_dict = {}
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = -np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                        episode_dict[valuedict_str] = [last_episode, colorval]
                print('Plotting all')
                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=scenario_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)

            else:
                run_test("ep0", reset=True, scenario=args.scenario)

        else:
            if args.testvals:
                episode_dict = {}
                agent_index = 1
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                    episode_dict['Agent ' +
                                 str(agent_index)] = [last_episode, colorval]
                    agent_index += 1

                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=figure_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)
            else:
                env, active_env = create_test_env(video_name_prefix=args.env)
                for episode in range(args.episodes):
                    run_test('ep' + str(episode),
                             env=env,
                             active_env=active_env)

        if args.video and active_env:
            active_env.close()