Exemplo n.º 1
0
def save_results(arg_dict, model_name, env, model_logdir=None, show=False):
    if model_logdir is None:
        model_logdir = arg_dict["logdir"]
    print(f"model_logdir: {model_logdir}")

    results_plotter.EPISODES_WINDOW = 100
    results_plotter.plot_results([model_logdir], arg_dict["steps"], results_plotter.X_TIMESTEPS, arg_dict["algo"] + " " + arg_dict["env_name"] + " reward")
    plt.gcf().set_size_inches(8, 6)
    plt.savefig(os.path.join(model_logdir, model_name) + '_reward_results.png')
    #plot_extended_results(model_logdir, 'd', results_plotter.X_TIMESTEPS, arg_dict["algo"] + " " + arg_dict["env_name"] + " distance", "Episode Distances")
    plt.gcf().set_size_inches(8, 6)
    plt.savefig(os.path.join(model_logdir, model_name) + '_distance_results.png')
    plt.close()
    plt.close()
    if isinstance(env, HERGoalEnvWrapper):
        results_plotter.plot_curves([(np.arange(len(env.env.episode_final_distance)),np.asarray(env.env.episode_final_distance))],'episodes',arg_dict["algo"] + " " + arg_dict["env_name"] + ' final step distance')
    else:
        results_plotter.plot_curves([(np.arange(len(env.unwrapped.episode_final_distance)),np.asarray(env.unwrapped.episode_final_distance))],'episodes',arg_dict["algo"] + " " + arg_dict["env_name"] + ' final step distance')
    plt.gcf().set_size_inches(8, 6)
    plt.ylabel("Step Distances")
    plt.savefig(os.path.join(model_logdir, model_name) + "_final_distance_results.png")
    plt.close()
    print("Congratulations! Training with {} timesteps succeed!".format(arg_dict["steps"]))
    if show:
        plt.show()
Exemplo n.º 2
0
    def train(self,
              symbol='JPM',
              sd=dt.datetime(2009, 1, 1),
              ed=dt.datetime(2010, 12, 31),
              time_steps=int(1e5),
              savepath=None,
              should_plot=False):
        # load data and indicators
        df = self._load_data([symbol], sd, ed)
        df_met = self._get_indicators(symbol, df)

        # set environment
        self.env = Monitor(LoanEnv(df_met),
                           self.log_dir,
                           allow_early_resets=True)

        # train model
        self.model = DQN(MlpPolicy,
                         self.env,
                         prioritized_replay=True,
                         verbose=1)
        self.model.learn(total_timesteps=time_steps, callback=self.debugcb)

        # save and plot
        if savepath is not None:
            self.model.save(savepath)

        if should_plot:
            results_plotter.plot_results([self.log_dir], time_steps,
                                         results_plotter.X_TIMESTEPS,
                                         f'DQN {symbol}')
            plt.show()
Exemplo n.º 3
0
def main():
    """
    Example usage in jupyter-notebook

    .. code-block:: python

        from stable_baselines import results_plotter
        %matplotlib inline
        results_plotter.plot_results(["./log"], 10e6, results_plotter.X_TIMESTEPS, "Breakout")

    Here ./log is a directory containing the monitor.csv files
    """
    import argparse
    import os
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dirs',
                        help='List of log directories',
                        nargs='*',
                        default=['./log'])
    parser.add_argument('--num_timesteps', type=int, default=int(10e6))
    parser.add_argument('--xaxis',
                        help='Varible on X-axis',
                        default=X_TIMESTEPS)
    parser.add_argument('--task_name', help='Title of plot', default='Title')
    args = parser.parse_args()
    args.dirs = [os.path.abspath(folder) for folder in args.dirs]
    results_plotter.plot_results(args.dirs, args.num_timesteps, args.xaxis,
                                 args.task_name)
    plt.show()
Exemplo n.º 4
0
def start_the_game():
    cyber = DDoS_env.cyberEnv(defense_list=global_settings.defense_space,
                              obsv_list=global_settings.observation_space)
    cyber = Monitor(cyber, log_dir)

    defense_agent_decision_model = PPO2(MlpPolicy,
                                        cyber,
                                        verbose=1,
                                        gamma=0.95,
                                        ent_coef=0.1)
    callback = training_result.SaveOnBestTrainingRewardCallback(
        check_freq=global_settings.SAVE_RESULT_FREQ_DDoS, log_dir=log_dir)
    defense_agent_decision_model.learn(
        total_timesteps=global_settings.TOTAL_TRAIN_STEPS, callback=callback)

    # # Evaluate the agent
    # print("Evaluating the agent")
    # from stable_baselines.common.evaluation import evaluate_policy
    # mean_reward, std_reward = evaluate_policy(defense_agent_decision_model, defense_agent_decision_model.get_env(), n_eval_episodes=2)
    # print("Mean Reward %s Std Reward %s"%(mean_reward,std_reward))
    ''' Plot the performance'''
    results_plotter.plot_results([log_dir], global_settings.TOTAL_TRAIN_STEPS,
                                 results_plotter.X_TIMESTEPS,
                                 "DDoS Reward Results")
    plt.show()
Exemplo n.º 5
0
from stable_baselines import results_plotter
import matplotlib.pyplot as plt

log_dir = '/home/yliu2/blimp_ws/exp_log/SAC/HOVER/4act/exp1'
SLEEP_RATE = 2
N_EPISODE = 5000
EPISODE_LENGTH = SLEEP_RATE * 30  #30 sec
TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE

results_plotter.plot_results([log_dir], TOTAL_TIMESTEPS,
                             results_plotter.X_TIMESTEPS, "SAC BLIMP")
plt.show()
Exemplo n.º 6
0
    gazebo.unpauseSim()

    # Create the vectorized environment
    env = DummyVecEnv([
        make_env(env_id, i, num_cpu - 1, log_dir, gazebo,
                 os.getpgid(gazebo_process.pid), os.getpgid(cf_process.pid))
        for i in range(num_cpu)
    ])
    env = VecNormalize(env)

    # Save best model every n steps and monitors performance
    # save_best_callback = SaveOnBestTrainingRewardCallback(check_freq=250, log_dir=log_dir)
    # Save model every n steps
    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path='./' + log_dir,
                                             name_prefix='ppo2')

    # Train from scratch
    model = PPO2(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=200000, callback=checkpoint_callback)

    # Load trained params and continue training
    # model = PPO2.load(log_dir + '/best_model')
    # model.set_env(env)
    # model.learn(total_timesteps=200000, callback=save_best_callback, reset_num_timesteps=False)

    results_plotter.plot_results([log_dir], 200000,
                                 results_plotter.X_TIMESTEPS, "PPO Crazyflie")
    plt.show()

    env.close()
Exemplo n.º 7
0
    if (n_steps + 1) % 1000 == 0:
        # Evaluate policy training performance
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-100:])
            print(x[-1], 'timesteps')
            print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    return True


################ TRAINING

model.learn(total_timesteps=time_steps, callback=auto_save_callback, seed=args.random_seed)

# print('save model')
# savemodel(model, MODEL, ENVIRONMENT, DATE)

results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "RGBD Observation")
plt.show()
print('total time', time.time()-start)


Exemplo n.º 8
0
import sys
import numpy as np
import tensorflow as tf
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
import time

import os
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines import results_plotter

time_steps = 5000
ll = 5e-5

log_dir = "./logFiles"

results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS,
                             "IEEE 39 Bus load shedding w/SAC")
plt.savefig(
    log_dir +
    '/IEEE_39Bus_loadshedding_SAC {}_{}.png'.format(str(time_steps), str(ll)))
plt.show()
                    if self.verbose > 0:
                        print("Saving new best model to {}".format(
                            self.save_path))
                    self.model.save(self.save_path)

        return True


# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment

env = gym.make('SatelliteEnvironment-v0')
env = Monitor(env, log_dir)

# Add some param noise for exploration
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                     desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0)
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
# Train the agent
time_steps = 1e5
model.learn(total_timesteps=int(time_steps), callback=callback)

results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS, "DDPG Satellite")
plt.show()
Exemplo n.º 10
0
def ttest_env(modelpath, modelname):
    for name in modelpath:
        os.makedirs(name, exist_ok=True)
        env = IdentityEnv(18, 18, 60)
        env = Monitor(env, name)
        e = DummyVecEnv([lambda: env])
        if name == log_dir_a2c:
            model = A2C(policy="MlpPolicy", env=e, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "a2c Monitor")
            plt.show()
        if name == log_dir_acer:
            model = ACER(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "acer Monitor")
            plt.show()
        if name == log_dir_acktr:
            model = ACKTR(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "ACKTR Monitor")
            plt.show()
        if name == log_dir_dqn:
            model = DQN(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "DQN Monitor")
            plt.show()
        if name == log_dir_ppo1:
            model = PPO1(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "PPO1 Monitor")
            plt.show()
        if name == log_dir_poo2:
            model = PPO2(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "PPO2 Monitor")
            plt.show()
        if name == log_dir_trpo:
            model = TRPO(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "TRPO Monitor")
            plt.show()
Exemplo n.º 11
0
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve'):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    x, y = ts2xy(load_results(log_folder), 'timesteps')
    y = moving_average(y, window=50)
    # Truncate x
    x = x[len(x) - len(y):]

    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Smoothed")
    plt.show()


from stable_baselines import results_plotter

# Helper from the library
results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS,
                             "Hexworld Coverage")

plot_results(log_dir)
Exemplo n.º 12
0
                    self.model.save(self.save_path)

        return True


# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = gym.make('LunarLanderContinuous-v2')
# print('安装监督前',env)
env = Monitor(env, log_dir)
# print('安装监督后',env)
# Add some param noise for exploration
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                     desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0)
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

# Train the agent
time_steps = 1e5
model.learn(total_timesteps=int(time_steps), callback=callback)
# print(results_plotter.X_TIMESTEPS)
# print(results_plotter.X_EPISODES)

results_plotter.plot_results([log_dir], time_steps, results_plotter.X_EPISODES,
                             "DDPG LunarLander")
plt.show()
Exemplo n.º 13
0
    envname = "Pendulum-v0"
    envname = "LunarLanderContinuous-v2"
    envname = "BipedalWalker-v2"
    env = gym.make(envname)
    exp_name = env.spec._env_name + '-DPDQN2'

    log_dir = 'logs_dev/' + exp_name
    env = Monitor(env, log_dir, allow_early_resets=True)

    model = DPDQN2(env, verbose=1)

    print("time_steps_todo: " + str(time_steps))
    model.learn(total_timesteps=int(time_steps))
    copyfile(log_dir + ".monitor.csv", "logs_tmp/tmp.monitor.csv")
    results_plotter.plot_results(["logs_tmp"], time_steps,
                                 results_plotter.X_TIMESTEPS,
                                 log_dir.split("/")[1])
    plt.show()

    os.makedirs("models", exist_ok=True)
    model.save("models/" + log_dir.split("/")[1])

    # test

    env = gym.make(envname)
    log_dir = 'logs_test/' + exp_name
    env = Monitor(env, log_dir, allow_early_resets=True)

    model = DPDQN2.load("models/" + log_dir.split("/")[1], env)
    obs = env.reset()
    for i in range(time_steps_test):
Exemplo n.º 14
0
# rewarda2c, eps_step=test_identity('a2c')

# Episode=np.arange(1,len(rewarda2c)+1,1)
# ResultTrain(Episode,Episode_reward=rewarda2c)

# reward_acer, eps_step=test_identity('acer')
# Episode=np.arange(1,len(reward_acer)+1,1)
# ResultTrain(Episode,Episode_reward=reward_acer)
#
# reward_acktr, eps_step=test_identity('acktr')
# Episode=np.arange(1,len(reward_acktr)+1,1)
# ResultTrain(Episode,Episode_reward=reward_acktr)
#
reward_dqn, eps_step = test_identity('dqn')
time_steps = 10000
results_plotter.plot_results([log_dir], time_steps, results_plotter.X_EPISODES,
                             "DQN RESULT")
plt.show()
# Episode=np.arange(1,len(reward_dqn)+1,1)
# print('DQN训练每回合的奖赏结果is',reward_dqn)
# ResultTrain(Episode,Episode_reward=reward_dqn)
#
# reward_ppo1, eps_step=test_identity('ppo1')
# Episode=np.arange(1,len(reward_ppo1)+1,1)
# ResultTrain(Episode,Episode_reward=reward_ppo1)
#
# reward_ppo2, eps_step=test_identity('ppo2')
# Episode=np.arange(1,len(reward_ppo2)+1,1)
# ResultTrain(Episode,Episode_reward=reward_ppo2)
#
# reward_trpo, eps_step=test_identity('trpo')
# Episode=np.arange(1,len(reward_trpo)+1,1)
Exemplo n.º 15
0
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    return True


# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = gym.make('CartPole-v1')
env = Monitor(env, log_dir, allow_early_resets=True)

# Add some param noise for exploration
# param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
model = DQN('MlpPolicy',
            env,
            learning_rate=1e-3,
            prioritized_replay=True,
            param_noise=True,
            verbose=1)
# Train the agent
time_steps = 1e5
model.learn(total_timesteps=int(time_steps), callback=callback)

results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS, "DQN CartPole")
plt.show()
Exemplo n.º 16
0
from stable_baselines import results_plotter
import matplotlib.pyplot as plt
# Helper from the library
results_plotter.plot_results([
    '/home/constantin/Desktop/projects/disertation/rl_logs_1_1-20200120T201830Z-001/rl_logs_1_1/'
], 1e5, results_plotter.X_TIMESTEPS, "Test")
plt.show()
Exemplo n.º 17
0
from stable_baselines import results_plotter
from matplotlib import pyplot as plt
import time

while (True):
    results_plotter.plot_results(["./log2"], 10e6, results_plotter.X_TIMESTEPS,
                                 "Breakout")
    plt.pause(10)
    plt.close()
Exemplo n.º 18
0
            }
    else:
        items = {
            "policy": MLP,
            batchsize: args.timesteps_per_batch,
        }
    model = algo(env=env, verbose=1, seed=args.SEED, **items)

    model.set_env(env)
    print("Training for ", args.total_timesteps)

    model.learn(total_timesteps=int(args.total_timesteps))
    # library helper
    plot_results(
        [log_dir],
        int(args.total_timesteps),
        results_plotter.X_TIMESTEPS,
        str(args.algo_name) + "_" + identifer,
    )
    plt.savefig("convergence_plot" + identifer + ".png")
    model.save("policy-" + identifer)

else:

    model = algo.load("policy-" + identifer)
    obs = env.reset()
    done = False
    score = 0
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
Exemplo n.º 19
0
             batch_size=1024,
             buffer_size=int(5e5),
             verbose=0,
             param_noise=param_noise,
             action_noise=action_noise,
             tensorboard_log=parent_dir + "tensorboard/",
             n_cpu_tf_sess=multiprocessing.cpu_count())

model.learn(total_timesteps=interval * icount,
            log_interval=interval,
            tb_log_name="DDPG_{}".format(time.strftime("%Y%m%d")),
            callback=callbackList)

obs = env.reset()
dones = False
counter = []
while dones == False:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    counter.append(rewards)

env.close()

print("\nFinal costs:")
pp.pprint(env.cost())

# Plot the reward graph
if useBestCallback:
    plot_results([log_dir], interval * icount, results_plotter.X_TIMESTEPS,
                 "DDPG CityLearn")
    plt.savefig(log_dir + "/rewards.pdf")
Exemplo n.º 20
0
#                  full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None):
model = PPO2(MlpPolicy, env, gamma=0.9, learning_rate=0.0086, nminibatches=64,
             verbose=1, tensorboard_log="./ppo2_filter_tensorboard/")

callback = SaveOnBestTrainingRewardCallback(check_freq=40, log_dir=log_dir)
time_steps = 3e4

start_time = time.time()
model.learn(total_timesteps=int(time_steps), callback=callback)
"---%s seconds ---" % (time.time() - start_time)

# save the last model
model.save("ppo2_LastModel")


results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO2 LunarLander")
plt.show()


# done = False
# while not done:
#     #action = ... # Your agent code here
#     obs, reward, done, info = env.step(env.action_sapce.sample())
#     env.render()

# for _ in range(10):
#     action = env.action_type.actions_indexes["IDLE"]
#     obs, reward, done, info = env.step(env.action_space.sample())
#     env.render()

# env = gym.make("overtaking-v0")
Exemplo n.º 21
0
    #env = CustomEnv(3, 6, "tcp://*:5556")
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    # Create log dir
    log_dir = "Logs/Custom_env/"
    os.makedirs(log_dir, exist_ok=True)
    # Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=500,
                                                log_dir=log_dir)

    #env = Monitor(env, log_dir)

    model = ACKTR(MlpPolicy, env, verbose=2)
    #model.load("DQN_agent")
    model.learn(total_timesteps=20000, callback=callback)
    model.save("temp_agent")

    a = input("Training completed")

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        probs = model.action_probability(obs)
        obs, rewards, dones, info = env.step(action)
        print("Observation:", obs, rewards, probs)

    results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS,
                                 "Lane Manager")
    plt.show()
Exemplo n.º 22
0
# Instantiate the agent
# model = DQN(EgoAttentionNetwork, env, learning_rate=1e-3, prioritized_replay=True, verbose=1)
model = DQN("MlpPolicy",
            env,
            learning_rate=1e-3,
            prioritized_replay=True,
            verbose=1,
            tensorboard_log="./test_results/DQN_overtaking_tensorboard/" +
            TIMESTAMP)
# create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
# Train the agent
time_steps = 1000
model.learn(total_timesteps=int(time_steps), callback=callback)

results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS, "DQN OvertakingEnv")
plt.show()

# # Save the agent
# model.save("dqn_overtaking")
# del model  # delete trained model to demonstrate loading
#
# # Load the trained agent
# model = DQN.load("dqn_overtaking", env=env)
#
# # Evaluate the agent
# mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
#
# # Enjoy trained agent
# obs = env.reset()
# episode_reward = 0
Exemplo n.º 23
0
def main():

    args = get_args()
    choose_device(args.device)
    set_global_seeds(args.seed)

    env_id = args.env
    exp_id = args.exp_id
    algo = args.algo
    env_name = env_id[:-3]
    env_index = env_list.index(env_id)

    # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training
    env_kwargs = {} if args.env_kwargs is None else args.env_kwargs
    if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']):
        if 'rew_land' in env_kwargs:
            if (int(env_kwargs['rew_land']) in [500, 1000, 10000]):
                env_success[-1] = int(env_kwargs['rew_land'])
            else:
                raise ValueError(
                    'Given env reward not acceptable. Please try again')

    params = [exp_id, env_name.lower()]
    folder = [exp_id, env_name.lower(), args.algo.lower()]
    tensorboard_path, monitor_path, callback_path = None, None, None

    if args.tensorboard:
        tensorboard_path = "tensorboard/{}_{}".format(*params)
        make_dir(tensorboard_path)

    # if args.train_RL: # Begin training here (location of this condition also decides experiment performance)

    # Load hyperparameters from yaml file
    with open('hyperparams/{}.yml'.format(args.algo), 'r') as f:
        hyperparams_dict = yaml.safe_load(f)
        if env_id in list(hyperparams_dict.keys()):
            hyperparams = hyperparams_dict[env_id]
        else:
            raise ValueError("Hyperparameters not found for {}-{}".format(
                args.algo, env_id))

    if args.hyperparams is not None:
        # Overwrite hyperparams if needed
        hyperparams.update(args.hyperparams)

    # OPTIONAL: Print saved hyperparams
    saved_hyperparams = OrderedDict([(key, hyperparams[key])
                                     for key in sorted(hyperparams.keys())])
    if args.verbose > 0:
        pprint(saved_hyperparams)

    if args.n_envs > 1:
        # if args.verbose:
        print("Overwriting n_envs with n={}".format(args.n_envs))
        n_envs = args.n_envs
    else:
        n_envs = hyperparams.get('n_envs', 1)

    # choose Monitor log path according to multiprocessing setting
    if args.monitor:
        if n_envs == 1:
            monitor_path = 'logs/single/{}_{}_{}'.format(*folder)
        else:
            if algo not in ['dqn', 'her', 'sac', 'td3']:
                monitor_path = 'logs/multi/{}_{}_{}'.format(*folder)
        make_dir(monitor_path)

    if int(float(args.timesteps_RL)) > 0:
        # if args.verbose:
        print("Overwriting n_timesteps with n={}".format(
            int(float(args.timesteps_RL))))
        n_timesteps = int(float(args.timesteps_RL))
    else:
        n_timesteps = int(hyperparams['n_timesteps'])

    # Convert to python object if needed
    if 'policy_kwargs' in hyperparams.keys() and isinstance(
            hyperparams['policy_kwargs'], str):
        hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs'])

    if 'n_envs' in hyperparams.keys():
        del hyperparams['n_envs']
    del hyperparams['n_timesteps']  #To avoid error

    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())):
    #     hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate'])

    def create_env(n_envs, eval_env=False):
        if algo in ['a2c', 'acer', 'acktr', 'ppo2']:
            if n_envs > 1:
                env = SubprocVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            else:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs)
                ])
            env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)])
            if env_wrapper is not None:
                env = env_wrapper(env)
        elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1):
            raise ValueError(
                "Error: {} does not support multiprocessing!".format(algo))
        elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1):
            raise ValueError(
                "Error: {} uses MPI for multiprocessing!".format(algo))
        else:
            env = make_vec_env(env_id,
                               n_envs=n_envs,
                               seed=args.seed,
                               monitor_dir=monitor_path,
                               wrapper_class=env_wrapper,
                               env_kwargs=env_kwargs)

        if args.normalize:  # choose from multiple options
            # env = VecNormalize(env, clip_obs=np.inf)
            env = VecNormalize(env, norm_reward=False, clip_obs=np.inf)
            # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs)
        return env

    # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)])
    # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)])
    env = create_env(n_envs)

    # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility

    callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder)
    save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[
        env_index]
    save_freq, eval_freq = max(save_freq // n_envs,
                               1), max(eval_freq // n_envs, 1)
    make_dir(callback_path)
    if args.check_callback:
        callback.append(
            CheckpointCallback(save_freq=save_freq,
                               save_path=callback_path,
                               name_prefix='rl_model',
                               verbose=1))
    if args.eval_callback:
        callback.append(
            EvalCallback(create_env(1, eval_env=True),
                         best_model_save_path=callback_path,
                         log_path=callback_path,
                         eval_freq=eval_freq,
                         verbose=1))

    model = (algo_list[args.algo])(env=env,
                                   seed=args.seed,
                                   tensorboard_log=tensorboard_path,
                                   n_cpu_tf_sess=1,
                                   verbose=args.verbose,
                                   **hyperparams)
    print('\nTraining {} on {} now... \n'.format(algo, env_id))

    start_time = time.time()
    model.learn(total_timesteps=n_timesteps, callback=callback)
    total_time = time.time() - start_time

    if args.normalize:
        env.save(os.path.join(callback_path, "vec_normalize.pkl"))

    if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']):
        print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))
    else:
        print("Took {:.2f}s for single process version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))

    env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)])

    if args.normalize:
        env = VecNormalize.load(
            os.path.join(callback_path, "vec_normalize.pkl"), env)
        env.training = False
        env.norm_reward = False
        env.seed(args.seed)

    # Evaluate RL model - choose either best model or last available model
    model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model'))
    # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder))
    model.set_env(env)
    evaluate('policy', model, env_id, env, algo, 100)

    if args.monitor:
        results_plotter.plot_results([monitor_path], n_timesteps,
                                     results_plotter.X_TIMESTEPS,
                                     "{} {}".format(algo, env_id))
        plot_results(monitor_path)

    if args.test:
        print('\nTesting policy...\n')
        obs = env.reset()
        for _ in range(n_timesteps):
            action, _states = model.predict(obs, deterministic=True)
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
            obs, rewards, dones, info = env.step(action)
            episode_reward += rewards
            env.render()
            if dones:
                done_count += 1
                success_count = check_success(env_index, env_success,
                                              success_count)
                total_reward += episode_reward
                episode_reward = 0
                env.reset()
        print('\n{}/{} successful episodes'.format(success_count, done_count))
        average_reward = total_reward / done_count
        print('\nAverage reward: {}'.format(average_reward))
        env.close()
Exemplo n.º 24
0
env = KukaCamGymEnv(renders=True, isDiscrete=True)  # pybullet可以直接make
env.cid = p.connect(p.DIRECT)
env = Monitor(env, log_dir)

# 添加基本的噪声参数
# # Add some param noise for exploration
# param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
# # Because we use parameter noise, we should use a MlpPolicy with layer normalization
# model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0)

# 设置超参
model = PPO2(
    MlpPolicy, env, verbose=1,
    tensorboard_log="./ppo2_kukaWithCam_tboard/")  # verbose应该是一个决定运行状态的参数

# 每1000步检查一次callback
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

# 开始训练
time_steps = 1e8
model.learn(total_timesteps=int(time_steps), callback=callback)

env.close()
p.disconnect()

# 实时的绘制训练结果
results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS, "PPO2 Kuka no Cam")
plt.show()
Exemplo n.º 25
0
                "buffer_size": int(args.timesteps_per_batch)
            }
    else:
        items = {
            "policy": MLP,
            batchsize: args.timesteps_per_batch,
        }

    model = algo(env=env, verbose=1, seed=args.SEED, **items)
    model.set_env(env)

    model.learn(total_timesteps=int(args.total_timesteps))
    # library helper
    plot_results(
        [log_dir],
        int(args.total_timesteps),
        results_plotter.X_TIMESTEPS,
        "TRPO muscle" + identifer,
    )
    plt.savefig("convergence_plot" + identifer + ".png")
    model.save("policy-" + identifer)

else:
    # Use trained policy for the simulation.
    model = TRPO.load("trpo_" + identifer)
    obs = env.reset()

    done = False
    score = 0
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
Exemplo n.º 26
0
    R = X['results']
    E = X['ep_lengths']

    av_reward = []
    for i in range(len(R)):
        av_reward.append(np.mean(R[i, :]))

    plt.plot(T, av_reward)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.savefig(log_dir + "evaluations.png")
    # plt.show()

    # plot all training rewards

    results_plotter.plot_results([log_dir], timesteps,
                                 results_plotter.X_TIMESTEPS, "")
    plt.savefig(log_dir + "reward_vs_timesteps.png")
    # plt.show()

    results_plotter.plot_results([log_dir], timesteps,
                                 results_plotter.X_EPISODES, "")
    plt.savefig(log_dir + "reward_vs_episodes.png")
    # plt.show()

    results_plotter.plot_results([log_dir], timesteps,
                                 results_plotter.X_WALLTIME, "")
    plt.savefig(log_dir + "reward_vs_walltime.png")
    # plt.show()

    #### smoothed training rewards
Exemplo n.º 27
0
os.makedirs(log_dir, exist_ok=True)
callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir)

env = gym.make('DeepRMSCA-v0', **env_args)

# logs will be saved in log_dir/monitor.csv
# in this case, on top of the usual monitored things, we also monitor service and bit rate blocking probabilities
env = Monitor(env, log_dir + 'training', info_keywords=('service_blocking_rate_since_reset','bit_rate_blocking_rate_since_reset'))

policy_args = dict(net_arch=5*[128], act_fun=tf.nn.elu) # the neural network has four layers with 150 neurons each

agent = TRPO(MlpPolicy, env, verbose=0, tensorboard_log="./tb/TRPO-DeepRMSCA-v0/", policy_kwargs=policy_args, gamma=.95, learning_rate=10e-5)

agent.learn(total_timesteps=100000, callback=callback)

results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "DeepRMSCA TRPO")

import matplotlib.pyplot as plt

def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve'):
Exemplo n.º 28
0
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-100:])
            print(x[-1], 'timesteps')
            print(
                "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}"
                .format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    return True


################ TRAINING

model.learn(total_timesteps=time_steps, seed=args.random_seed)

# print('save model')
# savemodel(model, MODEL, ENVIRONMENT, DATE)

results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS,
                             "RGBD Observation with a Sparse Reward Function")
plt.show()
print('total time', time.time() - start)
Exemplo n.º 29
0
import matplotlib.pyplot as plt
from stable_baselines import results_plotter

from config import TIME_STEPS

log_dir = "./monitor_logs/"
results_plotter.plot_results([log_dir], TIME_STEPS, results_plotter.X_TIMESTEPS, "Rewards over episodes")
plt.show()
Exemplo n.º 30
0
# for key, value in baselines_mlp_model.get_parameters().items():
#   print(key, value.shape)
#
# th_model = copy_mlp_weights(baselines_mlp_model)



# obs = env.reset()
# while True:
#     action, states = model.predict(obs)
#     obs, rewards, dones, info = env.step(action)
#     print(rewards, dones)
#     env.render()

print(log_dir)
results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "Witches")
plt.show()



## Output PPO2
## https://stable-baselines.readthedocs.io/en/master/modules/ppo2.html

# -------------------------------------
# | approxkl           | 8.841733e-05 |
# | clipfrac           | 0.0          |
# | ep_len_mean        | 1.4          |   mean episode length
# | ep_reward_mean     | 0.2          |   mean reward per episode
# | explained_variance | -0.0164      |
# | fps                | 2831         |
# | n_updates          | 99           |   number of gradient updates