Пример #1
0
def train_SAC(env, title="Stand Up Task Learning Curve"):
    print(f"action space shape -1:{env.action_space.shape[-1]}")

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    callback = Logger(log_dir=log_dir)
    timesteps = 20000
    model = SAC('MlpPolicy',
                env,
                learning_rate=0.001,
                learning_starts=10000,
                ent_coef='auto_1.1',
                train_freq=1,
                n_episodes_rollout=-1,
                target_entropy=-21,
                buffer_size=1000000,
                action_noise=None,
                batch_size=64,
                verbose=1,
                policy_kwargs=dict(net_arch=[64, 64]))
    model.learn(total_timesteps=timesteps, callback=callback)

    model.save("SAC_pkl")
    plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, title)
    plt.savefig("{}/learn_curve.png".format(log_dir))
    plt.show()
Пример #2
0
def test_ddpg():
    log_dir = f"model_save/best_model_ddpg_cnn"
    env = ENV(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = DDPG.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",i,"action:", action,"now profit:",env.profit)
            if done:
                print('stock',i,' total profit=',env.profit,' buy hold=',env.buy_hold)
                break
Пример #3
0
def test_ppo():
    log_dir = f"model_save/best_model_ppo"
    env = ENV_CONTINUE(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = PPO.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)
            day+=1
            if done:
                print('stock',i,' total profit=',env.profit,' buy hold=',env.buy_hold)
                break
Пример #4
0
def test_td3():
    log_dir = f"model_save/best_model_td3_sp2"
    env = ENV(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = TD3.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)
            day+=1
            if done:
                print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}'
                      .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad))
                break
def train(args, data):
    # extract data from dictionry
    observations = torch.tensor(data['observations'])
    actions = torch.tensor(data['actions'])
    next_observations = torch.tensor(data['next_observations'])
    rewards = torch.tensor(data['rewards'])
    dones = data['dones']

    in_dim = torch.cat([observations[0], actions[0]]).size(0)
    out_dim = torch.cat([next_observations[0], rewards[0]]).size(0)

    dynamics = DynamicsEnsemble(args.ensemble_size, 
                                in_dim, 
                                out_dim, 
                                args.encoder_hidden_dim,
                                args.decoder_hidden_dim,
                                args.latent_dim, 
                                args.n_hidden,
                                )

    # load saved model into dynamics ensemble
    checkpoint = torch.load(args.CHECKPOINTPATH)
    dynamics.load_state_dict(checkpoint['model_state_dict'])
    dynamics.opt.load_state_dict(checkpoint['optimizer_state_dict'])

    # define SAC agent
    agent = SAC('MlpPolicy', dynamics)

    # set up callback
    date_ = str(date.today())
    exp_code = date_ + '_' + str(np.random.randint(1000000))
    log_dir = args.write_to + '_' + exp_code
    callback = TensorboardCallback(exp_code, check_freq=100, log_dir=log_dir)

    timesteps = 1e5
    agent.learn(total_timesteps=int(timesteps), callback=callback)

    plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "SAC: Generative Ensembles")
    plt.show()
Пример #6
0
def plot_results(log_folder, title='Learning Curve'):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    x, y = ts2xy(load_results(log_folder), 'timesteps')
    y = moving_average(y, window=50)
    # Truncate x
    x = x[len(x) - len(y):]

    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Smoothed")

    # built int
    results_plotter.plot_results([log_folder], 3e5,
                                 results_plotter.X_TIMESTEPS,
                                 "TD3 LunarLander")

    plt.show()
    model = DDPG(CustomPolicy,
                 env,
                 verbose=1,
                 batch_size=64,
                 action_noise=action_noise)

    for i in range(step_iters):  # run for step_iters * training_timesteps

        model.learn(total_timesteps=training_timesteps)

        model.save("./models/ddpg" + str((i + 1) * training_timesteps))
        model.save_replay_buffer("./experiences/ddpg_experience" +
                                 str((i + 1) * training_timesteps))

        #### Show (and record a video of) the model's performance ##########################################
        env_test = RLTetherAviary(gui=False, record=True)
        obs = env_test.reset()
        start = time.time()
        for i in range(10 * env_test.SIM_FREQ):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env_test.step(action)
            if done: break
        env_test.close()

    env.close()

    results_plotter.plot_results([os.path.join(os.getcwd(), log_dir)],
                                 step_iters * training_timesteps,
                                 results_plotter.X_TIMESTEPS, "DDPG")

    plot_results(os.path.join(os.getcwd(), log_dir), "DDPG")
Пример #8
0
                   env_kwargs={"t_final": 5000},
                   n_envs=1,
                   monitor_dir=log_dir)  # Parallel environments
eval_env = gym.make(env_string, t_final=5000)
# -------------------------------------------------------------------------
# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env,
                             best_model_save_path='./logs/',
                             log_path='./logs/',
                             eval_freq=200,
                             deterministic=True,
                             render=False)

ransim_callback = CustomRansimCallback(eval_env,
                                       best_model_save_path='./logs/',
                                       log_path='./logs/',
                                       eval_freq=10 * 5 * 1e2,
                                       deterministic=True,
                                       render=False,
                                       plot_results=True)
# -------------------------------------------------------------------------
model = A2C('MlpPolicy', env, verbose=1)
timesteps = 2 * 100 * 5 * 1e2  # k*n_envs*T_Final/t_c
model.learn(total_timesteps=int(timesteps), callback=ransim_callback)
# -------------------------------------------------------------------------
plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "A2C ran-sim")
plt.savefig(log_dir + 'A2C_ran-sim_rewards_plot.png', format="png")
plt.show()

msa = 1
Пример #9
0
    env = AsistEnvGym(portal_data, room_data, victim_data, "as")
    env = Monitor(env, log_dir)

    n_actions = env.action_space.shape
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = PPO(MlpPolicy, env, verbose=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    # model = DQN(MlpPolicy, env, verbose=1)
    timeSteps = 80000
    model.learn(total_timesteps=timeSteps, callback=callback)

    plot_results([log_dir], timeSteps, results_plotter.X_TIMESTEPS,
                 "TD3 LunarLander")
    plt.show()

    # obs = env.reset()

    # score = 0
    # done = False
    # while not done:
    #     action, _states = model.predict(obs, deterministic=True)
    #     obs, rewards, done, info = env.step(action)
    #     score += rewards
    #     env.render()
    # with open("tmp.txt", 'w') as ff:
    #     g = ff.write(str(env.visit_node_sequence))
    # print(env.visit_node_sequence)
    # print("Victim_saved:", len(env.graph.safe_victim_list))
Пример #10
0
def using_callback_example():
    # Using Callback: Monitoring Training.

    class SaveOnBestTrainingRewardCallback(BaseCallback):
        """
		Callback for saving a model (the check is done every 'check_freq' steps)
		based on the training reward (in practice, we recommend using 'EvalCallback').

		:param check_freq:
		:param log_dir: Path to the folder where the model will be saved. It must contains the file created by the 'Monitor' wrapper.
		:param verbose: Verbosity level.
		"""
        def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
            super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
            self.check_freq = check_freq
            self.log_dir = log_dir
            self.save_path = os.path.join(log_dir, "best_model")
            self.best_mean_reward = -np.inf

        def _init_callback(self) -> None:
            # Create folder if needed.
            if self.save_path is not None:
                os.makedirs(self.save_path, exist_ok=True)

        def _on_step(self) -> bool:
            if self.n_calls % self.check_freq == 0:
                # Retrieve training reward.
                x, y = ts2xy(load_results(self.log_dir), "timesteps")
                if len(x) > 0:
                    # Mean training reward over the last 100 episodes.
                    mean_reward = np.mean(y[-100:])
                    if self.verbose > 0:
                        print(f"Num timesteps: {self.num_timesteps}")
                        print(
                            f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}"
                        )

                    # New best model, you could save the agent here.
                    if mean_reward > self.best_mean_reward:
                        self.best_mean_reward = mean_reward
                        # Example for saving best model.
                        if self.verbose > 0:
                            print(f"Saving new best model to {self.save_path}")
                        self.model.save(self.save_path)

            return True

    # Create log dir.
    log_dir = "tmp/"
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment.
    env = gym.make("LunarLanderContinuous-v2")
    env = Monitor(env, log_dir)

    # Add some action noise for exploration.
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    # Because we use parameter noise, we should use a MlpPolicy with layer normalization.
    model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0)
    # Create the callback: check every 1000 steps.
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)
    # Train the agent.
    timesteps = 1e5
    model.learn(total_timesteps=int(timesteps), callback=callback)

    plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS,
                 "TD3 LunarLander")
    plt.show()
Пример #11
0
        return ProgressBarCallback(self.pbar)

    def __exit__(self, exc_type, exc_val, exc_tb):  # close the callback
        self.pbar.n = self.total_timesteps
        self.pbar.update(0)
        self.pbar.close()


# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = make_vec_env('foo-v0', n_envs=1, monitor_dir=log_dir)
tic = time.perf_counter()

# Create callbacks
save_callback = SaveOnBestTrainingRewardCallback(check_freq=10000, log_dir=log_dir)


model = stable_baselines3.DQN('MlpPolicy', env, verbose=0, learning_rate=1e-4)
model = model.load('AAA', env)


steps = 10e6
with ProgressBarManager(steps) as progress_callback:
    # This is equivalent to callback=CallbackList([progress_callback, auto_save_callback])
    model = model.learn(steps, callback=[progress_callback, save_callback])
model.save('AAA')
results_plotter.plot_results([log_dir], steps, results_plotter.X_TIMESTEPS, "TD3 LunarLander")
Пример #12
0
               channel_type=Channels.RGB_ONLY)
time_steps = 200000
name = "Offworld_DQN4"

env = Monitor(env, log_dir)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
model = DQN("MlpPolicy",
            env,
            gamma=0.95,
            learning_rate=1e-3,
            verbose=0,
            buffer_size=1000,
            batch_size=16,
            exploration_fraction=0.9,
            exploration_final_eps=0.1,
            exploration_initial_eps=1.0,
            train_freq=1)
print(type(callback))
#, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1
model.learn(total_timesteps=int(time_steps), callback=callback)

results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS, name)
plt.savefig(name + '.png')
model.save(name)

model = DQN.load(name)
mean_reward = evaluate(model, num_steps=100)

env.close()
Пример #13
0
                        default="CartPole-v1")
    args = parser.parse_args()

    log_dir = args.log_folder

    # Load results
    W = load_results(log_dir)
    # print("Results seed: ", W)

    # Save walltime to stats.csv
    df = pd.read_csv(log_dir + 'stats.csv')
    df["Train walltime (s)"] = W["t"].max()
    df.to_csv(log_dir + "stats.csv", index=False)
    # print(df)

    # Plot training rewards

    TIMESTEPS = 1e10

    plot_results([log_dir], TIMESTEPS, X_TIMESTEPS, args.env)
    plt.savefig(log_dir + "reward_vs_timesteps.png")
    # plt.show()

    plot_results([log_dir], TIMESTEPS, X_EPISODES, args.env)
    plt.savefig(log_dir + "reward_vs_episodes.png")
    # plt.show()

    plot_results([log_dir], TIMESTEPS, X_WALLTIME, args.env)
    plt.savefig(log_dir + "reward_vs_walltime.png")
    # plt.show()
Пример #14
0
# For tensorflow imported with tensorboard
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3.common.results_plotter import X_TIMESTEPS, plot_results

parser = argparse.ArgumentParser()
parser.add_argument('algo', type=str)
parser.add_argument('env', type=str)
parser.add_argument('exp_folder', type=str)
args = parser.parse_args()

algo = args.algo
env = args.env
log_path = os.path.join(args.exp_folder, algo)

dirs = [
    os.path.join(log_path, folder) for folder in os.listdir(log_path)
    if env in folder and os.path.isdir(os.path.join(log_path, folder))
]

try:
    plot_results(dirs, 2e6, X_TIMESTEPS, env)
except Exception as e:
    print(e)

plt.show()
Пример #15
0
# For tensorflow imported with tensorboard
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning)


parser = argparse.ArgumentParser()
parser.add_argument("algo", type=str)
parser.add_argument("env", type=str)
parser.add_argument("exp_folder", type=str)
parser.add_argument("axis", choices=["steps", "episodes", "time"], type=str)
args = parser.parse_args()


algo = args.algo
env = args.env
log_path = os.path.join(args.exp_folder, algo)
x_axis = {"steps": X_TIMESTEPS, "episodes": X_EPISODES, "time": X_WALLTIME}[args.axis]

dirs = [
    os.path.join(log_path, folder)
    for folder in os.listdir(log_path)
    if (env in folder and os.path.isdir(os.path.join(log_path, folder)))
]

try:
    plot_results(dirs, 2e6, x_axis, env)
except Exception as e:
    print(e)

plt.show()
Пример #16
0
from stable_baselines3.common import results_plotter

log_dir = "/Users/daniel/repos/CitadelsAI/logs"
# results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "CitadelsAI")
results_plotter.plot_results([log_dir],
                             num_timesteps=200000,
                             x_axis=results_plotter.X_TIMESTEPS,
                             task_name="CitadelsAI")
print('debug')
Пример #17
0
                    )

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print(f"Saving new best model to {self.save_path}.zip")
                    self.model.save(self.save_path)

        return True


log_dir = "./tmp/a2c-log"
os.makedirs(log_dir, exist_ok=True)

tb_logs = "./tmp/a2c-tb-log"
os.makedirs(tb_logs, exist_ok=True)

env = RCT(settings_path='configs/settings.yml')
env = Monitor(env, log_dir)

n_actions = env.action_space.shape[-1]
#action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

model = A2C(MlpPolicy, env, tensorboard_log=tb_logs, verbose=0)
model.learn(total_timesteps=int(5e4), callback=callback)

results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS,
                             "A2C MlpPolicy RCT")
Пример #18
0
    # Save trained model

    print(f"Saving to {save_path}")
    model.save(f"{save_path}/{env_id}")

    if hasattr(model, 'save_replay_buffer') and args.save_replay_buffer:
        print("Saving replay buffer")
        model.save_replay_buffer(save_path)

    if normalize:
        # Important: save the running average, for testing the agent we need that normalization
        model.get_vec_normalize_env().save(
            os.path.join(params_path, 'vecnormalize.pkl'))
        # Deprecated saving:
        # env.save_running_average(params_path)

    # plot training
    plot_results([save_path], n_timesteps, results_plotter.X_TIMESTEPS,
                 "A2C ran-sim")
    plt.savefig(save_path + 'A2C_ransim_rewards_plot.png', format="png")
    plt.show()

    # plot evaluation
    file_path = os.path.join(save_path, 'evaluations.npz')
    #file_path = 'logs/a2c/ransim-v0_3/evaluations.npz'
    # np.load(file_path)
    plot_evaluation_results(file_path, n_timesteps,
                            results_plotter.X_TIMESTEPS, "A2C_eval_ran-sim")
    plt.savefig(save_path + 'A2C_ransim_rewards_eval_plot.png', format="png")
    plt.show()
Пример #19
0
            policy_kwargs=policy_kwargs,
            env=env,
            verbose=0,
            action_noise=action_noise,
            learning_starts=interval,
            tensorboard_log=parent_dir + "tensorboard/")
print()

model.learn(total_timesteps=interval * icount,
            log_interval=log_interval,
            tb_log_name="TD3_{}".format(time.strftime("%Y%m%d")),
            callback=callbackList)

obs = env.reset()
dones = False
counter = []
while dones == False:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    counter.append(rewards)

env.close()

print("\nFinal rewards:")
pp.pprint(env.cost())

# Plot the reward graph
if useBestCallback:
    plot_results([log_dir], interval * icount, results_plotter.X_TIMESTEPS,
                 "TD3 CityLearn")
    plt.savefig(log_dir + "/rewards.pdf")
Пример #20
0
def sb3_plot():
    results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, exp_name)
Пример #21
0
# callback for model training
# saves checkpoint if current version of model is better than all before...
callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                            log_dir=CHECKPOINT_DIR,
                                            save_path=CHECKPOINT_DIR)

# training
total_timesteps = 150000
model.learn(total_timesteps=total_timesteps, callback=callback)

# we don't have to save manually when we use the callback in the model.learn call
# model.save(os.path.join(CHECKPOINT_DIR, "mlp_dqn_cartpole"))

plot_results([CHECKPOINT_DIR],
             num_timesteps=total_timesteps,
             x_axis=results_plotter.X_TIMESTEPS,
             task_name="{} DQN on {}".format(policy_name, env_name),
             figsize=(8, 4))
plt.savefig(os.path.join(CHECKPOINT_DIR, "training_progress.png"))
plt.show()

# restore model from saved...
del model

model = DQN.load(os.path.join(CHECKPOINT_DIR, "best_model"), env=env)

# testing
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)