def test_channel_first_env(tmp_path):
    # test_cnn uses environment with HxWxC setup that is transposed, but we
    # also want to work with CxHxW envs directly without transposing wrapper.
    SAVE_NAME = "cnn_model.zip"

    # Create environment with transposed images (CxHxW).
    # If underlying CNN processes the data in wrong format,
    # it will raise an error of negative dimension sizes while creating convolutions
    env = FakeImageEnv(screen_height=40,
                       screen_width=40,
                       n_channels=1,
                       discrete=True,
                       channel_first=True)

    model = A2C("CnnPolicy", env, n_steps=100).learn(250)

    assert not is_vecenv_wrapped(model.get_env(), VecTransposeImage)

    obs = env.reset()

    action, _ = model.predict(obs, deterministic=True)

    model.save(tmp_path / SAVE_NAME)
    del model

    model = A2C.load(tmp_path / SAVE_NAME)

    # Check that the prediction is the same
    assert np.allclose(action, model.predict(obs, deterministic=True)[0])

    os.remove(str(tmp_path / SAVE_NAME))
Пример #2
0
def a2c(path):
    env = make_env(HumanPlayer())

    eval_env = make_env(RandomPlayer())

    model = A2C.load(path, env, verbose=1)

    mean, std = evaluate_policy(model, eval_env, n_eval_episodes=10)
    print(f"Loaded policy: mean={mean:.2f} +/- {std}")
    # Show how well we learned by plating a game:
    obs = env.reset()
    done = False
    while not done:
        action, _state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        print(f"{info['turn']: <4} | Reward: {reward: >4} | {info['winner']}")
        env.render()
    print("done")
Пример #3
0
def evaluate(params):

    # Load saved model
    model = A2C.load(exp_name, env=env)
    results = np.zeros(shape=(0,0))
    obs = env.reset()

    # Evaluate the agent
    episode_reward = 0
    for _ in range(params.get("test_episodes")):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if done or info.get('is_success', False):
            episode_reward = 0.0
            obs = env.reset()

        result = ("Reward:", episode_reward, "Success?", info.get('is_success', True))
        results = np.append(results, result, axis=None)
Пример #4
0
def load_state(run_dir):
    """
    Function that loads the previously saved state of training 
    """

    #looking for the latest saved state
    state_dir = os.path.join(run_dir, "saved_states")
    i = max([int(f.name) for f in os.scandir(state_dir) if f.is_dir()])
    load_dir = os.path.join(state_dir, str(i))

    policy_load_path = os.path.join(load_dir, 'policy')
    rm_load_path = os.path.join(load_dir, 'rm.pth')
    data_buff_load_path = os.path.join(run_dir, 'data_buff.pth')

    args_path = os.path.join(run_dir, "config.json")
    with open(args_path) as f:
        args = argparse.Namespace()
        args.__dict__.update(json.load(f))

    reward_model = pickle.load(open(rm_load_path, 'rb'))
    data_buffer = pickle.load(open(data_buff_load_path, 'rb'))
    policy = A2C.load(path=policy_load_path)

    return reward_model, policy, data_buffer, i + 1
Пример #5
0
 end_loop = args.start_iter + args.step
 print("START , END", start_loop, end_loop)
 for i in range(start_loop, end_loop):
     print("EVAL ", i)
     avg_dis_reward_run = []
     for j in range(0, 10):
         print("SEED 0")
         # lambd = np.load(f"./{args.folder}/buffers/lambda_{args.algo}_{j}.npy")
         # N = np.load(f"./{args.folder}/buffers/N_{args.algo}_{j}.npy")
         model_name = f"./{args.folder}/models/model_{args.algo}_{j}_{i}"
         #print ("Lambd N i ", lambd[i], N[i])
         env.set_N(int(N[i]), list(lambd[i]))
         if args.algo == 0:
             model = PPO.load(model_name, env)
         elif args.algo == 1:
             model = A2C.load(model_name, env)
         elif args.algo == 2:
             model = SAC.load(model_name, env)
         elif args.algo == 3:
             thres_vec = np.load(
                 f"./{args.folder}/buffers/thresvec_{args.env_name}_{j}.npy"
             )
             model.set_threshold_vec(thres_vec[i])
         avg_dis_reward = 0.0
         for k in range(100):
             env.seed(k)
             obs = env.reset()
             reward_traj = []
             dis_reward = 0.0
             for t in range(int(1e3)):
                 if args.algo == 3:
Пример #6
0
 def __init__(self, model_path, env):
     self.model = A2C.load(model_path, env)
Пример #7
0
import gym

from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy, CnnPolicy
from stable_baselines3.common.cmd_util import make_vec_env

# Parallel environments
# env = make_vec_env('SpaceInvaders-v0', n_envs=4)
# env = gym.make('SpaceInvaders-v0')
env = gym.make('Pong-v0')

# model = A2C(MlpPolicy, env, verbose=1)
#model = A2C(CnnPolicy, env, verbose=1)
model = A2C.load("a2c_pong")
#model.set_env(env)
#model.learn(total_timesteps=50000)
#model.save("a2c_pong")

#del model # remove to demonstrate saving and loading

obs = env.reset()

score = 0
wins = 0

while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    #env.render()
    score = score + 1
    if rewards > 0:
Пример #8
0
def process(file):
    env = gym.make('PerigeeRaising-Continuous3D-v0')
    env = NormalizeObservationSpace(env, lambda o: o / env.unwrapped.observation_space.high)
    env = Monitor(env)
    env.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(get_action_dim(env.action_space))
    evaluate_policy(agent, env, n_eval_episodes=1)

    hist_sc_state = env.unwrapped.hist_sc_state
    hist_action = env.unwrapped.hist_action
    time = np.array(list(map(lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[0].getDate()),
                             hist_sc_state))) / 3600.0  # Convert to hours
    a = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state))) / 1000.0  # Convert to km
    e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state)))
    mass = np.array(list(map(lambda sc_state: sc_state.getMass(), hist_sc_state)))
    ra = a * (1.0 + e)
    rp = a * (1.0 - e)
    v = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getVelocity().toArray(), hist_sc_state)))
    h = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getMomentum().toArray(), hist_sc_state)))
    angle_f_v = list(map(lambda q:
                         np.degrees(np.arccos(
                             np.dot(q[0], q[1]) / np.linalg.norm(q[0]) / (np.linalg.norm(q[1]) + 1e-10)
                         )),
                         zip(v, hist_action)))
    hist_action_plane = list(map(lambda q: q[1] - np.dot(q[1], q[0]) * q[0] / (np.linalg.norm(q[0]) ** 2),
                                 zip(h, hist_action)))
    angle_fp_v = list(map(lambda q:
                          np.degrees(np.arccos(
                              np.dot(q[0], q[1] * [1, 1, 0]) / np.linalg.norm(q[0]) / (
                                      np.linalg.norm(q[1] * [1, 1, 0]) + 1e-10)
                          )),
                          zip(v, hist_action_plane)))

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("ra (km)")
    axs.plot(time, ra, "k")
    plt.tight_layout()
    fig.savefig("plan_ra.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("rp (km)")
    axs.plot(time, rp, "k")
    plt.tight_layout()
    fig.savefig("plan_rp.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(mass[0] - 0.04, mass[0])
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("mass (kg)")
    axs.plot(time, mass, "k")
    plt.tight_layout()
    fig.savefig("plan_m.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain')
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(-1.3, 1.3)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("action")
    l1, l2, l3 = axs.plot(time[0:-1], hist_action, "k")
    l1.set_color("#000000")
    l2.set_color("#777777")
    l3.set_color("#BBBBBB")
    axs.legend(["Act1", "Act2", "Act3"], loc='upper left')
    plt.tight_layout()
    fig.savefig("plan_action.pdf", format="pdf")
    plt.close(fig)
Пример #9
0
                        type=str,
                        help='Help (default: ..)',
                        metavar='')
    ARGS = parser.parse_args()

    #### Load the model from file ##############################
    algo = ARGS.exp.split("-")[2]

    if os.path.isfile(ARGS.exp + '/success_model.zip'):
        path = ARGS.exp + '/success_model.zip'
    elif os.path.isfile(ARGS.exp + '/best_model.zip'):
        path = ARGS.exp + '/best_model.zip'
    else:
        print("[ERROR]: no model under the specified path", ARGS.exp)
    if algo == 'a2c':
        model = A2C.load(path)
    if algo == 'ppo':
        model = PPO.load(path)
    if algo == 'sac':
        model = SAC.load(path)
    if algo == 'td3':
        model = TD3.load(path)
    if algo == 'ddpg':
        model = DDPG.load(path)

    #### Parameters to recreate the environment ################
    env_name = ARGS.exp.split("-")[1] + "-aviary-v0"
    OBS = ObservationType.KIN if ARGS.exp.split(
        "-")[3] == 'kin' else ObservationType.RGB
    if ARGS.exp.split("-")[4] == 'rpm':
        ACT = ActionType.RPM
Пример #10
0
# Step 3.b Passing through Normalization and stack frame (Optional)

env = VecFrameStack(
    env,
    n_stack=custom_params['FRAME_STACK'])  # Use 1 for now because we use image
if not custom_params['USING_VAE']:
    env = VecTransposeImage(env)  # Uncomment if using 3d obs
if custom_params['USING_NORMALIZATION']:
    env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"),
                            env)

# Load the agent
if custom_params['algo'] == 'sac':
    model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'a2c':
    model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'dqn':
    model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'ppo':
    model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip"))

else:
    raise ValueError("Error model")

# Load the saved statistics
#  do not update them at test time
env.training = False
# reward normalization is not needed at test time
env.norm_reward = False

obs = env.reset()
from stable_baselines3.common.evaluation import evaluate_policy

# Create environment
#env = gym.make('LunarLander-v2')
env = gym.make('ransim-v0')

# Instantiate the agent
model = A2C('MlpPolicy', env, verbose=1)
# Train the agent
model.learn(total_timesteps=int(2e3), eval_log_path='log_msa')
# Save the agent
model.save("a2c_ran")
del model  # delete trained model to demonstrate loading

# Load the trained agent
model = A2C.load("a2c_ran")

# Evaluate the agent
#mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, return_episode_rewards=False)
episode_rewards, episode_lengths = evaluate_policy(model,
                                                   env,
                                                   n_eval_episodes=10,
                                                   return_episode_rewards=True)
#print('mean_reward: %.3f  std_reward: %.3f' %(mean_reward, std_reward))
msa = 1
'''# Enjoy trained agent
obs = env.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()'''
Пример #12
0
            n_steps=config["n_steps"],
            vf_coef=config["vf_coef"],
            ent_coef = config["ent_coef"],
            max_grad_norm=config["max_grad_norm"],
            learning_rate=lr,
            rms_prop_eps=config["epsilon"],
            use_rms_prop=config["use_rms_prop"],
            use_sde=config["use_sde"],
            normalize_advantage=config["normalize_advantage"],
            verbose=config["verbose"],
            tensorboard_log="tb/{}/".format(config["session_ID"]),
            policy_kwargs=dict(net_arch=[int(config["policy_hid_dim"]), int(config["policy_hid_dim"])]))
        
        model.learn(learn_total_steps)
        model.save("learned/{}".format(config["session_ID"]))
        env.save("learned/{}.pkl".format(config["session_ID"]))

        env.close()
    else:
        model = A2C.load("learned/{}".format(config["session_ID"]))
        env = DummyVecEnv([lambda: HumanoidBulletEnv(animate=True, max_steps=env_max_steps)])
        env = VecNormalize.load(("learned/{}.pkl".format(config["session_ID"])), env)
        env.training = False
        env.norm_reward = False

        obs = env.reset()
        for i in range(env_max_steps):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
        env.close()
Пример #13
0
 def load_model(self):
     print('Loading model from: {}'.format(self.model_path))
     model = A2C.load(self.model_path)
     model.set_env(self.env)
     model.tensorboard_log = self.log_dir
     return model
        model = A2C('CnnPolicy', env,
                    gamma=0.8,
                    learning_rate=5e-4,
                    verbose=1,
                    tensorboard_log="logs/")
        model.learn(total_timesteps=int(2e5))
        model.save("a2c_highway")
        # model = A2C('CnnPolicy', env).learn(total_timesteps=int(2e5))
        # model.save("a2c_highway_basic")
        # model.save("a2c_highway_policy5")

    # Record video

    # env.configure({"policy_frequency": 15, "duration": 20 * 15})
    # model = A2C.load("a2c_highway_policy5")
    model = A2C.load("a2c_highwayv0")
    # model = A2C.load("a2c_highway_basic")
    # env.configure({"policy_frequency": 15, "duration": 20 * 15})
    # video_length = 2 * env.config["duration"]
    # env = VecVideoRecorder(env, "videos/",
    #                        record_video_trigger=lambda x: x == 0, video_length=video_length,
    #                        name_prefix="dqn-agent")

    evaluate(env, model)

    for _ in range(5):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs)
            print(action)
Пример #15
0
import gym

from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
env = make_vec_env('CartPole-v1', n_envs=4)

model = A2C(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("a2c_cartpole")

del model  # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Пример #16
0
def process(file):
    env = gym.make('PerigeeRaising-Continuous3D-v0')
    env.unwrapped._ref_sv[2] = 0.0
    env.unwrapped._ref_sv[3] = 0.0
    env.unwrapped._ref_sv[4] = 0.0
    env = NormalizeObservationSpace(
        env, lambda o: o / env.unwrapped.observation_space.high)
    env = Monitor(env)
    env.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env, n_eval_episodes=1)

    hist_sc_state = env.unwrapped.hist_sc_state
    hist_action = env.unwrapped.hist_action
    x = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getX(), hist_sc_state))) / 1000.0  # Convert to km
    y = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getY(), hist_sc_state))) / 1000.0  # Convert to km

    env2 = gym.make('PerigeeRaising-Continuous3D-v0')
    env2.unwrapped._ref_sv[0] = 11000000.0 / 1.05
    env2.unwrapped._ref_sv[1] = 0.05
    env2.unwrapped._ref_sv[2] = 0.0
    env2.unwrapped._ref_sv[3] = 0.0
    env2.unwrapped._ref_sv[4] = 0.0
    env2 = NormalizeObservationSpace(
        env2, lambda o: o / env2.unwrapped.observation_space.high)
    env2 = Monitor(env2)
    env2.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env2, n_eval_episodes=1)

    hist_sc_state2 = env2.unwrapped.hist_sc_state
    hist_action2 = env2.unwrapped.hist_action
    x2 = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getX(), hist_sc_state2))) / 1000.0  # Convert to km
    y2 = np.array(
        list(
            map(
                lambda sc_state: sc_state.getPVCoordinates().getPosition().
                getY(), hist_sc_state2))) / 1000.0  # Convert to km

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.set_xlim(-12000, 12000)
    axs.set_ylim(-12000, 12000)
    axs.grid(False)
    axs.plot(x, y, "k", zorder=2)
    l2, = axs.plot(x2, y2, zorder=1)
    l2.set_color("#777777")
    axs.legend(["Before", "After"],
               loc='upper right',
               frameon=False,
               bbox_to_anchor=(0.0, 1.0))
    im = mpimg.imread('earth.png')
    plt.imshow(im, extent=[-6400, 6400, -6400, 6400], interpolation="none")
    axs.set_aspect('equal')
    plt.text(11000, 0, "Pericenter")
    plt.text(-18500, 0, "Apocenter")
    plt.axis('off')
    plt.tight_layout()
    fig.savefig("orbit.pdf", format="pdf")
    plt.close(fig)
Пример #17
0
def compare_models(ticker):
    # initialize structures for evaluation
    train_data_path = '../data/{}_train.csv'.format(ticker.lower())
    val_data_path = '../data/{}_validation.csv'.format(ticker.lower())
    train_data = pd.read_csv(train_data_path)
    val_data = pd.read_csv(val_data_path)
    val_data['Date'] = pd.to_datetime(val_data['Date'])
    env = SingleStockTradingEnv(train_data_path,
                                engineer_features,
                                initial_value=INITIAL_PORTFOLIO_VALUE,
                                borrowing=BORROWING,
                                long_only=LONG_ONLY)

    # run evaluation for just RL agent
    rl_checkpoint_path = 'checkpoints/{}_rl_no_restrictions'.format(
        ticker.lower())
    a2c = A2C.load(rl_checkpoint_path)
    rl_portfolio_values, rl_agent_holdings, rl_agent_actions, rl_goal_num_shares, rl_fig = evaluate(
        a2c,
        ticker,
        val_data,
        INITIAL_PORTFOLIO_VALUE,
        BORROWING,
        LONG_ONLY,
        use_gp=False,
        plot=True,
        show_plots=False,
        save_plots=False,
        env_type='no_restrictions')
    # get features for GP's
    lookback = 5
    train_features = engineer_features(train_data, lookback=lookback)

    # turn data in dataframes into model inputs
    X_train = torch.Tensor(
        train_features.drop(
            ['Date', 'Volume', 'Returns', 'Close', f'Open -{lookback}'],
            axis=1).values)
    y_train = torch.Tensor(train_features['Returns'].values)

    gp_params = {
        'n_train': 20,
        'training_iter': 10,
        'cvar_limit': -5,  # maximum loss tolerance %
        'gp_limit': 0.3,  # predicted magnitude of GPR such that GP takes over
        'data': {
            'X_train': X_train[-20:],
            'y_train': y_train[-20:]
        }  # last month's worth of data?
    }

    # run evaluation for RL w/ GP agent
    a2c_gp = TradingAgent(use_gp=True,
                          gp_params=gp_params,
                          policy='MlpPolicy',
                          env=env)
    a2c_gp.load(rl_path=rl_checkpoint_path)
    a2c_gp.learn(5000)
    a2c_gp.save(rl_path='checkpoints/{}_a2c_gp_no_restrictions_rl'.format(
        ticker.lower()),
                gp_path='checkpoints/{}_a2c_gp_no_restrictions_gp'.format(
                    ticker.lower()))
    gp_portfolio_values, gp_agent_holdings, gp_agent_actions, gp_goal_num_shares, gp_fig = evaluate(
        a2c_gp,
        ticker,
        val_data,
        INITIAL_PORTFOLIO_VALUE,
        BORROWING,
        LONG_ONLY,
        use_gp=True,
        plot=True,
        show_plots=False,
        save_plots=False,
        env_type='no_restrictions')
    # plot some stuff that might be interesting to look at
    comp_fig = plt.figure(figsize=(20, 5))
    plt.plot(val_data['Date'].iloc[6:],
             np.exp(val_data['Returns'].iloc[6:].cumsum()) *
             INITIAL_PORTFOLIO_VALUE,
             label='Buy and Hold')
    plt.plot(val_data['Date'].iloc[6:], rl_portfolio_values, label='A2C')
    plt.plot(val_data['Date'].iloc[6:], gp_portfolio_values, label='A2C + GP')
    plt.title('Performance Comparison - {}'.format(ticker))
    plt.xlabel('Date')
    plt.ylabel('Portfolio Value')
    plt.legend()

    actions_fig = plt.figure(figsize=(20, 5))
    plt.plot(val_data['Date'].iloc[6:], rl_agent_actions, label='RL Actions')
    plt.plot(val_data['Date'].iloc[6:], gp_agent_actions, label='GP Actions')
    plt.title('Actions Comparison - {}'.format(ticker))
    plt.legend()

    shares_fig = plt.figure(figsize=(20, 5))
    plt.plot(val_data['Date'].iloc[6:],
             rl_agent_holdings,
             label='RL Current # Shares')
    plt.plot(val_data['Date'].iloc[6:],
             gp_goal_num_shares,
             label='GP Target # Shares')
    plt.plot(val_data['Date'].iloc[6:],
             gp_agent_holdings,
             label='GP Current # Shares')
    plt.title('Holdings Comparison - {}'.format(ticker))
    plt.legend()

    # plt.show()

    # save figures
    # rl_fig.savefig('figures/{}_rl_base_no_restrictions.pdf'.format(ticker.lower()), bbox_inches='tight')
    gp_fig.savefig('figures/{}_rl_with_gp_no_restrictions.pdf'.format(
        ticker.lower()),
                   bbox_inches='tight')
    comp_fig.savefig('figures/{}_rl_gp_comparison_no_restrictions.pdf'.format(
        ticker.lower()),
                     bbox_inches='tight')
    actions_fig.savefig(
        'figures/{}_actions_comparison_no_restrictions.pdf'.format(
            ticker.lower()),
        bbox_inches='tight')
    shares_fig.savefig(
        'figures/{}_num_shares_comparison_no_restrictions.pdf'.format(
            ticker.lower()),
        bbox_inches='tight')

    # Calculate and output Sharpe ratios (assume risk-free rate is 0)
    base_log_returns = np.diff(np.log(val_data['Adj Close']))
    base_daily_vol = np.std(base_log_returns)
    base_sharpe = np.sqrt(252) * np.mean(base_log_returns) / base_daily_vol

    rl_log_returns = np.diff(np.log(rl_portfolio_values))
    rl_daily_vol = np.std(rl_log_returns)
    rl_sharpe = np.sqrt(252) * np.mean(rl_log_returns) / rl_daily_vol

    gp_log_returns = np.diff(np.log(gp_portfolio_values))
    gp_daily_vol = np.std(gp_log_returns)
    gp_sharpe = np.sqrt(252) * np.mean(gp_log_returns) / gp_daily_vol

    print('Base: {:.4f}, {:.4f}\tA2C: {:.4f}, {:.4f}\tA2C+GP: {:.4f}, {:.4f}'.
          format(base_sharpe, base_daily_vol, rl_sharpe, rl_daily_vol,
                 gp_sharpe, gp_daily_vol))
Пример #18
0
            # Download from given bucket (gcloud configured with privileges)
            client = gcloud.init_storage_client()
            bucket_name = args.model.split('/')[2]
            model_path = args.model.split(bucket_name + '/')[-1]
            gcloud.read_from_bucket(client, bucket_name, model_path)
            model_path = './' + model_path
        else:
            model_path = args.model

        model = None
        if args.algorithm == 'DQN':
            model = DQN.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'DDPG':
            model = DDPG.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'A2C':
            model = A2C.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'PPO':
            model = PPO.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'SAC':
            model = SAC.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'TD3':
            model = TD3.load(model_path, tensorboard_log=args.tensorboard)
        else:
            raise RuntimeError('Algorithm specified is not registered.')

        model.set_env(env)

    # ---------------------------------------------------------------------------- #
    #       Calculating total training timesteps based on number of episodes       #
    # ---------------------------------------------------------------------------- #
    n_timesteps_episode = env.simulator._eplus_one_epi_len / \
Пример #19
0
 def load(self, rl_path, gp_path=None):
     self.rl = A2C.load(rl_path)
     if gp_path is not None:
         state_dict = torch.load(gp_path)
         self.gp.load_state_dict(state_dict)
Пример #20
0
    # Download from given bucket (gcloud configured with privileges)
    client = gcloud.init_storage_client()
    bucket_name = args.model.split('/')[2]
    model_path = args.model.split(bucket_name + '/')[-1]
    gcloud.read_from_bucket(client, bucket_name, model_path)
    model_path = './' + model_path
else:
    model_path = args.model

model = None
if args.algorithm == 'DQN':
    model = DQN.load(model_path)
elif args.algorithm == 'DDPG':
    model = DDPG.load(model_path)
elif args.algorithm == 'A2C':
    model = A2C.load(model_path)
elif args.algorithm == 'PPO':
    model = PPO.load(model_path)
elif args.algorithm == 'SAC':
    model = SAC.load(model_path)
elif args.algorithm == 'TD3':
    model = TD3.load(model_path)
else:
    raise RuntimeError('Algorithm specified is not registered.')

# ---------------------------------------------------------------------------- #
#                             Execute loaded agent                             #
# ---------------------------------------------------------------------------- #
for i in range(args.episodes):
    obs = env.reset()
    rewards = []
Пример #21
0
 def load(cls, filename, **kwargs):
     rlberry_a2c_wrapper = cls(**kwargs)
     rlberry_a2c_wrapper.wrapped = A2CStableBaselines.load(filename)
     return rlberry_a2c_wrapper
Пример #22
0
import gym
import os
from stable_baselines3.common.monitor import Monitor as M
from stable_baselines3 import A2C
from random import randint
from csv import reader

model = A2C.load("./best_models/combined_600_1000")

env = gym.make('LunarLander-v2')

# read csv file as a list of lists
with open('./moderate_dataset/urgan_test_samples.csv', 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Pass reader object to list() to get a list of lists
    list_of_rows = list(csv_reader)

test_samples = [[float(j) for j in i] for i in list_of_rows]

TEST_LEVEL_NUMS = 20

cumulated_reward_ls = []
last_reward_ls = []

for i in range(TEST_LEVEL_NUMS):
    env.load_terrain(test_samples[i])
    init_position = randint(1, 18)
    env.set_initial_x(init_position)
    # Logs will be saved in log_dir/monitor.csv
    obs = env.reset()
Пример #23
0
env_id = 'PongNoFrameskip-v4'
video_folder = 'logs/videos/'
video_length = 1000
nEnv = 8
startFresh = False
if (startFresh):
    env = make_atari_env(env_id, n_envs=nEnv, seed=0)
    env = VecFrameStack(env, n_stack=4)
    env.reset()
    model = A2C('CnnPolicy', env, verbose=1)
    model.learn(total_timesteps=25000)
    model.save("a2c_pong_{}".format(model.num_timesteps))
    record_video(env_id,
                 model,
                 video_length=500,
                 prefix='ac2_' + env_id,
                 video_folder='videos/')
else:
    env = make_atari_env(env_id, n_envs=nEnv, seed=0)
    env = VecFrameStack(env, n_stack=4)
    env.reset()
    trained_model = A2C.load("a2c_pong_200000", verbose=1)
    trained_model.set_env(env)
    trained_model.learn(total_timesteps=1000, reset_num_timesteps=False)
    trained_model.save("a2c_pong_{}".format(trained_model.num_timesteps))
    record_video(env_id,
                 trained_model,
                 video_length=500,
                 prefix='ac2_' + env_id,
                 video_folder='videos/')
Пример #24
0
        model = PPO('MlpPolicy', env=env, verbose=1)

    model.learn(total_timesteps=timesteps)
    model.save("model_cups")


def act(env, model):
    # env is deterministic as in if I say "go right" the gripper will go right all the time.
    obs = env.reset()
    for i in range(100):
        env.render()
        action, _states = model.predict(obs, deterministic=True)
        # print(action)
        obs, reward, done, info = env.step(action)
        if done:
            print('[FINAL] obs=', obs, 'reward=', reward, 'done=', done)
            break


type = "DQN"
TIME_STEPS = 50000
env = gym.make('CupsWorld-v0')
# train(env, type, TIME_STEPS)
if type == "A2C":
    model = A2C.load('model_cups')
elif type == "DQN":
    model = DQN.load('model_cups')
elif type == "PPO":
    model = PPO.load('model_cups')
act(env, model)
env.close()
 def __init__(self):
     self.env = A2CAgent.create_env(1)
     self.model = A2C.load(MODEL_PATH)
Пример #26
0
def process(file):
    env = gym.make('PerigeeRaising-Continuous3D-v0',
                   use_perturbations=True,
                   perturb_action=True)
    env = NormalizeObservationSpace(
        env, lambda o: o / env.unwrapped.observation_space.high)
    env = Monitor(env)
    env.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env, n_eval_episodes=1)

    hist_sc_state = env.unwrapped.hist_sc_state
    hist_action = env.unwrapped.hist_action
    time = np.array(
        list(
            map(
                lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[
                    0].getDate()),
                hist_sc_state))) / 3600.0  # Convert to hours
    a = np.array(list(map(lambda sc_state: sc_state.getA(),
                          hist_sc_state))) / 1000.0  # Convert to km
    e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state)))
    mass = np.array(
        list(map(lambda sc_state: sc_state.getMass(), hist_sc_state)))
    ra = a * (1.0 + e)
    rp = a * (1.0 - e)

    env2 = gym.make('PerigeeRaising-Continuous3D-v0')
    env2 = NormalizeObservationSpace(
        env2, lambda o: o / env2.unwrapped.observation_space.high)
    env2 = Monitor(env2)
    env2.seed(42)
    agent = A2C.load(file)
    agent.policy.action_dist = SquashedDiagGaussianDistribution(
        get_action_dim(env.action_space))
    evaluate_policy(agent, env2, n_eval_episodes=1)

    hist_sc_state2 = env2.unwrapped.hist_sc_state
    hist_action2 = env2.unwrapped.hist_action
    time2 = np.array(
        list(
            map(
                lambda sc_state: sc_state.getDate().durationFrom(
                    hist_sc_state2[0].getDate()),
                hist_sc_state2))) / 3600.0  # Convert to hours
    a2 = np.array(list(map(lambda sc_state: sc_state.getA(),
                           hist_sc_state2))) / 1000.0  # Convert to km
    e2 = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state2)))
    mass2 = np.array(
        list(map(lambda sc_state: sc_state.getMass(), hist_sc_state2)))
    ra2 = a2 * (1.0 + e2)
    rp2 = a2 * (1.0 - e2)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("ra (km)")
    l2, = axs.plot(time2, ra2, "--")
    l2.set_color("#777777")
    axs.plot(time, ra, "k")
    axs.legend(["Planned", "Real"], loc='upper left')
    plt.tight_layout()
    fig.savefig("real_ra.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("rp (km)")
    l2, = axs.plot(time2, rp2, "--")
    l2.set_color("#777777")
    axs.plot(time, rp, "k")
    axs.legend(["Planned", "Real"], loc='upper left')
    plt.tight_layout()
    fig.savefig("real_rp.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0])
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(mass[0] - 0.04, mass[0])
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("mass (kg)")
    l2, = axs.plot(time2, mass2, "--")
    l2.set_color("#777777")
    axs.plot(time, mass, "k")
    axs.legend(["Planned", "Real"], loc='upper right')
    plt.tight_layout()
    fig.savefig("real_m.pdf", format="pdf")
    plt.close(fig)

    fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0))
    axs.ticklabel_format(axis='y', style='plain')
    axs.set_xlim(time[0], time[-1])
    axs.set_ylim(-1.3, 1.3)
    axs.grid(True)
    axs.set_xlabel("time (h)")
    axs.set_ylabel("action")
    l1, l2, l3 = axs.plot(time[0:-1], hist_action)
    l1.set_color("#000000")
    l2.set_color("#777777")
    l3.set_color("#BBBBBB")
    axs.legend(["Act1", "Act2", "Act3"], loc='upper left')
    plt.tight_layout()
    fig.savefig("real_action.pdf", format="pdf")
    plt.close(fig)
Пример #27
0
                    learning_rate=linear_schedule(0.0001),
                    seed=1,
                    gamma=gamma)

        model.learn(total_timesteps=1000000, callback=reward_callback)
        model.save(current_path + "/models/model_" + dt_string)

    elif mode == "train_on_pretrained":

        # Loading pre-trained agent
        model_files = [
            f for f in listdir(current_path + "/models")
            if isfile(join(current_path + "/models", f))
        ]
        model_pre_trained = A2C.load(
            current_path + "/models/" +
            model_files[0])  # Loading the most recently saved agent
        model_pre_trained.set_env(env=env)
        model_pre_trained.learn(total_timesteps=1000000,
                                callback=reward_callback)

    elif mode == "test":

        total_test_episodes = 100

        model_files = [
            f for f in listdir(current_path + "/models")
            if isfile(join(current_path + "/models", f))
        ]
        model = A2C.load(
            current_path + "/models/" +
                    env,
                    gamma=0.8,
                    learning_rate=5e-4,
                    verbose=1,
                    tensorboard_log="logs/")
        model.learn(total_timesteps=int(2e5))
        model.save("a2c_multiv3")
        # model = A2C('CnnPolicy', env).learn(total_timesteps=int(2e5))
        # model.save("a2c_highway_basic")
        # model.save("a2c_highway_policy5")

    # Record video

    # env.configure({"policy_frequency": 15, "duration": 20 * 15})
    # model = A2C.load("a2c_highway_policy5")
    model = A2C.load("a2c_multiv2")
    # model = A2C.load("a2c_highway_basic")
    # env.configure({"policy_frequency": 15, "duration": 20 * 15})
    # video_length = 2 * env.config["duration"]
    # env = VecVideoRecorder(env, "videos/",
    #                        record_video_trigger=lambda x: x == 0, video_length=video_length,
    #                        name_prefix="dqn-agent")

    evaluate(env, model)
    print("End")
    for _ in range(5):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs)
            print(action)