示例#1
0
def test_performance_her(online_sampling, n_bits):
    """
    That DQN+HER can solve BitFlippingEnv.
    It should not work when n_sampled_goal=0 (DQN alone).
    """
    env = BitFlippingEnv(n_bits=n_bits, continuous=False)

    model = DQN(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=dict(
            n_sampled_goal=5,
            goal_selection_strategy="future",
            online_sampling=online_sampling,
            max_episode_length=n_bits,
        ),
        verbose=1,
        learning_rate=5e-4,
        train_freq=1,
        learning_starts=100,
        exploration_final_eps=0.02,
        target_update_interval=500,
        seed=0,
        batch_size=32,
        buffer_size=int(1e5),
    )

    model.learn(total_timesteps=5000, log_interval=50)

    # 90% training success
    assert np.mean(model.ep_success_buffer) > 0.90
示例#2
0
def test_dqn_train_with_batch_norm():
    model = DQN(
        "MlpPolicy",
        "CartPole-v1",
        policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor),
        learning_starts=0,
        seed=1,
        tau=0,  # do not clone the target
    )

    (
        q_net_bias_before,
        q_net_running_mean_before,
        q_net_target_bias_before,
        q_net_target_running_mean_before,
    ) = clone_dqn_batch_norm_stats(model)

    model.learn(total_timesteps=200)

    (
        q_net_bias_after,
        q_net_running_mean_after,
        q_net_target_bias_after,
        q_net_target_running_mean_after,
    ) = clone_dqn_batch_norm_stats(model)

    assert ~th.isclose(q_net_bias_before, q_net_bias_after).all()
    assert ~th.isclose(q_net_running_mean_before, q_net_running_mean_after).all()

    assert th.isclose(q_net_target_bias_before, q_net_target_bias_after).all()
    assert th.isclose(q_net_target_running_mean_before, q_net_target_running_mean_after).all()
示例#3
0
def basic_usage_example():
    # Basic Usage: Training, Saving, Loading.

    # Create environment.
    env = gym.make("LunarLander-v2")

    # Instantiate the agent.
    model = DQN("MlpPolicy", env, verbose=1)
    # Train the agent.
    model.learn(total_timesteps=int(2e5))
    # Save the agent.
    model.save("dqn_lunar")
    del model  # Delete trained model to demonstrate loading.

    # Load the trained agent.
    # NOTE: if you have loading issue, you can pass 'print_system_info=True'
    # to compare the system on which the model was trained vs the current one.
    #model = DQN.load("dqn_lunar", env=env, print_system_info=True)
    model = DQN.load("dqn_lunar", env=env)

    # Evaluate the agent.
    # NOTE: If you use wrappers with your environment that modify rewards,
    #	this will be reflected here. To evaluate with original rewards,
    #	wrap environment in a "Monitor" wrapper before other wrappers.
    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    # Enjoy trained agent.
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        env.render()
 def train(time_steps, save=False, **params):
     verbose = params.get('verbose', 1)
     buffer_size = params.get('buffer_size', 10000)
     learning_starts = params.get('learning_starts', 1024)
     env = DQNAgent.create_env(1)
     model = DQN('CnnPolicy', env, verbose=verbose, buffer_size=buffer_size, learning_starts=learning_starts,
                 tensorboard_log=TB_LOGS)
     model.learn(time_steps)
     if save:
         model.save(MODEL_PATH)
示例#5
0
def ai_playing():
    env = Snake_Env(server=False)
    # env = make_vec_env(lambda: env, n_envs=4, monitor_dir="./vec")
    env = Monitor(env, "1e7_bw_dqn")
    obs = env.reset()
    model = DQN("CnnPolicy",
                env,
                verbose=1,
                optimize_memory_usage=True,
                buffer_size=500000)
    model.learn(total_timesteps=1e7)
    model.save("1e7_bw_dqn")
示例#6
0
def test_dqn():
    model = DQN(
        "MlpPolicy",
        "CartPole-v1",
        policy_kwargs=dict(net_arch=[64, 64]),
        learning_starts=100,
        buffer_size=500,
        learning_rate=3e-4,
        verbose=1,
        create_eval_env=True,
    )
    model.learn(total_timesteps=500, eval_freq=250)
示例#7
0
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True):
	env = flappy_env.FlappyEnv(use_dummy_video)
	env = Monitor(env, f"flappy_dqn_{itr}")
	obs = env.reset()
	model = DQN(
		"CnnPolicy", 
		env, 
		verbose = 1, 
		optimize_memory_usage = True, 
		buffer_size = 500000, 
		learning_rate = 1e-5, 
		tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/")
	model.learn(total_timesteps = timesteps)
	model.save(f"dqn_flappy_{itr}")
示例#8
0
def init_and_train_rl_classification_model(
        timesteps, path='data/rl_rps.pth', save=True, n=2000):
    dm, y_oracle = init_dm(CONFIG)
    env = ClassificationEnv(dm, y_oracle)
    # env = MonitorWrapper(env, autolog=True)
    model = DQN(CnnPolicy, env, verbose=1)
    idxs = list(range(n))
    dm.label_samples(idxs, y_oracle[idxs])
    model.learn(total_timesteps=timesteps)
    if save:
        model.save(path)
    env.enable_evaluating(True)
    evaluate(model, env)
    env.enable_evaluating(False)
    return model
示例#9
0
def train_dqn():

    log_dir = f"model_save/"
    env = ENV_DISCRETE(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)
    model = DQN('MlpPolicy', env, verbose=1, batch_size=2048, seed=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=100,
                                                log_dir=log_dir)
    model.learn(total_timesteps=int(100000),
                callback=callback,
                log_interval=100)
    model.save('model_save/dqn')
示例#10
0
def train_sqil(env, n=0):
    venv = gym.make(env)
    expert_data = make_sa_dataset(env, max_trajs=5)

    for i in range(n):
        if isinstance(venv.action_space, Discrete):
            model = DQN(SQLPolicy,
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[64, 64]),
                        learning_starts=1)
        else:
            model = SAC('MlpPolicy',
                        venv,
                        verbose=1,
                        policy_kwargs=dict(net_arch=[256, 256]),
                        ent_coef='auto',
                        learning_rate=linear_schedule(7.3e-4),
                        train_freq=64,
                        gradient_steps=64,
                        gamma=0.98,
                        tau=0.02)

        model.replay_buffer = SQILReplayBuffer(model.buffer_size,
                                               model.observation_space,
                                               model.action_space,
                                               model.device,
                                               1,
                                               model.optimize_memory_usage,
                                               expert_data=expert_data)
        mean_rewards = []
        std_rewards = []
        for train_steps in range(20):
            if train_steps > 0:
                if 'Bullet' in env:
                    model.learn(total_timesteps=25000, log_interval=1)
                else:
                    model.learn(total_timesteps=16384, log_interval=1)
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.env,
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Steps: {1}".format(train_steps, mean_reward))
            np.savez(os.path.join("learners", env,
                                  "sqil_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
示例#11
0
def test_dqn():
    env = gym.make("fishing-v0")
    check_env(env)

    model = DQN("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=200)

    # Simulate a run with the trained model, visualize result
    df = env.simulate(model)
    env.plot(df, "dqn-test.png")

    # Evaluate model
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)

    df = env.policyfn(model)
    env.plot_policy(df, "policy-test.png")
示例#12
0
def train_dqn_growpsace(save_model=False):
    wandb.run = config.tensorboard.run
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    env = gym.make(config.env_name)

    model = DQN("CnnPolicy",
                env,
                verbose=1,
                gradient_steps=20,
                optimize_memory_usage=True)
    model.learn(total_timesteps=config.num_updates,
                log_interval=1,
                callback=WandbStableBaselines3Callback())
    if save_model:
        model.save(f"dqn_{config.env_name}")
示例#13
0
def main(cfg: DictConfig):
    env = get_env(None, cfg.env)
    model = DQN(MlpPolicy,
                env,
                **cfg.model,
                tensorboard_log='logs/',
                verbose=1)

    callbacks = [TensorboardCallback()]
    if cfg.self_play:
        self_play = EveryNTimesteps(cfg.n_update_selfplay, callback=SelfPlay('ckpts/', cfg.env))
        callbacks.append(self_play)
    if cfg.ckpt_freq:
        ckpt_cb = CheckpointCallback(save_freq=cfg.ckpt_freq, save_path='ckpts/')
        callbacks.append(ckpt_cb)

    model.learn(total_timesteps=cfg.n_total_steps, callback=callbacks, tb_log_name=cfg.log_name)
示例#14
0
def train(env, type, timesteps):
    env.reset()
    print(check_env(env))
    env = FlattenObservation(env)
    print(env.reward_range)
    print(env.action_space)
    if type == "DQN":
        model = DQN('MlpPolicy',
                    exploration_fraction=0.999,
                    env=env,
                    verbose=1)
    elif type == "A2C":
        model = A2C('MlpPolicy', env=env, verbose=1)
    elif type == "PPO":
        model = PPO('MlpPolicy', env=env, verbose=1)

    model.learn(total_timesteps=timesteps)
    model.save("model_cups")
示例#15
0
def train(params):

    model = DQN(params.get("policy"),
                env,
                verbose=1,
                buffer_size=params.get("buffer_size"),
                learning_rate=params.get("learning_rate"),
                tensorboard_log=log_dir,
                gamma=params.get("gamma"),
                target_update_interval=params.get("target_update_interval"),
                train_freq=params.get("train_freq"),
                gradient_steps=params.get("gradient_steps"),
                exploration_fraction=params.get("exploration_fraction"),
                exploration_final_eps=params.get("exploration_final_eps"),
                learning_starts=params.get("learning_starts"),
                batch_size=params.get("batch_size"),
                policy_kwargs=policy_kwargs)
    # Train for 1e5 steps
    model.learn(total_timesteps=params.get("train_steps"))
    # Save the trained agent
    model.save(exp_name)
def run_dqn_baseline():
    env = make_atari_env('BreakoutNoFrameskip-v4', n_envs=1, seed=0)
    env = VecFrameStack(env, n_stack=4)
    tensorboard_log = os.path.join(os.path.dirname(__file__), 'runs_baseline')
    buffer_size = 100000
    num_training_steps = 1000000

    model = DQN('CnnPolicy',
                env,
                verbose=0,
                buffer_size=buffer_size,
                learning_starts=50000,
                optimize_memory_usage=False,
                tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=num_training_steps)

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
def run(experiment: Experiment, params: argparse.Namespace):
    sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda)
    env = helper.make_env(params, 'env')

    # Logs will be saved in log_dir/monitor.csv
    env = Monitor(env)

    with experiment.train():
        callback = SaveOnBestTrainingRewardCallback(experiment,
                                                    check_freq=1000)
        # Deactivate all the DQN extensions to have the original version
        # In practice, it is recommend to have them activated
        model = DQN(CnnPolicy,
                    env,
                    learning_rate=params.learning_rate,
                    gamma=params.gamma,
                    seed=params.seed,
                    max_grad_norm=params.max_grad_norm,
                    verbose=1,
                    device=device,
                    policy_kwargs={'features_extractor_class': ColoringCNN})
        model.learn(total_timesteps=params.max_ts, callback=callback)
示例#18
0
def test_eval_success_logging(tmp_path):
    n_bits = 2
    env = BitFlippingEnv(n_bits=n_bits)
    eval_env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=n_bits)])
    eval_callback = EvalCallback(
        eval_env,
        eval_freq=250,
        log_path=tmp_path,
        warn=False,
    )
    model = DQN(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        learning_starts=100,
        seed=0,
        replay_buffer_kwargs=dict(max_episode_length=n_bits),
    )
    model.learn(500, callback=eval_callback)
    assert len(eval_callback._is_success_buffer) > 0
    # More than 50% success rate
    assert np.mean(eval_callback._is_success_buffer) > 0.5
示例#19
0
def test_eval_callback_logs_are_written_with_the_correct_timestep(tmp_path):
    # Skip if no tensorboard installed
    pytest.importorskip("tensorboard")
    from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

    env_name = select_env(DQN)
    model = DQN(
        "MlpPolicy",
        env_name,
        policy_kwargs=dict(net_arch=[32]),
        tensorboard_log=tmp_path,
        verbose=1,
        seed=1,
    )

    eval_env = gym.make(env_name)
    eval_freq = 101
    eval_callback = EvalCallback(eval_env, eval_freq=eval_freq, warn=False)
    model.learn(500, callback=eval_callback)

    acc = EventAccumulator(str(tmp_path / "DQN_1"))
    acc.Reload()
    for event in acc.scalars.Items("eval/mean_reward"):
        assert event.step % eval_freq == 0
示例#20
0
文件: main.py 项目: omerv1991/TB_SBP

#env = gym.make('CartPole-v1')
env = gym.make('FrozenLake-v0')

log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)
env = Monitor(env, log_dir)

model = DQN('MlpPolicy', env, verbose=1, batch_size=32,
            learning_starts=1000)  #prioritized_replay=True

model.replay_buffer = TrajReplayBuffer(model.buffer_size,
                                       model.observation_space,
                                       model.action_space,
                                       model.device,
                                       trajectory=True,
                                       seq_num=1)
initial_time = round(time(), 2)
model.learn(total_timesteps=int(100000))

mean_reward, std_reward = evaluate_policy(model,
                                          env,
                                          n_eval_episodes=10,
                                          deterministic=True)

finish_time = round(time(), 2)
total_time = round(finish_time - initial_time, 2)
print("this run took total time of {0} seconds".format(total_time))
plot_results(log_dir)
示例#21
0
                tensorboard_log="./dqn_drone_tensorboard2/",
                policy_kwargs=policy_kwargs,
                exploration_fraction=0.4)

    #env_eval = Monitor(env, './logs/')

    eval_callback = EvalCallback(env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=1000,
                                 deterministic=True,
                                 render=False)

    #Deeper NN
    #model = DQN.load("DQN", env=env)
    model.learn(total_timesteps=5_000_000,
                callback=eval_callback)  # Typically not enough
    model.save("DQN")
    #model = DQN.load("DQN", env=env)
    model = DQN.load("logs/best_model", env=env)
    #model = PPO.load("PPO_discrete", env=env)

    logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS),
                    num_drones=ARGS.num_drones)
    obs = env.reset()
    start = time.time()
    n_trial = 0
    for i in range(ARGS.duration_sec * env.SIM_FREQ):
        if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0:
            action, _states = model.predict(
                obs,
示例#22
0
文件: dqn.py 项目: lusinga/algo
# game = 'Zaxxon-ram-v0'

#env = gym.make('Pong-v0')
env = gym.make(game)

# save_file = 'dqn_pong';
save_file = 'dqn_' + game

print(env.action_space)
print(env.get_action_meanings())

model = DQN(MlpPolicy, env, verbose=1)
#model = DQN.load(save_file)
model.set_env(env)
# model = DQN(CnnPolicy, env, verbose=1)
model.learn(total_timesteps=50000, log_interval=10)
# model.save(save_file)

obs = env.reset()

score = 0
rewards_sum = 0

while True:
    # print(score)
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    score = score + 1
    rewards_sum += reward
    if reward > 0:
示例#23
0
    # Set up tensorboard logger
    if args.tensorboard:
        log_callback = LoggerCallback(sinergym_logger=bool(args.logger))
        callbacks.append(log_callback)
        # lets change default dir for TensorboardFormatLogger only
        tb_path = args.tensorboard + '/' + name
        new_logger = configure(tb_path, ["tensorboard"])
        model.set_logger(new_logger)

    callback = CallbackList(callbacks)

    # ---------------------------------------------------------------------------- #
    #                                   TRAINING                                   #
    # ---------------------------------------------------------------------------- #
    model.learn(total_timesteps=timesteps,
                callback=callback,
                log_interval=args.log_interval)
    model.save(env.simulator._env_working_dir_parent + '/' + name)

    # If the algorithm doesn't reset or close the environment, this script will do it in
    # order to correctly log all the simulation data (Energyplus + Sinergym
    # logs)
    if env.simulator._episode_existed:
        env.close()

    # ---------------------------------------------------------------------------- #
    #                           Mlflow artifacts storege                           #
    # ---------------------------------------------------------------------------- #
    if args.mlflow_store:
        # Code for send output and tensorboard to mlflow artifacts.
        mlflow.log_artifacts(local_dir=env.simulator._env_working_dir_parent,
示例#24
0
def key_handler(event):
    """
    Accepts a key event and makes an appropriate decision.
    :param event: Key event
    :return: void
    """
    global _root
    global _routing_canvas
    global _rl_model
    global _is_first_step
    global _rl_env
    global _rl_target_cell
    global _step_count
    global LEARN_RATE
    global EXPLORE_INIT
    global EXPLORE_FINAL
    global GAMMA
    global TRAIN_TIME_STEPS
    global LOAD_MODEL_NAME

    e_char = event.char

    if e_char == 'l':
        # RL Agent Learning pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        # RL Agent
        _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT,
                        exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA)
        print("Beginning RL training")
        _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS)
        print("Finished RL training")
        print("Saving trained model")
        _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S"))
    elif e_char == 't':
        # RL Agent Testing pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        print("Loading trained model")
        if _rl_model is None:
            _rl_model = DQN.load(LOAD_MODEL_NAME)

        obs = _rl_env.reset()
        done = False
        while not done:
            rl_action, states = _rl_model.predict(obs, deterministic=True)
            print("Action " + str(rl_action))
            obs, rewards, done, info = _rl_env.step(rl_action)
    elif e_char == 'r':
        # RL flow debugging (no agent involved, emulate actions randomly)
        if _is_first_step:
            _rl_env.reset()
            _is_first_step = False
        else:
            rand_action = random.randrange(1)
            rl_action_step(rand_action)
    else:
        pass
示例#25
0
    buffer_size=500000,
    max_grad_norm=10,
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    device="cuda",
    tensorboard_log="./tb_logs/",
)

# Create an evaluation callback with the same env, called every 10000 iterations
callbacks = []
eval_callback = EvalCallback(
    env,
    callback_on_new_best=None,
    n_eval_episodes=5,
    best_model_save_path=".",
    log_path=".",
    eval_freq=10000,
)
callbacks.append(eval_callback)

kwargs = {}
kwargs["callback"] = callbacks

# Train for a certain number of timesteps
model.learn(total_timesteps=5e5,
            tb_log_name="dqn_airsim_car_run_" + str(time.time()),
            **kwargs)

# Save policy weights
model.save("dqn_airsim_car_policy")
示例#26
0
    dm, y_oracle = init_dm(CONFIG)
    print(dm)
    env = ClassificationEnv(dm, y_oracle)

    sys.path.insert(0, 'dral')
    if new:
        model = DQN(CnnPolicy, env, verbose=1, learning_rate=2e-4,
                    gamma=0.98, batch_size=32, learning_starts=3000)
    if load:
        model = DQN.load("data/rl_query_rps.pth")
    if test:
        model = init_and_train_rl_classification_model(
            timesteps=100000, path='data/rl_query_dogs_cats.pth')

    # show_grid_imgs(dm.test.get_x(list(range(9))), dm.test.get_y(list(range(9))), (3, 3))
    n_episodes = 5
    for k in range(n_episodes):

        # label images
        y_oracle = label_samples(dm, y_oracle, n=100, random=True)
        dm.train.shuffle()
        print(dm)

        model.learn(total_timesteps=6000, log_interval=30)

        # evaluation
        env.enable_evaluating(True)
        evaluate(model, env)
        env.enable_evaluating(False)
示例#27
0
        policy_kwargs = dict(activation_fn=torch.nn.Tanh,
                     net_arch=[dict(pi=[1024, 1024], vf=[1024, 1024])])

        if(args.dqn): 
            args.name = 'DQN_' + args.name
            model = DQN('MlpPolicy', gym.make('Trading-v2'), 
            verbose = 1, device = torch.device('cpu'), 
            tensorboard_log = './runs/')
        else: 
            model = PPO('MlpPolicy', make_vec_env('Trading-v2', 8), 
                verbose = 1, device = torch.device('cpu'), 
                tensorboard_log = './runs/')
        
        model.learn(total_timesteps = 20e6, 
                    tb_log_name = args.name, 
                    callback = CheckpointCallback(save_freq = 10000, save_path = "./trained_models", 
                                                  name_prefix = args.name))
        model.save('{}_trading_sb'.format('dqn' if args.dqn else 'ppo'))
    else: 
        print('Loading agent')
        if(args.dqn):
            model = DQN.load('dqn_trading_sb') 
        else: 
            model = PPO.load('ppo_trading_sb')
    # model = PPO('MlpPolicy', env, verbose = 1)


    eval_eps = 100
    pbar = tqdm(total = eval_eps)
    env = gym.make('Trading-v0')
    rewards = []
示例#28
0
class TradingAgent:
    def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs):
        # wrapper around stable_baselines RL implemenetations
        assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS)
        if model == 'a2c':
            self.rl = A2C(**kwargs)
        elif model == 'ppo':
            self.rl = PPO(**kwargs)
        elif model == 'dqn':
            self.rl = DQN(**kwargs)
        elif model == 'td3':
            self.rl = TD3(**kwargs)

        self.use_gp = use_gp
        if self.use_gp:
            assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR'
            self.n_train = gp_params['n_train']
            self.retraining_iter = gp_params['training_iter']
            self.cvar_limit = gp_params['cvar_limit']
            self.gp_limit = gp_params['gp_limit']

            self.likelihood = gpytorch.likelihoods.GaussianLikelihood()
            if 'data' in gp_params.keys():
                self.X_train = gp_params['data']['X_train']
                self.y_train = gp_params['data']['y_train']
            else:
                self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features
                self.y_train = torch.zeros(self.n_train)
            self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood)
            self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp)
            self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1)

            self.shares = 0
            self.cash = 0
            self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned

            # for plotting
            self.pred_return = 0
            self.pred_lower = 0
            self.pred_upper = 0

            # for debugging
            self.goal_num_shares = 0

    def learn(self, n_steps):
        # when using gp, load pretrained rl agent - no need to train
        if self.use_gp:
            # train GP using fixed number of steps
            self.__train_gp(100)
        else:
            # train RL agent
            self.rl.learn(n_steps)

    def predict(self, obs, deterministic):
        action, state = self.rl.predict(obs, deterministic=deterministic)

        if self.use_gp:
            # slightly retrain
            self.__train_gp(self.retraining_iter, retrain=True)

            # predict next step returns and CI using GP
            with torch.no_grad(), gpytorch.settings.fast_pred_var():
                output = self.gp(torch.Tensor(obs[2:])[None])
                obs_pred = self.likelihood(output)
                f_mean = output.mean.detach().numpy()[0]
                self.pred_return = f_mean.item()
                f_samples = output.sample(sample_shape=torch.Size((10000,))).detach().numpy()
                lower, upper = obs_pred.confidence_region()
                self.pred_lower = lower.item()
                self.pred_upper = upper.item()

            rl_action = action
            action -= ACTION_OFFSET # adjust from action for env to see actual trade

            # adjust trade size given prediction
            # if self.shares > 0: # long position
            if f_mean > self.gp_limit: # predict positive return over certain threshold
                tail_samples = f_samples[f_samples < lower.item()]
                ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else lower.item() # cvar per share
                if ps_cvar < 0:
                    goal_num_shares = self.cvar_limit // ps_cvar
                else:
                    goal_num_shares = self.shares + action # positive return for long - no adjustment needed
                action = min(10, max(0, goal_num_shares - self.shares))
            elif f_mean < -self.gp_limit:
                tail_samples = f_samples[f_samples > upper.item()]
                ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else upper.item() # cvar per share
                if ps_cvar < 0:
                    goal_num_shares = self.shares + action # negative return for short - no adjustment needed
                else:
                    goal_num_shares = self.cvar_limit // ps_cvar
                action = max(-10, min(0, goal_num_shares - self.shares))
            else:
                goal_num_shares = self.shares + action
            # print(ps_cvar, lower.item(), upper.item())

            # if not np.isnan(goal_num_shares):
            self.goal_num_shares = goal_num_shares
            # if action > 0: # buy order
            #     action = min(10, max(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction
            #     # print(goal_num_shares - self.shares, action)
            # elif action < 0: # sell order
            #     action = max(-10, min(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction

            action += ACTION_OFFSET # adjust for env actions being 1 to N rather than -N/2 to N/2

            # print(f_mean, ps_cvar, self.shares, goal_num_shares, rl_action-ACTION_OFFSET, action-ACTION_OFFSET)

        return action, state

    def update(self, obs, reward=None):
        self.obs.append(obs)
        self.shares, self.cash = obs[:2]
        if reward is not None:
            self.X_train = torch.cat((self.X_train, torch.Tensor(self.obs.pop(0)[2:])[None]))[1:] # self.X_train[1:]
            self.y_train = torch.cat((self.y_train, torch.Tensor([reward])))[1:]

        # print(self.X_train, self.y_train)

        self.gp.set_train_data(self.X_train, self.y_train)

    def save(self, rl_path, gp_path=None):
        self.rl.save(rl_path)
        if gp_path is not None:
            torch.save(self.gp.state_dict(), gp_path)

    def load(self, rl_path, gp_path=None):
        self.rl = A2C.load(rl_path)
        if gp_path is not None:
            state_dict = torch.load(gp_path)
            self.gp.load_state_dict(state_dict)

    def __train_gp(self, n_iter, retrain=False):
        # train GP using fixed number of steps
        self.gp.train()
        self.likelihood.train()

        for i in range(n_iter):
            output = self.gp(self.X_train)
            loss = -self.mll(output, self.y_train)
            self.opt.zero_grad()
            loss.backward()
            self.opt.step()

        self.gp.eval()
        self.likelihood.eval()
示例#29
0
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('CartPole-v0')

model = DQN(MlpPolicy, env, verbose=1)

#model = DQN(MlpPolicy, env, seed=1423, target_update_interval =5, batch_size=16, train_freq=128, buffer_size=256, gamma=0.95, learning_rate=1e-3, verbose=1)

print("start model evaluation without learning !")
mean_reward_before, std_reward_before = evaluate_policy(model,
                                                        env,
                                                        n_eval_episodes=100)
print("end model evaluation !")

print("start model learning !")
model.learn(total_timesteps=10000, log_interval=4)
print("end model learning !")

print("-> model saved !!")
model.save("dqn_cartpole")

print("start model evaluation with learning !")
mean_reward_after, std_reward_after = evaluate_policy(model,
                                                      env,
                                                      n_eval_episodes=100)
print("end model evaluation !")

print("-> model evaluation without learning")
print(
    f"mean_reward:{mean_reward_before:.2f} +/- std_reward:{std_reward_before:.2f}"
)
示例#30
0
import sys
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3 import DQN
from gym_sudoku.envs.sudoku_env import SudokuEnv

env = SudokuEnv()

if "--train" in sys.argv:
    model = DQN(MlpPolicy, env, verbose=1, learning_starts=100)
    model.learn(total_timesteps=10000)
    model.save("dqn_sudoku")
else:
    model = DQN.load("dqn_sudoku")

obs = env.reset()
env.render()
for _ in range(20):
    action, _states = model.predict(obs, deterministic=True)
    print("Action", action)
    print("States", _states)
    print("Coordinates", env.fill_pointer)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done:
        print("Resetting ==============================================>")
        obs = env.reset()