예제 #1
0
def basic_usage_example():
    # Basic Usage: Training, Saving, Loading.

    # Create environment.
    env = gym.make("LunarLander-v2")

    # Instantiate the agent.
    model = DQN("MlpPolicy", env, verbose=1)
    # Train the agent.
    model.learn(total_timesteps=int(2e5))
    # Save the agent.
    model.save("dqn_lunar")
    del model  # Delete trained model to demonstrate loading.

    # Load the trained agent.
    # NOTE: if you have loading issue, you can pass 'print_system_info=True'
    # to compare the system on which the model was trained vs the current one.
    #model = DQN.load("dqn_lunar", env=env, print_system_info=True)
    model = DQN.load("dqn_lunar", env=env)

    # Evaluate the agent.
    # NOTE: If you use wrappers with your environment that modify rewards,
    #	this will be reflected here. To evaluate with original rewards,
    #	wrap environment in a "Monitor" wrapper before other wrappers.
    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    # Enjoy trained agent.
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        env.render()
 def train(time_steps, save=False, **params):
     verbose = params.get('verbose', 1)
     buffer_size = params.get('buffer_size', 10000)
     learning_starts = params.get('learning_starts', 1024)
     env = DQNAgent.create_env(1)
     model = DQN('CnnPolicy', env, verbose=verbose, buffer_size=buffer_size, learning_starts=learning_starts,
                 tensorboard_log=TB_LOGS)
     model.learn(time_steps)
     if save:
         model.save(MODEL_PATH)
예제 #3
0
def ai_playing():
    env = Snake_Env(server=False)
    # env = make_vec_env(lambda: env, n_envs=4, monitor_dir="./vec")
    env = Monitor(env, "1e7_bw_dqn")
    obs = env.reset()
    model = DQN("CnnPolicy",
                env,
                verbose=1,
                optimize_memory_usage=True,
                buffer_size=500000)
    model.learn(total_timesteps=1e7)
    model.save("1e7_bw_dqn")
예제 #4
0
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True):
	env = flappy_env.FlappyEnv(use_dummy_video)
	env = Monitor(env, f"flappy_dqn_{itr}")
	obs = env.reset()
	model = DQN(
		"CnnPolicy", 
		env, 
		verbose = 1, 
		optimize_memory_usage = True, 
		buffer_size = 500000, 
		learning_rate = 1e-5, 
		tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/")
	model.learn(total_timesteps = timesteps)
	model.save(f"dqn_flappy_{itr}")
예제 #5
0
def init_and_train_rl_classification_model(
        timesteps, path='data/rl_rps.pth', save=True, n=2000):
    dm, y_oracle = init_dm(CONFIG)
    env = ClassificationEnv(dm, y_oracle)
    # env = MonitorWrapper(env, autolog=True)
    model = DQN(CnnPolicy, env, verbose=1)
    idxs = list(range(n))
    dm.label_samples(idxs, y_oracle[idxs])
    model.learn(total_timesteps=timesteps)
    if save:
        model.save(path)
    env.enable_evaluating(True)
    evaluate(model, env)
    env.enable_evaluating(False)
    return model
예제 #6
0
def train_dqn():

    log_dir = f"model_save/"
    env = ENV_DISCRETE(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)
    model = DQN('MlpPolicy', env, verbose=1, batch_size=2048, seed=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=100,
                                                log_dir=log_dir)
    model.learn(total_timesteps=int(100000),
                callback=callback,
                log_interval=100)
    model.save('model_save/dqn')
예제 #7
0
def train_dqn_growpsace(save_model=False):
    wandb.run = config.tensorboard.run
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    env = gym.make(config.env_name)

    model = DQN("CnnPolicy",
                env,
                verbose=1,
                gradient_steps=20,
                optimize_memory_usage=True)
    model.learn(total_timesteps=config.num_updates,
                log_interval=1,
                callback=WandbStableBaselines3Callback())
    if save_model:
        model.save(f"dqn_{config.env_name}")
예제 #8
0
def train(env, type, timesteps):
    env.reset()
    print(check_env(env))
    env = FlattenObservation(env)
    print(env.reward_range)
    print(env.action_space)
    if type == "DQN":
        model = DQN('MlpPolicy',
                    exploration_fraction=0.999,
                    env=env,
                    verbose=1)
    elif type == "A2C":
        model = A2C('MlpPolicy', env=env, verbose=1)
    elif type == "PPO":
        model = PPO('MlpPolicy', env=env, verbose=1)

    model.learn(total_timesteps=timesteps)
    model.save("model_cups")
예제 #9
0
def train(params):

    model = DQN(params.get("policy"),
                env,
                verbose=1,
                buffer_size=params.get("buffer_size"),
                learning_rate=params.get("learning_rate"),
                tensorboard_log=log_dir,
                gamma=params.get("gamma"),
                target_update_interval=params.get("target_update_interval"),
                train_freq=params.get("train_freq"),
                gradient_steps=params.get("gradient_steps"),
                exploration_fraction=params.get("exploration_fraction"),
                exploration_final_eps=params.get("exploration_final_eps"),
                learning_starts=params.get("learning_starts"),
                batch_size=params.get("batch_size"),
                policy_kwargs=policy_kwargs)
    # Train for 1e5 steps
    model.learn(total_timesteps=params.get("train_steps"))
    # Save the trained agent
    model.save(exp_name)
예제 #10
0
    save_freq=5000,
    save_path=
    './checkpoint/v11_dqn_multiInput_4actions_2obs_simpleRF_100000_steps/',
    name_prefix='dqn_policy')

time_steps = 100000
model.learn(
    total_timesteps=int(time_steps),
    log_interval=5,
    tb_log_name="v11_dqn_multiInput_4actions_2obs_simpleRF_100000_steps",
    callback=checkpoint_callback,
)

# Save policy weights
# model.save("model/dqn_airsim_drone_policy")
model.save("model/v11_dqn_multiInput_4actions_2obs_simpleRF_100000_steps")

# time_steps = 100000
# model = DQN(
#     "MultiInputPolicy",
#     env,
#     learning_rate=0.00025,
#     verbose=1,
#     batch_size=32,
#     train_freq=4,
#     target_update_interval=200,
#     learning_starts=200,
#     buffer_size=10000,
#     max_grad_norm=10,
#     exploration_fraction=0.1,
#     exploration_final_eps=0.01,
예제 #11
0
import sys
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3 import DQN
from gym_sudoku.envs.sudoku_env import SudokuEnv

env = SudokuEnv()

if "--train" in sys.argv:
    model = DQN(MlpPolicy, env, verbose=1, learning_starts=100)
    model.learn(total_timesteps=10000)
    model.save("dqn_sudoku")
else:
    model = DQN.load("dqn_sudoku")

obs = env.reset()
env.render()
for _ in range(20):
    action, _states = model.predict(obs, deterministic=True)
    print("Action", action)
    print("States", _states)
    print("Coordinates", env.fill_pointer)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done:
        print("Resetting ==============================================>")
        obs = env.reset()

예제 #12
0
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    device="cuda",
    tensorboard_log="./tb_logs/",
)

# Create an evaluation callback with the same env, called every 10000 iterations
callbacks = []
eval_callback = EvalCallback(
    env,
    callback_on_new_best=None,
    n_eval_episodes=5,
    best_model_save_path=".",
    log_path=".",
    eval_freq=10000,
)
callbacks.append(eval_callback)

kwargs = {}
kwargs["callback"] = callbacks

# Train for a certain number of timesteps
model.learn(
    total_timesteps=5e5,
    tb_log_name="dqn_airsim_drone_run_" + str(time.time()),
    **kwargs
)

# Save policy weights
model.save("dqn_airsim_drone_policy")
예제 #13
0
        if(args.dqn): 
            args.name = 'DQN_' + args.name
            model = DQN('MlpPolicy', gym.make('Trading-v2'), 
            verbose = 1, device = torch.device('cpu'), 
            tensorboard_log = './runs/')
        else: 
            model = PPO('MlpPolicy', make_vec_env('Trading-v2', 8), 
                verbose = 1, device = torch.device('cpu'), 
                tensorboard_log = './runs/')
        
        model.learn(total_timesteps = 20e6, 
                    tb_log_name = args.name, 
                    callback = CheckpointCallback(save_freq = 10000, save_path = "./trained_models", 
                                                  name_prefix = args.name))
        model.save('{}_trading_sb'.format('dqn' if args.dqn else 'ppo'))
    else: 
        print('Loading agent')
        if(args.dqn):
            model = DQN.load('dqn_trading_sb') 
        else: 
            model = PPO.load('ppo_trading_sb')
    # model = PPO('MlpPolicy', env, verbose = 1)


    eval_eps = 100
    pbar = tqdm(total = eval_eps)
    env = gym.make('Trading-v0')
    rewards = []
    baseline_diff = []
    for ep in range(eval_eps): 
예제 #14
0
MODEL_PATH = MODELS_DIR_PATH / MODEL_NAME
LOG_PATH = MODELS_DIR_PATH / 'logs/'
CHECKPOINT_PATH = MODELS_DIR_PATH / 'checkpoints/'
MONITOR_PATH = MODELS_DIR_PATH / 'monitoring/'

config = {
    'simulation_frequency': 15,
    'policy_frequency': 0.5,
    'demand_amplitude': 15000,
    'total_steps': 100,
}

env = gym.make('highway-v0', **config)

checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path=CHECKPOINT_PATH,
                                         name_prefix=MODEL_NAME,
                                         verbose=1)
env = Monitor(env, MONITOR_PATH)

model = DQN(MlpPolicy,
            env,
            verbose=1,
            tensorboard_log=LOG_PATH,
            learning_starts=100,
            target_update_interval=500)
model.learn(total_timesteps=TRAINING_STEPS,
            callback=checkpoint_callback,
            log_interval=4)
model.save(MODEL_PATH)
예제 #15
0
import numpy as np
import gym
import gym_fishing
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

env = gym.make('fishing-v0')

check_env(env)

model = DQN('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=200)

## Simulate a run with the trained model, visualize result
df = env.simulate(model)
env.plot(df, "dqn.png")

## Evaluate model
from stable_baselines3.common.evaluation import evaluate_policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)
print("mean reward:", mean_reward, "std:", std_reward)

## Save and reload the model
model.save("dqn")
model = DQN.load("dqn")
예제 #16
0
checkpoint_callback = CheckpointCallback(
    save_freq=500,
    save_path='./checkpoint/v18_dqn_cnnPolicy_4actions_imageObs_100000_steps/',
    name_prefix='dqn_policy')

time_steps = 100000
model.learn(
    total_timesteps=int(time_steps),
    log_interval=5,
    tb_log_name="v18_dqn_cnnPolicy_4actions_imageObs_100000_steps",
    callback=checkpoint_callback,
)

# Save policy weights
# model.save("model/dqn_airsim_drone_policy")
model.save("model/v18_dqn_cnnPolicy_4actions_imageObs_100000_steps")

# time_steps = 100
# model = DQN(
#     "CnnPolicy",
#     env,
#     learning_rate=0.00025,
#     verbose=1,
#     batch_size=32,
#     train_freq=4,
#     target_update_interval=10000,
#     learning_starts=10000,
#     buffer_size=50000,
#     max_grad_norm=10,
#     exploration_fraction=0.1,
#     exploration_final_eps=0.01,
예제 #17
0
        log_callback = LoggerCallback(sinergym_logger=bool(args.logger))
        callbacks.append(log_callback)
        # lets change default dir for TensorboardFormatLogger only
        tb_path = args.tensorboard + '/' + name
        new_logger = configure(tb_path, ["tensorboard"])
        model.set_logger(new_logger)

    callback = CallbackList(callbacks)

    # ---------------------------------------------------------------------------- #
    #                                   TRAINING                                   #
    # ---------------------------------------------------------------------------- #
    model.learn(total_timesteps=timesteps,
                callback=callback,
                log_interval=args.log_interval)
    model.save(env.simulator._env_working_dir_parent + '/' + name)

    # If the algorithm doesn't reset or close the environment, this script will do it in
    # order to correctly log all the simulation data (Energyplus + Sinergym
    # logs)
    if env.simulator._episode_existed:
        env.close()

    # ---------------------------------------------------------------------------- #
    #                           Mlflow artifacts storege                           #
    # ---------------------------------------------------------------------------- #
    if args.mlflow_store:
        # Code for send output and tensorboard to mlflow artifacts.
        mlflow.log_artifacts(local_dir=env.simulator._env_working_dir_parent,
                             artifact_path=name)
        if args.evaluation:
예제 #18
0
    buffer_size=500000,
    max_grad_norm=10,
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    device="cuda",
    tensorboard_log="./tb_logs/",
)

# Create an evaluation callback with the same env, called every 10000 iterations
callbacks = []
eval_callback = EvalCallback(
    env,
    callback_on_new_best=None,
    n_eval_episodes=5,
    best_model_save_path=".",
    log_path=".",
    eval_freq=10000,
)
callbacks.append(eval_callback)

kwargs = {}
kwargs["callback"] = callbacks

# Train for a certain number of timesteps
model.learn(total_timesteps=5e5,
            tb_log_name="dqn_airsim_car_run_" + str(time.time()),
            **kwargs)

# Save policy weights
model.save("dqn_airsim_car_policy")
    print(f"Load agent from {agentPath}")
    # model = PPO.load(agentPath)
    model = DQN.load(agentPath)
    model.set_env(env)
else:
    print(f"Instanciate new agent and save in {agentPath}")
    # model = PPO("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1)
    # model = DQN("CnnPolicy", env_vec, policy_kwargs=policy_kwargs, verbose=1)
    model = DQN("CnnPolicy",
                env,
                target_update_interval=1000,
                batch_size=512,
                exploration_final_eps=0.2,
                policy_kwargs=policy_kwargs,
                verbose=1)
    model.save(agentPath)

# Record gif of trained agent
imagesGrid = []
obs = env.reset()
imagesGrid.append(env.render("human"))
for step in range(200):
    action, _ = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    print("reward : ", reward)
    env.render(mode='console')
    imagesGrid.append(env.render("human"))
    if done:
        print("Goal reached!", "reward=", reward)
        break
imagesGrid[0].save(f'_data/visu.gif',
max_steps=100
dqn_model.learn(total_timesteps=max_steps)

# Commented out IPython magic to ensure Python compatibility.
# %tensorboard --logdir {LOG_DIR}

"""## Εκτίμηση απόδοσης"""

from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(dqn_model, test_env, n_eval_episodes=10)
print(f"Eval reward: {mean_reward} (+/-{std_reward})")

"""## Σώσιμο εκπαιδευμένου μοντέλου"""

dqn_model.save("dqn_pong")

"""Το μοντέλο θα αποθηκευθεί ως zip και μπορείτε να το κατεβάσετε τοπικά από το αριστερό sidebar του Colab στο "Files" και μετά στο ellipsis menu πάνω στο filename.

## Φόρτωση εκπαιδευμένου μοντέλου

Από το αριστερό sidebar του Colab και το "Files" ανεβάστε το αρχείο zip του εκπαιδευμένου μοντέλου.

Εδώ θα ανεβάσουμε ένα μοντέλο Α2C που έχουμε εκπαιδεύσει νωρίτερα. Μπορείτε να το κατεβάσετε από [εδώ](https://drive.google.com/uc?export=download&id=1COsaNOH8SjbpxxIYc5lOF-QUiUJGU5ZB). 

Αν χρειαστεί μετονομάστε το αρχείο σε a2c_pong.zip
"""

from stable_baselines3 import A2C
# !wget --no-check-certificate https://www.dropbox.com/s/zm02848gzbx3jsl/a2c_pong.zip?dl=1 -O a2c_berzerk.zip
# a2c_model = A2C.load("a2c_berzerk.zip", verbose=10)
        },
        "policy_frequency": 2,
        "duration": 40,
    })
    env.reset()
    model = DQN('CnnPolicy', env,
                gamma=0.8,
                learning_rate=5e-4,
                buffer_size=40*1000,
                learning_starts=200,
                exploration_fraction=0.6,
                target_update_interval=256,
                batch_size=32,
                verbose=1,
                tensorboard_log="logs/")
    model.learn(total_timesteps=int(2e5))
    model.save("dqn_highway")

    # Record video
    model = DQN.load("dqn_highway")
    env.configure({"policy_frequency": 15, "duration": 20 * 15})
    video_length = 2 * env.config["duration"]
    env = VecVideoRecorder(env, "videos/",
                           record_video_trigger=lambda x: x == 0, video_length=video_length,
                           name_prefix="dqn-agent")
    obs = env.reset()
    for _ in range(video_length + 1):
        action, _ = model.predict(obs)
        obs, _, _, _ = env.step(action)
    env.close()
예제 #22
0
파일: dqn.py 프로젝트: jfrancis71/PyGenBrix
import gym
from stable_baselines3 import DQN
from pfrl.wrappers import atari_wrappers

ap = argparse.ArgumentParser(description="DQN")
ap.add_argument("--env", default="PongNoFrameskip-v4")
ap.add_argument("--frame_stacks", default=4)
ap.add_argument("--learning_starts", default=100000, type=int)
ap.add_argument("--total_timesteps", default=1000000, type=int)
ap.add_argument("--save_path")
ap.add_argument("--tensorboard_log")
ns = ap.parse_args()

env = atari_wrappers.wrap_deepmind(
    atari_wrappers.make_atari(ns.env, max_frames=10000),
    episode_life=True,
    clip_rewards=True,
)
model = DQN('CnnPolicy',
            env,
            verbose=1,
            buffer_size=10000,
            learning_rate=.0001,
            learning_starts=ns.learning_starts,
            target_update_interval=1000,
            tensorboard_log=ns.tensorboard_log)
model.learn(total_timesteps=ns.total_timesteps)
if ns.save_path is not None:
    model.save(ns.save_path)
예제 #23
0
profiler = Profiler()
profiler.start()

# Setup Environment
print('Envrionment Setup...')
env = gym.make('Desktop-v0', debug=False, show=True, steplimit=100)
outdir = '/tmp/random-agent-results'
env = Monitor(env, directory=outdir, force=True)
episodes = 10
# Setup Agent
print('Agent Setup...')
model = DQN(MlpPolicy, env, verbose=0, buffer_size=500)
print('Returning Trained Model...')
model.learn(total_timesteps=1000, log_interval=4)
print('Saving Trained Model...')
model.save("deepq_desktop")

del model  # remove to demonstrate saving and loading
print('Loading Trained Model...')
model = DQN.load("deepq_desktop")


def unique_reward(last_state, current_state):
    # rewards a current state that is different from the last state
    return (np.sum(last_state) - np.sum(current_state))


if __name__ == '__main__':
    try:
        print('Running Environment')
        last_state = None
예제 #24
0
class TradingAgent:
    def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs):
        # wrapper around stable_baselines RL implemenetations
        assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS)
        if model == 'a2c':
            self.rl = A2C(**kwargs)
        elif model == 'ppo':
            self.rl = PPO(**kwargs)
        elif model == 'dqn':
            self.rl = DQN(**kwargs)
        elif model == 'td3':
            self.rl = TD3(**kwargs)

        self.use_gp = use_gp
        if self.use_gp:
            assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR'
            self.n_train = gp_params['n_train']
            self.retraining_iter = gp_params['training_iter']
            self.cvar_limit = gp_params['cvar_limit']
            self.gp_limit = gp_params['gp_limit']

            self.likelihood = gpytorch.likelihoods.GaussianLikelihood()
            if 'data' in gp_params.keys():
                self.X_train = gp_params['data']['X_train']
                self.y_train = gp_params['data']['y_train']
            else:
                self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features
                self.y_train = torch.zeros(self.n_train)
            self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood)
            self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp)
            self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1)

            self.shares = 0
            self.cash = 0
            self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned

            # for plotting
            self.pred_return = 0
            self.pred_lower = 0
            self.pred_upper = 0

            # for debugging
            self.goal_num_shares = 0

    def learn(self, n_steps):
        # when using gp, load pretrained rl agent - no need to train
        if self.use_gp:
            # train GP using fixed number of steps
            self.__train_gp(100)
        else:
            # train RL agent
            self.rl.learn(n_steps)

    def predict(self, obs, deterministic):
        action, state = self.rl.predict(obs, deterministic=deterministic)

        if self.use_gp:
            # slightly retrain
            self.__train_gp(self.retraining_iter, retrain=True)

            # predict next step returns and CI using GP
            with torch.no_grad(), gpytorch.settings.fast_pred_var():
                output = self.gp(torch.Tensor(obs[2:])[None])
                obs_pred = self.likelihood(output)
                f_mean = output.mean.detach().numpy()[0]
                self.pred_return = f_mean.item()
                f_samples = output.sample(sample_shape=torch.Size((10000,))).detach().numpy()
                lower, upper = obs_pred.confidence_region()
                self.pred_lower = lower.item()
                self.pred_upper = upper.item()

            rl_action = action
            action -= ACTION_OFFSET # adjust from action for env to see actual trade

            # adjust trade size given prediction
            # if self.shares > 0: # long position
            if f_mean > self.gp_limit: # predict positive return over certain threshold
                tail_samples = f_samples[f_samples < lower.item()]
                ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else lower.item() # cvar per share
                if ps_cvar < 0:
                    goal_num_shares = self.cvar_limit // ps_cvar
                else:
                    goal_num_shares = self.shares + action # positive return for long - no adjustment needed
                action = min(10, max(0, goal_num_shares - self.shares))
            elif f_mean < -self.gp_limit:
                tail_samples = f_samples[f_samples > upper.item()]
                ps_cvar = np.mean(tail_samples) if len(tail_samples) > 0 else upper.item() # cvar per share
                if ps_cvar < 0:
                    goal_num_shares = self.shares + action # negative return for short - no adjustment needed
                else:
                    goal_num_shares = self.cvar_limit // ps_cvar
                action = max(-10, min(0, goal_num_shares - self.shares))
            else:
                goal_num_shares = self.shares + action
            # print(ps_cvar, lower.item(), upper.item())

            # if not np.isnan(goal_num_shares):
            self.goal_num_shares = goal_num_shares
            # if action > 0: # buy order
            #     action = min(10, max(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction
            #     # print(goal_num_shares - self.shares, action)
            # elif action < 0: # sell order
            #     action = max(-10, min(0, goal_num_shares - self.shares)) # restrict same size trades as original, maintain same direction

            action += ACTION_OFFSET # adjust for env actions being 1 to N rather than -N/2 to N/2

            # print(f_mean, ps_cvar, self.shares, goal_num_shares, rl_action-ACTION_OFFSET, action-ACTION_OFFSET)

        return action, state

    def update(self, obs, reward=None):
        self.obs.append(obs)
        self.shares, self.cash = obs[:2]
        if reward is not None:
            self.X_train = torch.cat((self.X_train, torch.Tensor(self.obs.pop(0)[2:])[None]))[1:] # self.X_train[1:]
            self.y_train = torch.cat((self.y_train, torch.Tensor([reward])))[1:]

        # print(self.X_train, self.y_train)

        self.gp.set_train_data(self.X_train, self.y_train)

    def save(self, rl_path, gp_path=None):
        self.rl.save(rl_path)
        if gp_path is not None:
            torch.save(self.gp.state_dict(), gp_path)

    def load(self, rl_path, gp_path=None):
        self.rl = A2C.load(rl_path)
        if gp_path is not None:
            state_dict = torch.load(gp_path)
            self.gp.load_state_dict(state_dict)

    def __train_gp(self, n_iter, retrain=False):
        # train GP using fixed number of steps
        self.gp.train()
        self.likelihood.train()

        for i in range(n_iter):
            output = self.gp(self.X_train)
            loss = -self.mll(output, self.y_train)
            self.opt.zero_grad()
            loss.backward()
            self.opt.step()

        self.gp.eval()
        self.likelihood.eval()
예제 #25
0
def key_handler(event):
    """
    Accepts a key event and makes an appropriate decision.
    :param event: Key event
    :return: void
    """
    global _root
    global _routing_canvas
    global _rl_model
    global _is_first_step
    global _rl_env
    global _rl_target_cell
    global _step_count
    global LEARN_RATE
    global EXPLORE_INIT
    global EXPLORE_FINAL
    global GAMMA
    global TRAIN_TIME_STEPS
    global LOAD_MODEL_NAME

    e_char = event.char

    if e_char == 'l':
        # RL Agent Learning pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        # RL Agent
        _rl_model = DQN('MlpPolicy', _rl_env, verbose=1, learning_rate=LEARN_RATE, exploration_initial_eps=EXPLORE_INIT,
                        exploration_final_eps=EXPLORE_FINAL, gamma=GAMMA)
        print("Beginning RL training")
        _rl_model.learn(total_timesteps=TRAIN_TIME_STEPS)
        print("Finished RL training")
        print("Saving trained model")
        _rl_model.save("agent_" + time.strftime("%d-%m-%YT%H-%M-%S"))
    elif e_char == 't':
        # RL Agent Testing pass

        # AI Gym Environment check - only do this when testing a new environment (resets RNG seed)
        # check_env(_rl_env)
        _step_count = 0  # Reset because check_env increments via step()

        print("Loading trained model")
        if _rl_model is None:
            _rl_model = DQN.load(LOAD_MODEL_NAME)

        obs = _rl_env.reset()
        done = False
        while not done:
            rl_action, states = _rl_model.predict(obs, deterministic=True)
            print("Action " + str(rl_action))
            obs, rewards, done, info = _rl_env.step(rl_action)
    elif e_char == 'r':
        # RL flow debugging (no agent involved, emulate actions randomly)
        if _is_first_step:
            _rl_env.reset()
            _is_first_step = False
        else:
            rand_action = random.randrange(1)
            rl_action_step(rand_action)
    else:
        pass
                        target_update_interval=10000,
                        exploration_fraction=0.5,
                        exploration_initial_eps=1.0,
                        exploration_final_eps=0.05,
                        max_grad_norm=10,
                        tensorboard_log=log_dir,
                        create_eval_env=False,
                        policy_kwargs=None,
                        verbose=2,
                        seed=None,
                        device='cuda',
                        _init_setup_model=True)
            model.learn(total_timesteps=NUM_EPISODES * MAX_STEPS,
                        tb_log_name=log_name,
                        log_interval=1)
            model.save(model_name)

        if ALGORITHM == 'PPO':
            from stable_baselines3.ppo import MlpPolicy
            model = PPO(MlpPolicy, env, tensorboard_log=log_dir, verbose=2)
            model.learn(total_timesteps=NUM_EPISODES * MAX_STEPS,
                        tb_log_name=log_name,
                        log_interval=1)
            model.save(model_name)

        else:
            print('!ERROR: incorrect algorithm selection!')

        del model

    else:
예제 #27
0
                exploration_fraction=0.4)

    #env_eval = Monitor(env, './logs/')

    eval_callback = EvalCallback(env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=1000,
                                 deterministic=True,
                                 render=False)

    #Deeper NN
    #model = DQN.load("DQN", env=env)
    model.learn(total_timesteps=5_000_000,
                callback=eval_callback)  # Typically not enough
    model.save("DQN")
    #model = DQN.load("DQN", env=env)
    model = DQN.load("logs/best_model", env=env)
    #model = PPO.load("PPO_discrete", env=env)

    logger = Logger(logging_freq_hz=int(env.SIM_FREQ / env.AGGR_PHY_STEPS),
                    num_drones=ARGS.num_drones)
    obs = env.reset()
    start = time.time()
    n_trial = 0
    for i in range(ARGS.duration_sec * env.SIM_FREQ):
        if ARGS.duration_sec * env.SIM_FREQ % AGGR_PHY_STEPS == 0:
            action, _states = model.predict(
                obs,
                deterministic=True,
            )
def learn_with_selfplay(max_agents,
                        num_learn_steps,
                        num_learn_steps_pre_training,
                        num_eval_eps,
                        num_skip_steps=0,
                        model_name='dqn',
                        only_rule_based_op=False,
                        patience=5,
                        image_observations=True,
                        output_folder="output",
                        fine_tune_on=None,
                        opponent_pred_obs=False,
                        adversarial_training=None,
                        save_freq=None):
    """
    Train an agent with regular self-play. If there are checkpoints of previous training continue training with the checkpoints.

    :param max_agents: Stop after max_agents intermediate agents have been trained. An intermediate agent is saved when training
    successfully created an improved agent.
    :param num_learn_steps: Number of frames / steps for every learning iteration
    :param num_learn_steps_pre_training: Number of frames / steps for pre-training on the rule-based agent
    :param num_eval_eps: Number of episodes for intermediate evaluation. Intermediate evaluation determines whether trained agent improved
    compared to previous version
    :param num_skip_steps: Skip num_skip_steps frames performing the action from the previous step
    :param model_name: Name for saving the model. If there are already checkpoints with this name training is continued. Checkpoints will be
    saved as madel_namei, where i is the training iteration.
    :param only_rule_based_op: If set to true training is only performed on the rule-based agent.
    :param patience: Patience parameter for evaluation
    :param image_observations: Use image instead of feature observations
    :param output_folder: Root folder for outputs
    :param fine_tune_on: If not None instead of self-play training perform training of an adversarial policy against the victim specified as
    string to this parameter
    :param opponent_pred_obs:
        If this is set to True, the predictions of the opponents in the current state will beconcatenated to the observations for the main
        agent. This was an attempt to create a stronger adversarial policy, which could use this information, however in our experiments
        this didn't improve the adversarial policy
    :param adversarial_training: If set to True perform adversarial training using FGSM during training.
    :param save_freq: If not None save intermediate checkpoints during training with the given frequency
    :return:
    """
    eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based = _init_envs(image_observations,
                                                                                         num_skip_steps,
                                                                                         opponent_pred_obs,
                                                                                         adversarial_training)

    # If fine tuning, load model to fine-tune from path
    if fine_tune_on is not None:
        path = Path(output_folder) / 'models' / fine_tune_on
        fine_tune_model = DQN.load(path)
        fine_tune_model.tensorboard_log = None
        if opponent_pred_obs:
            # We can't eval on agents that don't have a q_net so we change eval_op to the original model that is being
            # fine-tuned against, instead of the rule-based agent
            eval_op = fine_tune_model
            eval_env_rule_based.set_opponent(eval_op)
            eval_env_rule_based = OpponentPredictionObs(eval_env_rule_based)
            eval_env.set_opponent(eval_op)
            eval_env = OpponentPredictionObs(eval_env)
    else:
        fine_tune_model = None

    # Initialize first agent
    pre_train_agent = SimpleRuleBasedAgent(train_env_rule_based)
    previous_models = [pre_train_agent]

    # Load potentially saved previous models
    for opponent_id in range(1, max_agents):
        path = _make_model_path(output_folder, model_name, opponent_id)
        if os.path.isfile(path):
            model = DQN.load(path)
            previous_models.append(model)
        else:
            break

    # Initialize first round
    last_agent_id = len(previous_models) - 1
    prev_num_steps = 0
    patience_counter = 0
    tb_path = Path(output_folder) / "tb-log"
    if last_agent_id == 0:
        # main_model = A2C('MlpPolicy', policy_kwargs=dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5)), env=train_env, verbose=0,
        #                 tensorboard_log="output/tb-log")
        # main_model = A2C('MlpPolicy', train_env, verbose=0, tensorboard_log="output/tb-log")  # , exploration_fraction=0.3)
        main_model = DQN('MlpPolicy', train_env_rule_based, verbose=0, tensorboard_log=tb_path)  # , exploration_fraction=0.3)
    else:
        main_model = copy.deepcopy(previous_models[last_agent_id])
        main_model.set_env(train_env)
        main_model.tensorboard_log = tb_path

    # Start training with self-play over several rounds
    opponent_id = last_agent_id
    while opponent_id < max_agents - 1:
        print(f"Running training round {opponent_id + 1}")
        if fine_tune_on is None:
            # Choose opponent based on setting
            if only_rule_based_op:
                current_train_env = train_env_rule_based
                # Use rule-based as opponent
                current_train_env.set_opponent(SimpleRuleBasedAgent(current_train_env))
            else:
                if opponent_id == 0:
                    current_train_env = train_env_rule_based
                else:
                    current_train_env = train_env
                # Take opponent from the previous version of the model
                current_train_env.set_opponent(previous_models[opponent_id])
        else:  # Use passed fine-tune agent as opponent
            current_train_env = train_env
            current_train_env.set_opponent(fine_tune_model)

        # Train the model
        current_train_env.set_opponent_right_side(True)

        chosen_n_steps = num_learn_steps_pre_training if opponent_id == 0 else num_learn_steps  # Iteration 0 is pre-training

        # In order to generate adversarial examples the adversarial training wrapper needs a references to the model that is
        # currently being trained
        if adversarial_training is not None:
            current_train_env.env.victim_model = main_model

        # Optionally add a callback to save intermediate checkpoints
        if save_freq is not None:
            checkpoint_callback = CheckpointCallback(save_freq=save_freq,
                                                     save_path='./output/intermediate/',
                                                     name_prefix=model_name + str(opponent_id + 1) + '_interm')
        else:
            checkpoint_callback = None

        # === LEARNING ===
        main_model.learn(total_timesteps=chosen_n_steps, tb_log_name=model_name, callback=checkpoint_callback)

        # Do evaluation for this training round
        eval_env_rule_based.set_opponent(eval_op)
        avg_round_reward, num_steps = evaluate(main_model, eval_env_rule_based, num_eps=num_eval_eps)
        print(model_name)
        print(f"Average round reward after training: {avg_round_reward}")
        print(f"Average number of steps per episode: {num_steps / num_eval_eps}")

        # Check if there was improvement
        if num_steps > prev_num_steps:  # Model improved compared to last
            print('Model improved')
            prev_num_steps = num_steps
            # Reset patience counter
            patience_counter = 0

            # Save the further trained model to disk
            main_model.save(_make_model_path(output_folder, model_name, opponent_id + 1))
            # Make a copy of the just saved model by loading it
            copy_of_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id + 1))
            # Save the copy to the list
            previous_models.append(copy_of_model)

            # From here we continue training the same main_model against itself
            opponent_id += 1
        else:
            print('Model did not improve')
            patience_counter += 1
            # Do not save the model
            if patience_counter > patience:
                print('Stopping early due to patience')
                break
            # Because our model did not improve compared to the previous one, we reset our main_model to the previous one
            main_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id))
            main_model.set_env(train_env)

            # Opponent does not change

    if not opponent_pred_obs:
        # Evaluate the last model against each of its previous iterations
        # evaluate_against_predecessors(previous_models, env_rule_based=eval_env_rule_based, env_normal=eval_env, num_eval_eps=num_eval_eps)
        pass  # Not useful right now
예제 #29
0
model = DQN(MlpPolicy, env, verbose=1)

#model = DQN(MlpPolicy, env, seed=1423, target_update_interval =5, batch_size=16, train_freq=128, buffer_size=256, gamma=0.95, learning_rate=1e-3, verbose=1)

print("start model evaluation without learning !")
mean_reward_before, std_reward_before = evaluate_policy(model,
                                                        env,
                                                        n_eval_episodes=100)
print("end model evaluation !")

print("start model learning !")
model.learn(total_timesteps=10000, log_interval=4)
print("end model learning !")

print("-> model saved !!")
model.save("dqn_cartpole")

print("start model evaluation with learning !")
mean_reward_after, std_reward_after = evaluate_policy(model,
                                                      env,
                                                      n_eval_episodes=100)
print("end model evaluation !")

print("-> model evaluation without learning")
print(
    f"mean_reward:{mean_reward_before:.2f} +/- std_reward:{std_reward_before:.2f}"
)

print("-> model evaluation with learning")
print(
    f"mean_reward:{mean_reward_after:.2f} +/- std_reward:{std_reward_after:.2f}"
eval_log_path = "eval_logs/dqn_evolve_rl_eval_{}_{}_{}_{}".format(env_name, loss_type, seed, time_int)
eval_callback = EvalCallback(eval_env, log_path=eval_log_path, eval_freq=eval_freq, deterministic=True, render=False, n_eval_episodes=5)

if env_name == 'MountainCar-v0':
    buffer_size = 10000  # max(total_timesteps // 100, 500)
    learning_starts = 1000  # max(total_timesteps // 1000, 100)
    learning_rate = 4e-3
    batch_size = 128
    gamma = 0.98
    train_freq = 16
    target_update_interval = 600
    gradient_steps = 8
    exploration_fraction = 0.2  # (learning_starts + 1000)/total_timesteps
    exploration_final_eps = 0.07

    model = DQN(MlpPolicy, env, policy_kwargs=policy_kwargs, target_update_interval=target_update_interval,
                exploration_fraction=exploration_fraction,
                buffer_size=buffer_size, train_freq=train_freq, learning_starts=learning_starts, seed=seed,
                tensorboard_log=tensorboard_log, verbose=1,
                loss_type=loss_type, dqn_reg_loss_weight=dqn_reg_loss_weight,
                batch_size=batch_size, learning_rate=learning_rate, gamma=gamma, gradient_steps=gradient_steps,
                exploration_final_eps=exploration_final_eps)
else:
    model = DQN(MlpPolicy, env, policy_kwargs=policy_kwargs, target_update_interval=target_update_interval, exploration_fraction=exploration_fraction,
                buffer_size=buffer_size, train_freq=train_freq, learning_starts=learning_starts, seed=seed, tensorboard_log=tensorboard_log, verbose=1,
                loss_type=loss_type, dqn_reg_loss_weight=dqn_reg_loss_weight)

model.learn(total_timesteps=total_timesteps, log_interval=100, callback=eval_callback)

model.save(model_save_name)