def main():
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    if not USE_LOADED_MODEL:
        model = ACKTR('MlpPolicy', env, verbose=1)

        # Multiprocessed RL Training
        start_time = time.time()
        model.learn(total_timesteps=n_timesteps, log_interval=10)
        total_time_multi = time.time() - start_time

        model.save("cartpole_v1_acktr")

    loaded_model = ACKTR.load("cartpole_v1_acktr")
    loaded_model.set_env(env)

    # Single Process RL Training
    single_process_model = ACKTR('MlpPolicy', env_id, verbose=1)
    start_time = time.time()
    single_process_model.learn(n_timesteps)
    total_time_single = time.time() - start_time

    print("Single-process: {0}s, Multi-process: {1}s".format(
        total_time_single, total_time_multi))

    # create separate clean environment for evaluation
    eval_env = gym.make(env_id)
    mean_reward, std_reward = evaluate_policy(loaded_model,
                                              eval_env,
                                              n_eval_episodes=10)
    print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
Exemplo n.º 2
0
if __name__ == "__main__":
    env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux"
    #env = UnityEnv(env_id, worker_id=2, use_visual=False)
    # Create log dir
    time_int = int(time.time())
    log_dir = "stable_results/basic_env_{}/".format(time_int)
    os.makedirs(log_dir, exist_ok=True)

    #env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    num_env = 2
    worker_id = 9
    env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)])

    model = ACKTR(MlpPolicy, env, verbose=1, ent_coef=0.)
    model.learn(total_timesteps=30000)
    model.save(log_dir+"model")

    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, infos = env.step(action)
            total_l += 1.
            total_r += rewards[0]
            if dones[0]:
def get_reward(r):
    if r == 'magni_reward':
        return magni_reward
    elif r == 'cameron_reward':
        return cameron_reward
    elif r == 'risk_event':
        return risk_event
    elif r == 'reward_target':
        return reward_target
    elif r == 'default':
        return risk_diff
    else:
        assert False, "Reward must be valid ('magni_reward', 'cameron_reward', 'risk_event', 'reward_target')"


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-g", "--age_group", default="adult")
    parser.add_argument("-r", "--reward", default="default")
    args = parser.parse_args()
    group = get_group(args.age_group)
    reward_fun = get_reward(args.reward)

    env = DummyVecEnv([
        make_env(group + '#0{}'.format(str(i).zfill(2)), i, reward_fun)
        for i in range(1, 11)
    ])
    model = ACKTR(MlpLstmPolicy, env, verbose=1)
    model.learn(total_timesteps=256000)
    model.save("ACKTR_MlpLSTM_" + group + "_def_reward")
Exemplo n.º 4
0
    fBG = 3.5506*(np.log(bg)**.8353-3.7932)
    risk = 10 * (fBG)**2
    return -1*risk

def cameron_reward(bg_hist, **kwargs):
    bg = bg_hist[-1]
    a = .2370  # 1/(mg/dL)
    b = -36.21
    c = 6.0e-5  # (1/(mg/dL)**3)
    d = 177  # mg/dL
    if bg < d:
        risk = a*bg+b+(c*(d-bg)**3)
    else:
        risk = a*bg+b
    return -1*risk

person_options = (['child#0{}'.format(str(i).zfill(2)) for i in range(1, 11)]+['adolescent#0{}'.format(str(i).zfill(2)) for i in range(1, 11)]+['adult#0{}'.format(str(i).zfill(2)) for i in range(1, 11)])
for i,p in enumerate(person_options):

    patient_id = p.split('#')[0] + str(i + 1)
    # Create a simulation environment
    print(p)
    patient = T1DPatient.withName(p)
    register(id='simglucose-'+p+'-v0',entry_point='simglucose.envs:T1DSimEnv',kwargs={'patient_name': p},'reward_fun': reward_target)
    env = gym.make('simglucose-'+p+'-v0')

    model = ACKTR(MlpLstmPolicy, env, verbose=1)
    model.learn(total_timesteps=250000)
    model.save('mlplstm_trained-'+p+'-reward_target')
    print('Model Trained and Saved for : '+ p)
Exemplo n.º 5
0
    #env = CustomEnv(3, 6, "tcp://*:5556")
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    # Create log dir
    log_dir = "Logs/Custom_env/"
    os.makedirs(log_dir, exist_ok=True)
    # Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=500,
                                                log_dir=log_dir)

    #env = Monitor(env, log_dir)

    model = ACKTR(MlpPolicy, env, verbose=2)
    #model.load("DQN_agent")
    model.learn(total_timesteps=20000, callback=callback)
    model.save("temp_agent")

    a = input("Training completed")

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        probs = model.action_probability(obs)
        obs, rewards, dones, info = env.step(action)
        print("Observation:", obs, rewards, probs)

    results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS,
                                 "Lane Manager")
    plt.show()
    env = VecFrameStack(env, 3)

    model = ACKTR(get_policy(policy),
                  env,
                  n_steps=100,
                  verbose=0,
                  gae_lambda=0.95,
                  vf_fisher_coef=0.5,
                  tensorboard_log=tensorboard_folder,
                  kfac_update=10,
                  n_cpu_tf_sess=2,
                  async_eigen_decomp=False)
    model.learn(total_timesteps=100000000,
                tb_log_name='ACKTR_PPO2' + model_tag)

    model.save(model_folder + "ACKTR_PPO2" + model_tag)
    del model
    model = ACKTR.load(model_folder + "ACKTR_PPO2" + model_tag)

    done = False
    states = None
    action_masks = []
    obs = env.reset()

    while not done:
        action, states = model.predict(obs, states, action_mask=action_masks)
        obs, _, done, infos = env.step(action)
        env.render()
        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
Exemplo n.º 7
0
                  gamma=config['gamma'],
                  policy_kwargs=config['policy_kwargs'],
                  verbose=1,
                  tensorboard_log=save_path)

elif config['algorithm'] == 'PPO2':
    env = make_vec_env(lambda: env, n_envs=1)
    model = PPO2(config['policy_network'],
                 env,
                 learning_rate=config['learning_rate'],
                 gamma=config['gamma'],
                 policy_kwargs=config['policy_kwargs'],
                 verbose=1,
                 tensorboard_log=save_path)

elif config['algorithm'] == 'DQN':
    model = DQN(
        config['policy_network'],
        env,
        learning_rate=config['learning_rate'],
        buffer_size=config['buffer_size'],
        target_network_update_freq=64,
        gamma=config['gamma'],  # policy_kwargs = config['policy_kwargs'],
        verbose=1,
        tensorboard_log=save_path)

model.learn(config['total_steps'], callback=callback)
model.save(os.path.join(save_path, 'model'))

env.close()
        print(env_id)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == "__main__":
    env = DummyVecEnv([
        make_env('adolescent#0{}'.format(str(i).zfill(2)), i)
        for i in range(1, 11)
    ])
    # model = SAC(LnMlpPolicy, env, verbose=1)
    model = ACKTR(MlpLstmPolicy, env, verbose=1)
    model.learn(total_timesteps=256000)
    model.save("ACKTR_MlpLSTM_adolescent_def_reward")

# for i,p in enumerate(child_options):
#     patient_id = p.split('#')[0] + str(i + 1)

#     register(
#         id='simglucose-' + patient_id + '-v0',
#         entry_point='simglucose.envs:T1DSimEnv',
#         kwargs={'patient_name': p}
#     )

#     env = gym.make('simglucose-' + patient_id + '-v0')
#     model = SAC(LnMlpPolicy, env, verbose=1)
#     print(p, patient_id)
#     model.learn(total_timesteps=250000)
#     print("Finished training for " + patient_id)
Exemplo n.º 9
0
# Log dir
log_dir = "./tmp10/"
os.makedirs(log_dir, exist_ok=True)
callback = SaveOnBestTrainingRewardCallback(check_freq=10000,
                                            log_dir=log_dir,
                                            save_name="acktr")

env = Manipulator2D()

# multiprocess environment
#env = make_vec_env('CartPole-v1', n_envs=4)
env = Monitor(env, log_dir)
# Custom MLP policy of two layers of size 32 each with tanh activation function
#policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32])

# Create the agent

#model = PPO2(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs,)
#model = PPO2(MlpPolicy, env, verbose=1)
# Train the agent
model = ACKTR(MlpPolicy, env, verbose=1, ent_coef=0.0)
# 3e5
# policy: 'MlpPolicy'
# ent_coef: 0.0)
model.learn(total_timesteps=20000000, callback=callback)
# Save the agent
model.save("acktr-man1")

# del model
# # the policy_kwargs are automatically loaded
# model = PPO2.load("ppo2-cartpole")
Exemplo n.º 10
0
class ACKTR_Agent:
    def __init__(self, params: Params):
        self.params: Params = params
        policy_name = self.params.agent_config['policy']
        self.policy = eval(policy_name)

    def create_model(self, n_envs=1):
        """ Create env and agent model """
        env_cls = SprEnv
        self.env = make_vec_env(env_cls,
                                n_envs=n_envs,
                                env_kwargs={"params": self.params},
                                seed=self.params.seed)
        self.model = ACKTR(
            self.policy,
            self.env,
            gamma=self.params.agent_config['gamma'],
            n_steps=self.params.agent_config['n_steps'],
            ent_coef=self.params.agent_config['ent_coef'],
            vf_coef=self.params.agent_config['vf_coef'],
            vf_fisher_coef=self.params.agent_config['vf_fisher_coef'],
            max_grad_norm=self.params.agent_config['max_grad_norm'],
            learning_rate=self.params.agent_config['learning_rate'],
            gae_lambda=self.params.agent_config['gae_lambda'],
            lr_schedule=self.params.agent_config['lr_schedule'],
            kfac_clip=self.params.agent_config['kfac_clip'],
            kfac_update=self.params.agent_config['kfac_update'],
            async_eigen_decomp=self.params.agent_config['async_eigen_decomp'],
            verbose=self.params.agent_config['verbose'],
            tensorboard_log="./tb/acktr/",
            seed=self.params.seed,
            policy_kwargs={"params": self.params})

    def train(self):
        with ProgressBarManager(self.params.training_duration) as callback:
            self.model.learn(total_timesteps=self.params.training_duration,
                             tb_log_name=self.params.tb_log_name,
                             callback=callback)

    def test(self):
        self.params.test_mode = True
        obs = self.env.reset()
        self.setup_writer()
        episode = 1
        step = 0
        episode_reward = [0.0]
        done = False
        # Test for 1 episode
        while not done:
            action, _states = self.model.predict(obs)
            obs, reward, dones, info = self.env.step(action)
            episode_reward[episode - 1] += reward[0]
            if info[0]['sim_time'] >= self.params.testing_duration:
                done = True
                self.write_reward(episode, episode_reward[episode - 1])
                episode += 1
            sys.stdout.write(
                "\rTesting:" +
                f"Current Simulator Time: {info[0]['sim_time']}. Testing duration: {self.params.testing_duration}"
            )
            sys.stdout.flush()
            step += 1
        print("")

    def save_model(self):
        """ Save the model to a zip archive """
        self.model.save(self.params.model_path)

    def load_model(self, path=None):
        """ Load the model from a zip archive """
        if path is not None:
            self.model = ACKTR.load(path)
        else:
            self.model = ACKTR.load(self.params.model_path)
            # Copy the model to the new directory
            self.model.save(self.params.model_path)

    def setup_writer(self):
        episode_reward_filename = f"{self.params.result_dir}/episode_reward.csv"
        episode_reward_header = ['episode', 'reward']
        self.episode_reward_stream = open(episode_reward_filename,
                                          'a+',
                                          newline='')
        self.episode_reward_writer = csv.writer(self.episode_reward_stream)
        self.episode_reward_writer.writerow(episode_reward_header)

    def write_reward(self, episode, reward):
        self.episode_reward_writer.writerow([episode, reward])
#         index = np.argmin(best_mean_reward)
#         if mean_reward > best_mean_reward[index]:
#             best_mean_reward[index] = mean_reward
#             print('best_mean_reward', best_mean_reward)
#             _locals['self'].save(log_dir + 'best_model_{}.pkl'.format(str(mean_reward)))
#     n_steps += 1
#     return False

# log_dir = 'LiveStream_1229/ACKTRCust3_deletem8_zhongwang_diff_delay/'
log_dir = 'ACKTRtest/'

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
tstart = time.time()
num_cpu = 2

env = SubprocVecEnv([make_env(i, log_dir) for i in range(num_cpu)])

model = ACKTR(
    env=env,
    policy=LstmCust3Policy,
    verbose=1,
)

model.learn(total_timesteps=int(5e6), callback=callback)
model.save(log_dir + "last_model")

print('Time taken: {:.2f}'.format(time.time() - tstart))
Exemplo n.º 12
0
def train(environment, algorithm, timesteps):
    from envs import cpa, mountain_car

    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.bench import Monitor
    from stable_baselines import PPO2, ACKTR, DQN, A2C

    now = datetime.now()
    current_time = now.strftime("%Y-%m-%d-%H-%M-%S")

    training_info_dir = "training_info" + os.path.sep
    current_training_info = "{}-{}-{}".format(current_time, algorithm, environment)
    current_training_info_dir = training_info_dir + current_training_info + os.path.sep

    model_file_path = current_training_info_dir + "model"
    log_file_path = current_training_info_dir + "monitor.csv"

    tensorboard_dir = training_info_dir + TENSORBOARD_DIR_NAME + os.path.sep

    dirs_to_create = [model_file_path, tensorboard_dir, model_file_path]

    for directory in dirs_to_create:
        create_dir(directory)

    env = None

    if environment == 'cpa_sparse':
        env = cpa.CPAEnvSparse()
    elif environment == 'cpa_dense':
        env = cpa.CPAEnvDense()
    elif environment == 'mc_sparse':
        env = mountain_car.MountainCarSparseEnv()
    elif environment == 'mc_dense':
        env = mountain_car.MountainCarDenseEnv()
    else:
        raise Exception("Environment '{}' is unknown.".format(environment))

    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    env = Monitor(env, filename=log_file_path, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = None

    if algorithm == 'acktr':
        model = ACKTR('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    elif algorithm == 'ppo':
        model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    elif algorithm == 'a2c':
        model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    elif algorithm == 'dqn':
        model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=tensorboard_dir)
    else:
        raise Exception("Algorithm '{}' is unknown.".format(algorithm))

    # Train the agent
    model.learn(total_timesteps=timesteps, tb_log_name=current_training_info)

    model.save(model_file_path)

    print("Finished training model: {}. Saved training info in: {}".format(model, current_training_info_dir))
Exemplo n.º 13
0
import gym
gym.logger.set_level(40)
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from env import GoLeftEnv
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

env = GoLeftEnv(grid_size=10)
env = make_vec_env(lambda: env, n_envs=1)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=0.9,
                                                 verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=callback_on_best,
                             verbose=1)

model = ACKTR('MlpPolicy', env, verbose=1)
model.learn(int(1e10), callback=eval_callback)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

model.save('models/best')

env.close()
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv(10, 10)])

model = ACKTR(get_policy(policy),
              env,
              verbose=0,
              tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='ACKTR_A2C' + model_tag)

model.save(model_folder + "ACKTR_A2C" + model_tag)
del model
model = ACKTR.load(model_folder + "ACKTR_A2C" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
        model.learn(total_timesteps=1000000,
                    reset_num_timesteps=False,
                    callback=callback)
        model.save(log_dir + 'model_PPO_' + str(id + 1))

    if args.algo == "acktr":
        id = balboa.utils.tensorboard_latest_directory_number(
            log_dir, 'ACKTR_')
        print('Using acktr')
        if args.load_id == None:
            # tensorboard_log=log_dir
            model = ACKTR("MlpPolicy",
                          env,
                          policy_kwargs=policy_kwargs,
                          ent_coef=0.0,
                          verbose=1)
            # verbose=1, n_steps=48, learning_rate=0.1, lr_schedule='constant',
        else:
            print("Loading model: " + str(args.load_id))
            model = ACKTR.load(log_dir + 'ACKTR_' + str(args.load_id) + ".zip",
                               env=env)
        model.tensorboard_log = log_dir
        # model.learning_rate = stable_baselines.common.schedules.LinearSchedule(1.0, 0.06, initial_p=0.06).value
        # model.cliprange = stable_baselines.common.schedules.LinearSchedule(1.0, 0.2, initial_p=0).value

        model.learn(total_timesteps=3000000,
                    reset_num_timesteps=False,
                    callback=callback)
        print("Saving to: " + log_dir + 'ACKTR_' + str(id + 1))
        model.save(log_dir + 'model_ACKTR_' + str(id + 1))
Exemplo n.º 16
0
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import ACKTR

# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = ACKTR(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("acktr_cartpole")

del model  # remove to demonstrate saving and loading

model = ACKTR.load("acktr_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Exemplo n.º 17
0
        env.seed(seed)
        print(env_id)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == "__main__":
    env = DummyVecEnv([
        make_env('adult#0{}'.format(str(i).zfill(2)), i) for i in range(1, 11)
    ])
    # model = SAC(LnMlpPolicy, env, verbose=1)
    model = ACKTR(MlpLstmPolicy, env, verbose=1)
    model.learn(total_timesteps=256000)
    model.save("ACKTR_MlpLSTM_adult_def_reward")

# for i,p in enumerate(child_options):
#     patient_id = p.split('#')[0] + str(i + 1)

#     register(
#         id='simglucose-' + patient_id + '-v0',
#         entry_point='simglucose.envs:T1DSimEnv',
#         kwargs={'patient_name': p}
#     )

#     env = gym.make('simglucose-' + patient_id + '-v0')
#     model = SAC(LnMlpPolicy, env, verbose=1)
#     print(p, patient_id)
#     model.learn(total_timesteps=250000)
#     print("Finished training for " + patient_id)