Exemplo n.º 1
0
def train():
    best_reward, best_reward_timesteps = None, None
    save_path = "model_save/"+MODEL_PATH+"/"
    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)

    # log_dir = f"model_save/"
    log_dir = save_path
    env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT)
    env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir)
    env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    if PARAM['algo']=='td3':
        model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                    learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ddpg':
        model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                     learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ppo':
        model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'])

    eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model',
                                 log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'],
                                 deterministic=True, render=False)

    model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500)
    print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep)
    model.save(save_path+MODEL_PATH+'_final_timesteps')
Exemplo n.º 2
0
    def train_DDPG(self, model_name, model_params = config.DDPG_PARAMS):
        """DDPG model"""
        from stable_baselines3 import DDPG
        from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise


        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions))


        start = time.time()
        model = DDPG('MlpPolicy', 
                    env_train,
                    batch_size=model_params['batch_size'],
                    buffer_size=model_params['buffer_size'],
                    action_noise=action_noise,
                    verbose=model_params['verbose'],
                    tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}"
                    )
        model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "DDPG_run")
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (DDPG): ', (end-start)/60,' minutes')
        return model
Exemplo n.º 3
0
def main():
    """
   # Example with Vectorized env
   num_cpu = 4  # Number of processes to use
   my_env_kwargs={'renders': False}
   env = make_vec_env('panda-ip-reach-v0', n_envs=num_cpu, env_kwargs=my_env_kwargs)
   """

    # Example with a simple Dummy vec env
    env = gym.envs.make('panda-ip-reach-v0', renders=False)
    env = DummyVecEnv([lambda: env])

    #check_env(pandaenv)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    print("n_actions = {0}".format(n_actions))

    #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=0.1 * np.ones(n_actions))

    model = DDPG(policy='MlpPolicy',
                 env=env,
                 learning_rate=0.001,
                 buffer_size=1000000,
                 learning_starts=100,
                 batch_size=100,
                 tau=0.005,
                 gamma=0.99,
                 train_freq=1,
                 gradient_steps=-1,
                 action_noise=action_noise,
                 optimize_memory_usage=False,
                 tensorboard_log="./ddpg_panda_reach_tensorboard/",
                 create_eval_env=False,
                 policy_kwargs=None,
                 verbose=1,
                 seed=None,
                 device='auto',
                 _init_setup_model=True)
    """
   print("start model evaluation without learning !")
   mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=1)
   print("end model evaluation !")
   """
    print("start model learning !")
    model.learn(total_timesteps=200000, log_interval=10)
    print("end model learning !")

    print("-> model saved !!")
    model.save("ddpg_panda_reach")
    """
   print("start model evaluation with learning !")
   mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=1)
   print("end model evaluation !")
   """
    """
Exemplo n.º 4
0
def train_ddpg():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)
    model = DDPG("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1, learning_starts=500000)
    callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir)
    model.learn(total_timesteps=int(1000000), callback = callback, log_interval = 480)
    model.save('model_save/ddpg_cnn')
Exemplo n.º 5
0
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""

    # add the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    start = time.time()
    model = DDPG('MlpPolicy', env_train, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end-start)/60,' minutes')
    return model
Exemplo n.º 6
0
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""
    # the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    start = time.time()
    param_noise = None
    # removed keyword "param_noise=param_noise" stable_baselines3 doesn't need this one
    model = DDPG('MlpPolicy', env_train, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end - start) / 60, ' minutes')
    return model
Exemplo n.º 7
0
def train():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    model = DDPG('MlpPolicy',
                 env,
                 verbose=1,
                 batch_size=PARAM['batch_size'],
                 seed=PARAM['seed'],
                 learning_starts=PARAM['learning_starts'])
    callback = SaveOnBestTrainingRewardCallback(check_freq=480,
                                                log_dir=log_dir)
    model.learn(total_timesteps=int(PARAM['total_time_step']),
                callback=callback,
                log_interval=480)
    model.save('model_save/' + MODEL_PATH)
Exemplo n.º 8
0
def train_DDPG(env):

    print(f"action space shape -1:{env.action_space.shape[-1]}")

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.02 * np.ones(n_actions))

    model = DDPG(
        'MlpPolicy',
        env,
        learning_rate=0.0003,
        learning_starts=5,
        train_freq=10,
        n_episodes_rollout=-1,
        buffer_size=100000,
        action_noise=action_noise,
        batch_size=128,
        verbose=2,
    )
    model.learn(total_timesteps=1000000, log_interval=1)

    model.save("DDPG_pkl")
Exemplo n.º 9
0
for i in range(n_tests):
    test_name = 'saved_models/a2c_soccer_actions_env_1_' + str(i)
    n_actions = env.action_space.shape[-1]
    model = A2C('MlpPolicy', env)
    model.learn(total_timesteps=25000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

# DDPG algorithm
for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_1_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    policy_kwargs = dict(net_arch=[400, 300])
    model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_3_' + str(i)
    n_actions = env.action_space.shape[-1]
Exemplo n.º 10
0
    #### Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    #### Train the model ###############################################################################
    model = DDPG(CustomPolicy,
                 env,
                 verbose=1,
                 batch_size=64,
                 action_noise=action_noise)

    for i in range(step_iters):  # run for step_iters * training_timesteps

        model.learn(total_timesteps=training_timesteps)

        model.save("./models/ddpg" + str((i + 1) * training_timesteps))
        model.save_replay_buffer("./experiences/ddpg_experience" +
                                 str((i + 1) * training_timesteps))

        #### Show (and record a video of) the model's performance ##########################################
        env_test = RLTetherAviary(gui=False, record=True)
        obs = env_test.reset()
        start = time.time()
        for i in range(10 * env_test.SIM_FREQ):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env_test.step(action)
            if done: break
        env_test.close()

    env.close()
Exemplo n.º 11
0
    # HyperParameters
    lr = 3e-4

    model_name = "DDGP_2.pt"
    model_path = "./Model/" + model_name

    # Instantiate Model
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=.75 * np.ones(n_actions))
    model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1)

    # Train OR Load Model
    model.learn(total_timesteps=25000)

    model.save(model_path)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    print("Mean Reward = ", mean_reward)

    print(env.soc_list)

    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
Exemplo n.º 12
0
             action_noise=action_noise,
             verbose=1,
             tensorboard_log="./ddpg_pendulum_tensorboard/")

print("start model evaluation without learning !")
mean_reward_before, std_reward_before = evaluate_policy(model,
                                                        env,
                                                        n_eval_episodes=100)
print("end model evaluation !")

print("start model learning !")
model.learn(total_timesteps=10000, log_interval=10)
print("end model learning !")

print("-> model saved !!")
model.save("ddpg_pendulum")

print("start model evaluation with learning !")
mean_reward_after, std_reward_after = evaluate_policy(model,
                                                      env,
                                                      n_eval_episodes=100)
print("end model evaluation !")

print("-> model evaluation without learning")
print(
    f"mean_reward:{mean_reward_before:.2f} +/- std_reward:{std_reward_before:.2f}"
)

print("-> model evaluation with learning")
print(
    f"mean_reward:{mean_reward_after:.2f} +/- std_reward:{std_reward_after:.2f}"
Exemplo n.º 13
0
import kukakr5Arc

env = gym.make('kukakr5Arc-v1')

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=float(0.5) *
                                            np.ones(n_actions))

# model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise)
model = DDPG(MlpPolicy, env, verbose=1, action_noise=action_noise)
model.learn(total_timesteps=400000)
model.save(
    "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher"
)
print('Saving model.... Model saved')

del model  # remove to demonstrate saving and loading

model = DDPG.load(
    "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher",
    env=env)
print('Loading model.....Model loaded')

#env.render() goes before env.reset() for the render to work
#env.render()

#obs = env.reset()
#while True:
Exemplo n.º 14
0
    env_id = 'gym_spm:spm-v0'
    num_cpu = 4  # Number of processes to use

    env = gym.make('gym_spm:spm-v0')
    # env = make_vec_env(env_id, n_envs=1, seed=0)
    # env = VecCheckNan(env, raise_exception=True)
    # env = check_env(env)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions))

    # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/")
    model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/")
    model.learn(total_timesteps=25000, tb_log_name='DDPG_test_run_3_SOCpoint5_two_state')
    model.save('DDPG_test_3_SOC_point5_two_states')


    model.load('DDPG_test_2_SOC_point5_two_states')
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    
    print("Mean Reward = ", mean_reward)
    
    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
    
    obs = env.reset()
    for _ in range(3600):
Exemplo n.º 15
0
    model_dir_description = model_dir + details

    # Instantiate Model
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=25.67 * np.ones(n_actions))
    model = DDPG(MlpPolicy,
                 env,
                 action_noise=action_noise,
                 verbose=1,
                 tensorboard_log=log_dir)

    # Train OR Load Model
    if train_model:
        model.learn(total_timesteps=25000, tb_log_name=details)
        model.save(model_dir_description)
    else:
        model.load(model_dir_description)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    print("Mean Reward = ", mean_reward)

    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
    env_id = 'gym_spm:spm-v0'
    num_cpu = 4  # Number of processes to use

    train = True

    env = gym.make('gym_spm:spm-v0')

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions))
    model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/")
    # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/")

    if train:
        model.learn(total_timesteps=2500000, tb_log_name='test_run_3_SOCpoint5_two_state')
        model.save('TD3_test_3_SOC_point5_two_states')
    else:
        model.load('TD3_test_2_SOC_point5_two_states')

    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    
    print("Mean Reward = ", mean_reward)
    
    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
    
    obs = env.reset()
    for _ in range(3600):