Exemplo n.º 1
0
def main(output_folder_path:Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLPIDAgent,
        "max_collision": 5
    }

    env = gym.make('roar-pid-v0', params=params)
    env.reset()

    model_params: dict = {
        "verbose": 1,
        "render": True,
        "tensorboard_log": (output_folder_path / "tensorboard").as_posix()
    }
    latest_model_path = find_latest_model(output_folder_path)
    if latest_model_path is None:
        model = DDPG(LnMlpPolicy, env=env, **model_params)  # full tensorboard log can take up space quickly
    else:
        model = DDPG.load(latest_model_path, env=env, **model_params)
        model.render = True
        model.tensorboard_log = (output_folder_path / "tensorboard").as_posix()

    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=(output_folder_path / "checkpoints").as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList([checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False)
    model.save(f"pid_ddpg_{datetime.now()}")
Exemplo n.º 2
0
def test_callbacks(model_class):

    env_id = 'Pendulum-v0'
    if model_class in [ACER, DQN]:
        env_id = 'CartPole-v1'

    allowed_failures = []
    # Number of training timesteps is too short
    # otherwise, the training would take too long, or would require
    # custom parameter per algorithm
    if model_class in [PPO1, DQN, TRPO]:
        allowed_failures = ['rollout_end']

    # Create RL model
    model = model_class('MlpPolicy', env_id)

    checkpoint_callback = CheckpointCallback(save_freq=500,
                                             save_path=LOG_FOLDER)

    # For testing: use the same training env
    eval_env = model.get_env()
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(eval_env,
                                 callback_on_new_best=callback_on_best,
                                 best_model_save_path=LOG_FOLDER,
                                 log_path=LOG_FOLDER,
                                 eval_freq=100)

    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1,
                                             save_path=LOG_FOLDER,
                                             name_prefix='event')
    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    callback = CallbackList(
        [checkpoint_callback, eval_callback, event_callback])

    model.learn(500, callback=callback)
    model.learn(200, callback=None)
    custom_callback = CustomCallback()
    model.learn(200, callback=custom_callback)
    # Check that every called were executed
    custom_callback.validate(allowed_failures=allowed_failures)
    # Transform callback into a callback list automatically
    custom_callback = CustomCallback()
    model.learn(500,
                callback=[checkpoint_callback, eval_callback, custom_callback])
    # Check that every called were executed
    custom_callback.validate(allowed_failures=allowed_failures)

    # Automatic wrapping, old way of doing callbacks
    model.learn(200, callback=lambda _locals, _globals: True)

    # Cleanup
    if os.path.exists(LOG_FOLDER):
        shutil.rmtree(LOG_FOLDER)
Exemplo n.º 3
0
    def train(self, tensorboard_log: str) -> None:

        try:
            self.load_model(tensorboard_log=tensorboard_log)

        except:
            self.create_model(tensorboard_log=tensorboard_log)

        # Stop training if reward gets close to zero
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-0.1,
                                                         verbose=1)
        eval_callback = EvalCallback(self.env,
                                     callback_on_new_best=callback_on_best,
                                     verbose=1)

        # Save model at regular time intervals
        checkpoint_callback = CheckpointCallback(
            save_freq=1000, save_path='./model_checkpoints/')

        # Chain callbacks together
        callback = CallbackList([eval_callback, checkpoint_callback])

        # Train model
        self.model.learn(total_timesteps=int(1e10),
                         callback=callback,
                         tb_log_name="run")

        # Save trained model
        print("Training is finished!")
Exemplo n.º 4
0
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    env_ = gym.make(env_name)
    rank = MPI.COMM_WORLD.Get_rank()
    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_PPO1_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    eval_callback = EvalCallback_wandb(env_,
                                       n_eval_episodes=eval_ep,
                                       eval_freq=eval_freq,
                                       log_path=path)
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    callbacklist.append(eval_callback)
    callbacklist.append(ckpt_callback)
    callback = CallbackList(callbacklist)

    if load_model:
        model = PPO1.load(env=env, load_path=load_model)
    else:
        model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs)

    ############################
    #          Logging         #
    ############################
    if rank == 0:
        logger.configure(path)
        config = {}
        config['load'] = [{'load_model': load_model}]
        config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
        config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
        config['policy'] = [{'policy_network': policy_kwargs}]
        with open('./run/' + model_name + '/' + model_name + '.txt',
                  'w+') as outfile:
            json.dump(config, outfile, indent=4)
    else:
        logger.configure(path, format_strs=[])
    ############################
    #            run           #
    ############################

    model.learn(total_timesteps=int(num_time_steps), callback=callback)
    model.save(path + '/finish')
Exemplo n.º 5
0
def main(
  training_env: PSMCartesianHERDDPGEnv,
  eval_env: PSMCartesianHERDDPGEnv = None,
  log_dir='./.logs/results'
):

  os.makedirs(log_dir, exist_ok=True)

  # training_env = Monitor(training_env, log_dir)

  n_actions = training_env.action_space.shape[0]
  noise_std = 0.2
  # Currently using OU noise
  action_noise = OrnsteinUhlenbeckActionNoise(
    mean=np.zeros(n_actions),
    sigma=noise_std * np.ones(n_actions)
  )
  model_class = DDPG  # works also with SAC, DDPG and TD3

  rl_model_kwargs = {
    'actor_lr': 1e-3,
    'critic_lr': 1e-3,
    'action_noise': action_noise,
    'nb_train_steps': 300,
    'nb_rollout_steps': 100,
    'gamma': 0.95,
    'observation_range': (-1.5,
                          1.5),
    'random_exploration': 0.05,
    'normalize_observations': True,
    'critic_l2_reg': 0.01
  }

  # Available strategies (cf paper): future, final, episode, random
  model = HER(
    'MlpPolicy',
    training_env,
    model_class,
    verbose=1,
    n_sampled_goal=4,
    goal_selection_strategy='future',
    buffer_size=int(1e5),
    batch_size=128,
    tensorboard_log="./ddpg_dvrk_tensorboard/",
    **rl_model_kwargs
  )
  # Reset the model
  training_env.reset()
  # Create callbacks
  checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path="./ddpg_dvrk_tensorboard/"
  )  # save_path="./.model/model_checkpoint/") #save_freq=100000
  # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model',
  #                             log_path=log_dir, eval_freq=500)
  callback = CallbackList([checkpoint_callback])  # , eval_callback])
  # Train the model
  model.learn(4000000, log_interval=100, callback=callback)
  model.save("./her_robot_env")
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    # env.render()
    env_ = gym.make(env_name)

    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_SAC_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)
    env = Monitor(env, filename=path)
    ############################
    #          Logging         #
    ############################
    logger.configure(path)
    config = {}
    config['load'] = [{'load_model': load_model}]
    config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
    config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
    config['policy'] = [{'policy_network': policy_kwargs}]
    with open('./run/' + model_name + '/' + model_name + '.txt',
              'w+') as outfile:
        json.dump(config, outfile, indent=4)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    eval_callback = EvalCallback_wandb_SAC(env_,
                                           n_eval_episodes=eval_ep,
                                           eval_freq=eval_freq,
                                           log_path=path)
    callbacklist.append(ckpt_callback)
    callbacklist.append(eval_callback)
    callback = CallbackList(callbacklist)

    ############################
    #            run           #
    ############################
    # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])])
    model = SAC(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=int(num_time_steps),
                log_interval=20,
                callback=callback)
    model.save(path + "SAC_Walker2d")
Exemplo n.º 7
0
 def train(self,
           timesteps: int,
           callbacks: Sequence[BaseCallback] = None,
           num_checkpoints=4) -> None:
     callbacks = [] if callbacks is None else callbacks
     cb = CheckpointCallback(save_freq=timesteps // num_checkpoints,
                             save_path=self._dirs.models,
                             name_prefix=self._dirs.prefix)
     self._model.learn(total_timesteps=timesteps,
                       callback=CallbackList([cb, *callbacks]))
Exemplo n.º 8
0
 def train(self,
           timesteps: int,
           num_checkpoints=4,
           callbacks: Sequence[BaseCallback] = None):
     ppo_offset = 128
     callbacks = [] if callbacks is None else callbacks
     cb = CheckpointCallback(save_freq=timesteps // num_checkpoints,
                             save_path=self._dirs.models,
                             name_prefix=self._dirs.prefix)
     self._model.learn(total_timesteps=timesteps + ppo_offset,
                       callback=CallbackList([cb, *callbacks]),
                       log_interval=100)
Exemplo n.º 9
0
def main(output_folder_path: Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(
        Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(
        Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLLocalPlannerAgent,
        "max_collision": 5,
    }

    env = gym.make('roar-local-planner-v0', params=params)
    env.reset()

    model_params: dict = {
        "verbose": 1,
        "render": True,
        "env": env,
        "n_cpu_tf_sess": None,
        "buffer_size": 1000,
        "nb_train_steps": 50,
        "nb_rollout_steps": 100,
        # "nb_eval_steps": 50,
        "batch_size": 32,
    }
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        model = DDPG(CnnPolicy, **model_params)
    else:
        model = DDPG.load(latest_model_path, **model_params)
    tensorboard_dir = (output_folder_path / "tensorboard")
    ckpt_dir = (output_folder_path / "checkpoints")
    tensorboard_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    model.tensorboard_log = tensorboard_dir.as_posix()
    model.render = True
    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList(
        [checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10),
                        callback=callbacks,
                        reset_num_timesteps=False)
    model.save(f"local_planner_ddpg_{datetime.now()}")
Exemplo n.º 10
0
def main(logdir):
    # params
    SLEEP_RATE = 100  #100Hz
    N_EPISODE = 1000
    EPISODE_TIME = 30
    EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME
    TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE

    # logdir
    logdir = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2'
    checkpoint_path = os.path.join(logdir, 'checkpoint')
    callback_path = logdir
    final_model_path = logdir + '/final_model'

    # env
    env = BlimpEnv(SLEEP_RATE)
    env = Monitor(env, logdir)
    # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir)
    print("Observation space:", env.observation_space)
    print("Shape:", env.observation_space.shape)
    print("Action space:", env.action_space)
    #
    # # callback
    SAVE_FREQ = EPISODE_LENGTH * 20  # every 1 episode
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=checkpoint_path,
                                             name_prefix='sac_callback_model')
    save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback(
        check_freq=SAVE_FREQ, log_dir=callback_path)
    callback = CallbackList(
        [checkpoint_callback, save_on_best_training_reward_callback])

    # traing got kill for some reason so continue from the checkpoint
    model_path = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2/best_model.zip'
    model = SAC.load(model_path)
    model.set_env(env)

    print("---------- Start Learing -----------")
    model.learn(total_timesteps=TOTAL_TIMESTEPS,
                log_interval=SAVE_FREQ,
                callback=callback)

    print("---------- Finish Learning ----------")
    model.save(final_model_path)
    del model  # remove to demonstrate saving and loading
    model = SAC.load(final_model_path)

    results_plotter.plot_results([logdir], TOTAL_TIMESTEPS,
                                 results_plotter.X_TIMESTEPS, "SAC BLIMP")
    plt.show()
Exemplo n.º 11
0
def main(output_folder_path: Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(
        Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(
        Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLLocalPlannerAgent,
        "max_collision": 5,
    }

    env = gym.make('roar-local-planner-v1', params=params)
    env.reset()

    tensorboard_dir, ckpt_dir = prep_dir(output_folder_path)
    model_params: dict = {
        "verbose": 1,
        "render": True,
        "env": env,
        "n_cpu_tf_sess": 2,
        "buffer_size": 10,
        "random_exploration": 0.1,
        "tensorboard_log": tensorboard_dir.as_posix(),
    }
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        model = DDPG(
            LnMlpPolicy,
            **model_params)  # full tensorboard log can take up space quickly
    else:
        model = DDPG.load(latest_model_path, **model_params)

    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList(
        [checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10),
                        callback=callbacks,
                        reset_num_timesteps=False)
    model.save(f"local_planner_v1_ddpg_{datetime.now()}")
Exemplo n.º 12
0
    def train(self):

        # Load latest model if available
        try:
            path = os.getcwd()
            os.chdir(os.getcwd() + '/model_checkpoints')
            files = [x for x in os.listdir() if x.endswith(".zip")]
            num = []
            for file in files:
                num.append([int(x) for x in file.split('_') if x.isdigit()][0])
            filename = "rl_model_" + str(max(num)) + "_steps.zip"
            print("Tentative: " + filename)
            self.model = PPO2.load(load_path=filename, env=DummyVecEnv([lambda: self.env]), tensorboard_log='./a2c_rasp_tensorboard/')
            print("Successfully loaded the previous model: " + filename)
            os.chdir(path)
        except:
            # Vector-encode our new environment
            env = DummyVecEnv([lambda: self.env])
            # Create new model
            self.model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log='./a2c_rasp_tensorboard/')
            print("Successfully created new model")

        # Stop training if reward get close to zero
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1e-2, verbose=1)
        eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1)

        # Save model at regular time intervals
        checkpoint_callback = CheckpointCallback(save_freq=2000, save_path='./model_checkpoints/')

        # Chain callbacks together
        callback = CallbackList([eval_callback, checkpoint_callback])

        # Train model
        episode = 1
        while episode < 10:
            # Update location of red dot
            _ = self.env.square
            if self.env.trainable:
                print("Beginning episode number {}".format(episode))
                self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run")
                episode += 1

        # Save trained model
        self.model.save("raspberry_agent")
Exemplo n.º 13
0
def setup(model_params, output_folder_path):
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        print("Creating model...")
        model = DDPG(CnnPolicy, **model_params)
    else:
        print("Loading model...")
        model = DDPG.load(latest_model_path, **model_params)
    tensorboard_dir = (output_folder_path / "tensorboard")
    ckpt_dir = (output_folder_path / "checkpoints")
    tensorboard_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_callback = CheckpointCallback(save_freq=200,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    logging_callback = LoggingCallback(model=model, verbose=1)
    callbacks = CallbackList([checkpoint_callback, logging_callback])
    return model, callbacks
Exemplo n.º 14
0
def main():
    agent_data = pd.read_csv('../output_EURUSD_M1_/agentData.csv')
    agent_data = agent_data.drop(agent_data.columns[0], axis=1)
    agent_data = agent_data.astype('float32')

    env = SubprocVecEnv([lambda: ForexTradingEnv(agent_data)] * 10, )
    #env = DummyVecEnv([lambda: ForexTradingEnv(agent_data)], )

    #    model = DQN(CustomDQNPolicy, env, gamma=0.95, verbose=1, tensorboard_log = "./tensorboard", entcoeff=0.005, adam_epsilon = 1e-6)

    import tensorflow as tf
    from TenorboardCallbacks import TensorboardCallback
    checkpoint_callback = CheckpointCallback(save_freq=1000000,
                                             save_path='./models/',
                                             name_prefix='ppo2')

    for curr in [1]:
        model = PPO2(PPO2Policy_Basic,
                     env,
                     verbose=1,
                     tensorboard_log="./tensorboard",
                     vf_coef=1e-7,
                     ent_coef=1e-4,
                     n_steps=512,
                     gamma=0.99)
        #model = PPO2.load("5_days_model/ppo2_999000000_steps.zip", policy=PPO2Policy_Basic, env = env,verbose=1, tensorboard_log = "./tensorboard")

        model.learn(total_timesteps=10000000000,
                    log_interval=10000000,
                    callback=CallbackList(
                        [TensorboardCallback(env), checkpoint_callback]))
        model.save(model_fileName)

    obs = env.reset()
    for i in range(2000000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        if i % 1 == 0:
            env.render()
        if done:
            break
Exemplo n.º 15
0
def learn(env_name, seed, load_path, save_path, tensorboard_log, total_timesteps, n_cpu):
    save_path = env_name if save_path is None else save_path
    checkpoint_callback = CheckpointCallback(save_freq=2000, save_path=save_path)
    eval_env = make_env(env_name, n_cpu, seed)()
    eval_callback = EvalCallback(eval_env, best_model_save_path=save_path+'/best', log_path=tensorboard_log, eval_freq=1000)
    callback = CallbackList([checkpoint_callback, eval_callback])

    policy = CnnPolicy
    # policy = CnnLstmPolicy
    # policy = CnnLnLstmPolicy
    print(env_name, policy)
    # Run this to enable SubprocVecEnv on Mac OS X.
    # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
    # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331
    env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)])
    if load_path is not None:
        model = PPO2.load(load_path, env, verbose=1, tensorboard_log=tensorboard_log)
    else:
        model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback)
    print('saving model:', save_path+'/latest_model')
    model.save(save_path+'/latest_model')
    env.close()
Exemplo n.º 16
0
def get_train_callback(eval_env,
                       seed,
                       log_dir,
                       save_f=10000,
                       eval_f=50000,
                       eval_ep=1000):
    checkpoint_callback = CheckpointCallback(save_freq=save_f,
                                             save_path=log_dir)

    # Separate evaluation env
    eval_callback = EvalTensorboardCallback(
        eval_env,
        best_model_save_path=os.path.join(log_dir, 'best_model'),
        log_path=os.path.join(log_dir, 'evaluation_results'),
        eval_freq=eval_f,
        n_eval_episodes=eval_ep,
        deterministic=True,
        render=False,
        seed=seed)

    # Create the callback list
    callback = CallbackList([checkpoint_callback, eval_callback])

    return callback
Exemplo n.º 17
0
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = environment(x,y,z, gamma)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


#points_values=list([[0,LR1],[1000000,LR2]])

#Sched=PiecewiseSchedule(points_values, outside_value=LR2)

if __name__ == '__main__':

    num_cpu = 1  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=environment(x,y,z, gamma)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    scenario=str(f'RG_t{test}_lr{LR}_gamma{gamma}_batch{batch_size}')    
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, deterministic=False)])
    

        
    model = A2C(CnnPolicy, env, gamma=gamma, verbose=1)#, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)
def train_initial_policy(
        model_name,
        algo=ALGO,
        env_name=ENV_NAME,
        time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl")

    # define the environment here
    env = gym.make(env_name)
    env.seed(SEED)
    if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high)

    if algo.__name__  == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda : env])

    if NORMALIZE :
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_reward=1e6,
                           )


    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[256, 256])

        model = SAC(CustomPolicy, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) * np.ones(n_actions))
        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[400, 300])
        model = TD3(CustomPolicy2, env,
                    verbose = 1,
                    tensorboard_log = 'data/TBlogs/initial_policy_training',
                    batch_size = args['batch_size'],
                    buffer_size = args['buffer_size'],
                    gamma = args['gamma'],
                    gradient_steps = args['gradient_steps'],
                    learning_rate = args['learning_rate'],
                    learning_starts = args['learning_starts'],
                    action_noise = action_noise,
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    timesteps_per_batch=args['timesteps_per_batch'],
                    lam=args['lam'],
                    max_kl=args['max_kl'],
                    gamma=args['gamma'],
                    vf_iters=args['vf_iters'],
                    vf_stepsize=args['vf_stepsize'],
                    entcoeff=args['entcoeff'],
                    cg_damping=args['cg_damping'],
                    cg_iters=args['cg_iters'],
                     seed=SEED,
                    )

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946,
                      seed=SEED)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(mlp_standard,
                     env,
                     n_steps=int(args['n_steps']/env.num_envs),
                     nminibatches=args['nminibatches'],
                     lam=args['lam'],
                     gamma=args['gamma'],
                     ent_coef=args['ent_coef'],
                     noptepochs=args['noptepochs'],
                     learning_rate=args['learning_rate'],
                     cliprange=args['cliprange'],
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     seed=SEED,
                     )

    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(CustomPolicy, env,
                    verbose=1,
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)
    elif SAVE_INTERMEDIATE:
        check_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                            save_path=model_name[:-4],
                                            name_prefix=ENV_NAME + '_' + str(SEED),
                                            verbose=1,
                                            )
        eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
        eval_env.seed(SEED)
        eval_callback = EvalCallback(eval_env,
                                     n_eval_episodes=10,
                                     eval_freq=SAVE_FREQ,
                                     log_path=model_name[:-4],
                                     deterministic=False,
                                     render=False,
                                     verbose=1)

        callbacks = CallbackList([check_callback, eval_callback])
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,
                    callback=callbacks)
        model.save(model_name)
        npzfile = np.load(model_name[:-4] + '/evaluations.npz')
        average_rewards = np.mean(npzfile['results'], axis=1)[:, 0]
        with open(model_name[:-4] + "/eval_results.txt", "a") as f:
            for i in range(np.shape(average_rewards)[0]):
                f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i]))
        evaluate_policy_on_env(env, model, render=False, iters=50)
    else:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,)
        model.save(model_name)
        evaluate_policy_on_env(env, model, render=False, iters=50)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/'+env_name+'.pkl')

    print('done :: ', model_name)
    exit()
    return _init


if __name__ == '__main__':

    num_cpu = 15  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=environment(x, y, z, gamma, cutoffpenaltyscalar, rg_prob, turnspc, savepath)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    
    #create callbacks to record data, initiate events during training.
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=savepath, n_eval_episodes=5
                                                                         , deterministic=False, best_model_save_path=savepath)])
    

    #create model with Stable Baselines package.
    model = A2C(CnnPolicy, env, gamma=gamma, n_steps=updatesteps, learning_rate=LR,  verbose=1)#, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter)
    
    
    #create learning curve plot
    evaluations= './%s/%s/evaluations.npz' % (storagefolder,scenario)
    data=np.load(evaluations)
    results=data['results']
    y=np.average(results, axis=1)
    timesteps=data['timesteps']
    plt.plot(timesteps,y)
    
Exemplo n.º 20
0
parser.add_argument(
    "--seeds",
    nargs="+",
    type=int,
    default=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    help="Seeds for evaluation",
)
parser.add_argument("--fd_port", type=int, default=55555)
args = parser.parse_args()

for b in args.benchmarks:
    for s in args.seeds:
        logger = Logger(experiment_name=f"PPO_{b}_s{s}",
                        output_path=Path(args.outdir))
        perf_logger = logger.add_module(PerformanceTrackingWrapper)
        config = {"seed": s, "logger": perf_logger, "benchmark": b}
        if b == "FastDownwardBenchmark":
            config["port"] = args.fd_port
        env = make_benchmark(config)
        model = PPO2("MlpPolicy", env)
        logging = LoggerCallback(logger)

        checkpoint = CheckpointCallback(
            save_freq=1000,
            save_path=f"{args.outdir}/PPO_{b}_s{s}/models",
            name_prefix="model",
        )
        callback = CallbackList([logging, checkpoint])
        model.learn(total_timesteps=args.timesteps, callback=callback)
        logger.close()
Exemplo n.º 21
0
	# td3_env = env
	
	checkpoint_on_event = CheckpointCallback(save_freq=1000, save_path= "./logs/model_checkpoints",
                                         name_prefix='rl_model')
	event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

	eval_callback = EvalCallback(td3_env, best_model_save_path='./logs/',
                             log_path='./logs/', eval_freq=100,
                             deterministic=True, render=False)

	

	# td3_model.learning_starts = 100
	
	custom_callback = customCallback(verbose=0)
	callback = CallbackList([custom_callback, checkpoint_on_event])

	td3_model = TD3(Td3MlpPolicy, td3_env,
					gamma = GAMMA,
					learning_rate = LEARNING_RATE,
					buffer_size = BUFFER_SIZE,
					learning_starts = LEARNING_STARTS,
					train_freq = TRAIN_FREQ,
					gradient_steps = GRADIENT_STEPS,
					batch_size = BATCH_SIZE,
					tau = TAU,
					policy_delay = POLICY_DELAY,
					action_noise = td3_noise,
					target_policy_noise = TARGET_POLICY_NOISE,
					target_noise_clip = TARGET_NOISE_CLIP,
					random_exploration = RANDOM_EXPLORATION,
Exemplo n.º 22
0
               rPiIP='192.168.0.183',
               rPiPort=50000,
               episodeLength=100,
               bullseye=10)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20,
                                                 verbose=1)

eval_callback = EvalCallback(env,
                             best_model_save_path='./logs/best',
                             log_path='./logs/',
                             eval_freq=500,
                             deterministic=True,
                             render=False,
                             callback_on_new_best=callback_on_best)

# Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path='./logs/',
                                         name_prefix='ppo1_model')

cb = CallbackList([eval_callback, checkpoint_callback])

model = DQN(MlpPolicy,
            env,
            verbose=1,
            double_q=True,
            tensorboard_log='./logs/')
model.learn(total_timesteps=2000, callback=cb)
model.save("dqn_rpi_led")
Exemplo n.º 23
0
def main():

    # Argument parser to select model type
    parser = argparse.ArgumentParser(description="Train a reinforcement learning flight controller.")
    parser.add_argument('-m','--model', help="RL Agent to train on.")
    args = vars(parser.parse_args())

    # Create a Comet experiment with an API key
    experiment = Experiment(api_key="Bq3mQixNCv2jVzq2YBhLdxq9A",
                            project_name="rl-flight-controller", workspace="alexbarnett12",
                            log_env_gpu = False, log_env_cpu = False, log_env_host= False, 
                            log_git_metadata = False, log_git_patch = False)

    # Load training parameters
    cfg = configparser.ConfigParser()
    cfg.read(TRAINING_CONFIG)
    params = cfg["PARAMETERS"]

    # Set training parameters
    learning_rate_max = float(params["learning_rate_max"])
    learning_rate_min = float(params["learning_rate_min"])
    n_steps = int(params["N_steps"])
    noptepochs = int(params["Noptepochs"])
    nminibatches = int(params["Nminibatches"])
    gamma = float(params["Gamma"])
    lam = float(params["Lam"])
    clip = float(params["Clip"])
    ent_coeff = float(params["Ent_coeff"])
    total_timesteps = int(params["Total_timesteps"])

    # Linearly decreasing learning rate (only for PPO2)
    lr_callback = create_lr_callback(learning_rate_max, learning_rate_min)

    # Report hyperparameters to Comet
    hyper_params = {"learning_rate": learning_rate_max, 
                    "steps": n_steps,
                    "epochs": noptepochs,
                    "minibatches": nminibatches,
                    "gamma": gamma,
                    "lambda": lam,
                    "clip_range": clip,
                    "ent_coeff": ent_coeff,
                    "total_timesteps": total_timesteps}
    experiment.log_parameters(hyper_params)

    # You can set the level to logger.DEBUG or logger.WARN if you
    # want to change the amount of output.
    logger.set_level(logger.DEBUG)

    # Create save directory and various save paths
    model_log_dir = create_model_log_dir()
    save_path = "./logs/" + model_log_dir + "/ckpts/"
    best_model_save_path = "./logs/" + model_log_dir + "/best_model/"
    log_path = "./logs/" + model_log_dir + "/results/"
    tensorboard_dir = "./logs/" + model_log_dir + "/tensorboard/"
    model_save_path = "./logs/saved_models/" + model_log_dir

    # Save training and reward params to model directory 
    shutil.copy("./gymfc/reward_params.config", "./logs/" + model_log_dir + "/reward_params.config")
    shutil.copy("./gymfc/training_params.config", "./logs/" + model_log_dir + "/training_params.config")

    # Create a callback to save model checkpoints
    checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=save_path,
                                             name_prefix='rl_model')

    # Create a separate evaluation environment
    #eval_env = gym.make('attitude-fc-v0')

    # Callback to evaluate the model during training
    #eval_callback = EvalCallback(eval_env, best_model_save_path=best_model_save_path,
    #                            log_path=log_path, eval_freq=100000)

    # Create training environment
    env = gym.make('attitude-fc-v0')

    # Callback to add max penalty watchers to Tensorboard
    tb_callback = TensorboardCallback(env)

    # Create the callback list
    #callback = CallbackList([checkpoint_callback, eval_callback, tb_callback])
    callback = CallbackList([checkpoint_callback, tb_callback])
    # RL Agent; Current options are PPO1 or PPO2
    # Note: PPO2 does not work w/o vectorized environments (gymfc is not vectorized)
    if args["model"] == "PPO2":
        print("PPO2!")
        model = PPO2(MlpPolicy, 
                    env,
                    n_steps=n_steps,
                    learning_rate=lr_callback,
                    noptepochs=noptepochs,
                    nminibatches=nminibatches,
                    gamma=gamma,
                    lam=lam,
                    cliprange=clip,
                    ent_coef=ent_coeff,
                    tensorboard_log=tensorboard_dir,
                    policy_kwargs= {layers: [32,32]})
        experiment.add_tag("PPO2")

    else:
        model = PPO1(MlpPolicy,
                     env,
                     timesteps_per_actorbatch=n_steps,
                     optim_stepsize = learning_rate_max,
                     schedule="linear",
                     optim_epochs=noptepochs,
                     optim_batchsize=nminibatches,
                     gamma=gamma,
                     lam=lam,
                     clip_param=clip,
                     entcoeff=ent_coeff,
                     tensorboard_log=tensorboard_dir)
        experiment.add_tag("PPO1")

    # Train the model. Clean up environment on user cancellation
    try:
        model.learn(total_timesteps=total_timesteps, callback=callback)
    except KeyboardInterrupt:
        print("INFO: Ctrl-C caught. Cleaning up...")
        env.close()
        eval_env.close()

    model.save(model_save_path)

    env.close()
    eval_env.close()
Exemplo n.º 24
0
    def __init__(self, env: Env, params: dict, model_path: str, log_path: str):
        """Initialize.

        :param env: gym environment. Assuming observation space is a tuple,
            where first component is from original env, and the second is
            temporal goal state.
        :param params: dict of parameters, like `default_parameters`.
        :param model_path: directory where to save models.
        :param log_path: directory where to save tensorboard logs.
        """
        # Check
        if params["initialize_file"]:
            raise ValueError(
                "Initialization not supported; use resuming option")
        if params["action_bias"]:
            raise ValueError("Action bias is not maintained here")

        # Alias
        original_env = env

        # Load a saved agent for the action bias
        self.biased_agent: Optional[DQN] = None
        if params["action_bias"]:
            loading_params = dict(params)
            loading_params["resume_file"] = params["action_bias"]
            loading_params["action_bias"] = None

            self.biased_agent = TrainStableBaselines(
                env=env,
                params=loading_params,
                model_path=model_path,
                log_path=log_path,
            ).model

        # Collect statistics
        #    (assuming future wrappers do not modify episodes)
        env = MyStatsRecorder(env=env, gamma=params["gamma"])

        # Callbacks
        checkpoint_callback = CustomCheckpointCallback(
            save_path=model_path,
            save_freq=params["save_freq"],
            extra=None,
        )
        stats_logger_callback = StatsLoggerCallback(stats_recorder=env,
                                                    scope="env0")

        callbacks_list = [checkpoint_callback, stats_logger_callback]
        if params["render"]:
            renderer_callback = RendererCallback()
            callbacks_list.append(renderer_callback)

        # If training a passive agent log this too
        if params["active_passive_agents"]:

            # Find the reward shaping env
            reward_shaping_env = find_wrapper(env, RewardShapingWrapper)

            passive_stats_env = MyStatsRecorder(
                env=UnshapedEnv(reward_shaping_env),
                gamma=params["gamma"],
            )

            passive_stats_callback = StatsLoggerCallback(
                stats_recorder=passive_stats_env,
                scope="env1",
            )
            callbacks_list.append(passive_stats_callback)

            # Make it move with the original env
            env = UnshapedEnvWrapper(
                shaped_env=env,
                unshaped_env=passive_stats_env,
            )
            original_reward_getter = env.get_reward  # alias
        else:
            original_reward_getter = None

        # Combine callbacks
        all_callbacks = CallbackList(callbacks_list)

        # Define or load
        resuming = bool(params["resume_file"])
        if not resuming:
            # Normalizer
            normalized_env = NormalizeEnvWrapper(
                env=env,
                training=True,
                entry=0,  # Only env features, not temporal goal state
            )
            flat_env = BoxAutomataStates(normalized_env)
            # Saving normalizer too
            checkpoint_callback.saver.extra_model = normalized_env

            # Agent
            model = DQN(
                env=flat_env,
                policy=ModularPolicy,
                policy_kwargs={
                    "layer_norm": params["layer_norm"],
                    "layers": params["layers"],
                    "shared_layers": params["shared_layers"],
                    "dueling": params["dueling"],
                },
                gamma=params["gamma"],
                learning_rate=params["learning_rate"],
                train_freq=params["train_freq"],
                double_q=True,
                batch_size=params["batch_size"],
                buffer_size=params["buffer_size"],
                learning_starts=params["learning_starts"],
                prioritized_replay=True,
                target_network_update_freq=params[
                    "target_network_update_freq"],
                exploration_fraction=params["exploration_fraction"],
                exploration_final_eps=params["exploration_final_eps"],
                exploration_initial_eps=params["exploration_initial_eps"],
                active_passive_agents=params["active_passive_agents"],
                passive_reward_getter=original_reward_getter,
                tensorboard_log=log_path,
                full_tensorboard_log=False,
                verbose=1,
            )
        else:
            # Reload model
            model, extra_model, counters = checkpoint_callback.load(
                path=params["resume_file"], )

            # Restore normalizer and env
            normalized_env = extra_model
            normalized_env.set_env(env)
            flat_env = BoxAutomataStates(normalized_env)

            # Restore properties
            model.tensorboard_log = log_path
            model.num_timesteps = counters["step"]
            model.learning_starts = params["learning_starts"] + counters["step"]
            model.set_env(flat_env)
            model.passive_reward_getter = original_reward_getter

        # Store
        self.params = params
        self.resuming = resuming
        self.saver = checkpoint_callback
        self.logger = stats_logger_callback
        self.callbacks = all_callbacks
        self.model: DQN = model
        self.normalized_env = normalized_env
        self.testing_agent = model if not params[
            "test_passive"] else model.passive_agent
Exemplo n.º 25
0
policy_kwargs = dict(act_fun=tf.nn.relu,
                     net_arch=net_arch,
                     obs_norm_init=init_obs_norm,
                     act_norm_init=init_act_norm)

n_time_step = 10 * 10**6

save_gif_callback = SaveGifCallback(save_freq=int(0.5 * 10**6),
                                    save_path=os.path.join(
                                        log_path, run_id, 'training_videos'),
                                    fps=int(1. / env._policy_step))
rwd_term_callback = DiscRwdTerminate(th_perc=0.9, n_skip=500)
rwd_rec_callback = SaveRewardPortionsCallback(
    fullfilename=os.path.join(log_path, run_id, 'reward_portions.txt'))

callback = CallbackList(
    [save_gif_callback, rwd_term_callback, rwd_rec_callback])

model = PPO2(NormalMlpPolicy,
             env,
             gamma=0.95,
             n_steps=8192,
             ent_coef=0.0001,
             nminibatches=4,
             noptepochs=4,
             learning_rate=5.0 * 10**(-4),
             policy_kwargs=policy_kwargs,
             verbose=False,
             tensorboard_log=os.path.join(log_path, run_id))
model.learn(total_timesteps=n_time_step,
            log_interval=1000,
            reset_num_timesteps=False,
    return _init


if __name__ == '__main__':

    num_cpu = 12  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(inputfile, i) for i in range(num_cpu)])
    eval_env = environment(inputfile, gamma)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    scenario = str(
        f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}')
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(eval_env, log_path=scenario, n_eval_episodes=5)
    ])

    model = PPO2(MlpPolicy,
                 env,
                 gamma=gamma,
                 n_steps=batch_size,
                 learning_rate=LR,
                 verbose=1)  #, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)

    filename = './%s/evaluations.npz' % scenario

    data = np.load(filename)
    results = data['results']
    y = np.average(results, axis=1)
                                         save_path='./tf_model_logs/')
# Separate evaluation env
eval_env = gym_env.PegInEnv(
    "PandaPegIn",
    has_offscreen_renderer=True,
    # has_renderer=True,
    use_camera_obs=False,
    control_freq=100,
)

eval_callback = EvalCallback(eval_env,
                             best_model_save_path='./tf_model_logs/best_model',
                             log_path='./tf_model_logs/best_model_results',
                             eval_freq=10000)
# Create the callback list
callback = CallbackList([checkpoint_callback, eval_callback])

env = gym_env.PegInEnv(
    "PandaPegIn",
    has_offscreen_renderer=True,
    # has_renderer=True,
    use_camera_obs=False,
    control_freq=100,
)

model = PPO1(MlpPolicy,
             env,
             timesteps_per_actorbatch=2048,
             clip_param=0.2,
             entcoeff=0.0,
             optim_epochs=5,
    # Create the vectorized environment
    #env = environment(x,y,z,0.95, 0.05, savepath, 'MlpPolicy', rg_prob='loadenv')
    env = environment(
        x, y, z, gamma, turnspc, policyname, rg_prob='loadenv'
    )  #SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    #eval_env=environment(x, y, z, gamma, turnspc, savepath, policyname, rg_prob='loadenv')
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    #create callbacks to record data, initiate events during training.
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(env,
                     log_path=savepath,
                     n_eval_episodes=1,
                     eval_freq=10000,
                     deterministic=det,
                     best_model_save_path=savepath)
    ])

    if (os.path.exists("%s/best_model.zip" % savepath)):
        # Instantiate the agent
        model = ACER(policy,
                     env,
                     gamma=gamma,
                     n_steps=episodetimesteps,
                     learning_rate=LR,
                     buffer_size=10000,
                     verbose=1)
        # Load the trained agent
    env = SubprocVecEnv([make_env(x, y, z, i) for i in range(num_cpu)])
    eval_env = evalenv(x, y, z, turnspc, policyname)
    env1 = environment(x, y, z, turnspc, scalar, policyname)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    #create callbacks to record data, initiate events during training.
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(eval_env,
                     log_path=evpath,
                     n_eval_episodes=100,
                     eval_freq=50000,
                     deterministic=True,
                     best_model_save_path=evpath),
        EvalCallback(env1,
                     log_path=savepath,
                     n_eval_episodes=20,
                     eval_freq=10000,
                     deterministic=False,
                     best_model_save_path=savepath)
    ])
    if (os.path.exists("%s/final_model.zip" % savepath)):
        # Instantiate the agent
        model = ACER(policy,
                     env,
                     gamma=gamma,
                     n_steps=episodetimesteps,
                     learning_rate=LR,
                     buffer_size=5000,
Exemplo n.º 30
0
def main(logdir):
    # params
    SLEEP_RATE = 100  #1 2 10 50 100Hz
    EPISODE_TIME = 30  # 30 120 sec
    USE_MPC = False
    N_EPISODE = 1000000
    Action_Choice = np.array([1, 1, 1, 1, 0, 0, 0, 0])
    EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME
    TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE

    # logdir
    logdir = os.path.join(logdir, strftime("%Y-%m-%d--%H:%M:%S", localtime()))
    os.makedirs(logdir)
    checkpoint_path = os.path.join(logdir, 'checkpoint')
    callback_path = logdir
    final_model_path = logdir + '/final_model'

    # env
    env = BlimpEnv(SLEEP_RATE, EPISODE_TIME, USE_MPC, Action_Choice)
    env = Monitor(env, logdir)
    # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir)
    print("Observation space:", env.observation_space)
    print("Shape:", env.observation_space.shape)
    print("Action space:", env.action_space)

    # callback
    SAVE_FREQ = EPISODE_LENGTH * 100  # save model for every 20 episode
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=checkpoint_path,
                                             name_prefix='sac_callback_model')
    save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback(
        check_freq=SAVE_FREQ, log_dir=callback_path)
    callback = CallbackList(
        [checkpoint_callback, save_on_best_training_reward_callback])

    # agent
    model = SAC(MlpPolicy,
                env,
                gamma=0.98,
                learning_rate=0.0003,
                buffer_size=1000000,
                learning_starts=EPISODE_LENGTH * 20,
                train_freq=1,
                batch_size=256,
                tau=0.01,
                ent_coef='auto',
                target_update_interval=1,
                gradient_steps=1,
                target_entropy='auto',
                action_noise=None,
                verbose=1,
                tensorboard_log=logdir,
                full_tensorboard_log=True,
                _init_setup_model=True)

    print("---------- Start Learing -----------")
    model.learn(total_timesteps=TOTAL_TIMESTEPS,
                log_interval=SAVE_FREQ,
                callback=callback)

    print("---------- Finish Learning ----------")
    model.save(final_model_path)
    del model  # remove to demonstrate saving and loading
    model = SAC.load(final_model_path)

    results_plotter.plot_results([logdir], TOTAL_TIMESTEPS,
                                 results_plotter.X_TIMESTEPS, "SAC BLIMP")
    plt.show()