def train(params, model=None, path=None):
    if model: # indicate in filename that this is a finetune
        if params['name']:
            params['name'] += '_Finetune'
        else:
            params['name'] = 'Finetune'
    
    data_dir, tb_path = get_paths(params, path=path)
    print("Training Parameters: ", params)
    os.makedirs(data_dir, exist_ok=True)
    # Save parameters immediatly
    params.save(data_dir)

    rank = mpi_rank_or_zero()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    
    def make_env(i):
        env = get_env(params)
        env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset'])
        return env

    use_her = params['env_args']['use_her'] if 'use_her' in params['env_args'] else False

    if use_her:
        env = make_env(0)
        goal_selection_strategy = 'future'
    else:
        env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])])

    if model: # indicate in filename that this is a finetune
        print("Model action space", model.action_space, model.action_space.low)
        print("Env action space", env.action_space, env.action_space.low)
    if params['normalize']:
        env = VecNormalize(env)
    if params['seed']:
        seed = params['seed'] + 100000 * rank
        set_global_seeds(seed)
        params['alg_args']['seed'] = seed
    if 'noise' in params and params['noise']:
        from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise
        n_actions = env.action_space.shape[-1]
        params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions))
    
    if model is None:
        alg = get_alg(params)
        policy = get_policy(params)
        if use_her:
            from stable_baselines import HER
            model = HER(policy, env, alg, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, 
                            tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args'])
        else:
            model = alg(policy,  env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args'])
    else:
        model.set_env(env)

    model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], callback=create_training_callback(data_dir, 
                                                    freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq']))
    print("######## SAVING MODEL TO", data_dir)
    model.save(data_dir +'/final_model')
    if params['normalize']:
        env.save(data_dir + '/normalized_environment.env')
    env.close()
            print(x[-1], 'timesteps')
            print(
                "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}"
                .format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
                env.envs[0].save(os.path.join(log_dir, "vec_normalize.pkl"))
    n_steps += 1
    # Returning False will stop training early
    return True


# env_s= lambda: gym.make("HopperEnvRep-v0")
# env_s = Monitor(env_s, log_dir, allow_early_resets=True)

#env.act_rep = 20

model = SAC(MlpPolicy, env, verbose=1)
#model = PPO2(MlpPolicy, env,verbose=True)
model.learn(total_timesteps=1000, use_action_repeat=True, callback=callback)
f.close()
# Don't forget to save the VecNormalize statistics when saving the agent
# log_dir = "logs/hopper_aneal/"
model.save(log_dir + "sac_hopper")
env.save(os.path.join(log_dir, "vec_normalize.pkl"))
def train_hrl(low_params, high_params, high_training_starts=0, model=None, path=None):
    if model: # indicate in filename that this is a finetune
        if low_params['name']:
            low_params['name'] += '_Finetune'
        else:
            low_params['name'] = 'Finetune'
        if high_params['name']:
            high_params['name'] += '_Finetune'
        else:
            high_params['name'] = 'Finetune'

    params = merge_hrl_params(low_params, high_params)
    data_dir, tb_path = get_paths(params, path=path)

    data_dir_components = data_dir.split('_')
    data_dir_components.insert(-1, 'Low')
    low_data_dir = '_'.join(data_dir_components)
    data_dir_components[-2] = 'High'
    high_data_dir =  '_'.join(data_dir_components)

    os.makedirs(high_data_dir, exist_ok=True)
    os.makedirs(low_data_dir, exist_ok=True)
    # Enforce consistency across params by using the split function.
    low_params, high_params = split_hrl_params(params)
    print("HRL PARAMS")
    print("High Params", high_params)
    print("low Params", low_params)
    high_params['env_wrapper_args']['policy'] = '/'.join(low_data_dir.split('/')[-2:])

    low_params.save(low_data_dir)
    high_params.save(high_data_dir)
    
    def make_env(i):
        env = get_env(params)
        print("ENVIRONMENT", env)
        env = Monitor(env, high_data_dir + '/' + str(i), allow_early_resets=params['early_reset'],
                    info_keywords=('low_ep_info',))
        return env

    env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])])

    if params['normalize']:
        env = VecNormalize(env)

    seed = params['seed']
    if seed:
        set_global_seeds(seed)
        params['alg_args']['seed'] = seed
    
    if model is None:
        alg = get_alg(params)
        policy = get_policy(params)
        model = alg(policy, policy, env, verbose=1, tensorboard_log=tb_path, high_policy_kwargs=params['high_policy_args'],
                                            low_policy_kwargs=params['low_policy_args'], 
                                            **{'low_' + key : value for key, value in params['low_alg_args'].items()},
                                            **{'high_' + key : value for key, value in params['high_alg_args'].items()})
    else:
        model.set_env(env)

    model.learn(total_timesteps=params['timesteps'], log_interval=int(params['log_interval']/4), 
                callback=create_training_callback(high_data_dir, low_level_data_dir=low_data_dir, freq=params['eval_freq'],
                checkpoint_freq=params['checkpoint_freq']), high_training_starts=high_training_starts)
    
    model.save(low_data_dir +'/final_model', high_data_dir + '/final_model')
    if params['normalize']:
        env.save(data_dir + '/normalized_environment.env')
    env.close()
Пример #4
0
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Learning Curve Smoothed")
    plt.show()


if __name__ == "__main__":
    rospy.init_node('drone_gym')
    env_id = 'Crazyflie-v0'
    log_dir = 'models/hover/empty_world_small/finalVec'

    env = DummyVecEnv([lambda: gym.make(env_id)])
    # Automatically normalize the input features and reward
    env = VecNormalize(env, norm_obs=True, norm_reward=True)

    # # Save best model every n steps and monitors performance
    # save_best_callback = SaveOnBestTrainingRewardCallback(check_freq=5, log_dir=log_dir)
    # # Save model every n steps
    # checkpoint_callback = CheckpointCallback(save_freq=5, save_path='./' + log_dir, name_prefix='ppo2')

    # Train from scratch
    model = PPO2(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=80000)
    # model.learn(total_timesteps=20, callback=[save_best_callback, checkpoint_callback])

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(log_dir + "/ppo2_final")
    stats_path = os.path.join(log_dir, "vec_normalize.pkl")
    env.save(stats_path)