示例#1
0
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                 use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                 max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
                 phi_obj, load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0
    for _ in range(200):
        trajectories, traj_len_list = run_policy(env,
                                                 policy,
                                                 scaler,
                                                 num_episodes,
                                                 max_timesteps=max_timesteps)

        num_traj = len(trajectories)

        episode += len(trajectories)
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, gamma)
        add_gae(trajectories, gamma, lam)

        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        policy.update(load_model,
                      observes,
                      actions,
                      advantages,
                      use_lr_adjust,
                      ada_kl_penalty,
                      c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew)

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    logger.log("saved model")
def train_models(env_name, num_episodes, 
        gamma, lam, kl_targ, 
        coef, use_lr_adjust, 
        ada_kl_penalty, seed, 
        epochs, phi_epochs,
        max_timesteps, reg_scale,
        phi_lr, phi_hs,
        policy_size, 
        phi_obj, load_model, type): 

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed) 
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, 
            kl_targ,epochs, 
            phi_epochs, 
            policy_size=policy_size,
            phi_hidden_sizes=phi_hs,
            reg_scale=reg_scale,
            lr_phi=phi_lr,
            phi_obj=phi_obj,
            type=type)

    
    run_policy(env, policy, 
            scaler, num_episodes, 
            max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler 
    
    episode = 0
    for i in range(2000):
        print("sampling and training at %s iteration\n"%(i))
        trajectories, traj_len_list = run_policy(env, policy, scaler, 
                            num_episodes, max_timesteps=max_timesteps, mode=load_model)
    
        num_traj = len(trajectories)
    
        episode += len(trajectories)
        add_value(trajectories, val_func)  
        add_disc_sum_rew(trajectories, gamma)  
        add_gae(trajectories, gamma, lam) 
    
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        
        policy.update(load_model, observes, actions, advantages,
                use_lr_adjust, ada_kl_penalty, c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew) 

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    refine_scaler = False
    if refine_scaler == True:
        run_policy(env, policy, 
                scaler, num_episodes, 
                max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler 
    with open('models/scaler/scaler.pkl', 'wb') as output:
        pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
    logger.log("saved model")