Пример #1
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, init_logvar):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (multiplier of obs dimension)
        init_logvar: natural log of initial policy variance
    """
    print('load model (l)?')
    loading = input('')
    pybullet.connect(pybullet.DIRECT)
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    # print('obs_dim') # 45 for HumanoidFlagrunBulletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0
    # print(obs_dim)
    # print('act_dim') # 17 for HumanoidFlagrunBelletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0
    # print(act_dim)
    # input('')
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)

    val_func = NNValueFunction(obs_dim, hid1_mult, loading)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)

    policy_model = policy.get_trpo_policy_model()
    valNN_model = val_func.get_valNN_model()
    lr = val_func.get_lr()

    if loading == 'l':
        policy_model.load_weights('pol_weights.h5')
        pol_weights = policy_model.get_weights()
        print('pol_weights')
        print(pol_weights)
        input('')
        loading == 'n'

    save_weights_flag = 1

    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)

        if episode <= batch_size:
            if loading == 'l':
                traj = open('trajectories.obj', 'rb')
                trajectories = pickle.load(traj)
                traj.close()
                print('342')
                input('')
        elif episode == num_episodes-batch_size:
            traj = open('trajectories.obj','wb')
            pickle.dump(trajectories,traj)
            traj.close()
            print('348')
            input('')

        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        if episode > 50:
            policy_model = policy.get_trpo_policy_model()
            print('about to save model')
            input('')
            policy_model.save('policy_model')
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()

    if save_weights_flag == 1:
        valNN_model.save('val_weights.h5')

        policy_weights = policy_model.get_weights()
        print('policy_weights')
        print(policy_weights)
        input('')
        # policy_model.save_weights('pol_weights.hdf5')
        policy_model.save_weights('pol_weights.h5')
Пример #2
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, False)
    if time_state:
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name, True)
    arg = [obs_dim, act_dim, kl_targ, time_state, env_name]
    policy = Policy(obs_dim, act_dim, kl_targ, env_name, True)

    episode = 0
    #progresses = None
    while episode < num_episodes:
        trajectories, progress = run_policy(env,
                                            policy,
                                            scaler,
                                            logger,
                                            arg,
                                            episodes=batch_size)
        #TODO change init setup
        try:
            progresses
        except:
            progresses = progress
        else:
            progresses = np.concatenate([progresses, progress], 1)

        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    path = os.path.join('savedmodel/' + env_name)
    path = os.path.join(path, 'prog.dat')
    progresses.dump(path)
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #3
0
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs,
         phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
         phi_obj):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_iterations: maximum number of iterations to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
        coef: coefficient of Stein control variate
        use_lr_adjust: whether adjust lr based on kl
        ada_kl_penalty: whether adjust kl penalty
        max_timesteps: maximum time steps per trajectory
        reg_scale: regularization coefficient 
        policy_size: policy network size
        phi_obj: FitQ or MinVar
    """

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)

    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    hid1_mult,
                    policy_logvar,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    c_ph=coef,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env,
               policy,
               scaler,
               batch_size=1000,
               max_timesteps=max_timesteps)

    for _ in range(num_iterations):
        logger.log("\n#Training Iter %d" % (_))
        logger.log("Draw Samples..")

        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  batch_size=batch_size,
                                  max_timesteps=max_timesteps)

        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage

        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew)

        logger.log("Starting Training...")
        policy.update(observes, actions, advantages, \
                use_lr_adjust, ada_kl_penalty)  # update policy

        val_func.fit(observes, disc_sum_rew)  # update value function

        logger.log('--------------------------------\n')

    policy.close_sess()
    val_func.close_sess()
Пример #4
0
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    val_func.load_val_model(load_dir)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)
    add_value(trajectories, val_func)
    add_disc_sum_rew(trajectories, gamma)
    add_gae(trajectories, gamma, lam)

    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)

    sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(observes,
                                             actions,
                                             advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    policy.update(load_model,
                  observes,
                  actions,
                  advantages,
                  use_lr_adjust,
                  ada_kl_penalty,
                  c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(observes, \
                    actions, advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)
Пример #5
0
    def __init__(self,
                 stateDim: int,
                 actionDim: int,
                 actionMin: np.array,
                 actionMax: np.array,
                 learningRate=0.0005,
                 gamma=0.99,
                 GAElambda=0.95,
                 PPOepsilon=0.2,
                 PPOentropyLossWeight=0,
                 nHidden: int = 2,
                 nUnitsPerLayer: int = 128,
                 mode="PPO-CMA-m",
                 activation="lrelu",
                 H: int = 9,
                 entropyLossWeight: float = 0,
                 sdLowLimit=0.01,
                 useScaler: bool = True,
                 criticTimestepScale=0.001):
        #Create policy network
        print("Creating policy")
        self.actionMin = actionMin.copy()
        self.actionMax = actionMax.copy()
        self.actionDim = actionDim
        self.stateDim = stateDim
        self.useScaler = useScaler
        if useScaler:
            self.scaler = Scaler(stateDim)
        self.scalerInitialized = False
        self.normalizeAdvantages = True
        self.gamma = gamma
        self.GAElambda = GAElambda
        self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale  #with gamma==0, no need for this
        piEpsilon = None
        nHistory = 1
        negativeAdvantageAvoidanceSigma = 0
        if mode == "PPO-CMA" or mode == "PPO-CMA-m":
            usePPOLoss = False  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = True
            self.reluAdvantages = True if mode == "PPO-CMA" else False
            nHistory = H  #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations
            useSigmaSoftClip = True
            negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0
        elif mode == "PPO":
            usePPOLoss = True  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = False
            # separateSigmaAdapt=False
            self.reluAdvantages = False
            useSigmaSoftClip = True
            piEpsilon = 0
        else:
            raise ("Unknown mode {}".format(mode))
        self.policy = Policy(
            stateDim,
            actionDim,
            actionMin,
            actionMax,
            entropyLossWeight=PPOentropyLossWeight,
            networkActivation=activation,
            networkDepth=nHidden,
            networkUnits=nUnitsPerLayer,
            networkSkips=False,
            learningRate=learningRate,
            minSigma=sdLowLimit,
            PPOepsilon=PPOepsilon,
            usePPOLoss=usePPOLoss,
            separateVarAdapt=separateVarAdapt,
            nHistory=nHistory,
            useSigmaSoftClip=useSigmaSoftClip,
            piEpsilon=piEpsilon,
            negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma)

        #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time.
        #Thus, we use time step as an additional feature for the critic.
        #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime
        print("Creating critic network")
        self.critic = Critic(stateDim=stateDim + 1,
                             learningRate=learningRate,
                             nHidden=nHidden,
                             networkUnits=nUnitsPerLayer,
                             networkActivation=activation,
                             useSkips=False,
                             lossType="L1")

        #Experience trajectory buffers for the memorize() and updateWithMemorized() methods
        self.experienceTrajectories = []
        self.currentTrajectory = []
Пример #6
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    if eval:
        print("Evaluating: ")
        evaluate(env_name, num_episodes)
        exit()

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    #policy.restore_weights() ## -------------
    #val_func.restore_weights() ## -------------
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

    print("Scaler vars,means: ")
    print(scaler.vars, scaler.means)

    for i in range(3):
        run_episode(env, policy, scaler, animate=True)

    #policy.save_weights()
    #val_func.save_weights()

    #WARNING: scaler is disabled

    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #7
0
    gamma = 0.995
    lam = 0.98
    batch_size = 5

    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]
    # sess = tf.Session()
    policy = Policy(obs_dim, act_dim)
    val_func = NNValueFunction(obs_dim)
    # sess.run(tf.compat.v1.initializers.global_variables())

    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim)

    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories, val_func, gamma, lam)
        policy.update(observes, actions, advantages, logger)
        val_func.fit(observes, disc_sum_rew, logger)
        logger.log({
            '_Episode': episode,
        })
        logger.write(display=True)
Пример #8
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    #TODO Change init_gym for one of my functions
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(
        ":", "_")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    pathFolder = logger.pathFolder
    #Change wrappers.Monitor for a class of mine that controls de simulation
    #Creo que el wrapper no sirve de nada para mi ejemplo
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)

    #Esto es para alimentar con el optimo
    trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger)
    add_value(trajectories, val_func)  # add estimated values to episodes
    add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
    add_gae(trajectories, gamma, lam)  # calculate advantage
    # concatenate all episodes into single NumPy arrays
    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    print(actions.shape)
    policy.update(observes, actions, advantages, logger)  # update policy
    val_func.fit(observes, disc_sum_rew, logger)  # update value function

    # No estoy seguro de si esto es necesario ya
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess(pathFolder)
    val_func.close_sess(pathFolder)
Пример #9
0
def main2(env_name, num_episodes, gamma, lam, kl_targ, batch_size,
          net_size_factor, noise_bias, weight, use_ppoclip):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    global alive_coef, progress_coef, threshold1, threshold2, change_rate
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    now = datetime.now().strftime(
        "%b-%d_%H:%M:%S") + "_multi_hop_{},{},{}".format(
            change_rate, threshold1, threshold2)
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    if weight == "None":
        val_func = NNValueFunction(obs_dim,
                                   net_size_factor=net_size_factor,
                                   alive_coef=alive_coef,
                                   progress_coef=progress_coef,
                                   reward_dim=reward_dim)
        policy = Policy(obs_dim,
                        act_dim,
                        kl_targ,
                        net_size_factor=net_size_factor,
                        noise_bias=noise_bias)

    else:
        token = weight.split(".")
        token[-3] = token[-3][:-5] + "value"
        weight_2 = ".".join(token)
        # assert False, "unreachable"
        val_func = NNValueFunctionContinue(weight_2,
                                           obs_dim,
                                           net_size_factor=net_size_factor,
                                           alive_coef=alive_coef,
                                           progress_coef=progress_coef)
        policy = PolicyContinue(weight,
                                obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    flag1 = False
    flag2 = False
    flag3 = False
    reward_queue = []
    queue_num = 100
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger,
                      scaler)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

        alive_sum = 0
        progr_sum = 0
        for t in trajectories:
            tmp_rewards = t['orig_rewards']
            tmp_rewards = np.sum(tmp_rewards, axis=0)
            alive_sum += tmp_rewards[0]
            progr_sum += tmp_rewards[1]
        reward_queue.append(np.mean([t['rewards'].sum()
                                     for t in trajectories]))
        reward_queue = reward_queue[-queue_num:]
        reward_std = np.std(np.array(reward_queue))

        print("Reward std by {} episode : {}".format(queue_num, reward_std))

        if alive_sum >= 5000:
            flag3 = True

        if (flag3 and alive_sum > progr_sum * threshold1) or flag1:
            flag1 = True
            alive_coef -= change_rate
            progress_coef += change_rate
            val_func.alive_coef = float(alive_coef)
            val_func.progress_coef = float(progress_coef)
            if alive_sum < progr_sum * threshold2:
                flag1 = False

        if progr_sum > alive_sum * threshold1 or flag2:
            flag2 = True
            alive_coef += change_rate
            progress_coef -= change_rate
            val_func.alive_coef = float(alive_coef)
            val_func.progress_coef = float(progress_coef)

            if progr_sum < alive_sum * threshold2:
                flag2 = False

        print(alive_sum, progr_sum)

        logger.log_model_3({
            "alive_coef": alive_coef,
            "progress_coef": progress_coef,
            "alive_sum": alive_sum,
            "progr_sum": progr_sum
        })

    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #10
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size,hid1_mult,
         policy_logvar, weights_path, init_episode, experiment_name, resume, augment=False):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    logger = Logger(logname=env_name, sub_dir=experiment_name)
    aigym_path = os.path.join('results', env_name, experiment_name)

    if resume:
        weights_path = aigym_path
        ckpt = tf.train.get_checkpoint_state(weights_path)
        init_episode = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1])

    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim = 45
    # obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    # env = wrappers.Monitor(env, aigym_path, force=True)
    if augment:
        obs_dim *= 2
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, weights_path)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, 5, augment)
    episode = init_episode
    while episode <= num_episodes:
        if episode % 1000 is 0:
            # record one episode
            record(env_name, aigym_path, policy, scaler, augment)
            policy.save(aigym_path, episode)
        trajectories = run_policy(env, policy, scaler, logger, batch_size, augment)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #record one last episode
    record(env_name, aigym_path, policy, scaler, augment)
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #11
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    '''
    '''
    ##################
    #  shared policy #
    ##################

    tic = time.clock()

    manarger = MPManager()
    manarger.start()

    shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name)
    shared_obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    shared_logger = Logger(logname=env_name, now=now + "-Master")
    shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master")
    #env = wrappers.Monitor(env, aigym_path, force=True)
    shared_scaler = Scaler(shared_obs_dim)

    shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None)
    shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult,
                           policy_logvar, -1, None)

    learning_rate_input = tf.placeholder("float")
    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=RMSP_ALPHA,
                                  momentum=0.0,
                                  epsilon=RMSP_EPSILON,
                                  clip_norm=GRAD_NORM_CLIP,
                                  device=device)

    # lacal policy declair
    env_a = [None] * N_WORKERS
    obs_dim_a = [None] * N_WORKERS
    act_dim_a = [None] * N_WORKERS
    logger_a = [None] * N_WORKERS
    aigym_path_a = [None] * N_WORKERS
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    val_func_a = [None] * N_WORKERS
    policy_a = [None] * N_WORKERS
    scaler_a = [None] * N_WORKERS
    for i in range(N_WORKERS):
        env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name)
        obs_dim_a[
            i] += 1  # add 1 to obs dimension for time step feature (see run_episode())
        logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i))
        aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i))
        #env_a[i] = wrappers.Monitor(env, aigym_path, force=True)
        scaler_a[i] = Scaler(obs_dim_a[i])

        val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i,
                                        shared_val_func)
        val_func_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_val_func.get_vars(), val_func_a[i].gradients)

        policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult,
                             policy_logvar, i, shared_policy)
        policy_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_policy.get_vars(), policy_a[i].gradients)

    # init tensorflow
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            allow_soft_placement=True))
    init = tf.global_variables_initializer()

    ## start sess
    sess.run(init)

    ## init shared scalar policy
    run_policy(sess,
               shared_env,
               shared_policy,
               shared_scaler,
               shared_logger,
               episodes=5)

    def single_work(thread_idx):
        """ training loop

        Args:
            env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
            num_episodes: maximum number of episodes to run
            gamma: reward discount factor (float)
            lam: lambda from Generalized Advantage Estimate
            kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
            batch_size: number of episodes per policy training batch
            hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
            policy_logvar: natural log of initial policy variance
        """
        env = env_a[thread_idx]
        policy = policy_a[thread_idx]
        #obs_dim = obs_dim_a[thread_idx]
        #act_dim = act_dim_a[thread_idx]
        logger = logger_a[thread_idx]
        aigym_path = aigym_path_a[thread_idx]
        scaler = scaler_a[thread_idx]
        val_func = val_func_a[thread_idx]

        print("=== start thread " + str(policy.get_thread_idx()) + " " +
              policy.get_scope() + " ===")
        print(shared_policy.get_vars())
        print(policy.get_vars())

        # run a few episodes of untrained policy to initialize scaler:
        #run_policy(sess, env, policy, scaler, logger, episodes=5)

        #policy.sync(shared_policy)
        #val_func.sync(shared_val_func)
        episode = 0

        while episode < num_episodes:

            ## copy global var into local
            sess.run(policy.sync)
            sess.run(val_func.sync)

            ## compute new model on local policy
            trajectories = run_policy(sess,
                                      env,
                                      policy,
                                      scaler,
                                      logger,
                                      episodes=batch_size)
            episode += len(trajectories)
            add_value(sess, trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode,
                            time.clock() - tic)

            policy.update(sess, observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(sess, observes, disc_sum_rew,
                         logger)  # update value function

            #cur_learning_rate = self._anneal_learning_rate(global_t)
            feed_dict = {
                policy.old_log_vars_ph: policy.old_log_vars_np,
                policy.old_means_ph: policy.old_means_np,
                policy.obs_ph: observes,
                policy.act_ph: actions,
                policy.advantages_ph: advantages,
                policy.beta_ph: policy.beta,
                policy.lr_ph: policy.lr,
                policy.eta_ph: policy.eta,
                learning_rate_input: policy.lr
            }

            sess.run(policy.apply_gradients, feed_dict)

            shared_policy.update(sess, observes, actions, advantages,
                                 shared_logger)

            feed_dict = {
                val_func.obs_ph: observes,
                val_func.val_ph: disc_sum_rew,
                learning_rate_input: val_func.lr
            }

            sess.run(val_func.apply_gradients, feed_dict)

            shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger)

            shared_logger.log({'_Time': time.clock() - tic})

            logger.write(
                display=True)  # write logger results to file and stdout

        logger.close()

    ## end def single work

    train_threads = []
    for i in range(N_WORKERS):
        train_threads.append(threading.Thread(target=single_work, args=(i, )))

    [t.start() for t in train_threads]
    [t.join() for t in train_threads]

    saver = tf.train.Saver()
    for i in range(N_WORKERS):
        logger_a[i].close()

    #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint')
    #saver.save(sess, path )

    sess.close()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, task_identity):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    print('Training started for ' + env_name + ' and task_identity ' +
          str(task_identity))

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name=env_name,
                                     task_identity=task_identity)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    env_name, task_identity)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

    scale, offset = scaler.get()
    #scale_and_offset_data = {'scale': scale, 'offset': offset}
    #scale_and_offset_file = 'scale_and_offset_file_' + env_name + '_' + task_identity + '.pkl'
    #with open(scale_and_offset_file, 'wb') as f:
    #    pickle.dump(scale_and_offset_data, f)
    #### Saving expert trajectories after sufficient training has been made
    ## Visualization
    #aigym_path = os.path.join(VIDEO_LOGS_DIRECTORY, env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    trajectories = run_policy(env,
                              policy,
                              scaler,
                              logger,
                              episodes=DEMONSTRATOR_EPISODES_TO_LOG)
    data_to_store = {
        DEMONSTRATOR_TRAJECTORY_KEY: trajectories,
        SCALE_KEY: scale,
        OFFSET_KEY: offset
    }
    directory_to_store_trajectories = './../' + DEMONSTRATOR_TRAJECTORIES_DIRECTORY
    if not os.path.exists(directory_to_store_trajectories):
        os.makedirs(directory_to_store_trajectories)
    file_to_store_trajectories = directory_to_store_trajectories + env_name + '_' + task_identity + '.pkl'
    with open(file_to_store_trajectories, "wb") as f:
        pickle.dump(data_to_store, f)

    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #13
0
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                 use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                 max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
                 phi_obj, load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    run_policy(env,
               policy,
               scaler,
               num_episodes,
               max_timesteps=max_timesteps,
               mode=load_model)  # run a few to init scaler

    episode = 0
    for i in range(2000):
        print("sampling and training at %s iteration\n" % (i))
        trajectories, traj_len_list = run_policy(env,
                                                 policy,
                                                 scaler,
                                                 num_episodes,
                                                 max_timesteps=max_timesteps,
                                                 mode=load_model)

        num_traj = len(trajectories)

        episode += len(trajectories)
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, gamma)
        add_gae(trajectories, gamma, lam)

        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        policy.update(load_model,
                      observes,
                      actions,
                      advantages,
                      use_lr_adjust,
                      ada_kl_penalty,
                      c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew)

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    refine_scaler = False
    if refine_scaler == True:
        run_policy(env,
                   policy,
                   scaler,
                   num_episodes,
                   max_timesteps=max_timesteps,
                   mode=load_model)  # run a few to refine scaler
    with open('models/scaler/scaler.pkl', 'wb') as output:
        pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
    logger.log("saved model")
Пример #14
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, **kwargs):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    memory = deque([])
    memory_size = kwargs['memory_size']
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    target_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                           policy_logvar)  # kl_targ = 0?
    explore_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                            policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, target_policy, scaler, logger, episodes=5, fix_drct_dist=0)
    run_policy(env,
               explore_policy,
               scaler,
               logger,
               episodes=5,
               fix_drct_dist=0)
    episode = 0
    fix_drct_dist_range = (0.3, 0)

    while episode < num_episodes:
        # save model
        if episode % 200 == 0:
            save_path = target_policy.saver.save(
                target_policy.sess,
                "/home/csc63182/testspace/models/halfcheetah-trpo/model-%d.ckpt"
                % (episode))

        # run a few episodes
        fix_drct_dist = (
            (episode * fix_drct_dist_range[1]) +
            (num_episodes - episode) * fix_drct_dist_range[0]) / num_episodes
        target_trajectories = run_policy(env,
                                         target_policy,
                                         scaler,
                                         logger,
                                         episodes=batch_size,
                                         fix_drct_dist=0)
        explore_trajectories = run_policy(env,
                                          explore_policy,
                                          scaler,
                                          logger,
                                          episodes=batch_size,
                                          fix_drct_dist=fix_drct_dist)

        # Add to memory
        n_explore = max(0, int(batch_size * (1 - episode / num_episodes)) - 1)
        trajectories = target_trajectories + explore_trajectories[:n_explore]
        episode += batch_size
        memory += trajectories
        while len(memory) > memory_size:
            memory.popleft()

        # train explore network
        add_value(explore_trajectories,
                  val_func)  # add estimated values to episodes
        add_disc_sum_rew(explore_trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(explore_trajectories, gamma, lam)  # calculate advantage
        observes, actions, advantages, disc_sum_rew = build_train_set(
            explore_trajectories)
        explore_policy.update(observes, actions, advantages,
                              logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function

        # train target network
        # re-sample trajectories
        trajectories = sample(memory, batch_size)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        target_policy.update(observes, actions, advantages,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    with open('rewards_%s.txt' % kwargs['log_postfix'], 'w') as f:
        for reward in rewards_record:
            f.write('%f\n' % reward)
    plt.plot((np.arange(len(rewards_record)) + 1) * batch_size, rewards_record)
    plt.savefig('learning_curve_%s.png' % kwargs['log_postfix'])
    logger.close()
    explore_policy.close_sess()
    target_policy.close_sess()
    val_func.close_sess()
Пример #15
0
def main(env_name, num_episodes, gamma, lamda, kl_targ, batch_size, hid1_mult, init_pol_logvar, animate,\
        save_video, num_episodes_sim, task_params, task_name):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lamda: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        init_pol_logvar: natural log of initial policy variance
        save_video: Boolean determining if videos of the agent will be saved
        num_episodes_sim: Number of episodes to simulate/save videos for
        task_params: list of parameters to modify each environment for a different task
        task_name: name user assigns to the task being used to modify the environment
    """

    # ****************  Environment Initialization and Paths  ***************
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    # Paths
    print("\n\n---- PATHS: ----")
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)  # logger object
    aigym_path = os.path.join('./videos', env_name, task_name,
                              now)  # videos folders
    agent_path = os.path.join('agents', env_name,
                              now)  # agent / policy folders
    os.makedirs(agent_path)
    print("Path for Saved Videos: {}".format(aigym_path))
    print("Path for Saved Agents: {}\n".format(agent_path))

    # Initialize Policy, Value Networks and Scaler
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_pol_logvar)
    run_policy(env, policy, scaler, logger,
               episodes=5)  # run some episodes to initialize scaler

    # Start Trainning
    animate = True if animate == "True" else False
    save_video = True if save_video == "True" else False
    saver_perc = int(
        num_episodes *
        0.02)  # determinines when the agent and video should be saved
    saver_offset = saver_perc
    killer = GracefulKiller()
    episode = 0

    while episode < num_episodes:

        # Obtain 'batch_size' trajectories and add additional intermediate calculations
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size,
                                  animate=animate)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lamda)  # calculate advantage

        # Concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        # Logging Stats
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)

        # Update Policy and Value Networks
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout

        # Store Policy, Value Network and Scaler: every 20% of total episodes or in first/last episode
        if episode >= saver_offset or episode >= num_episodes or episode <= batch_size or killer.kill_now:
            # TODO: Make saving agent/video a method so that it can be called in killer.kill_now
            saver_offset += saver_perc
            policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(
                agent_path, episode))  # Save Policy Network
            val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(
                agent_path, episode))  # Save Value Network
            pickle.dump(
                scaler,
                open("{}/scaler_ep_{}.p".format(agent_path, episode), 'wb'))
            print("---- Saved Agent at Episode {} ----".format(episode))

            # Save video of current agent/policy
            if save_video:
                print("---- Saving Video at Episode {} ----".format(episode))
                _ = sim_agent(
                    env,
                    policy,
                    scaler,
                    num_episodes_sim,
                    save_video=True,
                    out_dir=aigym_path +
                    "/vid_ep_{}/{}_{}".format(episode, task_name, task))
                env.close()  # closes window open by monitor wrapper
                env, _, _ = init_gym(
                    env_name
                )  # Recreate env as it is killed when saving videos
            print("\n\n")

            # If Ctrl + C is Pressed, ask user if Trainning shall be terminated
            if killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                killer.kill_now = False

    # Terminate Sessions
    env.close()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #16
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size,
         net_size_factor, noise_bias, weight, use_ppoclip):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    now = datetime.now().strftime("%b-%d_%H:%M:%S") + "_single"
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    if weight == "None":
        val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor)
        policy = None
        if use_ppoclip == "False":
            policy = Policy(obs_dim,
                            act_dim,
                            kl_targ,
                            net_size_factor=net_size_factor,
                            noise_bias=noise_bias)
        elif use_ppoclip == "True":
            policy = PolicyClip(obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
            #assert False, "Not tested"
        else:
            assert False, "Unreachable"
    else:
        token = weight.split(".")
        token[-3] = token[-3][:-5] + "value"
        weight_2 = ".".join(token)
        val_func = NNValueFunctionContinue(weight_2,
                                           obs_dim,
                                           net_size_factor=net_size_factor)
        policy = PolicyContinue(weight,
                                obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger,
                      scaler)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    # with open("test_dump", 'w') as f:
    #     pickle.dump(policy, f)
    policy.close_sess()
    val_func.close_sess()
Пример #17
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, risk_targ):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now_utc = datetime.utcnow()  # create unique directories
    now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str(
        now_utc.year) + '_' + str(
            ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str(
                now_utc.second)  # adjust for Montreal Time Zone
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    risk_targ, 'CVaR', batch_size, 1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
    #big_li_rew_nodisc0 = np.array([])
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        #predicted_values_0 = [t['values'][0] for t in trajectories]
        add_disc_sum_rew(
            trajectories, gamma, scaler.mean_rew,
            np.sqrt(scaler.var_rew))  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        nodisc0 = -0.0001 * np.array(
            [t['rewards'].sum() for t in trajectories])  # scaled for gradients
        print(nodisc0)
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        print('scaled sum rewards', nodisc0)
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        lamb = policy.update(observes, actions, advantages, nodisc0,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        kl_terms = np.append(kl_terms, policy.check_kl)
        x1 = list(range(1, (len(kl_terms) + 1)))
        rewards = plt.plot(x1, kl_terms)
        plt.title('RAPPO')
        plt.xlabel("Episode")
        plt.ylabel("KL Divergence")
        plt.savefig("KL_curve.png")
        plt.close("KL_curve.png")
        beta_terms = np.append(beta_terms, policy.beta)
        x2 = list(range(1, (len(beta_terms) + 1)))
        mean_rewards = plt.plot(x2, beta_terms)
        plt.title('RAPPO')
        plt.xlabel("Batch")
        plt.ylabel("Beta Lagrange Multiplier")
        plt.savefig("lagrange_beta_curve.png")
        plt.close("lagrange_beta_curve.png")
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        tr = run_policy(env, policy, scaler, logger, episodes=1000)
        sum_rewww = [t['rewards'].sum() for t in tr]
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('RAPPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("RA_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #18
0
def main(env_name, num_episodes, gamma, lamda, kl_targ, clipping_range, pol_loss_type, batch_size, init_pol_logvar, animate,\
        save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\
        episode_to_load, now_to_load):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lamda: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        clipping_range: max value to clip the policy gradient ratio
        pol_loss_type: string determining which type of loss to use for the Policy Network
        batch_size: number of episodes per policy training batch
        init_pol_logvar: natural log of initial policy variance
        save_video: Boolean determining if videos of the agent will be saved
        save_rate: Int determining how often to save videos for
        num_episodes_sim: Number of episodes to simulate/save videos for
        task_params: list of parameters to modify each environment for a different task
        task_name: name user assigns to the task being used to modify the environment
    """


    # ****************  Environment Initialization and Paths  ***************
    task_params_str = ''.join(str(e) +', ' for e in task_params)
    num_tasks = len(task_params)
    envs = [None]*num_tasks
    scalers = [None]*num_tasks
    loggers = [None]*num_tasks

    print ("\n\n------ PATHS: ------")
    start_time = datetime.now()
    if episode_to_load == None: now = start_time.strftime("%b-%d_%H:%M:%S") # If NOT loading from Checkpoint -> used to  create unique directories
    else: 
        assert now_to_load != None,\
            "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load)
        now = now_to_load
    logs_path = os.path.join('log-files', env_name, task_name, task_params_str, now)

    for task in range(num_tasks):
        # Create task specific environment 
        envs[task], obs_dim, act_dim = init_gym(env_name, task_param = task_params[task])
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

        # Create task specific Paths and logger object
        loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \
                               logname_file= "_{}_{}".format(task_name, task_params[task])) 

        if episode_to_load == None: # If NOT loading from Checkpoint
            scalers[task] = Scaler(obs_dim)            

            # Auxiliary saver (becase logger sometimes fails or takes to much time)
            with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'w') as f: 
                f.write("_Episode" + "  " + "_MeanReward")

        
    aigym_path= os.path.join('./videos', env_name, task_name, task_params_str, now) # videos folders 
    agent_path = os.path.join('agents', env_name , task_name, task_params_str, now) # agent / policy folders  
    if episode_to_load == None: # If NOT loading from Checkpoint 
        os.makedirs(agent_path)
        with open(agent_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:]))  # save commandline command
        with open(logs_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:]))  # save commandline command

    print("\nPath for Saved Videos : {}".format(aigym_path)) 
    print("Path for Saved Agents: {}\n".format(agent_path))    


    # ****************  Initialize Policy, Value Networks and Scaler  ***************
    print ("\n\n------ NEURAL NETWORKS: ------")
    dims_core_hid.insert(0, obs_dim) # Modify dims list to have the size of the layer 'n-1' at position '0'
    dims_head_hid.insert(0, dims_head_hid[-1])
    
    val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid, num_tasks)#, act_func_name)
    policy = Policy(obs_dim, act_dim, dims_core_hid, dims_head_hid, num_tasks, pol_loss_type = pol_loss_type)

    # Load from Checkpoint:
    # Validate intented episode to load OR get last episode number if no target load episode was provided 
    if episode_to_load != None:
        load_agent_path = agent_path # agent / policy folders
        saved_ep_list = [file.split(".")[0].split("_")[-1] for file in os.listdir(load_agent_path) if "policy" in file]

        if episode_to_load == -1: # Get last saved episode
            episode_to_load = sorted([int(ep_string) for ep_string in saved_ep_list])[-1]

        else: # Validate if episode_to_load was indeed saved 
            assert str(episode_to_load) in saved_ep_list,\
            "\n\nWARNING: Episode you want to load ({}) was not stored during trainning".format(episode_to_load)

        # Load Policy Network's Ops and Variables & Load Scaler Object
        policy.tf_saver.restore(policy.sess, "{}/policy_ep_{}".format(load_agent_path, episode_to_load)) 
        val_func.tf_saver.restore(val_func.sess, "{}/val_func_ep_{}".format(load_agent_path, episode_to_load))
        scalers = pickle.load(open("{}/scalers_ep_{}.p".format(load_agent_path, episode_to_load), 'rb'))         
        print("\n\n ---- CHECKPOINT LOAD:  Episoded Loaded **{}**".format(episode_to_load))

        # Delete extra epochs that where logged to the auxiliary logs
        for task in range(num_tasks):
            aux_log_path = logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task])
            aux_log = pd.read_table(aux_log_path, delim_whitespace=True)
            idx_to_cut = aux_log.index[aux_log["_Episode"] == episode_to_load ].tolist()[0]
            aux_log[0:idx_to_cut+1].to_csv(aux_log_path, header=True, index=False, sep=' ', mode='w') # overwrite trimmed aux_log


    # If NOT loading from Checkpoint: run some episodes to initialize scalers and create Tensor board dirs
    elif episode_to_load == None:
        for task in range(num_tasks): run_policy(envs[task], policy, scalers[task], loggers[task], episodes=5, task=task)  

        # Tensor Board writer
        os.makedirs(agent_path + '/tensor_board/policy')
        os.makedirs(agent_path + '/tensor_board/valFunc')

    tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy', graph=policy.g)
    tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc', graph=val_func.g)


    # ****************  Start Training  ***************
    print ("\n\n------ TRAINNING: ------")
    animate = True if animate == "True" else False
    save_video = True if save_video == "True" else False
    saver_offset = save_rate
    killer = GracefulKiller()

    if episode_to_load == None: episode = 0
    else: episode = episode_to_load
    
    # Episode is counted across all tasks i.e. N episodes indicates each tasks has been runned for N times
    while episode < num_episodes and not killer.kill_now:

        # ****************  Obtain data (train set)  ***************         
        observes_all = [None]*num_tasks
        actions_all = [None]*num_tasks
        advantages_all = [None]*num_tasks
        disc_sum_rew_all = [None]*num_tasks

        episode += batch_size
        for task in range(num_tasks):

            # Obtain 'batch_size' trajectories and add additional intermediate calculations
            trajectories = run_policy(envs[task],policy, scalers[task], loggers[task],episodes=batch_size,task=task,animate=animate)
            
            add_value(trajectories, val_func, task)  # add estimated values to episodes
            add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lamda)  # calculate advantage

            # Concatenate all episodes into single NumPy arrays
            observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task] = build_train_set(trajectories)

            # Logging Stats
            log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \
                            loggers[task], episode)

        # ****************  Update Policy and Value Networks  ***************
        print ("*************************************")
        for task in range(num_tasks):
            pol_summary = policy.update(task, observes_all[task], actions_all[task], advantages_all[task], loggers[task])  # update policy
            val_summary = val_func.fit(task, observes_all[task], disc_sum_rew_all[task], loggers[task])  # update value function
            # Auxiliary saver (because logger sometimes fails or takes to much time)
            with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'a') as f: 
                f.write("\n" + str(loggers[task].log_entry['_Episode']) + "  " + str(loggers[task].log_entry['_MeanReward'])) 
            loggers[task].write(display=True)  # write logger results to file and stdout

            tb_pol_writer.add_summary(pol_summary, global_step=episode)
            tb_val_writer.add_summary(val_summary, global_step=episode)


        # ****************  Storing NN and Videos  ***************
        # Store Policy, Value Network and Scaler: every 'save_rate' of total episodes or in first/last episode
        if episode >= saver_offset or episode >=num_episodes or episode <=batch_size or killer.kill_now:
        # TODO: Make saving agent/video a method so that it can be called in killer.kill_now 
            saver_offset += save_rate
            policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(agent_path, episode)) # Save Policy Network
            val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(agent_path, episode)) # Save Value Network
            pickle.dump(scalers, open("{}/scalers_ep_{}.p".format(agent_path, episode), 'wb'))            
            print ("---- Saved Agent at Episode {} ----".format(episode))

            # Save video of current agent/policy
            if save_video: 
                print ("---- Saving Video at Episode {} ----".format(episode))
                for task in range(num_tasks):
                    print("Environment Wind: {}".format(envs[task].env.world.gravity))
                    _ = sim_agent(envs[task], policy, task, scalers[task], num_episodes_sim, save_video=True, 
                                    out_dir=aigym_path + "/vid_ep_{}/{}_{}".format(episode, task_name, task_params[task]))
                    envs[task].close() # closes window open by monitor wrapper
                    envs[task], _, _ = init_gym(env_name,task_param=task_params[task]) # Recreate env as it was killed
            print("\n\n")

            # If Ctrl + C is Pressed, ask user if Trainning shall be terminated
            if killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                killer.kill_now = False

    # ****************  Terminate Variables  **************
    for task in range(num_tasks):
        envs[task].close()
        loggers[task].close()
    policy.close_sess()
    val_func.close_sess()

    # Save elapsed time
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    timedelta(0, 8, 562000)
    delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60)
    delta_str = "Elapsed Time: {} min {} seconds".format(delta_time[0], delta_time[1])
    # save elapsed time, 'a' to append not overwrite
    with open(agent_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str) 
    with open(logs_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str)  
Пример #19
0
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop
    Args:
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym()
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, episodes=5)
    episode = 0
    #Inizialize reward list (to keep track of improvements)
    avg_rew_list = []
    while episode < num_episodes:
        print(episode)
        trajectories = run_policy(env, policy, scaler, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        policy.update(observes, actions, advantages)  # update policy
        val_func.fit(observes, disc_sum_rew)  # update value function
        avg_rew_list.append(avg_rewards(trajectories))
        #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards
        if not episode % 20000:
            print("Saving models")
            policy.save(episode)
            val_func.save(episode)
            f = open("models/scaler-" + str(episode) + ".pkl", 'wb')
            pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb')
            pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL)
            f2.close()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #Show animation at the end of training
    while True:
        obs = env.reset()
        step = 0.0
        scale, offset = scaler.get()
        scale[-1] = 1.0
        offset[-1] = 0.0
        done = False
        while not done:
            obs = obs.astype(np.float32).reshape((1, -1))
            obs = np.append(obs, [[step]], axis=1)
            obs = (obs - offset) * scale
            action = policy.sample(obs).reshape((1, -1)).astype(np.float32)
            obs, reward, done, _ = env.step(np.squeeze(action, axis=0))
            env.render1()
            env.render2()
            step += 1e-3
    policy.close_sess()
    val_func.close_sess()
Пример #20
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    env_name = 'hsr'
    killer = GracefulKiller()
    #env, obs_dim, act_dim = init_gym(env_name)
    env = pr2_agent("r_arm")
    obs_dim = 10
    act_dim = 7
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=1)
    episode = 0
    #while episode < num_episodes:

    while episode < 30000:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Пример #21
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path,
         out_path, thread_count, animation_mode, gait_name, gait_length,
         gaits_config_path, reward_mask, log_rewards, gait_reward_weight,
         g_colab, progress_reward_weight, phase_time_limit):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    # restore_path = os.path.abspath(restore_path)
    env, obs_dim, act_dim = init_gym(env_name)
    log_rewards = log_rewards or (num_episodes == 0)
    env_list = []
    if thread_count > 1:
        env_list, obs_dim, act_dim = init_gyms(env_name, batch_size)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    start_time = datetime.now()  # create unique directories
    start_time_str = start_time.strftime("%b-%d/%H.%M.%S")
    logger = Logger(logname=env_name, now=start_time_str, out_path=out_path)
    env.env.set_params(gaits_config_path=gaits_config_path,
                       gait_name=gait_name,
                       gait_cycle_len=gait_length,
                       out_path=logger.path,
                       log_rewards=log_rewards,
                       render_mode=animation_mode,
                       reward_mask=reward_mask,
                       contact_reward=gait_reward_weight,
                       g_colab=g_colab,
                       progress_weight=progress_reward_weight,
                       phase_time_limit=phase_time_limit)
    scaler = Scaler(obs_dim)

    val_func = NNValueFunction(obs_dim, logger, restore_path)
    policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path)

    log_train_info(logger, num_episodes, start_time_str, gait_name,
                   gait_length, batch_size, restore_path, reward_mask,
                   gait_reward_weight, progress_reward_weight,
                   phase_time_limit)

    # run a few episodes of untrained policy to initialize scaler:
    episode = 0
    try:
        if restore_path is None:
            print("\nInitializing scaler (may take some time)... ")
            run_policy(env, policy, scaler, logger, episodes=5)
            print("Done\n")
        else:
            scaler.load(restore_path, obs_dim)

        while episode < num_episodes:
            sim_time = datetime.now()
            if thread_count > 1:
                trajectories = run_policy_parallel(env_list,
                                                   policy,
                                                   scaler,
                                                   logger,
                                                   episodes=batch_size,
                                                   thread_num=thread_count)
            else:
                trajectories = run_policy(env,
                                          policy,
                                          scaler,
                                          logger,
                                          episodes=batch_size)
            sim_time = datetime.now() - sim_time

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            train_time = datetime.now() - start_time
            policy_time = datetime.now()
            policy.update(observes, actions, advantages,
                          logger)  # update policy
            policy_time = datetime.now() - policy_time
            val_time = datetime.now()
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function
            val_time = datetime.now() - val_time

            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode, train_time, sim_time, policy_time,
                            val_time)
            logger.write(
                display=True)  # write logger results to file and stdout
            print("Estimated time left: {}\n".format(
                estimate_time_left(episode, num_episodes, train_time)))

            if episode % 1000 == 0:
                policy.save()
                val_func.save()
                scaler.save(logger.path)
                print("Data saved at {}\n".format(logger.path))
                update_train_info(logger, episode)
                if animation_mode > 0:
                    run_policy(env,
                               policy,
                               scaler,
                               logger,
                               episodes=1,
                               animate=True,
                               anim_name='epizode_{}'.format(episode))
            if episode % 5000 == 0:
                os.rename(
                    os.path.join(logger.path, 'value_dump'),
                    os.path.join(logger.path, 'value_dump_' + str(episode)))
                os.rename(
                    os.path.join(logger.path, 'policy_dump'),
                    os.path.join(logger.path, 'policy_dump_' + str(episode)))
                # if episode == 20000:
                #     reward_mask = 63
                #     env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length,
                #                        out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode,
                #                        reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab)
                print("Progress Enabled")
            if killer.kill_now:
                # if input('Terminate training (y/[n])? ') == 'y':
                #     break
                # killer.kill_now = False
                break
    finally:
        if animation_mode > 0 or num_episodes == 0:
            print("Rendering result video")
            try:
                trajectories = run_policy(
                    env,
                    policy,
                    scaler,
                    logger,
                    episodes=1,
                    animate=True,
                    anim_name='final_epizode_{}'.format(episode))
                # for walk analysis
                for t in trajectories:
                    logger.log_trajectory(t)
            except Exception as e:
                print("Failed to animate results, error: {}".format(e))
                raise e

        scaler.save(logger.path)
        policy.close_sess()
        val_func.close_sess()
        update_train_info(logger, episode)
        logger.close()