Exemplo n.º 1
0
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                 use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                 max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
                 phi_obj, load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0
    for _ in range(200):
        trajectories, traj_len_list = run_policy(env,
                                                 policy,
                                                 scaler,
                                                 num_episodes,
                                                 max_timesteps=max_timesteps)

        num_traj = len(trajectories)

        episode += len(trajectories)
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, gamma)
        add_gae(trajectories, gamma, lam)

        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        policy.update(load_model,
                      observes,
                      actions,
                      advantages,
                      use_lr_adjust,
                      ada_kl_penalty,
                      c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew)

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    logger.log("saved model")
   def _build_graph(self):
       """ Build and initialize TensorFlow graph """
       self.g = tf.Graph()
       with self.g.as_default():
           self._placeholders()
           self._policy_nn()
 
           self._logprob()
           self._kl_entropy()
           self._sample()
           self._loss_train_op()
           self.init = tf.global_variables_initializer()
           
           # Save only policy parameters
           policy_vars = tf.get_collection(\
               tf.GraphKeys.TRAINABLE_VARIABLES,
               scope='policy_nn')
           
           var_dict = {}
           for var in policy_vars:
               logger.log(var.name)
               var_dict[var.name]=  var
           
           self._init_session()
           self.saver = tf.train.Saver(var_dict)
    def update(self, observes, actions, advantages, use_lr_adjust, ada_kl_penalty):
        """ Update policy based on observations, actions and advantages

        Args:
            observes: observations, shape = (N, obs_dim)
            actions: actions, shape = (N, act_dim)
            advantages: advantages, shape = (N,)
            phi_value: phi_value, shape = (N,)
            phi_act_g: phi_act_g, shape = (N, act_dim)
        """
        feed_dict = {self.obs_ph: observes,
                     self.act_ph: actions,
                     self.advantages_ph: advantages,
                     self.beta_ph: self.beta,
                     self.eta_ph: self.eta,
                     self.lr_ph: self.lr * self.lr_multiplier,
                     self.lr_phi_ph: self.lr_phi}
        old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars],
                                                      feed_dict)
        feed_dict[self.old_log_vars_ph] = old_log_vars_np
        feed_dict[self.old_means_ph] = old_means_np
        loss, kl, entropy = 0, 0, 0
        
        if self.c_ph == 1.:
            # Update phi function & policy network
            logger.log("Training Phi for %d epochs"%self.phi_epochs)
            
            for _ in progressbar(range(self.phi_epochs), "Train Phi:", 25):
                self.sess.run(self.phi_train_op, feed_dict)
                phi_loss = self.sess.run(self.phi_loss, feed_dict)

            logger.record_tabular("Phi_loss", phi_loss)
        
        # Training policy
        logger.log("Training Policy for %d epochs"%self.epochs)
        for _ in progressbar(range(self.epochs), "Train Policy", 25):
            self.sess.run(self.train_op, feed_dict)
            loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict)
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break

        if (ada_kl_penalty):
            if kl > self.kl_targ * 2:  # servo beta to reach D_KL target
                self.beta = np.minimum(35, 1.5 * self.beta)  # max clip beta
                if (use_lr_adjust):
                    if self.beta > 30 and self.lr_multiplier > 0.1:
                        self.lr_multiplier /= 1.5
            elif kl < self.kl_targ / 2:
                self.beta = np.maximum(1 / 35, self.beta / 1.5)  # min clip beta
                if (use_lr_adjust):
                    if self.beta < (1 / 30) and self.lr_multiplier < 10:
                        self.lr_multiplier *= 1.5

        logger.record_dicts({
            'PolicyLoss': loss,
            'PolicyEntropy': entropy,
            'KL': kl,
            'Beta': self.beta,
            '_lr_multiplier': self.lr_multiplier})
    def _loss_train_op(self):
      
        # get Phi function and its derivatives 
        phi_value, phi_act_g = self.phi(self.obs_ph, self.act_ph, reuse=False)
        self.phi_value = phi_value
        self.phi_act_g = phi_act_g
        self.phi_nn_vars = self.phi.phi_vars

        ll_mean_g = 1/tf.exp(self.log_vars) * (self.act_ph - self.means)
        ll_log_vars_g = -1/2 * ( 1/tf.exp(self.log_vars) \
                    - 1/tf.exp(self.log_vars) * \
                    (self.act_ph - self.means) * \
                    (self.act_ph - self.means) * \
                    1 / tf.exp(self.log_vars))

        self.phi_value.set_shape((None,))

        log_vars_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \
                        * (ll_log_vars_g * tf.expand_dims(self.advantages_ph 
                        - self.c_ph * self.phi_value, 1) \
                        + 1/2 * self.c_ph * ll_mean_g * self.phi_act_g )
            
        means_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \
                        * (ll_mean_g * tf.expand_dims(self.advantages_ph - 
                        self.c_ph * self.phi_value, 1) \
                        + self.c_ph * self.phi_act_g)
        
        loss1_log_vars = - tf.reduce_mean(
                        tf.stop_gradient(log_vars_inner) * \
                        tf.exp(self.log_vars)) 
        
        loss1_mean = -tf.reduce_mean(
                        tf.stop_gradient(means_inner) * \
                        self.means)
        
        loss1 = loss1_log_vars + loss1_mean
        
        loss2 = tf.reduce_mean(self.beta_ph * self.kl)
        
        loss3 = self.eta_ph * tf.square(\
                        tf.maximum(0.0, \
                        self.kl - 2.0 * self.kl_targ))

        self.loss = loss1 + loss2 + loss3
        
        optimizer = tf.train.AdamOptimizer(self.lr_ph)
        self.train_op = optimizer.minimize(self.loss, 
                        var_list= self.policy_nn_vars)

        
        if self.reg_scale > 0.:
            reg_variables = tf.get_collection(\
                    tf.GraphKeys.REGULARIZATION_LOSSES)
            
            reg_term = tf.contrib.layers.apply_regularization(
                        self.phi.kernel_regularizer, 
                        reg_variables)
        else:
            reg_term = 0.

        if self.c_ph == 1.:
            if self.phi_obj == 'FitQ':
                self.phi_loss = tf.reduce_mean(\
                        tf.square(self.advantages_ph - \
                        self.phi_value), axis=0) + reg_term
            
                logger.log('phi_with FitQ as objective function')
        
            elif self.phi_obj == 'MinVar':
                self.means_mse = tf.reduce_sum(\
                        tf.reduce_mean( \
                        tf.square(means_inner - \
                        tf.reduce_mean(means_inner, \
                        axis=0)), axis = 0))
            
                self.logstd_vars_mse = tf.reduce_sum(\
                        tf.reduce_mean( \
                        tf.square(log_vars_inner - \
                        tf.reduce_mean(log_vars_inner, \
                        axis=0)), axis = 0))
            
                self.phi_loss = self.means_mse + self.logstd_vars_mse + reg_term
                logger.log('phi with MinVar as objecive function')
            
            else:
                raise NotImplementedError
            
            phi_optimizer = tf.train.AdamOptimizer(self.lr_phi_ph)      
            self.phi_train_op = phi_optimizer.minimize(\
                        self.phi_loss, 
                        var_list=self.phi_nn_vars)
            
        elif self.c_ph == 0.:
            logger.log("Training with PPO")            
            self.phi_train_op = tf.no_op 
    def _policy_nn(self):
        """ 
            Neural net for policy 
            approximation function
        """
        
        with tf.variable_scope("policy_nn"):
            # hidden layer sizes determined by obs_dim 
            # and act_dim (hid2 is geometric mean)
            if self.policy_size == 'small':
                logger.log("using small structure")
                
                hid1_size = self.obs_dim # * 10
                hid3_size = self.act_dim # * 10
                hid2_size = int(np.sqrt(hid1_size * hid3_size))
            
            elif self.policy_size == 'large':
                logger.log('Using large structure ')
                
                hid1_size = self.obs_dim * self.hid1_mult
                hid3_size = self.act_dim  * 10
                hid2_size = int(np.sqrt(hid1_size * hid3_size))
            else:
                raise NotImplementedError
            
            # heuristic to set learning rate based on NN size
            self.lr = 9e-4 / np.sqrt(hid2_size)  # 9e-4 empirically determined
            
            # 3 hidden layers with tanh activations
            out = tf.layers.dense(self.obs_ph,
                        hid1_size, tf.tanh,
                        kernel_initializer=tf.random_normal_initializer(
                        stddev=np.sqrt(1 / self.obs_dim)), name="h1")
            
            out = tf.layers.dense(out, 
                        hid2_size, tf.tanh,
                        kernel_initializer= \
                        tf.random_normal_initializer( \
                        stddev=np.sqrt(1 / hid1_size)),
                        name="h2")
            
            out = tf.layers.dense(out, 
                        hid3_size, tf.tanh,
                        kernel_initializer= \
                        tf.random_normal_initializer( \
                        stddev=np.sqrt(1 / hid2_size)), 
                        name="h3")
            
            self.means = tf.layers.dense(out, self.act_dim,
                        kernel_initializer= \
                        tf.random_normal_initializer( \
                        stddev=np.sqrt(1 / hid3_size)),
                        name="means")

            logvar_speed = (10 * hid3_size) // 48
            log_vars = tf.get_variable('logvars', 
                        (logvar_speed, self.act_dim), 
                        tf.float32,
                        tf.constant_initializer(0.0))


            self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar

            self.policy_nn_vars = tf.get_collection(\
                    tf.GraphKeys.TRAINABLE_VARIABLES, 
                    scope='policy_nn')

            logger.log('Policy Params -- h1: {}, h2: {}, \
                    h3: {}, lr: {:.3g}, logvar_speed: {}'
                    .format(hid1_size, hid2_size, hid3_size, 
                    self.lr, logvar_speed))
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    # scaler = Scaler(obs_dim)
    logger.log("loading scaler")
    with open('models/scaler/scaler.pkl', 'rb') as input:
        scaler = pickle.load(input)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    load_v = False  #whether load value function baseline or train from scratch; no big impact on stein
    if load_v == True:
        val_func.load_val_model(load_dir)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps,
                                             mode=load_model)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)

    #Split data into validation and training data
    random.shuffle(trajectories)
    t_trajectories = trajectories[:int(len(trajectories) / 2)]
    v_trajectories = trajectories[int(len(trajectories) / 2):]

    refit_v = True  # if fit value function baseline once again before evaluating; no big impact on stein
    if refit_v == True:
        tt_trajectories = copy.deepcopy(t_trajectories)
        add_value(tt_trajectories, val_func)
        add_disc_sum_rew(tt_trajectories, gamma)
        add_gae(tt_trajectories, gamma, lam)
        tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set(
            tt_trajectories)
        logger.log("refit value function baseline")
        val_func.fit(tt_observes, tt_disc_sum_rew)  # update value function
        logger.log("done")

    # build training data after refit v
    add_value(t_trajectories, val_func)
    add_disc_sum_rew(t_trajectories, gamma)
    add_gae(t_trajectories, gamma, lam)
    t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set(
        t_trajectories)

    # build validation data after refit v
    add_value(v_trajectories, val_func)
    add_disc_sum_rew(v_trajectories, gamma)
    add_gae(v_trajectories, gamma, lam)
    v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set(
        v_trajectories)

    sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        max_timesteps, env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(v_observes,
                                             v_actions,
                                             v_advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    d = Dataset(dict(ob=t_observes,
                     ac=t_actions,
                     atarg=t_advantages,
                     vtarg=t_disc_sum_rew),
                shuffle=True)
    for _ in range(phi_epochs):  # optim_epochs
        for batch in d.iterate_once(128):  # optim_batchsize
            policy.update(load_model,
                          batch['ob'],
                          batch['ac'],
                          batch['atarg'],
                          use_lr_adjust,
                          ada_kl_penalty,
                          c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(v_observes, \
                    v_actions, v_advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)
Exemplo n.º 7
0
    def _loss_train_op(self):
   

        # get Phi function and its derivatives 
        if self.type == 'stein':
            phi_value, phi_act_g = self.phi(self.obs_ph, self.act_ph, reuse=False)
        elif self.type == 'state':
            phi_value, phi_act_g = self.phi(self.obs_ph, reuse=False)
        self.phi_value = phi_value
        self.phi_act_g = phi_act_g
        self.phi_nn_vars = self.phi.phi_vars

        ll_mean_g = 1/tf.exp(self.log_vars) * (self.act_ph - self.means)
        ll_log_vars_g = -1/2 * ( 1/tf.exp(self.log_vars) \
                    - 1/tf.exp(self.log_vars) * \
                    (self.act_ph - self.means) * \
                    (self.act_ph - self.means) * \
                    1 / tf.exp(self.log_vars))

        self.phi_value.set_shape((None,))

        log_vars_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \
                        * (ll_log_vars_g * tf.expand_dims(self.advantages_ph 
                        - self.c_ph * self.phi_value, 1) \
                        + 1/2 * self.c_ph * ll_mean_g * self.phi_act_g )
            
        means_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \
                        * (ll_mean_g * tf.expand_dims(self.advantages_ph - 
                        self.c_ph * self.phi_value, 1) \
                        + self.c_ph * self.phi_act_g)
        
        loss1_log_vars = - tf.reduce_mean(
                        tf.stop_gradient(log_vars_inner) * \
                        tf.exp(self.log_vars)) 
        
        loss1_mean = -tf.reduce_mean(
                        tf.stop_gradient(means_inner) * \
                        self.means)
        
        loss1 = loss1_log_vars + loss1_mean
        
        loss2 = tf.reduce_mean(self.beta_ph * self.kl)
        
        loss3 = self.eta_ph * tf.square(\
                        tf.maximum(0.0, \
                        self.kl - 2.0 * self.kl_targ))

        self.loss = loss1 + loss2 + loss3
        
        optimizer = tf.train.AdamOptimizer(self.lr_ph)
        self.train_op = optimizer.minimize(self.loss, 
                        var_list= self.policy_nn_vars)

        
        # phi loss train op
        if self.phi_obj == 'MinVar':
            means_mse = tf.reduce_sum(\
                    tf.reduce_mean( \
                    tf.square(means_inner - \
                    tf.reduce_mean(means_inner, \
                    axis=0)), axis = 0))
            
            logstd_vars_mse = tf.reduce_sum(\
                    tf.reduce_mean(\
                    tf.square(log_vars_inner - \
                    tf.reduce_mean(log_vars_inner,\
                    axis=0)), axis = 0))
                        
            gradient = tf.concat([means_inner, log_vars_inner], axis=1)

            est_A = tf.gather(gradient, tf.range(0, tf.shape(gradient)[0] //2))

            est_B = tf.gather(gradient, 
                    tf.range(tf.shape(gradient)[0] //2, 
                    tf.shape(gradient)[0]))
            
            # calculate loss
            est_var = tf.reduce_sum(\
                    tf.square(tf.reduce_mean(\
                    est_A, axis=0) - \
                    tf.reduce_mean(est_B, axis=0)))
        
        
        if self.reg_scale > 0.:
            reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            reg_term = tf.contrib.layers.apply_regularization(
                        self.phi.kernel_regularizer, reg_variables)
           
            for var in reg_variables:
                logger.log("regularized, ", var.name, var.shape)
        else:
            reg_term = 0.

        if self.phi_obj == 'FitQ':
            self.phi_loss = tf.reduce_mean(\
                    tf.square(self.advantages_ph - \
                    self.phi_value), axis=0) + reg_term
            
            logger.log('phi_with FitQ as objective function')

        elif self.phi_obj == 'MinVar':
            
            self.phi_loss = means_mse + logstd_vars_mse + reg_term
            logger.log('phi with MinVar as objecive function')
        
        else:
            raise NotImplementedError
        
       
        phi_optimizer = tf.train.AdamOptimizer(self.lr_phi_ph)      
        self.phi_train_op = phi_optimizer.minimize(self.phi_loss, var_list=self.phi_nn_vars)

        self.means_inner = means_inner
        self.log_vars_inner = log_vars_inner
def train_models(env_name, num_episodes, 
        gamma, lam, kl_targ, 
        coef, use_lr_adjust, 
        ada_kl_penalty, seed, 
        epochs, phi_epochs,
        max_timesteps, reg_scale,
        phi_lr, phi_hs,
        policy_size, 
        phi_obj, load_model, type): 

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed) 
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, 
            kl_targ,epochs, 
            phi_epochs, 
            policy_size=policy_size,
            phi_hidden_sizes=phi_hs,
            reg_scale=reg_scale,
            lr_phi=phi_lr,
            phi_obj=phi_obj,
            type=type)

    
    run_policy(env, policy, 
            scaler, num_episodes, 
            max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler 
    
    episode = 0
    for i in range(2000):
        print("sampling and training at %s iteration\n"%(i))
        trajectories, traj_len_list = run_policy(env, policy, scaler, 
                            num_episodes, max_timesteps=max_timesteps, mode=load_model)
    
        num_traj = len(trajectories)
    
        episode += len(trajectories)
        add_value(trajectories, val_func)  
        add_disc_sum_rew(trajectories, gamma)  
        add_gae(trajectories, gamma, lam) 
    
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        
        policy.update(load_model, observes, actions, advantages,
                use_lr_adjust, ada_kl_penalty, c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew) 

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    refine_scaler = False
    if refine_scaler == True:
        run_policy(env, policy, 
                scaler, num_episodes, 
                max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler 
    with open('models/scaler/scaler.pkl', 'wb') as output:
        pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
    logger.log("saved model")
Exemplo n.º 9
0
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    val_func.load_val_model(load_dir)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)
    add_value(trajectories, val_func)
    add_disc_sum_rew(trajectories, gamma)
    add_gae(trajectories, gamma, lam)

    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)

    sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(observes,
                                             actions,
                                             advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    policy.update(load_model,
                  observes,
                  actions,
                  advantages,
                  use_lr_adjust,
                  ada_kl_penalty,
                  c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(observes, \
                    actions, advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)
Exemplo n.º 10
0
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs,
         phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
         phi_obj):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_iterations: maximum number of iterations to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
        coef: coefficient of Stein control variate
        use_lr_adjust: whether adjust lr based on kl
        ada_kl_penalty: whether adjust kl penalty
        max_timesteps: maximum time steps per trajectory
        reg_scale: regularization coefficient 
        policy_size: policy network size
        phi_obj: FitQ or MinVar
    """

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)

    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    hid1_mult,
                    policy_logvar,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    c_ph=coef,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env,
               policy,
               scaler,
               batch_size=1000,
               max_timesteps=max_timesteps)

    for _ in range(num_iterations):
        logger.log("\n#Training Iter %d" % (_))
        logger.log("Draw Samples..")

        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  batch_size=batch_size,
                                  max_timesteps=max_timesteps)

        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage

        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew)

        logger.log("Starting Training...")
        policy.update(observes, actions, advantages, \
                use_lr_adjust, ada_kl_penalty)  # update policy

        val_func.fit(observes, disc_sum_rew)  # update value function

        logger.log('--------------------------------\n')

    policy.close_sess()
    val_func.close_sess()