예제 #1
0
    def update():
        # Prepare hessian func, gradient eval
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)
        v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
예제 #2
0
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
예제 #3
0
    def update(self, dataX, dataZ, mean_x, std_x, mean_y, std_y, mean_z, std_z, mean_r, std_r,
               nEpoch, train_pi_iters, train_v_iters, target_kl, logger):   #, logger
        # pi_loss_list=[]
        # v_loss_list=[]
        self.update_mean_std(mean_x, mean_y, mean_z, mean_r, std_x, std_y, std_z, std_r)
        training_loss_list = []
        range_of_indeces = np.arange(dataX.shape[0])
        nData = dataX.shape[0]
        batchsize = int(self.batchsize)
        # Training
        for i in range(nEpoch):
            avg_loss = 0
            num_batches = 0
            indeces = npr.choice(range_of_indeces, size=(dataX.shape[0],), replace=False)
            for batch in range(int(math.floor(nData / batchsize))):
                # walk through the randomly reordered "old data"
                dataX_batch = dataX[indeces[batch * batchsize:(batch + 1) * batchsize], :]
                dataZ_batch = dataZ[indeces[batch * batchsize:(batch + 1) * batchsize], :]

                # one iteration of feedforward training
                _, loss, output, true_output = self.sess.run([self.train_step, self.mse_, self.curr_nn_output, self.z_],
                                                             feed_dict={self.x_: dataX_batch, self.z_: dataZ_batch})
                training_loss_list.append(loss)
                avg_loss += loss
                num_batches += 1

            # if ((i % 10) == 0):
            #     print("\n=== Epoch {} ===".format(i))
            #     print("loss: ", avg_loss / num_batches)
        dyn_avg_loss = avg_loss / num_batches
        # logger.log('dynamics model ave_training_loss:%f' %dyn_avg_loss)

        inputs = {k: v for k, v in zip(self.all_phs, self.buf.get())}
        pi_l_old, v_l_old, ent = self.sess.run([self.pi_loss, self.v_loss, self.approx_ent], feed_dict=inputs)

        for i in range(train_pi_iters):
            _, kl = self.sess.run([self.train_pi, self.approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            self.sess.run(self.train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = self.sess.run([self.pi_loss, self.v_loss, self.approx_kl, self.clipfrac], feed_dict=inputs)

        logger.store(LossPi=pi_l_old, LossV=v_l_old, LossDyn=dyn_avg_loss,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
        return training_loss_list, dyn_avg_loss, pi_l_old, v_l_old
예제 #4
0
 def set_and_eval(step):
     sess.run(set_pi_params,
              feed_dict={v_ph: old_params - alpha * x * step})
     return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))