grad = sess.run(surrogateFlatLoss, feed_dict={ obsPh: obs, aPh: actions, advPh: advantages, logProbSampPh: sampledLogProb }) #, logProbsAllPh : allLogProbs}) cjStart = time.time() newDir = utils.conjugate_gradients(Hx, grad, args.cg_iters) cjEnd = time.time() if args.delta_final < 0: curDelta = args.delta else: curDelta = utils.annealedNoise(args.delta, args.delta_final, args.epochs, e) LlossOld = sess.run( Lloss, feed_dict={ obsPh: obs, aPh: actions, advPh: advantages, logProbSampPh: sampledLogProb } ) #, logProbsAllPh : allLogProbs})#L function inputs: observations, advantages estimated, logProb of sampled action, logProbsOfAllActions coef = np.sqrt(2 * curDelta / (np.dot(np.transpose(newDir), Hx(newDir)) + 1e-8)) oldParams = sess.run(getPolicyParams)
summaryRet, summaryLen = sess.run([epTotalRewSum, epLenSum], feed_dict = {epTotalRewPh:epTotalRet, epLenPh:epLen}) globalStep = e*args.epoch_len + l writer.add_summary(summaryRet, globalStep) writer.add_summary(summaryLen, globalStep) finishedEp += 1 obs, epLen, epTotalRet = env.reset().copy(), 0, 0 simulationEnd = time.time() print("\tSimulation in epoch {} finished in {}".format(e, simulationEnd-epochSt)) #update policy and update state-value(multiple times) after that observations, actions, advEst, sampledLogProb, returns, Vprevs, additionalInfos = buffer.get() #if minimal is set to false, this will be not used, even though it will be passed in feed_dict for optimization step (see how opt. step is defined) learningRate = utils.annealedNoise(args.learning_rate, 0, args.epochs, e) #update updateStart = time.time() total = args.epoch_len for j in range(args.update_epochs): perm = np.random.permutation(total) start = 0 approxKlCumBeforeVfUpdate = 0 approxKlCumAfterVfUpdate = 0 oldParams = sess.run(getTrainableParams) while(start < total): end = np.amin([start+args.minibatch_size, total]) # KEY TECHNIQUE: This will stop updating the policy once the KL has been breached
nextNonTerminal = 1.0 - nextDone nextValue = lastValue nextReturn = lastValue else: nextNonTerminal = 1.0 - dones[t+1] nextValue = predVals[t+1] nextReturn = returns[t+1] delta = rewards[t] + args.gamma * nextValue * nextNonTerminal - predVals[t] advantages[t] = lastgaelam = delta + args.gamma * args.lambd * nextNonTerminal * lastgaelam returns[t] = rewards[t] + args.gamma * nextNonTerminal * nextReturn if not args.orig_returns: returns = advantages + predVals #if minimal is set to false, this will be not used, even though it will be passed in feed_dict for optimization step (see how opt. step is defined) learningRatePolicy = utils.annealedNoise(args.learning_rate_policy, 0, args.epochs, e) learningRateVf = utils.annealedNoise(args.learning_rate_state_value, 0, args.epochs, e) #update updateStart = time.time() total = args.epoch_len for j in range(args.update_epochs): perm = np.random.permutation(total) start = 0 #oldParams = sess.run(getTrainableParams) while(start < total): end = np.amin([start+args.minibatch_size, total]) sess.run(optimizationStepPolicy, feed_dict={obsPh : obs[perm[start:end]], aPh: actions[perm[start:end]], VPrevPh:predVals[perm[start:end]], advPh : utils.normalize(advantages[perm[start:end]]) if args.norm_adv else advantages[perm[start:end]], logProbSampPh : sampledLogProb[perm[start:end]], learningRatePolicyPh:learningRatePolicy}) sess.run(optimizationStepVf, feed_dict={obsPh : obs[perm[start:end]], totalEstimatedDiscountedRewardPh : returns[perm[start:end]], VPrevPh:predVals[perm[start:end]], learningRateVfPh:learningRateVf})
solved = False while step < args.total_train_steps and not solved: episodeStart = time.time() obs, epLen, epRet, allRets, allQs, doSample = env.reset( ), 0, 0, [], [], True #basicaly this is one episode because while exits when terminal state is reached or max number of steps(in episode or generaly) is reached while doSample: if step < args.start_steps: sampledAction = np.asarray([env.action_space.sample()]) else: noise = utils.annealedNoise(args.eps_start, args.eps_end, args.steps_to_decrease, step) sampledAction, _, = policy.getSampledActions( np.expand_dims(obs, 0)) + np.random.normal( 0, noise, (1, outputLength)) statistics[0].addValue(np.expand_dims(obs, 0)) statistics[1].addValue(sampledAction) predQ = sess.run(Q1.output, feed_dict={ obsPh: np.expand_dims(obs, 0), aPh: sampledAction })[0] nextObs, reward, terminal, infos = env.step(sampledAction[0]) epLen += 1