예제 #1
0
        grad = sess.run(surrogateFlatLoss,
                        feed_dict={
                            obsPh: obs,
                            aPh: actions,
                            advPh: advantages,
                            logProbSampPh: sampledLogProb
                        })  #, logProbsAllPh : allLogProbs})
        cjStart = time.time()
        newDir = utils.conjugate_gradients(Hx, grad, args.cg_iters)
        cjEnd = time.time()

        if args.delta_final < 0:
            curDelta = args.delta
        else:
            curDelta = utils.annealedNoise(args.delta, args.delta_final,
                                           args.epochs, e)

        LlossOld = sess.run(
            Lloss,
            feed_dict={
                obsPh: obs,
                aPh: actions,
                advPh: advantages,
                logProbSampPh: sampledLogProb
            }
        )  #, logProbsAllPh : allLogProbs})#L function inputs: observations, advantages estimated, logProb of sampled action, logProbsOfAllActions
        coef = np.sqrt(2 * curDelta /
                       (np.dot(np.transpose(newDir), Hx(newDir)) + 1e-8))

        oldParams = sess.run(getPolicyParams)
예제 #2
0
                    summaryRet, summaryLen = sess.run([epTotalRewSum, epLenSum], feed_dict = {epTotalRewPh:epTotalRet, epLenPh:epLen})
                    globalStep = e*args.epoch_len + l
                    writer.add_summary(summaryRet, globalStep)
                    writer.add_summary(summaryLen, globalStep)                    
                    finishedEp += 1
                obs, epLen, epTotalRet = env.reset().copy(), 0, 0
              
        simulationEnd = time.time()      
        
        print("\tSimulation in epoch {} finished in {}".format(e, simulationEnd-epochSt))   
            
        #update policy and update state-value(multiple times) after that
        observations, actions, advEst, sampledLogProb, returns, Vprevs, additionalInfos = buffer.get()
 
        #if minimal is set to false, this will be not used, even though it will be passed in feed_dict for optimization step (see how opt. step is defined)
        learningRate = utils.annealedNoise(args.learning_rate, 0, args.epochs, e)
    
        #update
        updateStart = time.time()      
        total = args.epoch_len
        for j in range(args.update_epochs):
            perm = np.random.permutation(total)
            start = 0
            approxKlCumBeforeVfUpdate = 0
            approxKlCumAfterVfUpdate = 0            
            oldParams = sess.run(getTrainableParams)
            
            while(start < total):    
                end = np.amin([start+args.minibatch_size, total])
                
                # KEY TECHNIQUE: This will stop updating the policy once the KL has been breached
예제 #3
0
                nextNonTerminal = 1.0 - nextDone
                nextValue = lastValue
                nextReturn = lastValue
            else:
                nextNonTerminal = 1.0 - dones[t+1]
                nextValue = predVals[t+1]
                nextReturn = returns[t+1]
            delta = rewards[t] + args.gamma * nextValue * nextNonTerminal - predVals[t]
            advantages[t] = lastgaelam = delta + args.gamma * args.lambd * nextNonTerminal * lastgaelam
            returns[t] = rewards[t] + args.gamma * nextNonTerminal * nextReturn
            
        if not args.orig_returns:
            returns = advantages + predVals
 
        #if minimal is set to false, this will be not used, even though it will be passed in feed_dict for optimization step (see how opt. step is defined)
        learningRatePolicy = utils.annealedNoise(args.learning_rate_policy, 0, args.epochs, e)
        learningRateVf = utils.annealedNoise(args.learning_rate_state_value, 0, args.epochs, e)
    
        #update
        updateStart = time.time()      
        total = args.epoch_len
        for j in range(args.update_epochs):
            perm = np.random.permutation(total)
            start = 0
            #oldParams = sess.run(getTrainableParams)
            
            while(start < total):    
                end = np.amin([start+args.minibatch_size, total])
                
                sess.run(optimizationStepPolicy, feed_dict={obsPh : obs[perm[start:end]], aPh: actions[perm[start:end]], VPrevPh:predVals[perm[start:end]], advPh : utils.normalize(advantages[perm[start:end]]) if args.norm_adv else advantages[perm[start:end]], logProbSampPh : sampledLogProb[perm[start:end]], learningRatePolicyPh:learningRatePolicy})
                sess.run(optimizationStepVf, feed_dict={obsPh : obs[perm[start:end]], totalEstimatedDiscountedRewardPh : returns[perm[start:end]], VPrevPh:predVals[perm[start:end]], learningRateVfPh:learningRateVf})
예제 #4
0
    solved = False
    while step < args.total_train_steps and not solved:

        episodeStart = time.time()

        obs, epLen, epRet, allRets, allQs, doSample = env.reset(
        ), 0, 0, [], [], True

        #basicaly this is one episode because while exits when terminal state is reached or max number of steps(in episode or generaly) is reached
        while doSample:

            if step < args.start_steps:
                sampledAction = np.asarray([env.action_space.sample()])
            else:
                noise = utils.annealedNoise(args.eps_start, args.eps_end,
                                            args.steps_to_decrease, step)
                sampledAction, _, = policy.getSampledActions(
                    np.expand_dims(obs, 0)) + np.random.normal(
                        0, noise, (1, outputLength))

            statistics[0].addValue(np.expand_dims(obs, 0))
            statistics[1].addValue(sampledAction)

            predQ = sess.run(Q1.output,
                             feed_dict={
                                 obsPh: np.expand_dims(obs, 0),
                                 aPh: sampledAction
                             })[0]

            nextObs, reward, terminal, infos = env.step(sampledAction[0])
            epLen += 1