def policySARSA(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False): """ Actor-Critic: actor update the policy, critic update the value. """ InitParameter = 0.1 if echoSE: squareErrors = [] policy = SoftmaxPolicy(mdp) valuePolicy = Policy(mdp) for i in range(len(policy.parameters)): policy.parameters[i] = InitParameter valuePolicy.parameters[i] = 0.0 for _ in range(iterNum): if echoSE: squareErrors.append(getSquareErrorPolicy(valuePolicy)) state = random.choice(mdp.states) sFeature = mdp.getFeature(state) action = random.choice(mdp.actions) isTerminal = False count = 0 while not isTerminal and count < maxWalkLen: isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action) nextAction = policy.epsilonGreedy(nextSFeature, epsilon) valuePolicy.update(sFeature, action, reward + mdp.gamma * valuePolicy.qFunc(nextSFeature, nextAction), alpha) policy.update(sFeature, action, valuePolicy.qFunc(sFeature, action), alpha) sFeature = nextSFeature action = nextAction count += 1 if echoSE: return policy, squareErrors else: return policy
def featureQLearning(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False): """ qFunc = r + max_{a'}(\gamma * q(\hat{s'}, a')) """ InitParameter = 0.1 if echoSE: squareErrors = [] policy = Policy(mdp) for i in range(len(policy.parameters)): policy.parameters[i] = InitParameter for _ in range(iterNum): if echoSE: squareErrors.append(getSquareErrorPolicy(policy)) state = random.choice(mdp.states) sFeature = mdp.getFeature(state) action = random.choice(mdp.actions) isTerminal = False count = 0 while not isTerminal and count < maxWalkLen: isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action) maxQ = -1.0 for nextAction in mdp.actions: q = policy.qFunc(nextSFeature, nextAction) if maxQ < q: maxQ = q policy.update(sFeature, action, reward + mdp.gamma * maxQ, alpha) action = policy.epsilonGreedy(nextSFeature, epsilon) state = nextState sFeature = nextSFeature count += 1 if echoSE: return policy, squareErrors else: return policy