def featureMCControl(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False): """ qFunc = g_t """ InitParameter = 0.1 if echoSE: squareErrors = [] policy = Policy(mdp) for i in range(len(policy.parameters)): policy.parameters[i] = InitParameter for _ in range(iterNum): if echoSE: squareErrors.append(getSquareErrorPolicy(policy)) states, sFeatures, actions, rewards = [], [], [], [] state = random.choice(mdp.states) sFeature = mdp.getFeature(state) isTerminal = False count = 0 while not isTerminal and count < maxWalkLen: action = policy.epsilonGreedy(sFeature, epsilon) isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action) states.append(state) sFeatures.append(sFeature) rewards.append(reward) actions.append(action) state = nextState sFeature = nextSFeature count += 1 g = 0.0 for i in range(len(states) - 1, -1, -1): g *= mdp.gamma g += rewards[i] for i in range(len(states)): policy.update(sFeatures[i], actions[i], g, alpha) g -= rewards[i] g /= mdp.gamma if echoSE: return policy, squareErrors else: return policy
def featureQLearning(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False): """ qFunc = r + max_{a'}(\gamma * q(\hat{s'}, a')) """ InitParameter = 0.1 if echoSE: squareErrors = [] policy = Policy(mdp) for i in range(len(policy.parameters)): policy.parameters[i] = InitParameter for _ in range(iterNum): if echoSE: squareErrors.append(getSquareErrorPolicy(policy)) state = random.choice(mdp.states) sFeature = mdp.getFeature(state) action = random.choice(mdp.actions) isTerminal = False count = 0 while not isTerminal and count < maxWalkLen: isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action) maxQ = -1.0 for nextAction in mdp.actions: q = policy.qFunc(nextSFeature, nextAction) if maxQ < q: maxQ = q policy.update(sFeature, action, reward + mdp.gamma * maxQ, alpha) action = policy.epsilonGreedy(nextSFeature, epsilon) state = nextState sFeature = nextSFeature count += 1 if echoSE: return policy, squareErrors else: return policy