Пример #1
0
def policySARSA(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False):
    """
    Actor-Critic: actor update the policy, critic update the value.
    """

    InitParameter = 0.1

    if echoSE:
        squareErrors = []

    policy = SoftmaxPolicy(mdp)
    valuePolicy = Policy(mdp)

    for i in range(len(policy.parameters)):
        policy.parameters[i] = InitParameter
        valuePolicy.parameters[i] = 0.0

    for _ in range(iterNum):
        if echoSE:
            squareErrors.append(getSquareErrorPolicy(valuePolicy))

        state = random.choice(mdp.states)
        sFeature = mdp.getFeature(state)
        action = random.choice(mdp.actions)
        isTerminal = False

        count = 0
        while not isTerminal and count < maxWalkLen:
            isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action)
            nextAction = policy.epsilonGreedy(nextSFeature, epsilon)

            valuePolicy.update(sFeature, action,
                               reward + mdp.gamma * valuePolicy.qFunc(nextSFeature, nextAction), alpha)
            policy.update(sFeature, action, valuePolicy.qFunc(sFeature, action), alpha)

            sFeature = nextSFeature
            action = nextAction
            count += 1

    if echoSE:
        return policy, squareErrors
    else:
        return policy
def featureQLearning(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False):
    """
    qFunc = r + max_{a'}(\gamma * q(\hat{s'}, a'))
    """

    InitParameter = 0.1

    if echoSE:
        squareErrors = []

    policy = Policy(mdp)

    for i in range(len(policy.parameters)):
        policy.parameters[i] = InitParameter

    for _ in range(iterNum):
        if echoSE:
            squareErrors.append(getSquareErrorPolicy(policy))

        state = random.choice(mdp.states)
        sFeature = mdp.getFeature(state)
        action = random.choice(mdp.actions)
        isTerminal = False

        count = 0
        while not isTerminal and count < maxWalkLen:
            isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action)

            maxQ = -1.0
            for nextAction in mdp.actions:
                q = policy.qFunc(nextSFeature, nextAction)
                if maxQ < q:
                    maxQ = q

            policy.update(sFeature, action, reward + mdp.gamma * maxQ, alpha)

            action = policy.epsilonGreedy(nextSFeature, epsilon)
            state = nextState
            sFeature = nextSFeature
            count += 1

    if echoSE:
        return policy, squareErrors
    else:
        return policy