示例#1
0
            MaxA = a
    return MaxV,MaxA

def accionRandom(a,grid = Grid.grid_estandar(), eps=EPS):

    p = np.random.random()

    if p < (1.0 - eps):
        return a
    else:
        return np.random.choice(grid.posiblesAccionesBasicas())


if __name__ == "__main__":

    grid = Grid.grid_negativo(-0.1)
    todos_estados = grid.todos_estados()

    Q = {}
    alpha_personal = {}

    for s in todos_estados:
        Q[s] = {}
        alpha_personal[s] = {}

        for a in grid.posiblesAccionesBasicas():
            Q[s][a] = 0
            alpha_personal[s][a] = 1.0


    t = 1.0
def MaxQ(Q_s):

    MaxV = float('-inf')
    MaxA = None

    for k, v in Q_s.items():
        if v > MaxV:
            MaxV = v
            MaxA = k

    return (MaxV, MaxA)


if __name__ == "__main__":

    grid = Grid.grid_negativo(penalizacion=-0.2)
    Q, politica, historialRetornos = iniciarValores(grid)

    print("Recompensas")
    mostrar_valores(grid.recompensas, grid)

    print("\nPolítica Inicial")
    mostrar_politica(politica, grid)

    print("\nValores Iniciales")
    V = {}
    for s in politica:
        V[s] = MaxQ(Q[s])[0]
    mostrar_valores(V, grid)

    episodios = 10000