MaxA = a return MaxV,MaxA def accionRandom(a,grid = Grid.grid_estandar(), eps=EPS): p = np.random.random() if p < (1.0 - eps): return a else: return np.random.choice(grid.posiblesAccionesBasicas()) if __name__ == "__main__": grid = Grid.grid_negativo(-0.1) todos_estados = grid.todos_estados() Q = {} alpha_personal = {} for s in todos_estados: Q[s] = {} alpha_personal[s] = {} for a in grid.posiblesAccionesBasicas(): Q[s][a] = 0 alpha_personal[s][a] = 1.0 t = 1.0
def MaxQ(Q_s): MaxV = float('-inf') MaxA = None for k, v in Q_s.items(): if v > MaxV: MaxV = v MaxA = k return (MaxV, MaxA) if __name__ == "__main__": grid = Grid.grid_negativo(penalizacion=-0.2) Q, politica, historialRetornos = iniciarValores(grid) print("Recompensas") mostrar_valores(grid.recompensas, grid) print("\nPolítica Inicial") mostrar_politica(politica, grid) print("\nValores Iniciales") V = {} for s in politica: V[s] = MaxQ(Q[s])[0] mostrar_valores(V, grid) episodios = 10000