from slm_lab.agent import net from slm_lab.agent.algorithm import policy_util from slm_lab.agent.algorithm.base import Algorithm from slm_lab.lib import logger, util from slm_lab.lib.decorator import lab_api import numpy as np import pydash as ps import torch logger = logger.get_logger(__name__) class SARSA(Algorithm): ''' Implementation of SARSA. Algorithm: Repeat: 1. Collect some examples by acting in the environment and store them in an on policy replay memory (either batch or episodic) 2. For each example calculate the target (bootstrapped estimate of the discounted value of the state and action taken), y, using a neural network to approximate the Q function. s_t' is the next state following the action actually taken, a_t. a_t' is the action actually taken in the next state s_t'. y_t = r_t + gamma * Q(s_t', a_t') 4. For each example calculate the current estimate of the discounted value of the state and action taken x_t = Q(s_t, a_t) 5. Calculate L(x, y) where L is a regression loss (eg. mse) 6. Calculate the gradient of L with respect to all the parameters in the network and update the network parameters using the gradient e.g. algorithm_spec "algorithm": { "name": "SARSA", "action_pdtype": "default", "action_policy": "boltzmann",
from slm_lab.env.base import BaseEnv from slm_lab.env.wrapper import make_gym_env from slm_lab.env.vec_env import make_gym_venv from slm_lab.env.registration import try_register_env from slm_lab.lib import logger, util from slm_lab.lib.decorator import lab_api import gym import numpy as np import pydash as ps import roboschool logger = logger.get_logger(__name__) class OpenAIEnv(BaseEnv): ''' Wrapper for OpenAI Gym env to work with the Lab. e.g. env_spec "env": [{ "name": "PongNoFrameskip-v4", "frame_op": "concat", "frame_op_len": 4, "normalize_state": false, "reward_scale": "sign", "num_envs": 8, "max_t": null, "max_frame": 1e7 }], ''' def __init__(self, spec):