def __init__(self): super(PredictActionsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple( (self.cartpole.action_space, ) * (NUM_PREDICTED_ACTIONS + 1))
def __init__(self): super(PredictObsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space, ) + (self.cartpole.observation_space, ) * (NUM_PREDICTED_OBSERVATIONS))
def cartpole_env(env_id=1, **kwargs): return GymEnvWrapper(CartPoleEnv(**kwargs), act_null_value=0)
import gym import numpy as np import math from gym.envs.classic_control.cartpole import CartPoleEnv from envs.task import Task _env = CartPoleEnv() _X_THRESHOLD = _env.x_threshold _THETA_THRESHOLD = _env.theta_threshold_radians del _env class CartPoleBalanceTask(Task): def __call__(self, states, actions, next_states): next_dones = GYMMB_ContinuousCartPole.is_done(next_states) dones = GYMMB_ContinuousCartPole.is_done(states) rewards = 1.0 - next_dones.float() * dones.float() # basically, you get always 1.0 unless you exceed the is_done termination criteria return rewards class CartPoleSpeedyBalanceTask(Task): def __call__(self, states, actions, next_states): rewards = states[:, 1].abs() # more rewards for abs velocity. No free points return rewards class GYMMB_ContinuousCartPole(CartPoleEnv): """ A continuous version. Observation space:
logging.debug("actions: %s\n%s", action_tensor.shape, action_tensor) logging.debug("log_prob: %s\n%s", log_prob_tensor.shape, log_prob_tensor) logging.debug("log_prob test: %s\n%s", prob_tensor.shape, torch.log(prob_tensor)) logging.debug("rewards: %s\n%s", rewards_tensor.shape, rewards_tensor) logging.debug("loss: %s\n%s", loss.shape, loss) if __name__ == "__main__": #Create environment. Note that make actually wraps the actual environment. #The wrapper will end an episode based on a time limit # env = gym.make("CartPole-v0") #state = [x,x_dot,theta,theta_dot], actions = [left_or_right] 0 = left 1 = right env = CartPoleEnv() #instanciate the env directly, without the wrapper #get the state and action size from the environement state_size = env.observation_space.shape[0] action_size = env.action_space.n #instanciate agent agent = PolicyGradientAgent(state_size, action_size) done = True last_render_time = time.time() episode_count = -1 #forever while True: #if episode is done