def __init__(self): super(PredictActionsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple( (self.cartpole.action_space, ) * (NUM_PREDICTED_ACTIONS + 1))
def __init__(self): super(PredictObsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space, ) + (self.cartpole.observation_space, ) * (NUM_PREDICTED_OBSERVATIONS))
class PredictObsCartpoleEnv(Env): def __init__(self): super(PredictObsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space, ) + (self.cartpole.observation_space, ) * (NUM_PREDICTED_OBSERVATIONS)) def _seed(self, *n, **kw): return self.cartpole._seed(*n, **kw) def _render(self, *n, **kw): return self.cartpole._render(*n, **kw) def _configure(self, *n, **kw): return self.cartpole._configure(*n, **kw) def _step(self, action): # the first element of action is the actual current action current_action = action[0] observation, reward, done, info = self.cartpole._step(current_action) if not done: # We add the newly predicted observations to the list before checking predictions # in order to give the agent a chance to predict the observations that they # are going to get _this_ round. self.predicted_observations.append(action[1:]) if self.iteration > TIME_BEFORE_BONUS_ALLOWED: for i in range( min(NUM_PREDICTED_OBSERVATIONS, len(self.predicted_observations))): l2dist = np.sqrt( np.sum( np.square( np.subtract( self.predicted_observations[-(i + 1)][i], observation)))) bonus = CORRECT_PREDICTION_BONUS * (1 - math.erf(l2dist)) reward += bonus self.iteration += 1 return observation, reward, done, info def _reset(self): observation = self.cartpole._reset() self.predicted_observations = [] self.iteration = 0 return observation
class PredictObsCartpoleEnv(Env): def __init__(self): super(PredictObsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space,) + (self.cartpole.observation_space,) * (NUM_PREDICTED_OBSERVATIONS)) def _seed(self, *n, **kw): return self.cartpole._seed(*n, **kw) def _render(self, *n, **kw): return self.cartpole._render(*n, **kw) def _configure(self, *n, **kw): return self.cartpole._configure(*n, **kw) def _step(self, action): # the first element of action is the actual current action current_action = action[0] observation, reward, done, info = self.cartpole._step(current_action) if not done: # We add the newly predicted observations to the list before checking predictions # in order to give the agent a chance to predict the observations that they # are going to get _this_ round. self.predicted_observations.append(action[1:]) if self.iteration > TIME_BEFORE_BONUS_ALLOWED: for i in xrange(min(NUM_PREDICTED_OBSERVATIONS, len(self.predicted_observations))): l2dist = np.sqrt(np.sum(np.square(np.subtract( self.predicted_observations[-(i + 1)][i], observation )))) bonus = CORRECT_PREDICTION_BONUS * (1 - math.erf(l2dist)) reward += bonus self.iteration += 1 return observation, reward, done, info def _reset(self): observation = self.cartpole._reset() self.predicted_observations = [] self.iteration = 0 return observation
def __init__(self, max_episode_length=500, random_stable_position=False): CartPoleEnv.__init__(self) self.action_high = np.asarray([self.force_mag]) self.action_space = spaces.Box(-self.action_high, self.action_high) self._max_episode_length = max_episode_length self._time_step = 0 self._stable_x = None if random_stable_position: self._rand_pos_max = self.x_threshold - 0.4 self._stable_x = np.random.uniform(-self._rand_pos_max, self._rand_pos_max) # log.info("obs high : {}".format(self.observation_space.high)) oh = np.hstack((self.observation_space.high, np.asarray([self._rand_pos_max]))) self.observation_space = spaces.Box(-oh, oh) log.debug("Action Space {}".format(self.action_space)) log.debug("Observations Space {}".format(self.observation_space))
class PredictActionsCartpoleEnv(Env): def __init__(self): super(PredictActionsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple( (self.cartpole.action_space, ) * (NUM_PREDICTED_ACTIONS + 1)) def _seed(self, *n, **kw): return self.cartpole._seed(*n, **kw) def _render(self, *n, **kw): return self.cartpole._render(*n, **kw) def _configure(self, *n, **kw): return self.cartpole._configure(*n, **kw) def _step(self, action): # the first element of action is the actual current action current_action = action[0] observation, reward, done, info = self.cartpole._step(current_action) if not done: if self.iteration > TIME_BEFORE_BONUS_ALLOWED: for i in xrange( min(NUM_PREDICTED_ACTIONS, len(self.predicted_actions))): if self.predicted_actions[-(i + 1)][i] == current_action: reward += CORRECT_PREDICTION_BONUS self.predicted_actions.append(action[1:]) self.iteration += 1 return observation, reward, done, info def _reset(self): observation = self.cartpole._reset() self.predicted_actions = [] self.iteration = 0 return observation
class PredictActionsCartpoleEnv(Env): def __init__(self): super(PredictActionsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space,) * (NUM_PREDICTED_ACTIONS+1)) def _seed(self, *n, **kw): return self.cartpole._seed(*n, **kw) def _render(self, *n, **kw): return self.cartpole._render(*n, **kw) def _configure(self, *n, **kw): return self.cartpole._configure(*n, **kw) def _step(self, action): # the first element of action is the actual current action current_action = action[0] observation, reward, done, info = self.cartpole._step(current_action) if not done: if self.iteration > TIME_BEFORE_BONUS_ALLOWED: for i in xrange(min(NUM_PREDICTED_ACTIONS, len(self.predicted_actions))): if self.predicted_actions[-(i + 1)][i] == current_action: reward += CORRECT_PREDICTION_BONUS self.predicted_actions.append(action[1:]) self.iteration += 1 return observation, reward, done, info def _reset(self): observation = self.cartpole._reset() self.predicted_actions = [] self.iteration = 0 return observation
def reset(self): obs = CartPoleEnv.reset(self) self.steps_beyond_done = 0 self.success_steps = 0 return obs
def __init__(self): CartPoleEnv.__init__(self) self.steps_beyond_done = 0 self.success_steps = 0
def __init__(self): super(PredictActionsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space,) * (NUM_PREDICTED_ACTIONS+1))
import gym import numpy as np import math from gym.envs.classic_control.cartpole import CartPoleEnv from envs.task import Task _env = CartPoleEnv() _X_THRESHOLD = _env.x_threshold _THETA_THRESHOLD = _env.theta_threshold_radians del _env class CartPoleBalanceTask(Task): def __call__(self, states, actions, next_states): next_dones = GYMMB_ContinuousCartPole.is_done(next_states) dones = GYMMB_ContinuousCartPole.is_done(states) rewards = 1.0 - next_dones.float() * dones.float() # basically, you get always 1.0 unless you exceed the is_done termination criteria return rewards class CartPoleSpeedyBalanceTask(Task): def __call__(self, states, actions, next_states): rewards = states[:, 1].abs() # more rewards for abs velocity. No free points return rewards class GYMMB_ContinuousCartPole(CartPoleEnv): """ A continuous version. Observation space:
def cartpole_env(env_id=1, **kwargs): return GymEnvWrapper(CartPoleEnv(**kwargs), act_null_value=0)
logging.debug("actions: %s\n%s", action_tensor.shape, action_tensor) logging.debug("log_prob: %s\n%s", log_prob_tensor.shape, log_prob_tensor) logging.debug("log_prob test: %s\n%s", prob_tensor.shape, torch.log(prob_tensor)) logging.debug("rewards: %s\n%s", rewards_tensor.shape, rewards_tensor) logging.debug("loss: %s\n%s", loss.shape, loss) if __name__ == "__main__": #Create environment. Note that make actually wraps the actual environment. #The wrapper will end an episode based on a time limit # env = gym.make("CartPole-v0") #state = [x,x_dot,theta,theta_dot], actions = [left_or_right] 0 = left 1 = right env = CartPoleEnv() #instanciate the env directly, without the wrapper #get the state and action size from the environement state_size = env.observation_space.shape[0] action_size = env.action_space.n #instanciate agent agent = PolicyGradientAgent(state_size, action_size) done = True last_render_time = time.time() episode_count = -1 #forever while True: #if episode is done
def __init__(self): super(PredictObsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space,) + (self.cartpole.observation_space,) * (NUM_PREDICTED_OBSERVATIONS))