def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) print(env.observation_space) print(env.action_space) state_dim = env.observation_space.shape[0] try: action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) discrete = False print('Continuous Action Space') except IndexError: action_dim = env.action_space.n action_bound = 1 discrete = True print('Discrete Action Space') actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) noise = Noise(DELTA, SIGMA, OU_A, OU_MU) reward = Reward(REWARD_FACTOR, GAMMA) if GYM_MONITOR_EN: if not RENDER_ENV: env = Monitor(env, MONITOR_DIR, video_callable=False, force=True) else: env = Monitor(env, MONITOR_DIR, force=True) try: train(sess, env, actor, critic, noise, reward, discrete) except KeyboardInterrupt: pass if GYM_MONITOR_EN: env.close()
if stateValues[maxValueAction]['value'] < stateValues[action]['value']: maxValueAction = action if stateValues[minCountAction]['count'] > stateValues[action]['count']: minCountAction = action # Compute the decay of the exploration decayX = 0.5 decayY = 50 decay = max(-i_episode * decayX + decayY, 10 / (i_episode + 1)) if randint(0, 100) < decay: explorationHistory[i_episode] += 1 return minCountAction else: return maxValueAction nbEpisodes = 1000 stepsHistory = [0] * nbEpisodes env = gym.make('LunarLander-v2') env = Monitor(env, 'tmp/cart-pole', force=True) for i in range(6): print i history = {} # 'state' ==> [{'count': int, 'value': float}] explorationHistory = [0] * nbEpisodes learn(nbEpisodes, i) env.close() # gym.upload('tmp/cart-pole', api_key='sk_QoYvL963TwnAqSJXZLOQ') plt.plot(range(nbEpisodes), stepsHistory, range(nbEpisodes), explorationHistory, range(nbEpisodes), [195] * nbEpisodes) plt.ylabel('Number of rewards') plt.show()
randomAgent = RandomAgent(action_size) while randomAgent.memory.isFull() == False: state = env.reset() for time in range(500): action = randomAgent.act(state) next_state, reward, done, _ = env.step(action) randomAgent.remember(state, action, reward, next_state, done) state = next_state if done: break print("Finish to full the random agent memory") agent.memory = randomAgent.memory randomAgent = None env = Monitor(env, 'tmp/cart-pole-ddqn-2', force=True) for e in range(EPISODES): if DEBUG and e >= EPISODES - 10: agent.stopExploration() state = env.reset() for time in range(500): #env.render() # act on one input (one state) action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.remember(state, action, reward, next_state, done) agent.replay(batch_size) state = next_state
import gym import numpy as np from gym.wrappers.monitoring import Monitor env = gym.make('CartPole-v0') env = Monitor(env, 'tmp/cart-pole-random-search-1', force=True) print("Action space: {0}".format(env.action_space)) print("Observation space: {0}\n\tLow: {1}\n\tHigh: {2}".format( env.observation_space, env.observation_space.low, env.observation_space.high, )) def action_selection(weights, observation): if np.matmul(weights, observation) < 0: return 0 else: return 1 def run_episode(weights): observation = env.reset() total_reward = 0 for t in range(200): env.render() action = action_selection(weights, observation) observation, reward, done, info = env.step(action) total_reward += reward if done:
import gym from gym.wrappers.monitoring import Monitor from gym.monitoring.tests import helpers with helpers.tempdir() as temp: env = gym.make('CartPole-v0') # 모니터 래핑 env = Monitor(temp)(env) #env.monitor.start(temp) #env.monitor.start('/tmp/cartpole-experiment-1') for i_episode in range(20): observation = env.reset() for t in range(500): env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) print(observation,reward,done,info) if done: print("Episode finished after {} timesteps".format(t+1)) break env.close() #env.monitor.close()
def monitor_start(self, instance_id, directory, force, resume): env = self._lookup_env(instance_id) self.envs[instance_id] = Monitor(env, directory, None, force, resume)
# Source: http://rl-gym-doc.s3-website-us-west-2.amazonaws.com/mlss/lab1.html import gym import numpy as np from gym.wrappers.monitoring import Monitor from policy import Policy # Task settings: env = gym.make('CartPole-v0') # Change as needed env = Monitor(env, 'tmp/cart-pole-cross-entropy-1', force=True) num_steps = 500 # maximum length of episode # Alg settings: n_iter = 100 # number of iterations of CEM batch_size = 25 # number of samples per batch elite_ratio = 0.2 # fraction of samples used as elite set dim_theta = Policy.get_dim_theta(env) # Initialize mean and standard deviation theta_mean = np.zeros(dim_theta) theta_std = np.ones(dim_theta) # Now, for the algorithm for iteration in range(n_iter): # Sample parameter vectors thetas = np.vstack([ np.random.multivariate_normal(theta_mean, np.diag(theta_std**2)) for _ in range(batch_size) ]) rewards = [ Policy.make_policy(env, theta).evaluate(env, num_steps)
# Source: http://rl-gym-doc.s3-website-us-west-2.amazonaws.com/mlss/lab1.html # Implementation of Monte-Carlo Expectation Maximization import gym import numpy as np from gym.wrappers.monitoring import Monitor from policy import Policy # Task settings: env = gym.make('CartPole-v0') # Change as needed env = Monitor(env, 'tmp/cart-pole-monte-carlo-em-1', force=True) num_steps = 500 # maximum length of episode # Alg settings: n_iter = 100 # number of iterations of CEM batch_size = 25 # number of samples per batch dim_theta = Policy.get_dim_theta(env) # Initialize mean and variance theta_mean = np.zeros(dim_theta) theta_variance = np.ones(dim_theta) # Now, for the algorithm for iteration in range(n_iter): # Sample parameter vectors thetas = np.vstack([np.random.multivariate_normal(theta_mean, np.diag(theta_variance)) for _ in range(batch_size)]) rewards = [Policy.make_policy(env, theta).evaluate(env, num_steps) for theta in thetas] # Weight parameters by score # Update theta_mean, theta_std theta_mean = np.average(thetas, axis=0, weights=rewards) theta_variance = np.average((thetas - theta_mean) ** 2, axis=0, weights=rewards)
import gym import numpy as np from gym.wrappers.monitoring import Monitor MC_POLICY_EVAL_EP = 10 BASE_NOISE_FACTOR = 0.1 NUM_POLICY_EVAL = 500 env = gym.make('CartPole-v0') env = Monitor(env, 'tmp/cart-pole-hill-climb-4', force=True) print("Action space: {0}".format(env.action_space)) print("Observation space: {0}\n\tLow: {1}\n\tHigh: {2}".format( env.observation_space, env.observation_space.low, env.observation_space.high, )) def action_selection(weights, observation): if np.matmul(weights, observation) < 0: return 0 else: return 1 def run_episode(weights): observation = env.reset() total_reward = 0 for t in range(200): env.render()