예제 #1
0
def main(_):
    with tf.Session() as sess:
        env = gym.make(ENV_NAME)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        print(env.observation_space)
        print(env.action_space)

        state_dim = env.observation_space.shape[0]

        try:
            action_dim = env.action_space.shape[0]
            action_bound = env.action_space.high
            # Ensure action bound is symmetric
            assert (env.action_space.high == -env.action_space.low)
            discrete = False
            print('Continuous Action Space')
        except IndexError:
            action_dim = env.action_space.n
            action_bound = 1
            discrete = True
            print('Discrete Action Space')

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        reward = Reward(REWARD_FACTOR, GAMMA)

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                env = Monitor(env,
                              MONITOR_DIR,
                              video_callable=False,
                              force=True)
            else:
                env = Monitor(env, MONITOR_DIR, force=True)

        try:
            train(sess, env, actor, critic, noise, reward, discrete)
        except KeyboardInterrupt:
            pass

        if GYM_MONITOR_EN:
            env.close()
예제 #2
0
        if stateValues[maxValueAction]['value'] < stateValues[action]['value']:
            maxValueAction = action
        if stateValues[minCountAction]['count'] > stateValues[action]['count']:
            minCountAction = action
    # Compute the decay of the exploration
    decayX = 0.5
    decayY = 50
    decay = max(-i_episode * decayX + decayY, 10 / (i_episode + 1))
    if randint(0, 100) < decay:
        explorationHistory[i_episode] += 1
        return minCountAction
    else:
        return maxValueAction


nbEpisodes = 1000
stepsHistory = [0] * nbEpisodes
env = gym.make('LunarLander-v2')
env = Monitor(env, 'tmp/cart-pole', force=True)
for i in range(6):
    print i
    history = {}  # 'state' ==> [{'count': int, 'value': float}]
    explorationHistory = [0] * nbEpisodes
    learn(nbEpisodes, i)
env.close()
# gym.upload('tmp/cart-pole', api_key='sk_QoYvL963TwnAqSJXZLOQ')
plt.plot(range(nbEpisodes), stepsHistory, range(nbEpisodes),
         explorationHistory, range(nbEpisodes), [195] * nbEpisodes)
plt.ylabel('Number of rewards')
plt.show()
예제 #3
0
    randomAgent = RandomAgent(action_size)
    while randomAgent.memory.isFull() == False:
        state = env.reset()
        for time in range(500):
            action = randomAgent.act(state)
            next_state, reward, done, _ = env.step(action)
            randomAgent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
    print("Finish to full the random agent memory")
    agent.memory = randomAgent.memory
    randomAgent = None

    env = Monitor(env, 'tmp/cart-pole-ddqn-2', force=True)
    for e in range(EPISODES):
        if DEBUG and e >= EPISODES - 10:
            agent.stopExploration()

        state = env.reset()
        for time in range(500):
            #env.render()
            
            # act on one input (one state)
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)

            agent.remember(state, action, reward, next_state, done)
            agent.replay(batch_size)
            state = next_state
예제 #4
0
import gym
import numpy as np

from gym.wrappers.monitoring import Monitor

env = gym.make('CartPole-v0')
env = Monitor(env, 'tmp/cart-pole-random-search-1', force=True)
print("Action space: {0}".format(env.action_space))
print("Observation space: {0}\n\tLow: {1}\n\tHigh: {2}".format(
    env.observation_space,
    env.observation_space.low,
    env.observation_space.high,
))


def action_selection(weights, observation):
    if np.matmul(weights, observation) < 0:
        return 0
    else:
        return 1


def run_episode(weights):
    observation = env.reset()
    total_reward = 0
    for t in range(200):
        env.render()
        action = action_selection(weights, observation)
        observation, reward, done, info = env.step(action)
        total_reward += reward
        if done:
예제 #5
0
import gym
from gym.wrappers.monitoring import Monitor
from gym.monitoring.tests import helpers


with helpers.tempdir() as temp:

    env = gym.make('CartPole-v0')
    # 모니터 래핑
    env = Monitor(temp)(env)
    #env.monitor.start(temp)
    #env.monitor.start('/tmp/cartpole-experiment-1')
    for i_episode in range(20):
        observation = env.reset()
        for t in range(500):
            env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            print(observation,reward,done,info)
            if done:
                print("Episode finished after {} timesteps".format(t+1))
                break

    env.close()
#env.monitor.close()
예제 #6
0
 def monitor_start(self, instance_id, directory, force, resume):
     env = self._lookup_env(instance_id)
     self.envs[instance_id] = Monitor(env, directory, None, force, resume)
예제 #7
0
# Source: http://rl-gym-doc.s3-website-us-west-2.amazonaws.com/mlss/lab1.html
import gym
import numpy as np
from gym.wrappers.monitoring import Monitor

from policy import Policy

# Task settings:
env = gym.make('CartPole-v0')  # Change as needed
env = Monitor(env, 'tmp/cart-pole-cross-entropy-1', force=True)
num_steps = 500  # maximum length of episode
# Alg settings:
n_iter = 100  # number of iterations of CEM
batch_size = 25  # number of samples per batch
elite_ratio = 0.2  # fraction of samples used as elite set

dim_theta = Policy.get_dim_theta(env)

# Initialize mean and standard deviation
theta_mean = np.zeros(dim_theta)
theta_std = np.ones(dim_theta)

# Now, for the algorithm
for iteration in range(n_iter):
    # Sample parameter vectors
    thetas = np.vstack([
        np.random.multivariate_normal(theta_mean, np.diag(theta_std**2))
        for _ in range(batch_size)
    ])
    rewards = [
        Policy.make_policy(env, theta).evaluate(env, num_steps)
예제 #8
0
# Source: http://rl-gym-doc.s3-website-us-west-2.amazonaws.com/mlss/lab1.html
# Implementation of Monte-Carlo Expectation Maximization
import gym
import numpy as np
from gym.wrappers.monitoring import Monitor

from policy import Policy

# Task settings:
env = gym.make('CartPole-v0')  # Change as needed
env = Monitor(env, 'tmp/cart-pole-monte-carlo-em-1', force=True)
num_steps = 500  # maximum length of episode
# Alg settings:
n_iter = 100  # number of iterations of CEM
batch_size = 25  # number of samples per batch

dim_theta = Policy.get_dim_theta(env)

# Initialize mean and variance
theta_mean = np.zeros(dim_theta)
theta_variance = np.ones(dim_theta)

# Now, for the algorithm
for iteration in range(n_iter):
    # Sample parameter vectors
    thetas = np.vstack([np.random.multivariate_normal(theta_mean, np.diag(theta_variance)) for _ in range(batch_size)])
    rewards = [Policy.make_policy(env, theta).evaluate(env, num_steps) for theta in thetas]
    # Weight parameters by score
    # Update theta_mean, theta_std
    theta_mean = np.average(thetas, axis=0, weights=rewards)
    theta_variance = np.average((thetas - theta_mean) ** 2, axis=0, weights=rewards)
예제 #9
0
import gym
import numpy as np
from gym.wrappers.monitoring import Monitor

MC_POLICY_EVAL_EP = 10
BASE_NOISE_FACTOR = 0.1
NUM_POLICY_EVAL = 500

env = gym.make('CartPole-v0')
env = Monitor(env, 'tmp/cart-pole-hill-climb-4', force=True)

print("Action space: {0}".format(env.action_space))
print("Observation space: {0}\n\tLow: {1}\n\tHigh: {2}".format(
    env.observation_space,
    env.observation_space.low,
    env.observation_space.high,
))


def action_selection(weights, observation):
    if np.matmul(weights, observation) < 0:
        return 0
    else:
        return 1


def run_episode(weights):
    observation = env.reset()
    total_reward = 0
    for t in range(200):
        env.render()