예제 #1
0
    def test_reinforce_doesnt_store_invalid_transitions(self):

        STEPS = 35

        env = DummyEnv()
        agent = REINFORCE.from_environment(env, discount_gamma=0.)
        rollout = Rolling(agent, env)

        rollout.roll(STEPS, verbose=0, push_experience=True)

        data = agent.memory_sampler.sample(-1)

        self.assertEqual(agent.episodes, 3)
        np.testing.assert_array_less(data["state"], 10)
        self.assertEqual(len(data["state"]), STEPS - 4)
from trickster.agent import REINFORCE
from trickster.rollout import Trajectory, RolloutConfig
from trickster.utility import visual

env = gym.make("CartPole-v1")
input_shape = env.observation_space.shape
num_actions = env.action_space.n

policy = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(num_actions, activation="softmax")
])
policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3))

agent = REINFORCE(policy, action_space=num_actions)

rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300))

rewards = []
losses = []

for episode in range(1, 501):
    rollout_history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

    rewards.append(rollout_history["reward_sum"])
    losses.append(agent_history["loss"])

    print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:])),
예제 #3
0
actor_stream = LeakyReLU()(actor_stream)
actor_stream = Conv2D(64, (3, 3), strides=(2, 2),
                      padding="same")(actor_stream)  # 4
actor_stream = LeakyReLU()(actor_stream)
actor_stream = Conv2D(128, (4, 4))(actor_stream)
actor_stream = LeakyReLU()(actor_stream)

actor_stream = Dense(32, activation="relu")(actor_stream)
action_probs = Dense(NUM_MOVES, activation="softmax")(actor_stream)

actor = Model(common_input, action_probs, name="Actor")
actor.compile(Adam(1e-3), "categorical_crossentropy")

agent = REINFORCE(actor,
                  action_space=MOVES,
                  memory=Experience(max_length=10000),
                  discount_factor_gamma=0.99,
                  state_preprocessor=preprocess)

screen = CV2Screen(scale=2)
episode = 0
reward_memory = deque(maxlen=100)
critic_losses = deque(maxlen=50)
actor_losses = deque(maxlen=100)

rollout = MultiTrajectory(agent,
                          envs,
                          warmup_episodes=WARMUP,
                          rollout_configs=RolloutConfig(max_steps=512,
                                                        skipframes=4))
history = {"episode": 0}
예제 #4
0
envs = [FakeEnv() for _ in range(10)]
test_env = FakeEnv()

actor = Sequential([  # 200, 160
    Flatten(input_shape=test_env.shape),
    Dense(256),
    BatchNormalization(),
    LeakyReLU(),
    Dense(2, activation="softmax")
])
actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy")

agent = REINFORCE(actor,
                  2,
                  Experience(),
                  discount_factor_gamma=0.99,
                  state_preprocessor=None)

rollout = MultiTrajectory([agent for _ in range(10)], envs)
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
actor_loss = deque(maxlen=100)
actor_utility = deque(maxlen=100)
actor_entropy = deque(maxlen=100)
critic_loss = deque(maxlen=100)

episode = 0

while 1:
예제 #5
0
from trickster.agent import REINFORCE
from trickster.rollout import Trajectory, RolloutConfig
from trickster.utility import gymic
from trickster.model import mlp

env = gymic.rwd_scaled_env()
input_shape = env.observation_space.shape
num_actions = env.action_space.n

policy = mlp.wide_mlp_actor_categorical(input_shape, num_actions, adam_lr=1e-4)
agent = REINFORCE(policy, action_space=num_actions)
rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300))

rollout.fit(episodes=500, rollouts_per_update=1, update_batch_size=-1)
rollout.render(repeats=10)
예제 #6
0
import keras

from trickster.agent import REINFORCE
from trickster.rollout import Trajectory
from trickster.experience import Experience
from trickster.utility import gymic
from trickster.model import mlp

env = gymic.rwd_scaled_env("LunarLander-v2")
input_shape = env.observation_space.shape
num_actions = env.action_space.n

policy = mlp.wide_mlp_actor_categorical(input_shape, num_actions)
policy.compile(optimizer=keras.optimizers.SGD(lr=2e-4, momentum=0.9),
               loss="categorical_crossentropy")

agent = REINFORCE(policy,
                  action_space=num_actions,
                  memory=Experience(max_length=10000),
                  discount_factor_gamma=0.99)

rollout = Trajectory(agent, env)
rollout.fit(episodes=1000, rollouts_per_update=16, update_batch_size=-1)
rollout.render(repeats=10)