from trickster.utility import visual

env = gym.make("CartPole-v1")
input_shape = env.observation_space.shape
num_actions = env.action_space.n

policy = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(num_actions, activation="softmax")
])
policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3))

agent = REINFORCE(policy, action_space=num_actions)

rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300))

rewards = []
losses = []

for episode in range(1, 501):
    rollout_history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

    rewards.append(rollout_history["reward_sum"])
    losses.append(agent_history["loss"])

    print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:])),
          end="")
示例#2
0
agent = REINFORCE(actor,
                  action_space=MOVES,
                  memory=Experience(max_length=10000),
                  discount_factor_gamma=0.99,
                  state_preprocessor=preprocess)

screen = CV2Screen(scale=2)
episode = 0
reward_memory = deque(maxlen=100)
critic_losses = deque(maxlen=50)
actor_losses = deque(maxlen=100)

rollout = MultiTrajectory(agent,
                          envs,
                          warmup_episodes=WARMUP,
                          rollout_configs=RolloutConfig(max_steps=512,
                                                        skipframes=4))
history = {"episode": 0}

while 1:
    rollout.reset()
    episode_actor_losses = []
    while not rollout.finished:
        history = rollout.roll(steps=4, verbose=0, learning_batch_size=32)
        reward_memory.append(history["reward_sum"])
        if "loss" in history:
            episode_actor_losses.append(history["loss"])
    actor.save("../models/reskiv/reinforce_latest.h5")
    episode = history["episode"]

    if episode_actor_losses:
        actor_losses.append(np.mean(episode_actor_losses))
示例#3
0
          activation="relu",
          input_shape=input_shape,
          kernel_initializer="he_uniform"),
    Dense(16, activation="relu", kernel_initializer="he_uniform"),
    Dense(1, activation="linear", kernel_initializer="he_uniform")
])
critic.compile(loss="mse", optimizer=Adam(5e-4))

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 1001):
    episode_actor_loss = []
    episode_actor_utility = []
    episode_actor_entropy = []
    episode_critic_loss = []

    for update in range(32):
示例#4
0
from trickster.agent import REINFORCE
from trickster.rollout import MultiTrajectory, RolloutConfig
from trickster.utility import visual

envs = [gym.make("CartPole-v1") for _ in range(8)]
input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

policy = Sequential([Dense(16, activation="relu", input_shape=input_shape),
                     Dense(16, activation="relu"),
                     Dense(num_actions, activation="softmax")])
policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3))

agent = REINFORCE(policy, action_space=num_actions)

rollout = MultiTrajectory(agent, envs, rollout_configs=RolloutConfig(max_steps=300))

rewards = []
losses = []

for episode in range(1, 501):
    rollout_history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)
    rewards.append(rollout_history["rewards"])
    losses.append(agent_history["loss"])
    print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:])), end="")
    if episode % 10 == 0:
        print()

visual.plot_vectors([rewards, losses], ["Reward", "Loss"], smoothing_window_size=10)
示例#5
0
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR))

agent = PPO(actor,
            critic,
            action_space=test_env.action_space,
            memory=Experience(max_length=EXPERIENCE_SIZE),
            reward_discount_factor_gamma=DISCOUNT_FACTOR_GAMMA,
            entropy_penalty_coef=ENTROPY_PENALTY_BETA)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=MAX_TIMESTEPS))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_kld = []
actor_entropy = []
critic_loss = []

for episode in range(1, NUM_EPISODES + 1):
    roll_history = rollout.roll(steps=ROLL_TIMESTEPS,
                                verbose=0,
                                push_experience=True)
    agent_history = agent.fit(epochs=FIT_EPOCHS,
                              batch_size=FIT_BATCH_SIZE,
input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

qnet = Sequential([Dense(16, activation="relu", input_shape=input_shape),
                   Dense(16, activation="relu"),
                   Dense(num_actions, activation="linear")])
qnet.compile(loss="mse", optimizer=Adam(1e-3))

agent = DQN(qnet,
            action_space=2,
            memory=Experience(max_length=10000),
            epsilon=1.,
            discount_factor_gamma=0.98,
            use_target_network=True)

rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
losses = []

for episode in range(1, 501):
    episode_losses = []

    for update in range(32):
        rollout.roll(steps=4, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=32, verbose=0)
        episode_losses.append(agent_history["loss"])

    test_history = test_rollout.rollout(verbose=0, push_experience=False, render=False)
    rewards.append(test_history["reward_sum"])
示例#7
0
policy = Sequential([
    Dense(24, activation="relu", input_shape=input_shape),
    Dense(24, activation="relu"),
    Dense(num_actions, activation="linear")
])
policy.compile(loss="mse", optimizer=Adam(1e-4))

agent = DQN(policy,
            action_space=num_actions,
            memory=Experience(max_length=10000),
            epsilon=1.,
            discount_factor_gamma=0.98,
            use_target_network=True)

rollout = Trajectory(agent, env, RolloutConfig(max_steps=200))

rewards = []
losses = []

for warmup in range(1, 33):
    rollout.rollout(verbose=0, learning_batch_size=0)

for episode in range(1, 501):
    rollout._reset()
    episode_rewards = []
    episode_losses = []
    while not rollout.finished:
        roll_history = rollout.roll(steps=2, verbose=0, learning_batch_size=64)
        episode_rewards.append(roll_history["reward_sum"])
        episode_losses.append(roll_history["loss"])
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(5e-4))

agent = A2C(actor,
            critic,
            action_space=envs[0].action_space,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 301):
    episode_actor_loss = []
    episode_actor_utility = []
    episode_actor_entropy = []
    episode_critic_loss = []

    for update in range(32):
示例#9
0
num_actions = env.action_space.shape[0]

actor, critics = mlp.wide_ddpg_actor_critic(input_shape,
                                            output_dim=num_actions,
                                            action_range=2,
                                            num_critics=2,
                                            actor_lr=5e-4,
                                            critic_lr=5e-4)

agent = TD3(actor,
            critics,
            action_space=spaces.CONTINUOUS,
            memory=Experience(max_length=int(1e4)),
            discount_factor_gamma=0.99,
            action_noise_sigma=0.1,
            action_noise_sigma_decay=1.,
            action_minima=-2,
            action_maxima=2,
            target_noise_sigma=0.2,
            target_noise_clip=0.5)

rollout = Rolling(agent, env)
test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True))

rollout.fit(episodes=1000,
            updates_per_episode=64,
            step_per_update=1,
            update_batch_size=32,
            testing_rollout=test_rollout)
test_rollout.render(repeats=10)
示例#10
0
env = Match(cfg)
test_env = Match(cfg)

ann = mlp.wide_dueling_q_network(env.observation_space.shape,
                                 env.action_space.n,
                                 adam_lr=1e-4)

experience = Experience(10000)
agent = DoubleDQN(ann,
                  env.action_space,
                  experience,
                  epsilon=1.,
                  epsilon_decay=1.,
                  epsilon_min=0.1)

rcfg = RolloutConfig(max_steps=1024, skipframes=2)
training_rollout = Rolling(agent, env, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

print("Filling experience...")
while experience.N < 10000:
    training_rollout.roll(steps=32, verbose=0, push_experience=True)
    print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="")
print()
agent.epsilon_decay = 0.99995

logger = history.History("reward_sum", *agent.history_keys, "epsilon")

for episode in range(1, 501):

    for update in range(32):
示例#11
0
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.MaxPool2D(),  # 5
    keras.layers.GlobalAveragePooling2D(),  # 16
    keras.layers.Dense(4, kernel_initializer="he_uniform"),
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.Dense(2, kernel_initializer="he_uniform")
])
qnet.compile(keras.optimizers.Adam(1e-3), "mse")

agent = DQN(qnet, 2, Experience(max_length=10_000), discount_factor_gamma=0.99,
            epsilon=1.0, epsilon_decay=0.99999, epsilon_min=0.3, use_target_network=True,
            state_preprocessor=None)

rollout = Rolling(agent, env, RolloutConfig(skipframes=2))
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
losses = deque(maxlen=100)

episode = 0

while 1:

    episode += 1

    rollout.roll(steps=4, verbose=0, push_experience=True)
    if agent.memory.N < 1000:
        print(f"\rFilling memory... {agent.memory.N}/1000", end="")
        continue
示例#12
0
            action_space=MOVES,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.995,
            entropy_penalty_coef=0.0,
            state_preprocessor=lambda state: state / 255.)

episode = 1

reward_memory = deque(maxlen=10)
step_lengths = deque(maxlen=10)
critic_losses = deque(maxlen=10)
actor_losses = deque(maxlen=10)
actor_utility = deque(maxlen=10)
actor_entropy = deque(maxlen=10)

rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=512, skipframes=2))
test_rollout = Trajectory(agent, test_env, config=RolloutConfig(max_steps=512, skipframes=2))

while 1:
    episode_a_losses = []
    episode_a_utility = []
    episode_a_entropy = []
    episode_c_losses = []

    for update in range(32):
        rollout.roll(steps=2, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

        episode_a_losses.append(agent_history["actor_loss"])
        episode_a_utility.append(agent_history["actor_utility"])
        episode_a_entropy.append(agent_history["actor_entropy"])
示例#13
0
critic_input = Input(shape=[64, 64, 3], name="critic_input")

critic_stream = Flatten()(critic_input)
critic_stream = Dense(64, activation="tanh")(critic_stream)
critic_stream = BatchNormalization()(critic_stream)
critic_stream = Dense(32, activation="tanh")(critic_stream)
critic_stream = BatchNormalization()(critic_stream)
value_estimate = Dense(NUM_MOVES, activation="softmax")(critic_stream)

critic = Model(critic_input, value_estimate, name="Critic")
critic.compile(Adam(5e-4), "mse")

agent = DQN(critic, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.99, epsilon=0.7,
            state_preprocessor=lambda state: state / 255. - 0.5)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=512, skipframes=2))
test_rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=512, skipframes=2))

episode = 0
reward_memory = deque(maxlen=10)
losses = deque(maxlen=10)

while 1:
    episode += 1

    episode_losses = []
    for update in range(32):
        rollout.roll(steps=4, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=32, verbose=0)
        episode_losses.append(agent_history["loss"])
示例#14
0
from trickster.agent import A2C
from trickster.rollout import MultiRolling, Trajectory, RolloutConfig
from trickster.model import mlp

cfg = MatchConfig(canvas_size=(100, 100),
                  players_per_side=2,
                  learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT,
                  observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR)

envs = [Match(cfg) for _ in range(8)]
test_env = Match(cfg)

actor, critic = mlp.wide_pg_actor_critic(envs[0].observation_space.shape,
                                         envs[0].action_space.n,
                                         actor_lr=1e-4,
                                         critic_lr=1e-4)

agent = A2C(actor, critic, test_env.action_space, entropy_penalty_coef=0.1)

rcfg = RolloutConfig(max_steps=512, skipframes=2)

training_rollout = MultiRolling(agent, envs, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

training_rollout.fit(episodes=1000,
                     updates_per_episode=512,
                     steps_per_update=1,
                     testing_rollout=testing_rollout)
testing_rollout.render(repeats=10)