예제 #1
0
from trickster.agent import PPO
from trickster.rollout import MultiRolling, Trajectory
from trickster.utility import gymic
from trickster.model import mlp

envs = [gymic.rwd_scaled_env("LunarLander-v2") for _ in range(32)]
test_env = gymic.rwd_scaled_env("LunarLander-v2")

input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

actor = mlp.wide_mlp_actor_categorical(input_shape, num_actions, adam_lr=1e-4)
critic = mlp.wide_mlp_critic_network(input_shape, output_dim=1, adam_lr=1e-4)
agent = PPO(actor,
            critic,
            action_space=num_actions,
            discount_factor_gamma=0.99,
            entropy_penalty_coef=0.05)

rollout = MultiRolling(agent, envs)
test_rollout = Trajectory(agent, test_env)

rollout.fit(episodes=1000,
            updates_per_episode=1,
            steps_per_update=32,
            update_batch_size=32,
            testing_rollout=test_rollout,
            plot_curves=True)
test_rollout.render(repeats=10)
예제 #2
0
ALGO = "DQN"
NUM_ENVS = 4
TRAJECTORY_MAX_STEPS = 200
STEPS_PER_UPDATE = 1
UPDATES_PER_EPOCH = 64
EPOCHS = 200
UPDATE_BATCH_SIZE = 100

envs = [gym.make(ENV_NAME) for _ in range(NUM_ENVS)]
test_env = gym.make(ENV_NAME)

algo = {"DQN": DQN, "DoubleDQN": DoubleDQN}[ALGO]

agent = algo.from_environment(envs[0])

rollout = MultiRolling(agent, envs, TRAJECTORY_MAX_STEPS)
test_rollout = Trajectory(agent, test_env, TRAJECTORY_MAX_STEPS)

rollout.fit(epochs=EPOCHS,
            updates_per_epoch=UPDATES_PER_EPOCH,
            steps_per_update=STEPS_PER_UPDATE,
            update_batch_size=UPDATE_BATCH_SIZE,
            warmup_buffer=True,
            callbacks=[
                callbacks.TrajectoryEvaluator(testing_rollout=test_rollout,
                                              repeats=4),
                callbacks.ProgressPrinter(rollout.progress_keys)
            ])

test_rollout.render(repeats=10)
예제 #3
0
critic = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR))

agent = PPO(actor,
            critic,
            action_space=test_env.action_space,
            memory=Experience(max_length=EXPERIENCE_SIZE),
            reward_discount_factor_gamma=DISCOUNT_FACTOR_GAMMA,
            entropy_penalty_coef=ENTROPY_PENALTY_BETA)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=MAX_TIMESTEPS))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_kld = []
actor_entropy = []
critic_loss = []

for episode in range(1, NUM_EPISODES + 1):
    roll_history = rollout.roll(steps=ROLL_TIMESTEPS,
                                verbose=0,
                                push_experience=True)
    agent_history = agent.fit(epochs=FIT_EPOCHS,
예제 #4
0
envs = [gymic.rwd_scaled_env("CartPole-v1") for _ in range(8)]
test_env = gymic.rwd_scaled_env("CartPole-v1")

input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

ann = mlp.wide_mlp_critic_network(input_shape, num_actions, adam_lr=1e-4)

agent = DQN(ann,
            action_space=2,
            memory=Experience(max_length=10000),
            epsilon=1.,
            epsilon_decay=0.99995,
            epsilon_min=0.1,
            discount_factor_gamma=0.98,
            use_target_network=True)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, test_env)

rollout.fit(episodes=500,
            updates_per_episode=128,
            steps_per_update=1,
            update_batch_size=32,
            testing_rollout=test_rollout,
            plot_curves=True)
test_rollout.render()
input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

qnet = Sequential([Dense(16, activation="relu", input_shape=input_shape),
                   Dense(16, activation="relu"),
                   Dense(num_actions, activation="linear")])
qnet.compile(loss="mse", optimizer=Adam(1e-3))

agent = DQN(qnet,
            action_space=2,
            memory=Experience(max_length=10000),
            epsilon=1.,
            discount_factor_gamma=0.98,
            use_target_network=True)

rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
losses = []

for episode in range(1, 501):
    episode_losses = []

    for update in range(32):
        rollout.roll(steps=4, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=32, verbose=0)
        episode_losses.append(agent_history["loss"])

    test_history = test_rollout.rollout(verbose=0, push_experience=False, render=False)
    rewards.append(test_history["reward_sum"])
예제 #6
0
            action_space=MOVES,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.995,
            entropy_penalty_coef=0.0,
            state_preprocessor=lambda state: state / 255.)

episode = 1

reward_memory = deque(maxlen=10)
step_lengths = deque(maxlen=10)
critic_losses = deque(maxlen=10)
actor_losses = deque(maxlen=10)
actor_utility = deque(maxlen=10)
actor_entropy = deque(maxlen=10)

rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=512, skipframes=2))
test_rollout = Trajectory(agent, test_env, config=RolloutConfig(max_steps=512, skipframes=2))

while 1:
    episode_a_losses = []
    episode_a_utility = []
    episode_a_entropy = []
    episode_c_losses = []

    for update in range(32):
        rollout.roll(steps=2, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

        episode_a_losses.append(agent_history["actor_loss"])
        episode_a_utility.append(agent_history["actor_utility"])
        episode_a_entropy.append(agent_history["actor_entropy"])
ALGO = "SAC"
TRAJECTORY_MAX_STEPS = 100
STEPS_PER_UPDATE = 1
UPDATES_PER_EPOCH = 32
EPOCHS = 1000
UPDATE_BATCH_SIZE = 64
NUM_ENVS = 2

envs = [gym.make(ENV_NAME) for _ in range(NUM_ENVS)]
test_env = gym.make(ENV_NAME)

algo = {"DDPG": DDPG, "TD3": TD3, "SAC": SAC}[ALGO]

agent = algo.from_environment(envs[0])

rollout = MultiRolling(agent, envs, TRAJECTORY_MAX_STEPS)
test_rollout = Trajectory(agent, test_env, TRAJECTORY_MAX_STEPS)

cbs = [
    callbacks.TrajectoryEvaluator(testing_rollout=test_rollout),
    callbacks.ProgressPrinter(keys=rollout.progress_keys),
    callbacks.TrajectoryRenderer(testing_rollout=test_rollout),
    callbacks.TensorBoard(experiment_name=rollout.experiment_name)
]

rollout.fit(epochs=EPOCHS,
            updates_per_epoch=UPDATES_PER_EPOCH,
            steps_per_update=STEPS_PER_UPDATE,
            update_batch_size=UPDATE_BATCH_SIZE,
            warmup_buffer=1000)
예제 #8
0
    Conv2D(16, 3, padding="same", activation="relu"),  # 20
    Conv2D(32, 3, strides=2, padding="same", activation="relu"),  # 10
    GlobalAveragePooling2D(),  # 32
    Dense(1, activation="linear")
])

critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR))

agent = PPO(actor,
            critic,
            action_space=test_env.action_space,
            discount_factor_gamma=DISCOUNT_GAMMA,
            gae_factor_lambda=GAE_LAMBDA,
            entropy_penalty_coef=ENTROPY_PENALTY_BETA)

rollout = MultiRolling(agent.dispatch_workers(NUM_PARALLEL_ENVS), envs)
test_rollout = Trajectory(agent, test_env)

rewards = []
actor_loss = []
actor_utility = []
actor_std = []
actor_kld = []
actor_entropy = []
critic_loss = []

for episode in range(1, NUM_EPISODES + 1):
    roll_history = rollout.roll(steps=ROLL_TIMESTEPS,
                                verbose=0,
                                push_experience=True)
    agent_history = agent.fit(epochs=FIT_EPOCHS,
예제 #9
0
from trickster.agent import A2C
from trickster.rollout import MultiRolling, Trajectory, RolloutConfig
from trickster.model import mlp

cfg = MatchConfig(canvas_size=(100, 100),
                  players_per_side=2,
                  learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT,
                  observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR)

envs = [Match(cfg) for _ in range(8)]
test_env = Match(cfg)

actor, critic = mlp.wide_pg_actor_critic(envs[0].observation_space.shape,
                                         envs[0].action_space.n,
                                         actor_lr=1e-4,
                                         critic_lr=1e-4)

agent = A2C(actor, critic, test_env.action_space, entropy_penalty_coef=0.1)

rcfg = RolloutConfig(max_steps=512, skipframes=2)

training_rollout = MultiRolling(agent, envs, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

training_rollout.fit(episodes=1000,
                     updates_per_episode=512,
                     steps_per_update=1,
                     testing_rollout=testing_rollout)
testing_rollout.render(repeats=10)
예제 #10
0
from trickster.agent import A2C
from trickster.rollout import MultiRolling, Trajectory
from trickster.utility import gymic
from trickster.model import mlp

envs = [gymic.rwd_scaled_env() for _ in range(8)]
input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

actor, critic = mlp.wide_pg_actor_critic(input_shape,
                                         num_actions,
                                         critic_lr=5e-4)

agent = A2C(actor,
            critic,
            action_space=envs[0].action_space,
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.05)

rollout = MultiRolling(agent, envs)
test_rollout = Trajectory(agent, gymic.rwd_scaled_env())

rollout.fit(episodes=300,
            updates_per_episode=128,
            steps_per_update=1,
            testing_rollout=test_rollout)
test_rollout.render()