from trickster.agent import PPO from trickster.rollout import MultiRolling, Trajectory from trickster.utility import gymic from trickster.model import mlp envs = [gymic.rwd_scaled_env("LunarLander-v2") for _ in range(32)] test_env = gymic.rwd_scaled_env("LunarLander-v2") input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n actor = mlp.wide_mlp_actor_categorical(input_shape, num_actions, adam_lr=1e-4) critic = mlp.wide_mlp_critic_network(input_shape, output_dim=1, adam_lr=1e-4) agent = PPO(actor, critic, action_space=num_actions, discount_factor_gamma=0.99, entropy_penalty_coef=0.05) rollout = MultiRolling(agent, envs) test_rollout = Trajectory(agent, test_env) rollout.fit(episodes=1000, updates_per_episode=1, steps_per_update=32, update_batch_size=32, testing_rollout=test_rollout, plot_curves=True) test_rollout.render(repeats=10)
ALGO = "DQN" NUM_ENVS = 4 TRAJECTORY_MAX_STEPS = 200 STEPS_PER_UPDATE = 1 UPDATES_PER_EPOCH = 64 EPOCHS = 200 UPDATE_BATCH_SIZE = 100 envs = [gym.make(ENV_NAME) for _ in range(NUM_ENVS)] test_env = gym.make(ENV_NAME) algo = {"DQN": DQN, "DoubleDQN": DoubleDQN}[ALGO] agent = algo.from_environment(envs[0]) rollout = MultiRolling(agent, envs, TRAJECTORY_MAX_STEPS) test_rollout = Trajectory(agent, test_env, TRAJECTORY_MAX_STEPS) rollout.fit(epochs=EPOCHS, updates_per_epoch=UPDATES_PER_EPOCH, steps_per_update=STEPS_PER_UPDATE, update_batch_size=UPDATE_BATCH_SIZE, warmup_buffer=True, callbacks=[ callbacks.TrajectoryEvaluator(testing_rollout=test_rollout, repeats=4), callbacks.ProgressPrinter(rollout.progress_keys) ]) test_rollout.render(repeats=10)
critic = Sequential([ Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(1, activation="linear") ]) critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR)) agent = PPO(actor, critic, action_space=test_env.action_space, memory=Experience(max_length=EXPERIENCE_SIZE), reward_discount_factor_gamma=DISCOUNT_FACTOR_GAMMA, entropy_penalty_coef=ENTROPY_PENALTY_BETA) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=MAX_TIMESTEPS)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_kld = [] actor_entropy = [] critic_loss = [] for episode in range(1, NUM_EPISODES + 1): roll_history = rollout.roll(steps=ROLL_TIMESTEPS, verbose=0, push_experience=True) agent_history = agent.fit(epochs=FIT_EPOCHS,
envs = [gymic.rwd_scaled_env("CartPole-v1") for _ in range(8)] test_env = gymic.rwd_scaled_env("CartPole-v1") input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n ann = mlp.wide_mlp_critic_network(input_shape, num_actions, adam_lr=1e-4) agent = DQN(ann, action_space=2, memory=Experience(max_length=10000), epsilon=1., epsilon_decay=0.99995, epsilon_min=0.1, discount_factor_gamma=0.98, use_target_network=True) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, test_env) rollout.fit(episodes=500, updates_per_episode=128, steps_per_update=1, update_batch_size=32, testing_rollout=test_rollout, plot_curves=True) test_rollout.render()
input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n qnet = Sequential([Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="linear")]) qnet.compile(loss="mse", optimizer=Adam(1e-3)) agent = DQN(qnet, action_space=2, memory=Experience(max_length=10000), epsilon=1., discount_factor_gamma=0.98, use_target_network=True) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] losses = [] for episode in range(1, 501): episode_losses = [] for update in range(32): rollout.roll(steps=4, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=32, verbose=0) episode_losses.append(agent_history["loss"]) test_history = test_rollout.rollout(verbose=0, push_experience=False, render=False) rewards.append(test_history["reward_sum"])
action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.995, entropy_penalty_coef=0.0, state_preprocessor=lambda state: state / 255.) episode = 1 reward_memory = deque(maxlen=10) step_lengths = deque(maxlen=10) critic_losses = deque(maxlen=10) actor_losses = deque(maxlen=10) actor_utility = deque(maxlen=10) actor_entropy = deque(maxlen=10) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=512, skipframes=2)) test_rollout = Trajectory(agent, test_env, config=RolloutConfig(max_steps=512, skipframes=2)) while 1: episode_a_losses = [] episode_a_utility = [] episode_a_entropy = [] episode_c_losses = [] for update in range(32): rollout.roll(steps=2, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) episode_a_losses.append(agent_history["actor_loss"]) episode_a_utility.append(agent_history["actor_utility"]) episode_a_entropy.append(agent_history["actor_entropy"])
ALGO = "SAC" TRAJECTORY_MAX_STEPS = 100 STEPS_PER_UPDATE = 1 UPDATES_PER_EPOCH = 32 EPOCHS = 1000 UPDATE_BATCH_SIZE = 64 NUM_ENVS = 2 envs = [gym.make(ENV_NAME) for _ in range(NUM_ENVS)] test_env = gym.make(ENV_NAME) algo = {"DDPG": DDPG, "TD3": TD3, "SAC": SAC}[ALGO] agent = algo.from_environment(envs[0]) rollout = MultiRolling(agent, envs, TRAJECTORY_MAX_STEPS) test_rollout = Trajectory(agent, test_env, TRAJECTORY_MAX_STEPS) cbs = [ callbacks.TrajectoryEvaluator(testing_rollout=test_rollout), callbacks.ProgressPrinter(keys=rollout.progress_keys), callbacks.TrajectoryRenderer(testing_rollout=test_rollout), callbacks.TensorBoard(experiment_name=rollout.experiment_name) ] rollout.fit(epochs=EPOCHS, updates_per_epoch=UPDATES_PER_EPOCH, steps_per_update=STEPS_PER_UPDATE, update_batch_size=UPDATE_BATCH_SIZE, warmup_buffer=1000)
Conv2D(16, 3, padding="same", activation="relu"), # 20 Conv2D(32, 3, strides=2, padding="same", activation="relu"), # 10 GlobalAveragePooling2D(), # 32 Dense(1, activation="linear") ]) critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR)) agent = PPO(actor, critic, action_space=test_env.action_space, discount_factor_gamma=DISCOUNT_GAMMA, gae_factor_lambda=GAE_LAMBDA, entropy_penalty_coef=ENTROPY_PENALTY_BETA) rollout = MultiRolling(agent.dispatch_workers(NUM_PARALLEL_ENVS), envs) test_rollout = Trajectory(agent, test_env) rewards = [] actor_loss = [] actor_utility = [] actor_std = [] actor_kld = [] actor_entropy = [] critic_loss = [] for episode in range(1, NUM_EPISODES + 1): roll_history = rollout.roll(steps=ROLL_TIMESTEPS, verbose=0, push_experience=True) agent_history = agent.fit(epochs=FIT_EPOCHS,
from trickster.agent import A2C from trickster.rollout import MultiRolling, Trajectory, RolloutConfig from trickster.model import mlp cfg = MatchConfig(canvas_size=(100, 100), players_per_side=2, learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT, observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR) envs = [Match(cfg) for _ in range(8)] test_env = Match(cfg) actor, critic = mlp.wide_pg_actor_critic(envs[0].observation_space.shape, envs[0].action_space.n, actor_lr=1e-4, critic_lr=1e-4) agent = A2C(actor, critic, test_env.action_space, entropy_penalty_coef=0.1) rcfg = RolloutConfig(max_steps=512, skipframes=2) training_rollout = MultiRolling(agent, envs, rcfg) testing_rollout = Trajectory(agent, test_env, rcfg) training_rollout.fit(episodes=1000, updates_per_episode=512, steps_per_update=1, testing_rollout=testing_rollout) testing_rollout.render(repeats=10)
from trickster.agent import A2C from trickster.rollout import MultiRolling, Trajectory from trickster.utility import gymic from trickster.model import mlp envs = [gymic.rwd_scaled_env() for _ in range(8)] input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions, critic_lr=5e-4) agent = A2C(actor, critic, action_space=envs[0].action_space, discount_factor_gamma=0.98, entropy_penalty_coef=0.05) rollout = MultiRolling(agent, envs) test_rollout = Trajectory(agent, gymic.rwd_scaled_env()) rollout.fit(episodes=300, updates_per_episode=128, steps_per_update=1, testing_rollout=test_rollout) test_rollout.render()