from trickster.utility import visual env = gym.make("CartPole-v1") input_shape = env.observation_space.shape num_actions = env.action_space.n policy = Sequential([ Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="softmax") ]) policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3)) agent = REINFORCE(policy, action_space=num_actions) rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300)) rewards = [] losses = [] for episode in range(1, 501): rollout_history = rollout.rollout(verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) rewards.append(rollout_history["reward_sum"]) losses.append(agent_history["loss"]) print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:])), end="")
agent = REINFORCE(actor, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.99, state_preprocessor=preprocess) screen = CV2Screen(scale=2) episode = 0 reward_memory = deque(maxlen=100) critic_losses = deque(maxlen=50) actor_losses = deque(maxlen=100) rollout = MultiTrajectory(agent, envs, warmup_episodes=WARMUP, rollout_configs=RolloutConfig(max_steps=512, skipframes=4)) history = {"episode": 0} while 1: rollout.reset() episode_actor_losses = [] while not rollout.finished: history = rollout.roll(steps=4, verbose=0, learning_batch_size=32) reward_memory.append(history["reward_sum"]) if "loss" in history: episode_actor_losses.append(history["loss"]) actor.save("../models/reskiv/reinforce_latest.h5") episode = history["episode"] if episode_actor_losses: actor_losses.append(np.mean(episode_actor_losses))
activation="relu", input_shape=input_shape, kernel_initializer="he_uniform"), Dense(16, activation="relu", kernel_initializer="he_uniform"), Dense(1, activation="linear", kernel_initializer="he_uniform") ]) critic.compile(loss="mse", optimizer=Adam(5e-4)) agent = A2C(actor, critic, action_space=env.action_space, memory=Experience(max_length=10000), discount_factor_gamma=0.98, entropy_penalty_coef=0.01) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_entropy = [] critic_loss = [] for episode in range(1, 1001): episode_actor_loss = [] episode_actor_utility = [] episode_actor_entropy = [] episode_critic_loss = [] for update in range(32):
from trickster.agent import REINFORCE from trickster.rollout import MultiTrajectory, RolloutConfig from trickster.utility import visual envs = [gym.make("CartPole-v1") for _ in range(8)] input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n policy = Sequential([Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="softmax")]) policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3)) agent = REINFORCE(policy, action_space=num_actions) rollout = MultiTrajectory(agent, envs, rollout_configs=RolloutConfig(max_steps=300)) rewards = [] losses = [] for episode in range(1, 501): rollout_history = rollout.rollout(verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) rewards.append(rollout_history["rewards"]) losses.append(agent_history["loss"]) print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:])), end="") if episode % 10 == 0: print() visual.plot_vectors([rewards, losses], ["Reward", "Loss"], smoothing_window_size=10)
Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(1, activation="linear") ]) critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR)) agent = PPO(actor, critic, action_space=test_env.action_space, memory=Experience(max_length=EXPERIENCE_SIZE), reward_discount_factor_gamma=DISCOUNT_FACTOR_GAMMA, entropy_penalty_coef=ENTROPY_PENALTY_BETA) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=MAX_TIMESTEPS)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_kld = [] actor_entropy = [] critic_loss = [] for episode in range(1, NUM_EPISODES + 1): roll_history = rollout.roll(steps=ROLL_TIMESTEPS, verbose=0, push_experience=True) agent_history = agent.fit(epochs=FIT_EPOCHS, batch_size=FIT_BATCH_SIZE,
input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n qnet = Sequential([Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="linear")]) qnet.compile(loss="mse", optimizer=Adam(1e-3)) agent = DQN(qnet, action_space=2, memory=Experience(max_length=10000), epsilon=1., discount_factor_gamma=0.98, use_target_network=True) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] losses = [] for episode in range(1, 501): episode_losses = [] for update in range(32): rollout.roll(steps=4, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=32, verbose=0) episode_losses.append(agent_history["loss"]) test_history = test_rollout.rollout(verbose=0, push_experience=False, render=False) rewards.append(test_history["reward_sum"])
policy = Sequential([ Dense(24, activation="relu", input_shape=input_shape), Dense(24, activation="relu"), Dense(num_actions, activation="linear") ]) policy.compile(loss="mse", optimizer=Adam(1e-4)) agent = DQN(policy, action_space=num_actions, memory=Experience(max_length=10000), epsilon=1., discount_factor_gamma=0.98, use_target_network=True) rollout = Trajectory(agent, env, RolloutConfig(max_steps=200)) rewards = [] losses = [] for warmup in range(1, 33): rollout.rollout(verbose=0, learning_batch_size=0) for episode in range(1, 501): rollout._reset() episode_rewards = [] episode_losses = [] while not rollout.finished: roll_history = rollout.roll(steps=2, verbose=0, learning_batch_size=64) episode_rewards.append(roll_history["reward_sum"]) episode_losses.append(roll_history["loss"])
Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(1, activation="linear") ]) critic.compile(loss="mse", optimizer=Adam(5e-4)) agent = A2C(actor, critic, action_space=envs[0].action_space, memory=Experience(max_length=10000), discount_factor_gamma=0.98, entropy_penalty_coef=0.01) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_entropy = [] critic_loss = [] for episode in range(1, 301): episode_actor_loss = [] episode_actor_utility = [] episode_actor_entropy = [] episode_critic_loss = [] for update in range(32):
num_actions = env.action_space.shape[0] actor, critics = mlp.wide_ddpg_actor_critic(input_shape, output_dim=num_actions, action_range=2, num_critics=2, actor_lr=5e-4, critic_lr=5e-4) agent = TD3(actor, critics, action_space=spaces.CONTINUOUS, memory=Experience(max_length=int(1e4)), discount_factor_gamma=0.99, action_noise_sigma=0.1, action_noise_sigma_decay=1., action_minima=-2, action_maxima=2, target_noise_sigma=0.2, target_noise_clip=0.5) rollout = Rolling(agent, env) test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True)) rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, update_batch_size=32, testing_rollout=test_rollout) test_rollout.render(repeats=10)
env = Match(cfg) test_env = Match(cfg) ann = mlp.wide_dueling_q_network(env.observation_space.shape, env.action_space.n, adam_lr=1e-4) experience = Experience(10000) agent = DoubleDQN(ann, env.action_space, experience, epsilon=1., epsilon_decay=1., epsilon_min=0.1) rcfg = RolloutConfig(max_steps=1024, skipframes=2) training_rollout = Rolling(agent, env, rcfg) testing_rollout = Trajectory(agent, test_env, rcfg) print("Filling experience...") while experience.N < 10000: training_rollout.roll(steps=32, verbose=0, push_experience=True) print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="") print() agent.epsilon_decay = 0.99995 logger = history.History("reward_sum", *agent.history_keys, "epsilon") for episode in range(1, 501): for update in range(32):
keras.layers.BatchNormalization(), keras.layers.ReLU(), keras.layers.MaxPool2D(), # 5 keras.layers.GlobalAveragePooling2D(), # 16 keras.layers.Dense(4, kernel_initializer="he_uniform"), keras.layers.BatchNormalization(), keras.layers.ReLU(), keras.layers.Dense(2, kernel_initializer="he_uniform") ]) qnet.compile(keras.optimizers.Adam(1e-3), "mse") agent = DQN(qnet, 2, Experience(max_length=10_000), discount_factor_gamma=0.99, epsilon=1.0, epsilon_decay=0.99999, epsilon_min=0.3, use_target_network=True, state_preprocessor=None) rollout = Rolling(agent, env, RolloutConfig(skipframes=2)) test_rollout = Trajectory(agent, test_env) rewards = deque(maxlen=100) losses = deque(maxlen=100) episode = 0 while 1: episode += 1 rollout.roll(steps=4, verbose=0, push_experience=True) if agent.memory.N < 1000: print(f"\rFilling memory... {agent.memory.N}/1000", end="") continue
action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.995, entropy_penalty_coef=0.0, state_preprocessor=lambda state: state / 255.) episode = 1 reward_memory = deque(maxlen=10) step_lengths = deque(maxlen=10) critic_losses = deque(maxlen=10) actor_losses = deque(maxlen=10) actor_utility = deque(maxlen=10) actor_entropy = deque(maxlen=10) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=512, skipframes=2)) test_rollout = Trajectory(agent, test_env, config=RolloutConfig(max_steps=512, skipframes=2)) while 1: episode_a_losses = [] episode_a_utility = [] episode_a_entropy = [] episode_c_losses = [] for update in range(32): rollout.roll(steps=2, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) episode_a_losses.append(agent_history["actor_loss"]) episode_a_utility.append(agent_history["actor_utility"]) episode_a_entropy.append(agent_history["actor_entropy"])
critic_input = Input(shape=[64, 64, 3], name="critic_input") critic_stream = Flatten()(critic_input) critic_stream = Dense(64, activation="tanh")(critic_stream) critic_stream = BatchNormalization()(critic_stream) critic_stream = Dense(32, activation="tanh")(critic_stream) critic_stream = BatchNormalization()(critic_stream) value_estimate = Dense(NUM_MOVES, activation="softmax")(critic_stream) critic = Model(critic_input, value_estimate, name="Critic") critic.compile(Adam(5e-4), "mse") agent = DQN(critic, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.99, epsilon=0.7, state_preprocessor=lambda state: state / 255. - 0.5) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=512, skipframes=2)) test_rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=512, skipframes=2)) episode = 0 reward_memory = deque(maxlen=10) losses = deque(maxlen=10) while 1: episode += 1 episode_losses = [] for update in range(32): rollout.roll(steps=4, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=32, verbose=0) episode_losses.append(agent_history["loss"])
from trickster.agent import A2C from trickster.rollout import MultiRolling, Trajectory, RolloutConfig from trickster.model import mlp cfg = MatchConfig(canvas_size=(100, 100), players_per_side=2, learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT, observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR) envs = [Match(cfg) for _ in range(8)] test_env = Match(cfg) actor, critic = mlp.wide_pg_actor_critic(envs[0].observation_space.shape, envs[0].action_space.n, actor_lr=1e-4, critic_lr=1e-4) agent = A2C(actor, critic, test_env.action_space, entropy_penalty_coef=0.1) rcfg = RolloutConfig(max_steps=512, skipframes=2) training_rollout = MultiRolling(agent, envs, rcfg) testing_rollout = Trajectory(agent, test_env, rcfg) training_rollout.fit(episodes=1000, updates_per_episode=512, steps_per_update=1, testing_rollout=testing_rollout) testing_rollout.render(repeats=10)