Пример #1
0
 def setUp(self):
     a = np.arange(10)
     b = -a
     xps = [Experience() for _ in range(5)]
     for i, xp in enumerate(xps, start=0):
         aa = a + 10 * i
         bb = b + 10 * i
         xp.remember(aa, bb)
     self.sampler = ExperienceSampler(xps)
Пример #2
0
    def test_remember_handles_multiple_arrays(self):
        xp = Experience()
        a = np.arange(100)
        b = a - 10
        c = a / 10

        xp.remember(a, b, c)

        self.assertEqual(len(xp.memoirs), 3)
        for source, target in zip(xp.memoirs, [a, b, c]):
            self.assertListEqual(source.tolist(), target.tolist())
Пример #3
0
    def test_sampling_considers_explicit_exclusions(self):
        exclusion = (3, 6)
        excluded_a = []
        excluded_b = []

        exps = []
        for i in range(1, 6):
            a = np.arange(10 * i, 10 * (i + 1))
            b = a + 100
            exps.append(Experience())
            exps[-1].remember(a, b, exclude=exclusion)
            for e in exclusion:
                excluded_a.append(a[e])
                excluded_b.append(b[e])

        sampler = ExperienceSampler(exps)
        states, states_next, sample_b = sampler.sample(-1)

        for e_a, e_b in zip(excluded_a, excluded_b):
            self.assertNotIn(e_a, states)
            self.assertNotIn(e_b, sample_b)
Пример #4
0
actor.compile(loss="categorical_crossentropy", optimizer=Adam(1e-4))

critic = Sequential([
    Dense(16,
          activation="relu",
          input_shape=input_shape,
          kernel_initializer="he_uniform"),
    Dense(16, activation="relu", kernel_initializer="he_uniform"),
    Dense(1, activation="linear", kernel_initializer="he_uniform")
])
critic.compile(loss="mse", optimizer=Adam(5e-4))

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 1001):
    episode_actor_loss = []
    episode_actor_utility = []
Пример #5
0
envs = [FakeEnv() for _ in range(10)]
test_env = FakeEnv()

actor = Sequential([  # 200, 160
    Flatten(input_shape=test_env.shape),
    Dense(256),
    BatchNormalization(),
    LeakyReLU(),
    Dense(2, activation="softmax")
])
actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy")

agent = REINFORCE(actor,
                  2,
                  Experience(),
                  discount_factor_gamma=0.99,
                  state_preprocessor=None)

rollout = MultiTrajectory([agent for _ in range(10)], envs)
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
actor_loss = deque(maxlen=100)
actor_utility = deque(maxlen=100)
actor_entropy = deque(maxlen=100)
critic_loss = deque(maxlen=100)

episode = 0

while 1:
Пример #6
0
    Dense(16, activation="relu"),
    Dense(num_actions, activation="softmax")
])
actor.compile(loss="categorical_crossentropy", optimizer=Adam(ACTOR_ADAM_LR))

critic = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR))

agent = PPO(actor,
            critic,
            action_space=test_env.action_space,
            memory=Experience(max_length=EXPERIENCE_SIZE),
            reward_discount_factor_gamma=DISCOUNT_FACTOR_GAMMA,
            entropy_penalty_coef=ENTROPY_PENALTY_BETA)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=MAX_TIMESTEPS))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_kld = []
actor_entropy = []
critic_loss = []
Пример #7
0
 def setUp(self):
     self.xp = Experience(max_length=100)
     self.sampler = ExperienceSampler(self.xp)
     self.a = np.arange(100)
     self.b = self.a - 100
     self.c = self.a / 10
Пример #8
0
env = gymic.rwd_scaled_env("LunarLanderContinuous-v2", reward_scale=0.01)

input_shape = env.observation_space.shape
num_actions = env.action_space.shape[0]

actor, critics = mlp.wide_ddpg_actor_critic(input_shape,
                                            output_dim=num_actions,
                                            action_range=2,
                                            num_critics=2,
                                            actor_lr=5e-4,
                                            critic_lr=5e-4)

agent = TD3(actor,
            critics,
            action_space=spaces.CONTINUOUS,
            memory=Experience(max_length=int(1e4)),
            discount_factor_gamma=0.99,
            action_noise_sigma=0.1,
            action_noise_sigma_decay=1.,
            action_minima=-2,
            action_maxima=2,
            target_noise_sigma=0.2,
            target_noise_clip=0.5)

rollout = Rolling(agent, env)
test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True))

rollout.fit(episodes=1000,
            updates_per_episode=64,
            step_per_update=1,
            update_batch_size=32,
Пример #9
0
from trickster.utility import history, visual
from trickster.model import mlp

cfg = MatchConfig(canvas_size=(128, 128),
                  players_per_side=2,
                  learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT,
                  observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR)

env = Match(cfg)
test_env = Match(cfg)

ann = mlp.wide_dueling_q_network(env.observation_space.shape,
                                 env.action_space.n,
                                 adam_lr=1e-4)

experience = Experience(10000)
agent = DoubleDQN(ann,
                  env.action_space,
                  experience,
                  epsilon=1.,
                  epsilon_decay=1.,
                  epsilon_min=0.1)

rcfg = RolloutConfig(max_steps=1024, skipframes=2)
training_rollout = Rolling(agent, env, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

print("Filling experience...")
while experience.N < 10000:
    training_rollout.roll(steps=32, verbose=0, push_experience=True)
    print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="")
Пример #10
0
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.MaxPool2D(),  # 10
    keras.layers.Conv2D(16, 3, kernel_initializer="he_uniform", padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.MaxPool2D(),  # 5
    keras.layers.GlobalAveragePooling2D(),  # 16
    keras.layers.Dense(4, kernel_initializer="he_uniform"),
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.Dense(2, kernel_initializer="he_uniform")
])
qnet.compile(keras.optimizers.Adam(1e-3), "mse")

agent = DQN(qnet, 2, Experience(max_length=10_000), discount_factor_gamma=0.99,
            epsilon=1.0, epsilon_decay=0.99999, epsilon_min=0.3, use_target_network=True,
            state_preprocessor=None)

rollout = Rolling(agent, env, RolloutConfig(skipframes=2))
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
losses = deque(maxlen=100)

episode = 0

while 1:

    episode += 1
Пример #11
0
    def test_experience_constructor_creates_empty_object(self):
        xp = Experience()

        self.assertIsNone(xp.memoirs)
        self.assertEqual(xp.N, 0)
Пример #12
0
    def test_remember_considers_max_size(self):
        xp = Experience(max_length=100)
        xp.remember(np.arange(120))

        self.assertTrue(xp.N, 100)
        self.assertListEqual(xp.memoirs[0].tolist(), list(range(20, 120)))
Пример #13
0
    def test_experience_remembers_array(self):
        xp = Experience()
        xp.remember(np.arange(100))

        self.assertEqual(xp.N, 100)
        self.assertListEqual(xp.memoirs[0].tolist(), list(range(100)))
Пример #14
0
    def test_experience_constructor_considers_max_size_argument(self):
        xp = Experience(max_length=3)

        self.assertEqual(xp.max_length, 3)
Пример #15
0
    def reset(self):
        self.initial_state = self.env.reset()
        return self.empty


envs = [FakeEnv() for _ in range(10)]
test_env = FakeEnv()

actor = Sequential([  # 200, 160
    Flatten(input_shape=test_env.shape),
    Dense(200, activation="relu"),
    Dense(2, activation="softmax")
])
actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy")

agent = REINFORCE(actor, 2, Experience(), discount_factor_gamma=0.99,
                  state_preprocessor=None)

rollout = MultiTrajectory(agent, envs)
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=10)
actor_loss = deque(maxlen=80)
actor_utility = deque(maxlen=80)
actor_entropy = deque(maxlen=80)
critic_loss = deque(maxlen=80)

episode = 0

while 1:
Пример #16
0
canvas_shape, action_shape = env.neurons_required

actor_input = Input(shape=[64, 64, 3], name="actor_input")
critic_input = Input(shape=[64, 64, 3], name="critic_input")

critic_stream = Flatten()(critic_input)
critic_stream = Dense(64, activation="tanh")(critic_stream)
critic_stream = BatchNormalization()(critic_stream)
critic_stream = Dense(32, activation="tanh")(critic_stream)
critic_stream = BatchNormalization()(critic_stream)
value_estimate = Dense(NUM_MOVES, activation="softmax")(critic_stream)

critic = Model(critic_input, value_estimate, name="Critic")
critic.compile(Adam(5e-4), "mse")

agent = DQN(critic, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.99, epsilon=0.7,
            state_preprocessor=lambda state: state / 255. - 0.5)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=512, skipframes=2))
test_rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=512, skipframes=2))

episode = 0
reward_memory = deque(maxlen=10)
losses = deque(maxlen=10)

while 1:
    episode += 1

    episode_losses = []
    for update in range(32):
        rollout.roll(steps=4, verbose=0, push_experience=True)