def create_env(args: argparse.Namespace, report_each: int = 100, **kwargs) \ -> tuple[wrappers.EvaluationEnv, np.ndarray, np.ndarray, np.ndarray]: # Create the environment env = wrappers.EvaluationEnv(gym.make("Taxi-v3"), seed=args.seed, report_each=report_each, **kwargs) # Extract a deterministic MDP into three NumPy arrays # - R[state][action] is the reward # - D[state][action] is the True/False value indicating end of episode # - N[state][action] is the next state R, D, N = [ np.array([[env.P[s][a][0][i] for a in range(env.action_space.n)] for s in range(env.observation_space.n)]) for i in [2, 3, 1] ] return env, R, D, N
np.concatenate(states), { "actions": np.concatenate(actions), "action_probs": np.concatenate(action_probs), "advantages": np.concatenate(advantages), "returns": np.concatenate(returns) }, batch_size=args.batch_size, epochs=args.epochs, verbose=0, ) # Periodic evaluation iteration += 1 if iteration % args.evaluate_each == 0: for _ in range(args.evaluate_for): evaluate_episode() # Final evaluation while True: evaluate_episode(start_evaluation=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(gym.make("SingleCollect-v0"), args.seed) main(env, args)
for i in range(args.envs): replay_buffer.append(Transition(state[i], action[i], reward[i], done[i], next_state[i])) state = next_state # Training if len(replay_buffer) >= 4 * args.batch_size: # Note that until now we used `np.random.choice` with `replace=False` to generate # batch indices. However, this call is extremely slow for large buffers, because # it generates a whole permutation. With `np.random.randint`, indices may repeat, # but once the buffer is large, it happend with little probability. batch = np.random.randint(len(replay_buffer), size=args.batch_size) states, actions, rewards, dones, next_states = map(np.array, zip(*[replay_buffer[i] for i in batch])) # TODO: Perform the training # Periodic evaluation for _ in range(args.evaluate_for): evaluate_episode() # Final evaluation while True: evaluate_episode(start_evaluation=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(gym.make(args.env), args.seed) main(env, args)
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seeds and threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) if args.recodex: # TODO: Perform evaluation of a trained model. while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose an action action = None state, reward, done, _ = env.step(action) else: # TODO: Perform training raise NotImplementedError() if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(gym.make("CartPolePixels-v0"), args.seed) main(env, args)
{ "actions": np.concatenate(actions)[:, a], "action_probs": np.concatenate(action_probs)[:, a], "advantages": np.concatenate(advantages), "returns": np.concatenate(returns) }, batch_size=args.batch_size, epochs=args.epochs, verbose=0, ) # Periodic evaluation iteration += 1 if iteration % args.evaluate_each == 0: for _ in range(args.evaluate_for): evaluate_episode() # Final evaluation while True: evaluate_episode(start_evaluation=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv( gym.make("MultiCollect{}-v0".format(args.agents)), args.seed) main(env, args)
next_state, reward, done, _ = env.step(action) states.append(state) actions.append(action) rewards.append(reward) state = next_state # TODO(reinforce): Compute returns from the received rewards # TODO(reinforce): Add states, actions and returns to the training batch # TODO(reinforce): Train using the generated batch. # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO(reinforce): Choose greedy action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(gym.make("CartPole-v1"), args.seed) main(env, args)
replay_buffer.append(episode) # Train the network if enough data is available if len(replay_buffer) >= args.batch_size: network.train([ replay_buffer[i] for i in np.random.choice(len(replay_buffer), size=args.batch_size, replace=False) ]) # TODO(memory_game): Maybe evaluate the current performance, using # `evaluate_episode()` method returning the achieved return, # and setting `training=False` when the performance is high enough. # Final evaluation while True: evaluate_episode(True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(memory_game_environment.make(args.cards), args.seed, evaluate_for=args.evaluate_for, report_each=args.evaluate_for) main(env, args)
# TODO: Predict action distribution using `network.predict_actions` # and then sample it using for example `np.random.normal`. Do not # forget to clip the actions to the `env.action_space.{low,high}` # range, for example using `np.clip`. actions = None # TODO(paac): Perform steps in the vectorized environment # TODO(paac): Compute estimates of returns by one-step bootstrapping # TODO(paac): Train network using current states, chosen actions and estimated returns # Periodic evaluation for _ in range(args.evaluate_for): evaluate_episode() # Final evaluation while True: evaluate_episode(start_evaluation=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv( wrappers.DiscreteMountainCarWrapper( gym.make("MountainCarContinuous-v0"), tiles=args.tiles), args.seed) main(env, args)
# TODO: Perform evaluation of a trained model. while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose an action action = None state, reward, done, _ = env.step(action) else: # TODO: Perform training # If you want to create N multiprocessing parallel environments, use # vector_env = gym.vector.AsyncVectorEnv([lambda: gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip))] * N) # vector_env.seed(args.seed) # The individual environments will get incremental seeds pass if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(gym.make("CarRacingSoftFS{}-v0".format( args.frame_skip)), args.seed, evaluate_for=15, report_each=1) main(env, args)
state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Perform an action. action = None next_state, reward, done, _ = env.step(action) # TODO: Update the action-value estimates state = next_state # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")), args.seed) main(env, args)
def main(args: argparse.Namespace) -> np.ndarray: # Create a random generator with a fixed seed generator = np.random.RandomState(args.seed) # Create the environment env = wrappers.EvaluationEnv(gym.make("Taxi-v3"), seed=args.seed, report_each=min(200, args.episodes)) Q = np.zeros((env.observation_space.n, env.action_space.n)) # The next action is always chosen in the epsilon-greedy way. def choose_next_action(Q: np.ndarray) -> tuple[int, float]: greedy_action = argmax_with_tolerance(Q[next_state]) next_action = greedy_action if generator.uniform( ) >= args.epsilon else env.action_space.sample() return next_action, args.epsilon / env.action_space.n + ( 1 - args.epsilon) * (greedy_action == next_action) # The target policy is either the behavior policy (if not args.off_policy), # or the greedy policy (if args.off_policy). def compute_target_policy(Q: np.ndarray) -> np.ndarray: target_policy = np.eye(env.action_space.n)[argmax_with_tolerance( Q, axis=-1)] if not args.off_policy: target_policy = ( 1 - args.epsilon ) * target_policy + args.epsilon / env.action_space.n return target_policy # Run the TD algorithm for _ in range(args.episodes): next_state, done = env.reset(), False # Generate episode and update Q using the given TD method next_action, next_action_prob = choose_next_action(Q) while not done: action, action_prob, state = next_action, next_action_prob, next_state next_state, reward, done, _ = env.step(action) if not done: next_action, next_action_prob = choose_next_action(Q) # TODO: Perform the update to the state-action value function `Q`, using # a TD update with the following parameters: # - `args.n`: use `args.n`-step method # - `args.off_policy`: # - if False, the epsilon-greedy behaviour policy is also the target policy # - if True, the target policy is the greedy policy # - for SARSA (with any `args.n`) and expected SARSA (with `args.n` > 1), # importance sampling must be used # - `args.mode`: this argument can have the following values: # - "sarsa": regular SARSA algorithm # - "expected_sarsa": expected SARSA algorithm # - "tree_backup": tree backup algorithm # # Perform the updates as soon as you can -- whenever you have all the information # to update `Q[state, action]`, do it. For each `action` use its corresponding # `action_prob` at the time of taking the `action` as the behaviour policy probability, # and the `compute_target_policy(Q)` with the current `Q` as the target policy. # # Do not forget that when `done` is True, bootstrapping on the # `next_state` is not used. # # Also note that when the episode ends and `args.n` > 1, there will # be several state-action pairs that also need to be updated. Perform # the updates in the order in which you encountered the state-action # pairs and during these updates, use the `compute_target_policy(Q)` # with the up-to-date value of `Q`. return Q
while training: # To generate expert trajectory, you can use state, trajectory = env.expert_trajectory() # TODO: Perform a training episode state, done = env.reset(), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() state, reward, done, _ = env.step(action) # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv( wrappers.DiscreteLunarLanderWrapper(gym.make("LunarLander-v2")), args.seed) main(env, args)
if args.render_each and env.episode % args.render_each == 0: # Produce an HTML visualization using all the stored states. env.render("html", path="{}{}.html".format(args.env, env.episode)) return rewards # Evaluation in ReCodEx if args.recodex: while True: evaluate_episode(start_evaluation=True) # TODO: Perform training. # # Note that the SAC had issues with exploding gradients (the model started # to predict NaNs after several updates); the problem went away after # passing `clipnorm=10` to the `tf.optimizers.Adam`. Note that the # value `10` is my first try and definitely not an optimal value. # # Vectorized Brax environment can be created using # venv = wrappers.BraxWrapper(args.env, workers=args.threads) raise NotImplementedError() if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(wrappers.BraxWrapper(args.env), args.seed) main(env, args)
action = None # Perform the action. next_state, reward, done, _ = env.step(action) states.append(state) actions.append(action) rewards.append(reward) state = next_state # TODO: Compute returns from the received rewards and update Q and C. # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose a greedy action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv( wrappers.DiscreteCartPoleWrapper(gym.make("CartPole-v1")), args.seed) main(env, args)