def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) if args.recodex: # TODO: Perform evaluation of a trained model. while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose an action action = None state, reward, done, _ = env.step(action) else: # TODO: Perform training # If you want to create N multiprocessing parallel environments, use # vector_env = gym.vector.AsyncVectorEnv([lambda: gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip))] * N) # vector_env.seed(args.seed) # The individual environments will get incremental seeds pass
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seed np.random.seed(args.seed) # TODO: Implement a suitable RL algorithm. training = True while training: # To generate expert trajectory, you can use state, trajectory = env.expert_trajectory() # TODO: Perform a training episode state, done = env.reset(), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() state, reward, done, _ = env.step(action) # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action)
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seeds and threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) if args.recodex: # TODO: Perform evaluation of a trained model. while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose an action action = None state, reward, done, _ = env.step(action) else: # TODO: Perform training raise NotImplementedError()
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seed np.random.seed(args.seed) # TODO: Variable creation and initialization training = True while training: # Perform episode state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Perform an action. action = None next_state, reward, done, _ = env.step(action) # TODO: Update the action-value estimates state = next_state # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action)
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seed np.random.seed(args.seed) # Implement Q-learning RL algorithm, using linear approximation. W = np.zeros([env.observation_space.nvec[-1], env.action_space.n]) epsilon = args.epsilon training = True while training: # Perform episode state, done = env.reset(), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() # TODO: Choose an action. action = None next_state, reward, done, _ = env.step(action) # TODO: Update the action-value estimates state = next_state if args.epsilon_final_at: epsilon = np.interp(env.episode + 1, [0, args.epsilon_final_at], [args.epsilon, args.epsilon_final]) # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action)
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace): # Fix random seed np.random.seed(args.seed) # TODO: # - Create Q, a zero-filled NumPy array with shape [number of states, number of actions], # representing estimated Q value of a given (state, action) pair. # - Create C, a zero-filled NumPy array with the same shape, # representing number of observed returns of a given (state, action) pair. for _ in range(args.episodes): # TODO: Perform an episode, collecting states, actions and rewards. state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Compute `action` using epsilon-greedy policy. action = None # Perform the action. next_state, reward, done, _ = env.step(action) state = next_state # TODO: Compute returns from the recieved rewards and update Q and C. # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose a greedy action action = None state, reward, done, _ = env.step(action)
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Construct the network network = Network(env, args) # Replay memory; maxlen parameter can be passed to deque for a size limit, # which we however do not need in this simple task. replay_buffer = collections.deque() Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "done", "next_state"]) epsilon = args.epsilon training = True while training: # Perform episode state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Choose an action. # You can compute the q_values of a given state by # q_values = network.predict([state])[0] action = None next_state, reward, done, _ = env.step(action) # Append state, action, reward, done and next_state to replay_buffer replay_buffer.append( Transition(state, action, reward, done, next_state)) # TODO: If the replay_buffer is large enough, preform a training batch # from `args.batch_size` uniformly randomly chosen transitions. # # After you choose `states` and suitable targets, you can train the network as # network.train(states, ...) state = next_state if args.epsilon_final_at: epsilon = np.interp(env.episode + 1, [0, args.epsilon_final_at], [args.epsilon, args.epsilon_final]) # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action)
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seeds and threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Construct the agent agent = Agent(env, args) # Training for _ in range(args.episodes // args.batch_size): batch_states, batch_actions, batch_returns = [], [], [] for _ in range(args.batch_size): # Perform episode states, actions, rewards = [], [], [] state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO(reinforce): Choose `action` according to probabilities # distribution (see `np.random.choice`), which you # can compute using `agent.predict` and current `state`. action = None next_state, reward, done, _ = env.step(action) states.append(state) actions.append(action) rewards.append(reward) state = next_state # TODO(reinforce): Compute returns from the received rewards # TODO(reinforce): Add states, actions and returns to the training batch # TODO(reinforce): Train using the generated batch. # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO(reinforce): Choose greedy action action = None state, reward, done, _ = env.step(action)
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Construct the network network = Network(env, args) # Replay memory; maxlen parameter can be passed to deque for a size limit, # which we however do not need in this simple task. replay_buffer = collections.deque() Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "done", "next_state"]) def evaluate_episode(start_evaluation: bool = False) -> float: rewards, state, done = 0, env.reset(start_evaluation), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Predict the action using the greedy policy action = None state, reward, done, _ = env.step(action) rewards += reward return rewards noise = OrnsteinUhlenbeckNoise(env.action_space.shape[0], 0, args.noise_theta, args.noise_sigma) training = True while training: # Training for _ in range(args.evaluate_each): state, done = env.reset(), False noise.reset() while not done: # TODO: Predict actions by calling `network.predict_actions` # and adding the Ornstein-Uhlenbeck noise. As in paac_continuous, # clip the actions to the `env.action_space.{low,high}` range. action = None next_state, reward, done, _ = env.step(action) replay_buffer.append( Transition(state, action, reward, done, next_state)) state = next_state if len(replay_buffer) >= args.batch_size: batch = np.random.choice(len(replay_buffer), size=args.batch_size, replace=False) states, actions, rewards, dones, next_states = map( np.array, zip(*[replay_buffer[i] for i in batch])) # TODO: Perform the training # Periodic evaluation for _ in range(args.evaluate_for): evaluate_episode() # Final evaluation while True: evaluate_episode(start_evaluation=True)