parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") parser.add_argument("--threads", default=4, type=int, help="Maximum number of threads to use.") parser.add_argument("--debug", default=False, type=bool, help="Enable debug outputs.") args = parser.parse_args() # Create the environment env = mountain_car_evaluator.environment(discrete=True) # Construct the network network = Network(threads=args.threads) network.construct(args, env.states, env.actions) epsilon = args.epsilon alpha = args.learning_rate reward_history = [] reward_threshold = -170 reward_mean_length = 150 episodes_since_reset = 0
# Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--episodes", default=None, type=int, help="Training episodes.") parser.add_argument("--render_each", default=None, type=int, help="Render some episodes.") parser.add_argument("--alpha", default=None, type=float, help="Learning rate.") parser.add_argument("--alpha_final", default=None, type=float, help="Final learning rate.") parser.add_argument("--epsilon", default=None, type=float, help="Exploration factor.") parser.add_argument("--epsilon_final", default=None, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=None, type=float, help="Discounting factor.") args = parser.parse_args() # Create the environment env = mountain_car_evaluator.environment() # TODO: Implement Q-learning RL algorithm. # # The overall structure of the code follows. while training: # Perform a training episode state, done = env.reset(), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() next_state, reward, done, _ = env.step(action) # Perform last 100 evaluation episodes
parser.add_argument("--epsilon_final", default=0.001, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=1, type=float, help="Discounting factor.") parser.add_argument("--tiles", default=8, type=int, help="Number of tiles.") # default 8 args = parser.parse_args() # Create the environment env = mountain_car_evaluator.environment(tiles=args.tiles) # Implement Q-learning RL algorithm, using linear approximation. W = np.zeros([env.weights, env.actions]) epsilon = args.epsilon alpha = args.alpha / args.tiles evaluating = False while not evaluating: state, done = env.reset(evaluating), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() # Choose `action` according to epsilon-greedy strategy if np.random.uniform() > epsilon:
class Network: def __init__(self, threads, seed=42): # Create an empty graph and a session graph = tf.Graph() graph.seed = seed self.session = tf.Session(graph = graph, config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)) def construct(self, args, num_states, num_actions): with self.session.graph.as_default(): # Input states self.states = tf.placeholder(tf.int32, [None]) # Input q_values (uses as targets for training) self.q_values = tf.placeholder(tf.float32, [None, num_actions]) # TODO: Compute one-hot representation of self.states. # TODO: Compute the q_values as a single fully connected layer without activation, # with `num_actions` outputs, using the one-hot encoded states. It is important # to use such trivial architecture for the network to train at all. # Training # TODO: Perform the training, using mean squared error of the given # `q_values` and the predicted ones. # Initialize variables self.session.run(tf.global_variables_initializer()) def predict(self, states): # TODO: Predict q_values for given states def train(self, states, q_values): # TODO: Given states and target Q-values, perform the training if __name__ == "__main__": # Fix random seed np.random.seed(42) # Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--episodes", default=500, type=int, help="Training episodes.") parser.add_argument("--epsilon", default=0.1, type=float, help="Exploration factor.") parser.add_argument("--epsilon_final", default=0.1, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=1.0, type=float, help="Discounting factor.") parser.add_argument("--learning_rate", default=0.01, type=float, help="Learning rate.") parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Create the environment env = mountain_car_evaluator.environment(discrete=True) # Construct the network network = Network(threads=args.threads) network.construct(args, env.states, env.actions) evaluating = False epsilon = args.epsilon while True: # TODO: decide if we want to start evaluating -- maybe after already processing # args.episodes (i.e., env.episode >= args.episodes), but you can use other logis. # Perform episode state, done = env.reset(evaluating), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: compute q_values using the network and action using epsilon-greedy policy. action = ... next_state, reward, done, _ = env.step(action) # Perform the network update # TODO: Compute the q_values of the next_state # TODO: Update the goal q_values for the state `state`, using the TD update # for action `action` (leaving the q_values for different actions unchanged). # TODO: Train the network using the computed goal q_values for state `state`. state = next_state # Epsilon interpolation if args.epsilon_final: epsilon = np.exp(np.interp(env.episode + 1, [0, args.episodes], [np.log(args.epsilon), np.log(args.epsilon_final)]))