def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft) vmodel = ValueModel(D, ft) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 1 == 0: print( "episode:", n, "total reward: %.1f" % totalreward, "avg reward (last 100): %.1f" % totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(ft, D, [], []) # init = tf.global_variables_initializer() session = tf.InteractiveSession() # session.run(init) pmodel.set_session(session) pmodel.init_vars() gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) totalrewards, pmodel = random_search(env, pmodel, gamma) print("max reward:", np.max(totalrewards)) # play 100 episodes and check the average avg_totalrewards = play_multiple_episodes(env, 100, pmodel, gamma, print_iters=True) print("avg reward over 100 episodes with best models:", avg_totalrewards) plt.plot(totalrewards) plt.title("Rewards") plt.show()
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft, []) vmodel = ValueModel(D, ft, []) init = tf.compat.v1.global_variables_initializer() session = tf.compat.v1.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.95 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 1 == 0: print("episode:", n, "total reward: %.1f" % totalreward, "num steps: %d" % num_steps, "avg reward (last 100): %.1f" % totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env) D = ft.dimensions pmodel = PolicyModel(ft, D, [], []) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) totalrewards, pmodel = random_search(env, pmodel, gamma) print("max reward", np.max(totalrewards)) # play 100 episodes and check the average avg_totalrewards = play_multiple_episodes(env, 100, pmodel, gamma, print_iters=True) print("avg reward over 100 episoes with best models:", avg_totalrewards) plt.plot(totalrewards) plt.title("Rewards") plt.show()
actions.pop(0) else: while len(rewards) > 0: guess_rewards = rewards + [-1] * (n - len(rewards)) G = multiplier.dot(guess_rewards) model.update(states[0], actions[0], G) rewards.pop(0) states.pop(0) actions.pop(0) return totalreward if __name__ == '__main__': env = gym.make('MountainCar-v0') ft = FeatureTransformer(env) model = Model(env, ft, "constant") gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 300 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 0.1 * (0.97**n) totalreward = play_one(model, eps, gamma) totalrewards[n] = totalreward
observation # In[71]: actions = np.atleast_1d(env.observation_space.sample()) # In[72]: observation, reward, done, info = env.step(actions) # In[73]: observation # In[74]: ft.transform(observations=observation) # In[33]: ft = FeatureTransformer(env, n_components=100) # In[37]: ft.transform(np.atleast_2d(env.reset())) # In[38]: np.atleast_2d(env.reset())
# an object where the actual action is stored in object[0] observation, reward, done, info = env.step([action]) totalreward += reward # Update models V_next = vmodel.predict(observation) G = reward + gamma * V_next advantage = G - vmodel.predict(prev_observation) pmodel.partial_fit(prev_observation, action, advantage) vmodel.partial_fit(prev_observation, G) iters += 1 return totalreward, iters # In[7]: ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft, []) vmodel = ValueModel(D, ft, []) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.95 N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma)