def main(): env = gym.make('CartPole-v0') gamma = 0.99 copy_period = 50 D = len(env.observation_space.sample()) K = env.action_space.n sizes = [200,200] model = DQN(D, K, sizes, gamma) tmodel = DQN(D, K, sizes, gamma) if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0/np.sqrt(n+1) totalreward = play_one(env, model, tmodel, eps, gamma, copy_period) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft) vmodel = ValueModel(D, ft) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 1 == 0: print( "episode:", n, "total reward: %.1f" % totalreward, "avg reward (last 100): %.1f" % totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
def main(): env = gym.make('CartPole-v0') D = env.observation_space.shape[0] K = env.action_space.n pmodel = PolicyModel(D, K, []) vmodel = ValueModel(D, [10]) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 1 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward = play_one_mc(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "avg reward (last 100):", totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('CartPole-v0') D = env.observation_space.shape[0] K = env.action_space.n p_model = PolicyModel(D, K, [10]) v_model = ValueModel(D, [5, 5]) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) p_model.set_session(session) v_model.set_session(session) gamma = 0.99 N = 3000 total_rewards = np.empty(N) for n in range(N): total_reward = play_one_mc(env, p_model, v_model, gamma) total_rewards[n] = total_reward if n % 100 == 0: print('episode:', n, 'total reward:', total_reward, 'avg reward (last 100):', total_rewards[max(0, n - 100):n+1].mean()) print('avg reward for last 100 episodes:', total_rewards[-100:].mean()) print('total steps:', total_rewards.sum()) plt.plot(total_rewards) plt.title("Rewards") plt.show() plot_running_avg(total_rewards)
def main(): env = gym.make('CartPole-v0').env gamma = 0.99 copy_period = 50 D = len(env.observation_space.sample()) K = env.action_space.n sizes = [200, 200] model = DQN(D, K, sizes, gamma) tmodel = DQN(D, K, sizes, gamma) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) model.set_session(session) tmodel.set_session(session) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0 / np.sqrt(n + 1) totalreward = play_one(env, model, tmodel, eps, gamma, copy_period) totalrewards[n] = totalreward print(f'episode {n}, total reward {totalreward}') print(f'avg reward last hundo = {totalrewards[-100].mean()}') print(f'tot step = {-totalrewards.sum()}') plt.plot(totalrewards) plt.title('rewards') plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('CartPole-v0') ft = FeatureTransformer(env) model = Model(env, ft) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0 / np.sqrt(n + 1) totalreward = play_one(env, model, eps, gamma) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('CartPole-v0') ft = FeatureTransformer(env) model = Model(env, ft) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0/np.sqrt(n+1) totalreward = play_one(env, model, eps, gamma) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('CartPole-v0') D = env.observation_space.shape[0] K = env.action_space.n pmodel = PolicyModel(D, K, []) vmodel = ValueModel(D, [10]) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward = play_one_mc(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def main(show_plots=True): env = gym.make('MountainCar-v0') ft = FeatureTransformer(env) model = Model(env, ft, 'constant') gamma = 0.99 N = 300 total_rewards = np.empty(N) for n in range(N): eps = 0.1 * (0.97**n) total_reward = play_one(model, eps, gamma) total_rewards[n] = total_reward if n % 10 == 0: print('episode:', n, 'total reward:', total_reward) print('avg reward for last 100 episodes:', total_rewards[-100].mean()) print('total steps:', -total_rewards.sum()) if show_plots: plt.plot(total_rewards) plt.title('Rewards') plt.show() plot_running_avg(total_rewards) # Plot the optimal state-value function plot_cost_to_go(env, model)
def main(): env = gym.make('MountainCarContinuous-v0').env ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(ft, D, []) vmodel = ValueModel(ft, D, []) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.95 N = 50 totalrewards = np.empty(N) for n in range(N): totalreward, _ = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward print(f"episode {n}, total rewards {totalreward}") plt.plot(totalrewards) plt.title('rewards') plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
def main(): env = gym.make('CartPole-v0') gamma = 0.99 copy_period = 50 D = len(env.observation_space.sample()) K = env.action_space.n sizes = [200,200] model = DQN(D, K, sizes, gamma) tmodel = DQN(D, K, sizes, gamma) if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0/np.sqrt(n+1) totalreward = play_one(env, model, tmodel, eps, gamma, copy_period) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def laplace_env_main(): session = tf.InteractiveSession() # train by a 200*200 grid laplace_env = create_LaplaceSolver(session, n=200) D = laplace_env.trained_NN.D K = 10 pmodel = PolicyModel(D, K, [10, 5]) vmodel = ValueModel(D, [20, 15]) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.99 N = 60 #iters = np.empty(N) costs = np.empty(N) iters = [] output_string = None output_strings = [] for n in range(N): if (n == N): ite, all_error = play_one_mc(laplace_env, pmodel, vmodel, gamma) else: ite, _ = play_one_mc(laplace_env, pmodel, vmodel, gamma) iters_np = np.array(iters[max(0, n - 100):(n + 1)]) #output_string = "episode:", n , "iter:", ite, "average iters:", iters[max(0, n-100):(n+1)].mean() output_string = "episode: %d, iter: %d, average iters: %f \n" % ( n, ite, iters_np.mean()) #output_string = "episode: %d, iter: %d " %(n , ite) output_strings.append(output_string) #iters[n] = ite if ite < 100000000: iters.append(ite) if len(iters) % 5 == 0: print(output_string) file_name = 'pg3.text' with open(file_name, "a") as text_file: text_file.write(output_string) iters_np = np.array(iters[-100:]) print("avg reward for last 100 episodes:", iters_np.mean()) print("total steps:", iters.sum()) # After the training process, try the 500*500 grid. laplace_env_500 = create_LaplaceSolver(session, n=500) ite, errors = last_play(laplace_env_500, pmodel, vmodel) plt.plot(iters) plt.title("iters_while_training") plt.show() plot_running_avg(totalrewards) plt.plot(errors) plt.title("grid 500 * 500 error") plt.show()
def main(): env = gym.make('Breakout-v0') gamma = 0.99 copy_period = 10000 D = len(env.observation_space.sample()) K = env.action_space.n conv_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)] hidden_sizes = [512] # model = DQN(K, conv_sizes, hidden_sizes, gamma, scope='main') # tmodel = DQN(K, conv_sizes, hidden_sizes, gamma, scope='target') model = DQN(K, conv_sizes, hidden_sizes, gamma) tmodel = DQN(K, conv_sizes, hidden_sizes, gamma) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) model.set_session(session) tmodel.set_session(session) if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 100000 totalrewards = np.empty(N) costs = np.empty(N) n_max = 500000 # last step to decrease epsilon eps_step = 0.9 / n_max eps = 1.0 for n in range(N): t0 = datetime.now() totalreward, eps, num_steps = play_one(env, model, tmodel, eps, eps_step, gamma, copy_period) totalrewards[n] = totalreward if n % 1 == 0: print("episode:", n, "total reward:", totalreward, "eps:", "%.3f" % eps, "num steps:", num_steps, "episode duration:", (datetime.now() - t0), "avg reward (last 100):", "%.3f" % totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('redtiebot-v0') gamma = 0.99 copy_period = 50 s_time = time.time() D = 5 K = 9 sizes = [10, 15, 20, 15, 10] model = DQN(D, K, sizes, gamma, env) tmodel = DQN(D, K, sizes, gamma, env) tmodel.load() init = tf.compat.v1.global_variables_initializer() #session = tf.compat.v1.InteractiveSession() #session.run(init) #model.set_session(session) #tmodel.set_session(session) import pdb pdb.set_trace() if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 model.set_max_guided_run(int(.25 * N)) totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0 / np.sqrt(n + 1) totalreward = play_one(env, model, tmodel, eps, gamma, copy_period) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n - 100):(n + 1)].mean()) print("time: " + str(time.time() - s_time)) s_time = time.time() print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('CartPole-v0') ft = FeatureTransformer(env) model = Model(env, ft) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0 / np.sqrt(n + 1) totalreward = play_one(env, model, eps, gamma) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n - 100):(n + 1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) env = wrappers.Monitor(env, 'cart_pole') observation = env.reset() done = False iters = 0 while not done and iters < 5000: #action = model.sample_action(observation, eps) action = np.argmax(model.predict(observation)) #prev_observation = observation observation, reward, done, info = env.step(action)
def main(): env = gym.make('CartPole-v0').env ft = FeatureTransformer(env) model = Model(env, ft) gamma = 0.99 N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1 / np.sqrt(n + 1) totalreward = play_one(env, model, eps, gamma) totalrewards[n] = totalreward if n % 100 == 0: print(f'episode {n}, total reward {totalreward}') print(f'avg reward last hundo = {totalrewards[-100].mean()}') print(f'tot step = {-totalrewards.sum()}') plt.plot(totalrewards) plt.title('rewards') plt.show() plot_running_avg(totalrewards)
def main(): env = gym.make('MountainCarContinuous-v0').env ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(ft, D, [], []) session = tf.InteractiveSession() pmodel.set_session(session) pmodel.init_vars() gamma = 0.99 totalreward, pmodel = random_search( env, pmodel, gamma ) print(f'max rewards {max(totalreward)}') avg_totalrewards = play_multiple_episodes(env, 100, pmodel, gamma) print(f'avg reward = {avg_totalrewards}') plt.plot(totalreward) plt.title('rewards') plt.show() plot_running_avg(totalreward)
def main(): env = gym.make('CartPole-v0') ft = FeatureTransformer() model = Model(env, ft) N = 500 total_rewards = np.empty(N) for n in range(N): eps = 1 / np.sqrt(n + 1) total_reward = play_one(model, eps) total_rewards[n] = total_reward if (n + 1) % 100 == 0: print('episode:', (n + 1), 'total reward:', total_reward, 'eps:', eps, 'avg_reward (last 100):', total_rewards[max(0, n - 100):n+1].mean()) print('avg reward for the last 100 episodes:', total_rewards[-100].mean()) plt.plot(total_rewards) plt.title('Total Rewards') plt.show() plot_running_avg(total_rewards) #main()
lambda_ = 0.7 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) # costs = np.empty(N) for n in range(N): # eps = 1.0/(0.1*n+1) # eps = 0.1*(0.97**n) eps = 1.0/np.sqrt(n+1) # eps = 0.1 states_actions_rewards, totalreward = play_one(model, env, eps, gamma, lambda_) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
<<<<<<< HEAD N = 600 ======= N = 500 >>>>>>> upstream/master totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0/np.sqrt(n+1) totalreward = play_one(env, model, tmodel, eps, gamma, copy_period) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) if __name__ == '__main__': main()