def main(args): myLander = LunarLander() myLander.set_discount(args.discount) myrl, trainRewards, totalLoss = train_QL(myLander, numTrials=args.num_train_trials, numEpochs=args.num_epochs, memsize=args.memsize) print("Training completed. Switching to testing.") plt.plot(trainRewards) plt.ylabel('trainingReward') plt.xlabel('Trial No.') plt.savefig('plots/trainprogress_memSz' + str(args.memsize) + '_epochs' + str(args.num_epochs) + '.png') #plt.show() plt.clf() plt.plot(totalLoss) plt.savefig('plots/loss_v_time_memSz_' + str(args.memsize) + '_epochs' + str(args.num_epochs) + '.png') #plt.show() #Now test trained model: myrl.explorationProb = 0 #Can simulate from here: simulate(myLander, myrl, memD=None, numTrials=args.num_test_trials, do_training=False, verbose=False)
def main(): myLander = LunarLander() myrl, trainRewards = train_QL(myLander, improvedSmallFeatureExtractor, numTrials=1000000) #myrl, trainRewards = train_QL( myLander, improvedFeatureExtractor, numTrials=500000 ) # myrl, trainRewards = train_QL( myLander, roundedFeatureExtractor, numTrials=500 ) print("Training completed. Switching to testing.") plt.plot(trainRewards) plt.ylabel('trainingReward') plt.xlabel('Trial No.') plt.savefig("output/trainprogress" + time.strftime("%m%d_%H%M")) plt.show() #Now test trained model: myrl.explorationProb = 0 #Can simulate from here: testRewards = simulate(myLander, myrl, numTrials=100, do_training=False) #plot progress from testing plt.clf() plt.plot(testRewards) plt.ylabel('testReward') plt.xlabel('Trial No.') plt.savefig("output/testprogress" + time.strftime("%m%d_%H%M")) plt.show()
def test_lander(weight_dict, featureExtractor): newLander = LunarLander() myrl = QLearningAlgorithm(newLander.actions, newLander.discount, featureExtractor) myrl.weights = weight_dict myrl.explorationProb = 0.0 simulate(newLander, myrl, numTrials=100, do_training=False, do_render=True)
def main(): """ Train and evaluate agent. This function basically does the same as the checker that evaluates your agent. You can use it for debugging your agent and visualizing what it does. """ from lunar_lander import LunarLander from gym.wrappers.monitoring.video_recorder import VideoRecorder env = LunarLander() agent = Agent(env) agent.train() rec = VideoRecorder(env, "policy.mp4") episode_length = 300 n_eval = 100 returns = [] print("Evaluating agent...") for i in range(n_eval): print(f"Testing policy: episode {i+1}/{n_eval}") state = env.reset() cumulative_return = 0 # The environment will set terminal to True if an episode is done. terminal = False env.reset() for t in range(episode_length): # if i <= 10: # rec.capture_frame() # Taking an action in the environment action = agent.get_action( torch.as_tensor(state, dtype=torch.float32)) state, reward, terminal = env.transition(action) cumulative_return += reward if terminal: break returns.append(cumulative_return) print(f"Achieved {cumulative_return:.2f} return.") # if i == 10: # rec.close() # print("Saved video of 10 episodes to 'policy.mp4'.") env.close() print(f"Average return: {np.mean(returns):.2f}")
def __init__(self, env=LunarLander(), QNet=QNetwork, exploration_type=0, epsilon=0.9, discount=0.99, max_episodes=1000, max_episode_length=1000, batch_size=32, discount_decay_episodes=300, plot_point=25, num_policy_exe=10, continue_learning=False, execute_policy=0, filepath=os.path.abspath(os.path.dirname(sys.argv[0])) + '\\Weights\\'): self.env = env self.exploration_type = exploration_type self.epsilon = epsilon if not exploration_type == 2 else 2 self.phi = 100 if exploration_type == 2 else 1 self.discount = discount self.max_episodes = max_episodes self.max_episode_length = max_episode_length self.batch_size = batch_size self.continue_learning = continue_learning self.execute_policy = execute_policy self.buffer = ReplayBuffer(100000) self.rew_plotter = GraphCollector() self.loss_plotter = GraphCollector() self.num_actions = self.env.action_space.n self.NUM_COMPONENT = self.env.num_reward_components self.discount_decay_episodes = discount_decay_episodes self.plot_point = plot_point self.main_nn = [] self.target_nn = [] self.optimizer = [] self.mse = tf.keras.losses.MeanSquaredError() self.num_policy_exe = num_policy_exe self.filepath = filepath self.explainer = Explainer(self) for c in range(self.NUM_COMPONENT): self.main_nn.append(QNet(64, self.num_actions)) self.target_nn.append(QNet(64, self.num_actions)) self.optimizer.append(tf.optimizers.Adam(0.01)) #5e-4
help="Collect the data in a pickle file.", ) args = parser.parse_args() samples = { "state": [], "state_img": [], "next_state": [], "next_state_img": [], "reward": [], "action": [], "terminal": [], } env = LunarLander() env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release a = np.array([0]) episode_rewards = [] steps = 0 while True: episode_reward = 0 state = env.reset() state_img = env.render( mode="rgb_array")[::4, ::4, :] # downsampling (every 4th pixel). while True:
if done or step > max_timesteps: break return episode_reward if __name__ == "__main__": # important: probably it doesn't work for you to set rendering to False for evaluation rendering = True conf = Config() agent = BCAgent(conf) model_name = 'agent_2020-03-07--19-42.pt' agent.load(f"models/{model_name}", to_cpu=True) env = LunarLander() episode_rewards = [] for i in range(conf.n_test_episodes): episode_reward = run_episode(env, agent, conf, rendering=rendering) episode_rewards.append(episode_reward) # save results in a dictionary and write them into a .json file results = dict() results["episode_rewards"] = episode_rewards results["mean"] = np.array(episode_rewards).mean() results["std"] = np.array(episode_rewards).std() timestamp = model_name.split(sep='_')[1][0:-3] fname = f"results/results_bc_agent-{timestamp}.json" fh = open(fname, "w")
EPS_END = opt.min_epsilon EPS_DECAY = opt.epsilon_decay EPS_OFFSET = opt.initial_memory_size TARGET_SYNC = opt.sync_freq LOG_FREQ = opt.log_freq RENDER = opt.render MAX_FRAMES = opt.frames LR = opt.lr INITIAL_MEMORY = opt.initial_memory_size MEMORY_SIZE = opt.memory_size PLAY_STEPS = opt.play_steps HUMAN = opt.human # create environment if 'lunar' in opt.env: env = LunarLander() RAM = True else: env = gym.make(env_id) if not opt.evaluate: env = ptan.common.wrappers.wrap_dqn(env) else: env = ptan.common.wrappers.wrap_dqn(env, episodic_life=False, reward_clipping=False) RAM = False N_ACTIONS = env.action_space.n # human control mode and saliency rendering if HUMAN: RENDER = True
def plot_io_bounds(x, y, vx, vy, theta, omega, a, steps, discrete=True): import matplotlib.pyplot as plt statebox = [x, y, vx, vy, theta, omega] centerstate = [box[0] + .5 * (box[1] - box[0]) for box in statebox] envstate = [i for i in centerstate] # Zero order hold on actions if needed if discrete and isinstance(a, int): a = a * np.ones(steps, dtype=np.int32) elif not discrete: a = [np.array(a) for i in range(steps)] # System IDed model trajectory centerstatehist = [centerstate] for i in range(steps): centerstate = lander_dynamics(*centerstate, a=a[i], discrete=discrete) centerstatehist.append(centerstate) # Actual openai gym model trajectory envstatehist = [envstate] if discrete: from lunar_lander import LunarLander env = LunarLander() else: from lunar_lander import LunarLanderContinuous env = LunarLanderContinuous() s = env.reset(envstate) for i in range(steps): s, _, _, _ = env.step(a[i]) envstatehist.append(s[0:6]) # Overapproximated trajectory stateboxhist = [statebox] for i in range(steps): statebox = lander_box_dynamics(*statebox, a=a[i], steps=1, discrete=discrete) stateboxhist.append(statebox) centerstatehist = np.array(centerstatehist) envstatehist = np.array(envstatehist) stateboxhist = np.array(stateboxhist) t = np.linspace(0, steps, steps + 1) fig, axs = plt.subplots(6, 1, figsize=(4, 9)) # fig.set_size_inches(5,7,forward=True) limits = [[-1, 1], [0, 1], [-1, 1], [-1, 1], [-np.pi / 3, np.pi / 3], [-.5, .5]] for i in range(6): axs[i].fill_between(t, stateboxhist[:, i, 0], stateboxhist[:, i, 1], alpha=0.3) axs[i].plot(centerstatehist[:, i], 'r') axs[i].plot(envstatehist[:, i], 'b.') axs[i].set_ylim(bottom=limits[i][0], top=limits[i][1]) axs[i].set_yticks(np.linspace(limits[i][0], limits[i][1], 17), minor=True) axs[i].grid(which='minor', alpha=.4) axs[0].set_title('Action {0}'.format(a)) plt.show()
from cartpole import TimedCartPoleEnv from lunar_lander import LunarLander from simple_lander import SimpleLander from model import QVModel from kerlas import ReplayMemory from kerlas.policies import BoltzmannQPolicy, GreedyEpsPolicy import numpy as np, random from time_limit import TimeLimit np.set_printoptions(precision=4, suppress=True) Gamma = 0.99 #game_env = SimpleEnv(200) # lunar lander env = LunarLander() game_env = TimeLimit(env, time_limit=300, timeout_reward=-1.0) # simple lander game_env = SimpleLander() state_dim = game_env.observation_space.shape[-1] nactions = game_env.action_space.n print("state_dim:", state_dim) qvmodel = QVModel(state_dim, nactions, Gamma) memory = ReplayMemory(10000) NGames = 100000 NextTrain = TrainInterval = 5 # train after 5 games
from lunar_lander import LunarLander, FPS import random, time, getopt import numpy as np np.set_printoptions(precision=3, suppress=True) if __name__ == "__main__": env = LunarLander() dt = 1.0 / FPS obs = env.reset() env.render() done = False t = 0 t0 = time.time() while not done and t < 500: a = random.randint(0, 3) s1, r, done, info = env.step(a) print(s1, r, info) #time.sleep(dt*10) env.render() t += 1 print("rate:", t / (time.time() - t0))
if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay def load(self, name): self.model.load_weights(name) self.epsilon = 0.3 #self.model = load_model(name) self.model.summary() def save(self, name): self.model.save_weights(name) if __name__ == "__main__": # env = gym.make('LunarLander-v3') env = LunarLander() state_size = env.observation_space.shape[0] action_size = env.action_space.n #agent = DQNAgent(state_size, action_size) done = False batch_size = 1 if 0: agent.load("model.dat") for e in range(EPISODES): #agent.load("../../Downloads/model_900.h5") #agent.epsilon = 0.0 state = env.reset() state = np.reshape(state, [1, state_size]) tot_rew = 0 for time in range(300):
from lunar_lander import demo_heuristic_lander, LunarLander total_reward_array = [] myLunarLander = LunarLander() dorender = True num_iters = 100 isdumb = True for i in range(0, num_iters): end_reward = demo_heuristic_lander(myLunarLander, render=dorender, dumb=isdumb) total_reward_array.append(end_reward) myLunarLander.reset() print("Iteration: " + str(i)) print("Average Rewards Over " + str(num_iters) + " trials ") average_reward = sum(total_reward_array) / len(total_reward_array) print(average_reward)