def main(): running_reward = 10 episode_durations = [] for i_episode in count(1): state, ep_reward = env.reset(), 0 for t in range(1, 10000): # Don't infinite loop while learning action = select_action(state) state, reward, done, _ = env.step(action) if args.render: env.render() policy.rewards.append(reward) ep_reward += reward if done: episode_durations.append(t + 1) plot_durations(episode_durations) break running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward finish_episode() if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. format(i_episode, ep_reward, running_reward)) if running_reward > env.spec.reward_threshold: print("Solved! Running reward is now {} and " "the last episode runs to {} time steps!".format( running_reward, t)) break
from model_DDDQN import load_checkpoint from utils import plot_durations import matplotlib.pyplot as plt import torch # IMPORTANT: Set value for i_episode to indicate which checkpoint you want to use # for evaluation. i_episode = 400 ckpt_dir = "DDDQN_CartPoleV1_obs_checkpoints/" input_size = 4 output_size = 2 # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Read checkpoint policy_net, _, _, _, _, episode_rewards, episode_loss = \ load_checkpoint(ckpt_dir, i_episode, input_size, output_size, device=device) # Plot figure plot_durations(episode_rewards, episode_loss)
optimize_model(policy_net, batch_log_prob, batch_rewards, optimizer, GAMMA, device=device) # Clear trajectories batch batch_log_prob = [] batch_rewards = [] # Reset Flags if not(render_each_episode): finished_rendering_this_epoch = False # Record stats training_info["epoch mean durations"].append(sum(epoch_durations) / batch_size) training_info["epoch mean rewards"].append(sum(epoch_rewards) / batch_size) if (i_epoch + 1) % num_avg_epoch: training_info["past %d epochs mean reward" % (num_avg_epoch)] = \ (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \ if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0 # Plot stats plot_durations(training_info["epoch mean rewards"]) # Update counter i_epoch += 1 # Every save_ckpt_interval, save a checkpoint according to current i_episode. if (i_epoch) % save_ckpt_interval == 0: save_checkpoint(ckpt_dir, policy_net, optimizer, i_epoch, learning_rate=learning_rate, **training_info)
def trainDQN(file_name="DQN", env=GridworldEnv(1), batch_size=128, gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000): """ DQN training routine. Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.title("") plt.draw() plt.pause(0.00001) num_actions = env.action_space.n model = DQN(num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done = 0 # total steps for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() print("Cur episode:", i_episode, "steps done:", steps_done, "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay)) # Initialize the environment and state env.reset() # last_screen = env.current_grid_map # (1, 1, 8, 8) current_screen = get_screen(env) state = current_screen # - last_screen for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) steps_done += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma) if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-dqn-rewards', episode_rewards) np.save(file_name + '-dqn-durations', episode_durations) return model, episode_rewards, episode_durations
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128, gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000, n_step=10, target_update=10): """ Soft Q-learning training routine when observation vector is input Retuns rewards and durations logs. """ num_actions = env.action_space.n input_size = env.observation_space.shape[0] model = DQN(input_size, num_actions) target_model = DQN(input_size, num_actions) target_model.load_state_dict(model.state_dict()) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size, n_step, gamma) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done, t = 0, 0 for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() if i_episode != 0: print("Cur episode:", i_episode, "steps done:", episode_durations[-1], "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward) # Initialize the environment and state state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view( -1, input_size) for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy(next_state_tmp).type( torch.FloatTensor).view(-1, input_size) if done: next_state = None # Store the transition in memory memory.push(model, target_model, state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, target_model, optimizer, memory, batch_size, gamma, beta) #### Difference w.r.t DQN if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append( env.episode_total_reward ) ##### Modify for OpenAI envs such as CartPole if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break if i_episode % target_update == 0 and i_episode != 0: target_model.load_state_dict(model.state_dict()) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql0-rewards', episode_rewards) np.save(file_name + '-sql0-durations', episode_durations) return model, episode_rewards, episode_durations
training_info["episode reward"].append(running_reward) if running_reward > training_info["max reward achieved"]: training_info["max reward achieved"] = running_reward training_info["past 100 episodes mean reward"] = \ (sum(training_info["episode reward"][-100:]) / 100) if len(training_info["episode reward"])>=100 else 0 training_info["training loss"].append(running_minibatch_loss / (t + 1)) training_info["episode loss"].append(running_episode_loss / t) if (running_episode_loss / t) > training_info["max episode loss recorded"]: training_info[ "max episode loss recorded"] = running_episode_loss / t # Plot stats plot_durations(training_info["episode reward"], training_info["training loss"], training_info["episode loss"]) print("============= Episode: %d =============" % (i_episode + 1)) print("Episode reward: %d" % training_info["episode reward"][-1]) print("Episode duration: %d" % (t + 1)) print("Training loss: %f" % training_info["training loss"][-1]) print("Episode loss: %f \n" % training_info["episode loss"][-1]) print("Max reward achieved: %f" % training_info["max reward achieved"]) print("Max TD loss recorded: %f" % training_info["max TD loss recorded"]) print("Max episode loss recorded: %f" % training_info["max episode loss recorded"]) print("Past 100 episodes avg reward: %f \n\n" %
memory, policy_net, target_net, optimizer, GAMMA=GAMMA, device=device) if loss is not None: running_loss += loss if done: # Save and print episode stats (duration and episode loss) episode_durations.append(t + 1) mean_duration = (sum(episode_durations[-100:]) / 100) if len(episode_durations) >= 100 else 0 episode_loss.append(running_loss / (t + 1)) plot_durations(episode_durations, episode_loss) print( "Episode: %d Cumulative Rewards: %d Episode Loss: %f, past 100 episodes avg reward: %f" % (i_episode + 1, t + 1, (running_loss / (t + 1)), mean_duration)) # Check if the problem is solved # CartPole standard: average reward for the past 100 episode above 195 if mean_duration > 195: print("\n\n\t Problem Solved !!!\n\n\n") break i_episode += 1 # Update the target network, copying all weights and biases in DQN if i_episode % target_update == 0:
last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None memory.push(state, action, next_state, reward) state = next_state #if done: # print "Episode Done" #else: # print state.size() optimize_model(policy_net, optimizer) if done: episode_durations.append(t + 1) plot_durations(episode_durations, AVERAGE_SIZE) break if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) print("Complet") env.render() env.close() plt.ioff() plt.show()
def train(): global memory try: memory, ct, steps = pickle.load(open("cache.p", "rb")) except: print("Starting from scratch........!!!!!!") memory = ReplayMemory(10000) steps = 0 ct = 0 game_evn.jump() try: while True: score = 0 current_screen = game_evn.capture_screen() / 255 current_screen_torch = torch.from_numpy(current_screen).unsqueeze( 0).unsqueeze(0) state = current_screen_torch for t in count(): sample = random.random() threshold = eps_end + (eps_start - eps_end) * math.exp( -1. * steps / eps_decay) steps += 1 if sample > threshold: with torch.no_grad(): action = policy_net(state.float()).max(1)[1].view(1, 1) else: action = torch.tensor([[random.randrange(3)]], device=device, dtype=torch.long) current_screen, reward, is_gameover, score = game_state.get_state( action.item()) reward = torch.tensor([reward], device=device) score += reward current_screen = game_evn.capture_screen() / 255 current_screen_torch = torch.from_numpy( current_screen).unsqueeze(0).unsqueeze(0) if not is_gameover: next_state = current_screen_torch else: next_state = None memory.push(state, action, next_state, reward) state = next_state optimize_model() if is_gameover: episode_durations.append(t + 1) plot_durations(episode_durations) break if ct % 100 == 0: game_evn.pause_game() with open("cache.p", "wb") as cache: pickle.dump((memory, ct, steps), cache) target_net.load_state_dict(policy_net.state_dict()) gc.collect() torch.save( { "policy_net": policy_net.state_dict(), "target_net": target_net.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_name) game_evn.resume_game() print(f"{ct} running.....") ct += 1 except KeyboardInterrupt: torch.save( { "policy_net": policy_net.state_dict(), "target_net": target_net.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_name)
from save_and_load import load_checkpoint from utils import plot_durations import matplotlib.pyplot as plt import torch # IMPORTANT: Set value for i_episode to indicate which checkpoint you want to use # for evaluation. i_episode = 7300 start_idx = 6500 end_idx = 7200 ckpt_dir = "DDDQN_SGD_CartPoleV1_obs_checkpoints/" input_size = 4 output_size = 2 # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Read checkpoint _, _, _, training_info = \ load_checkpoint(ckpt_dir, i_episode, input_size, output_size, device=device) # Plot figure plot_durations(training_info["episode reward"], training_info["training loss"], training_info["episode loss"], (start_idx, end_idx))
def train(): # Graph Part print("Graph initialization...") xdim = xtrim[1] - xtrim[0] ydim = ytrim[1] - ytrim[0] channel=3 num_action = env.action_space.n policy_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel, num_action=num_action, learning_rate=learning_rate, batch_size=batch_size) target_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel, num_action=num_action, learning_rate=learning_rate, batch_size=batch_size) policy_net.to(DEVICE) target_net.to(DEVICE) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Memory memory = utils.ReplayMemory(10000) # ETCs steps_done = 0 episode_durations = [] policy_net.float() target_net.float() print("Training Start.....") for episode in range(num_episodes): REWARD = 0 previous_screenshot = utils.dimension_manipulation(env.reset()[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]]) current_screenshot = previous_screenshot state = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE) for t in count(): #env.render() action = utils.select_action(state, steps_done, policy_net) observation, reward, done, _ = env.step(action.item()) previous_screenshot = current_screenshot current_screenshot = utils.dimension_manipulation(observation[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]]) if not done: next_status = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE) REWARD += reward else: next_status = None if True : memory.push(state, action, next_status, torch.tensor(float(t+1)).to(DEVICE)[None]) state = next_status utils.optimize_model(policy_net, target_net, memory, batch_size) if done: utils.optimize_model(policy_net, target_net, memory, batch_size) episode_durations.append(t + 1) utils.plot_durations(episode_durations) if REWARD != 0: print("\n######## Episode " + str(episode)) print("Duration : " + str(t + 1)) print("REWARD : " + str(REWARD)) print("loss : " + str(policy_net.loss.item())) break if episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict())
if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model() if done: episode_durations.append(t + 1) plot_durations(episode_durations) break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) print('Complete') env.render() env.close() plt.ioff() plt.show() ###################################################################### # Here is the diagram that illustrates the overall resulting data flow. # # .. figure:: /_static/img/reinforcement_learning_diagram.jpg
training_info["value net loss"].append(value_net_mse) if (i_epoch + 1) % num_avg_epoch: training_info["past %d epochs mean reward" % (num_avg_epoch)] = \ (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \ if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0 # Print stats print("\n\n============= Epoch: %d =============" % (i_epoch + 1)) print("epoch mean durations: %f" % (epoch_durations[-1])) print("epoch mean rewards: %f" % (epoch_rewards[-1])) print("Max reward achieved: %f" % training_info["max reward achieved"]) print("value net loss: %f" % value_net_mse) # Plot stats if plot: plot_durations(training_info["epoch mean rewards"], training_info["value net loss"]) # Update counter i_epoch += 1 # Every save_ckpt_interval, save a checkpoint according to current i_episode. if i_epoch % save_ckpt_interval == 0: save_checkpoint(ckpt_dir, policy_net, value_net, policynet_optimizer, valuenet_optimizer, i_epoch, policy_lr=policy_lr, valuenet_lr=valuenet_lr, **training_info)
n_games = 1000 score = 0 print("Save is currently !!!!!!!!!!!!!!!!!! ", A.save) for i in range(n_games): A.env.reset() last_screen = A.get_state() current_screen = A.get_state() state = current_screen - last_screen done = False score = 0 if i % 20 == 0 and i > 0: plot_durations(scores, 0.001) print('----------------- training --------------------') print('epsiode number', i) print("Average score ", avg_score[-1]) print('----------------- training --------------------') while not done: action = A.choose_action(state) _, reward, done, _ = A.env.step(action) last_screen = current_screen current_screen = A.get_state() next_state = current_screen - last_screen
from save_and_load import load_checkpoint from utils import plot_durations import matplotlib.pyplot as plt import torch # IMPORTANT: Set value for i_episode to indicate which checkpoint you want to use # for evaluation. i_epoch = 650 start_idx = 0 end_idx = i_epoch input_size = 8 output_size = 4 layer_sizes = [input_size, 128, 128, 128, output_size] # The MLP network architecture env_name = "LunarLander-v2" ckpt_dir = "simplePG_Adam_%s_obs_checkpoints/" % (env_name) # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Read checkpoint _, _, training_info = \ load_checkpoint(ckpt_dir, i_epoch, layer_sizes, device=device) # Plot figure plot_durations(training_info["epoch mean rewards"], (start_idx, end_idx))