def demo_saved_agent(env, agent_name, n_episodes=3, epsilon=0.05, seed=0, train_mode=False, verbose=False): # initialize environment and scenario info brain, brain_name, state, action_size, state_size = env_initialize( env, train_mode=train_mode) # load the agent params and create the agent params, local_weights, target_weights = load_dqn(agent_name, verbose=verbose) agent = DQN_Agent(state_size, action_size, brain_name, seed, params=params) print(agent.display_params()) # set trained agent weights agent.qnetwork_local.load_state_dict(local_weights) agent.qnetwork_target.load_state_dict(target_weights) # run demo return demo_agent(env, agent, n_episodes=n_episodes, epsilon=epsilon, seed=seed, train_mode=train_mode)
def eval(): agent = DQN_Agent(learning=False, rew_attr="wait_time") env = Environment(agent) generate_routefile(2000) env.run() for key in TRAFFIC_ATTRS: print("STATS: ", sum(env.stats[key])/len(env.stats[key]))
def create_agent(test_case_id, *args, **kwargs): """ Method that will be called to create your agent during testing. You can, for example, initialize different class of agent depending on test case. """ # return Base_Agent(test_case_id=test_case_id) return DQN_Agent(test_case_id=test_case_id)
def learn(): for i in range(100): print("Inside learning step: ", i) generate_routefile(2000) learning_rate = 10/(50 + i) eps_prob = 10/(10 + i) print("Loop: ", i) agent =DQN_Agent(learning=True, rew_attr="wait_time", Lnorm=3, # learning_rate=learning_rate, # exploration_eps=eps_prob ) env = Environment(agent) env.run()
def main(): env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1') # Parameters for observation image size processing. img_rows = 128 img_cols = 128 img_stack = 4 action_size = 8 # 8 valid button combinations # Inputs to the agent's prediction network will have the following shape. input_size = (img_rows, img_cols, img_stack) # File paths stat_path = '../statistics/dqn_n-step' model_path = '../models/dqn_n-step' # Priortized Experience Replay. if (PER_AGENT): print('PER agent') stat_path += '_PER' model_path += '_PER' dqn_agent = DQN_PER_Agent(input_size, action_size) else: dqn_agent = DQN_Agent(input_size, action_size) # Use the Noisy Dueling Network. if (NOISY): stat_path += '_noisy_dueling' model_path += '_noisy_dueling' print('NOISY Dueling agent') dqn_agent.main_model = Networks.noisy_dueling_dqn( input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.noisy_dueling_dqn( input_size, action_size, dqn_agent.target_lr) dqn_agent.noisy = True # Use the normal dueling network. elif (DUELING): stat_path += '_dueling' model_path += '_dueling' print('Dueling agent') dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.target_lr) # Normal DQN. else: dqn_agent.main_model = Networks.dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dqn(input_size, action_size, dqn_agent.target_lr) # Append correct suffix and filetype to paths. stat_path += '_stats.csv' main_model_path = model_path + '_main.h5' target_model_path = model_path + '_target.h5' # Load previous models, or instantiate new networks. if (LOAD_MODELS): dqn_agent.load_models(main_model_path, target_model_path) # Modify statrting epsilon value if (EPSILON == START): dqn_agent.epsilon = dqn_agent.initial_epsilon elif (EPSILON == MIDDLE): dqn_agent.epsilon = ( (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2) else: dqn_agent.epsilon = dqn_agent.final_epsilon # Store rewards and states from the previous n state, action pairs to # create experiences. prev_n_rewards = deque(maxlen=dqn_agent.n_step) prev_n_exp = deque(maxlen=dqn_agent.n_step) # One episode is 4500 steps if not completed # 5 minutes of frames at 1/15th of a second = 4 60Hz frames total_timestep = 0 # Total number of timesteps over all episodes. for episode in range(EPISODES): done = False reward_sum = 0 # Average reward within episode. timestep = 0 # Track timesteps within the episode. # Rewards and states must be consecutive to improve temporal awareness. # Reset at the start of each episode to compensate for sudden scene change. prev_n_rewards.clear() prev_n_exp.clear() # Experiences are a stack of the img_stack most frames to provide # temporal information. Initialize this sequence to the first # observation stacked 4 times. first_obs = env.reset() processed = preprocess_obs(first_obs, size=(img_rows, img_cols)) # (img_rows, img_cols, img_stack) exp_stack = np.stack(([processed] * img_stack), axis=2) # Expand dimension to stack and submit multiple exp_stacks in a batch # (1, img_rows, img_cols, img_stack). exp_stack = np.expand_dims(exp_stack, axis=0) # 1x64x64x4 # Continue until the end of the zone is reached or 4500 timesteps have # passed. while not done: # Predict an action to take based on the most recent # experience. # # Note that the first dimension # (1, img_rows, img_cols, img_stack) is ignored by the # network here as it represents a batch size of 1. act_idx, action = dqn_agent.act(exp_stack) obs, reward, done, info = env.step(action) # env.render() timestep += 1 total_timestep += 1 reward_sum += reward # Create a 1st dimension for stacking experiences and a 4th for # stacking img_stack frames. obs = preprocess_obs(obs, size=(img_rows, img_cols)) obs = np.reshape(obs, (1, img_rows, img_cols, 1)) # Append the new observation to the front of the stack and remove # the oldest (4th) frame. exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3) # Save the previous state, selected action, and resulting reward prev_n_rewards.appendleft(reward) prev_n_exp.append((exp_stack, act_idx, done)) exp_stack = exp_stack_new # Once sufficent steps have been taken, discount rewards and save nth # previous experience if (len(prev_n_rewards) >= dqn_agent.n_step): # Compute discounted reward discounted_reward = 0 for idx in range(len(prev_n_rewards)): prev_reward = prev_n_rewards[idx] # rewards are append left so that the most recent rewards are # discounted the least. discounted_reward += ((dqn_agent.gamma**idx) * prev_reward) # Experiences are pushed forward into the deque as more are appened. The # nth previous experience is at the last index. original_state, original_act, _ = prev_n_exp[-1] nth_state, _, nth_done = prev_n_exp[0] # Save the nth previous state and predicted action the discounted sum of rewards # and final state over the next n steps. dqn_agent.save_memory(original_state, original_act, discounted_reward, nth_state, nth_done) # In the observation phase skip training updates and decrmenting epsilon. if (total_timestep >= dqn_agent.observation_timesteps): # Update the target model with the main model's weights. if ((total_timestep % dqn_agent.update_target_freq) == 0): dqn_agent.update_target_model() # Train the agent on saved experiences. if ((total_timestep % dqn_agent.timestep_per_train) == 0): dqn_agent.replay_update() dqn_agent.save_models(main_model_path, target_model_path) if (dqn_agent.epsilon > dqn_agent.final_epsilon): # Decrease epsilon by a fraction of the range such that epsilon decreases # for "exploration_timesteps". dec = ( (dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / dqn_agent.exploration_timesteps) dqn_agent.epsilon -= dec # print(info) print("Epsisode:", episode, " Timestep:", timestep, " Action:", act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:", dqn_agent.epsilon) # Save mean episode reward at the end of the episode - append to stats file with open(stat_path, "a") as stats_fd: reward_str = "Epsiode Cummulative Reward: " + str( reward_sum) + ", Episode Timestpes: " + str(timestep) + ",\n" stats_fd.write(str(reward_str))
def demo4_LearningPathPlanning(setting): n_sample = 100 # Environment env = FireEnvironment(64, 64) # Vehicle to generate observation mask vehicle = Vehicle(n_time_windows=512, grid_size=(64, 64), planner_type='Default') # Trainer and Estimator dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size=(env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=16, gru_hidden_dim=16) ### DQN agent dqn_agent = DQN_Agent(state_size=16, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0) # Train Data Buffer memory = SingleTrajectoryBuffer(N_MEMORY_SIZE) # Train Iteration Logger writer = SummaryWriter() # Video Writier video_writer1 = ImageStreamWriter('LearningPlanner.avi', FPS, image_size=(1200, 820)) # Add concat. text setting_text = '' for k, v in setting.items(): setting_text += k setting_text += ':' setting_text += str(v) setting_text += '\t' writer.add_text('setting', setting_text) ######################################## ### Interacting with the Environment ### ######################################## mask_obs, obs, state = env.reset() state_est_grid = dyn_autoencoder.u_k ### Loss Monitors ### list_loss = [] list_cross_entropy_loss = [] list_entropy_loss = [] list_rewards = [] list_new_fire_count = [] list_action = [] ### Filling the Data Buffer ### for i in tqdm.tqdm(range(N_TRAIN_WAIT)): map_visit_mask, img_resized = vehicle.full_mask() mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)): # determine epsilon-greedy action from current sate h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() epsilon = 0.1 action = dqn_agent.act(h_k, epsilon) list_action.append(action) ### Collect Data from the Env. ### map_visit_mask, img_resized = vehicle.plan_a_trajectory( state_est_grid, n_sample, action) mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) ### Run the Estimator ### state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask) h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() #### Update the reinforcement learning agent ### dqn_agent.step(h_k, action, reward, h_kp1, done=False) list_rewards.append(reward) list_new_fire_count.append(info['new_fire_count']) ################################ ### Rendering and Save Video ### ################################ img_env = env.output_image() img_agent = dyn_autoencoder.output_image(state_est_grid) # State Est #blank = np.zeros((400, 200, 3)) img_top = img_env #np.concatenate((blank, img_env[:,:800], blank), axis=1) blank = np.zeros((20, 1200, 3)) img_top = np.concatenate((img_top, blank), axis=0) img_top = (img_top * 255).astype('uint8') img_state_est_grid_uint8 = (img_agent * 255).astype('uint8') backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB) img_bayes_uint8 = np.concatenate((img_top, backtorgb), axis=0) #<-- to be saved render('Dynamic Auto Encoder', img_bayes_uint8, 1) # Save video # video_writer1.write_image_frame(img_bayes_uint8) ### Training ### loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update( memory, N_TRAIN_BATCH, N_TRAIN_WINDOW) list_loss.append(loss_val) list_cross_entropy_loss.append(loss_val_cross) list_entropy_loss.append(loss_val_ent) if i % N_LOGGING_PERIOD == 0: avg_loss = np.mean(np.array(list_loss)) list_loss = [] writer.add_scalar('dynautoenc/loss', avg_loss, i) avg_loss_cross = np.mean(np.array(list_cross_entropy_loss)) list_cross_entropy_loss = [] writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i) avg_loss_entropy = np.mean(np.array(list_entropy_loss)) list_entropy_loss = [] writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i) avg_reward = np.mean(np.array(list_rewards)) list_rewards = [] writer.add_scalar('perform/rewards', avg_reward, i) avg_new_fire_count = np.mean(np.array(list_new_fire_count)) list_new_fire_count = [] writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i) writer.add_scalar('perform/pc_coverd_new_fire', avg_reward / avg_new_fire_count, i) action_0_count = list_action.count(0) action_1_count = list_action.count(1) action_2_count = list_action.count(2) action_3_count = list_action.count(3) writer.add_scalar('action_count/0', action_0_count / len(list_action), i) writer.add_scalar('action_count/1', action_1_count / len(list_action), i) writer.add_scalar('action_count/2', action_2_count / len(list_action), i) writer.add_scalar('action_count/3', action_3_count / len(list_action), i) list_action = [] writer.add_scalar('obs_state0/o00', O_np_val[0][0], i) writer.add_scalar('obs_state1/o01', O_np_val[0][1], i) writer.add_scalar('obs_state2/o02', O_np_val[0][2], i) writer.add_scalar('obs_state0/o10', O_np_val[1][0], i) writer.add_scalar('obs_state1/o11', O_np_val[1][1], i) writer.add_scalar('obs_state2/o12', O_np_val[1][2], i) writer.add_scalar('obs_state0/o20', O_np_val[2][0], i) writer.add_scalar('obs_state1/o21', O_np_val[2][1], i) writer.add_scalar('obs_state2/o22', O_np_val[2][2], i) print( 'losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f' % (i, avg_loss, avg_loss_cross, avg_loss_entropy)) print('memory size at iteration: %d, size: %d' % (i, len(memory.obs_memory))) if (i + 1) % N_SAVING_PERIOD == 0: f_name = setting['name'] dyn_autoencoder.save_the_model(i, f_name) dqn_agent.save_the_model(i, f_name) video_writer1.close()
from dqn_agent import DQN_Agent from tqdm import tqdm import networkx as nx import numpy as np import pylab as plt import pandas as pd input_dim = 1 output_dim = 9 exp_replay_size = 256 agent = DQN_Agent(seed=0, layer_sizes=[input_dim, 16, output_dim], lr=1e-3, sync_freq=5, exp_replay_size=exp_replay_size) # Main training loop losses_list, reward_list, episode_len_list, epsilon_list = [], [], [], [] episodes = 10000 epsilon = 1 # Initialize the graph edge_list = [(0, 2), (0, 1), (0, 3), (2, 4), (5, 6), (7, 4), (0, 6), (5, 3), (3, 7), (0, 8)] goal = 7 SIZE_MATRIX = 9 G = nx.Graph() G.add_edges_from(edge_list)
import gym from tqdm import tqdm from time import sleep from dqn_agent import DQN_Agent env = gym.make('CartPole-v0') input_dim = env.observation_space.shape[0] output_dim = env.action_space.n exp_replay_size = 256 agent = DQN_Agent(seed=1423, layer_sizes=[input_dim, 64, output_dim], lr=1e-3, sync_freq=5, exp_replay_size=exp_replay_size) agent.load_pretrained_model("cartpole-dqn.pth") reward_arr = [] for i in tqdm(range(100)): obs, done, rew = env.reset(), False, 0 while not done: A = agent.get_action(obs, env.action_space.n, epsilon=0) obs, reward, done, info = env.step(A.item()) rew += reward sleep(0.01) env.render() reward_arr.append(rew)
def demo5_ComparePolicies(setting, env): n_sample = 2048 # Vehicle to generate observation mask vehicle = Vehicle(n_time_windows=64, grid_size=(64,64), planner_type='Default') # Trainer and Estimator dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=4, gru_hidden_dim=4) ### DQN agent dqn_agent = DQN_Agent(state_size=4, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0) # Train Data Buffer memory = SingleTrajectoryBuffer(N_MEMORY_SIZE) # Video Writier ''' video_f_name = 'UsePlanner'+ '_' + setting['name'] + '_' + setting['policy_type'] + '.avi' video_writer1 = ImageStreamWriter(video_f_name, FPS, image_size=(1200,820)) ''' # Train Iteration Logger writer = SummaryWriter() # Add concat. text setting_text = '' for k,v in setting.items(): setting_text += k setting_text += ':' setting_text += str(v) setting_text += '\t' writer.add_text('setting', setting_text) ######################################## ### Interacting with the Environment ### ######################################## ### Loss Monitors ### list_rewards = [] list_new_fire_count = [] list_action = [] list_loss = [] ### Filling the Data Buffer ### for i in tqdm.tqdm(range(N_TRAIN_WAIT)): map_visit_mask, img_resized = vehicle.full_mask() mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) mask_obs, obs, state = env.reset() state_est_grid = dyn_autoencoder.u_k for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)): # determine epsilon-greedy action from current sate h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() epsilon = 0.1 action = dqn_agent.act(h_k, epsilon) ### Collect Data from the Env. ### # Plan a trajectory policy_type = setting['policy_type'] if policy_type == 'Default': map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) elif policy_type == 'Random': action = 777 map_visit_mask, img_resized = vehicle.generate_a_random_trajectory() elif policy_type == 'Act0': action = 0 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) elif policy_type == 'Act1': action = 1 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) elif policy_type == 'Act2': action = 2 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) else: action = 3 map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) list_action.append(action) # Collect the masked observation mask_obs, obs, state, reward, info = env.step(map_visit_mask) memory.add(mask_obs.detach().long(), state.detach().long(), map_visit_mask.detach().long()) ### Run the Estimator ### state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask) h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() list_rewards.append(reward) list_new_fire_count.append(info['new_fire_count']) update = True #### Update the reinforcement learning agent and Dyn Auto Enc ### if policy_type != 'Random': dqn_agent.step(h_k, action, reward, h_kp1, False, update) loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW, update) list_loss.append(loss_val) ################################ ### Rendering and Save Video ### ################################ img_env = env.output_image() img_agent = dyn_autoencoder.output_image(state_est_grid) # State Est #blank = np.zeros((400, 200, 3)) img_top = img_env #np.concatenate((blank, img_env[:,:800], blank), axis=1) blank = np.zeros((20, 1200, 3)) img_top = np.concatenate((img_top, blank), axis=0) img_top = (img_top*255).astype('uint8') img_state_est_grid_uint8 = (img_agent*255).astype('uint8') backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB) img_bayes_uint8 = np.concatenate((img_top, backtorgb), axis=0) #<-- to be saved render('Dynamic Auto Encoder', img_bayes_uint8, 1) # Save video # #video_writer1.write_image_frame(img_bayes_uint8) if i%N_LOGGING_PERIOD == 0: avg_reward = np.mean(np.array(list_rewards)) list_rewards = [] writer.add_scalar('perform/rewards', avg_reward, i) avg_new_fire_count = max(np.mean(np.array(list_new_fire_count)), 1) # to avoid division by zero list_new_fire_count = [] writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i) writer.add_scalar('perform/pc_coverd_new_fire', avg_reward/avg_new_fire_count, i) if policy_type != 'Random': avg_loss = np.mean(np.array(list_loss)) list_loss = [] writer.add_scalar('dynautoenc/loss', avg_loss, i) action_0_count = list_action.count(0) action_1_count = list_action.count(1) action_2_count = list_action.count(2) action_3_count = list_action.count(3) writer.add_scalar('action_count/0', action_0_count/len(list_action), i) writer.add_scalar('action_count/1', action_1_count/len(list_action), i) writer.add_scalar('action_count/2', action_2_count/len(list_action), i) writer.add_scalar('action_count/3', action_3_count/len(list_action), i) list_action = [] writer.add_scalar('obs_state0/o00', O_np_val[0][0], i) writer.add_scalar('obs_state1/o01', O_np_val[0][1], i) writer.add_scalar('obs_state2/o02', O_np_val[0][2], i) writer.add_scalar('obs_state0/o10', O_np_val[1][0], i) writer.add_scalar('obs_state1/o11', O_np_val[1][1], i) writer.add_scalar('obs_state2/o12', O_np_val[1][2], i) writer.add_scalar('obs_state0/o20', O_np_val[2][0], i) writer.add_scalar('obs_state1/o21', O_np_val[2][1], i) writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)
def train(fullcover, name, setting): n_sample = 20 # Environment env = FireEnvironment(64, 64) # Vehicle to generate observation mask vehicle = Vehicle(n_time_windows=1000, grid_size=(64,64), planner_type=setting['planner_type']) # Trainer and Estimator dyn_autoencoder = DynamicAutoEncoder(SETTING, grid_size = (env.map_width, env.map_height), n_state=3, n_obs=3, encoding_dim=16, gru_hidden_dim=16) # Train Data Buffer memory = SingleTrajectoryBuffer(N_MEMORY_SIZE) ### DQN agent dqn_agent = DQN_Agent(state_size=16, action_size=4, replay_memory_size=1000, batch_size=64, gamma=0.99, learning_rate=0.01, target_tau=0.01, update_rate=1, seed=0) # Train Iteration Logger from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() # Add concat. text setting_text = '' for k,v in setting.items(): setting_text += k setting_text += str(v) setting_text += '\t' writer.add_text('setting', setting_text) ######################################## ### Interacting with the Environment ### ######################################## mask_obs, obs, state = env.reset() map_visit_mask, img_resized = vehicle.full_mask() state_est_grid = dyn_autoencoder.u_k ### Loss Monitors ### list_loss = [] list_cross_entropy_loss = [] list_entropy_loss = [] list_rewards = [] list_count_fire_visit = [] list_count_all_fire = [] list_action = [] ### Filling the Data Buffer ### for i in tqdm.tqdm(range(N_TRAIN_WAIT)): if fullcover: map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) else: map_visit_mask, img_resized = vehicle.full_mask() mask_obs, obs, state, reward = env.step(map_visit_mask) memory.add(mask_obs, state, map_visit_mask) for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)): # determine epsilon-greedy action from current sate h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() epsilon = 0.1 action = dqn_agent.act(h_k, epsilon) list_action.append(action) ### Collect Data from the Env. ### if fullcover: map_visit_mask, img_resized = vehicle.full_mask() else: map_visit_mask, img_resized = vehicle.plan_a_trajectory(state_est_grid, n_sample, action) mask_obs, obs, state, reward = env.step(map_visit_mask) memory.add(mask_obs, state, map_visit_mask) ### Run the Estimator ### state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask) h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy() #### Update the reinforcement learning agent ### dqn_agent.step(h_k, action, reward, h_kp1, done=False) list_rewards.append(reward) fire_count = (torch.sum(state[2])).item() fire_visit = (torch.sum(mask_obs.permute(2,0,1) * state[2].unsqueeze(0))).item() if fire_count < 1: print('no fire') else: list_count_fire_visit.append(fire_visit) list_count_all_fire.append(fire_count) ### Render the Env. and the Est. ### if i % N_RENDER_PERIOD == 0: img_env = env.output_image() img_state_est_grid = dyn_autoencoder.output_image(state_est_grid) render('env', img_env, 1) render('img_state_est_grid', img_state_est_grid, 1) ### Training ### loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update(memory, N_TRAIN_BATCH, N_TRAIN_WINDOW) list_loss.append(loss_val) list_cross_entropy_loss.append(loss_val_cross) list_entropy_loss.append(loss_val_ent) if i%N_LOGGING_PERIOD == 0: avg_loss = np.mean(np.array(list_loss)) list_loss = [] writer.add_scalar('dynautoenc/loss', avg_loss, i) avg_loss_cross = np.mean(np.array(list_cross_entropy_loss)) list_cross_entropy_loss = [] writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i) avg_loss_entropy = np.mean(np.array(list_entropy_loss)) list_entropy_loss = [] writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i) avg_reward = np.mean(np.array(list_rewards)) list_rewards = [] writer.add_scalar('perform/rewards', avg_reward, i) avg_count_fire_visit = np.mean(np.array(list_count_fire_visit)) list_count_fire_visit = [] writer.add_scalar('perform/avg_count_fire_visit', avg_count_fire_visit, i) avg_count_all_fire = np.mean(np.array(list_count_all_fire)) list_count_all_fire = [] writer.add_scalar('perform/avg_count_all_fire', avg_count_all_fire, i) action_0_count = list_action.count(0) action_1_count = list_action.count(1) action_2_count = list_action.count(2) action_3_count = list_action.count(3) list_action = [] if setting['planner_type'] == 'Default': writer.add_scalar('action_count/0', action_0_count, i) writer.add_scalar('action_count/1', action_1_count, i) writer.add_scalar('action_count/2', action_2_count, i) writer.add_scalar('action_count/3', action_3_count, i) writer.add_scalar('obs_state0/o00', O_np_val[0][0], i) writer.add_scalar('obs_state1/o01', O_np_val[0][1], i) writer.add_scalar('obs_state2/o02', O_np_val[0][2], i) writer.add_scalar('obs_state0/o10', O_np_val[1][0], i) writer.add_scalar('obs_state1/o11', O_np_val[1][1], i) writer.add_scalar('obs_state2/o12', O_np_val[1][2], i) writer.add_scalar('obs_state0/o20', O_np_val[2][0], i) writer.add_scalar('obs_state1/o21', O_np_val[2][1], i) writer.add_scalar('obs_state2/o22', O_np_val[2][2], i) print('losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f' % (i, avg_loss, avg_loss_cross, avg_loss_entropy)) print('memory size at iteration: %d, size: %d' % (i, len(memory.obs_memory))) if (i+1)%N_SAVING_PERIOD==0: f_name = name dyn_autoencoder.save_the_model(i, f_name)
from tqdm import tqdm from time import sleep from dqn_agent import DQN_Agent import numpy as np input_dim = 1 output_dim = 9 exp_replay_size = 256 agent = DQN_Agent(seed=1423, layer_sizes=[input_dim, 16, output_dim], lr=1e-3, sync_freq=5, exp_replay_size=exp_replay_size) agent.load_pretrained_model("shortest-path-dqn.pth") goal = 7 obs = 4 done = False steps = [obs] while not obs == goal: A = agent.get_action(np.array([obs]), 9, epsilon=0) print(str(obs) + ' -> ' + str(A.item())) obs = A.item() steps.append(A.item()) print(steps)
def main(): env = make(game='SonicTheHedgehog2-Genesis', state='EmeraldHillZone.Act1') # Parameters for observation image size processing. img_rows = 128 img_cols = 128 img_stack = 4 action_size = 8 # 8 valid button combinations # Inputs to the agent's prediction network will have the following shape. input_size = (img_rows, img_cols, img_stack) # File paths stat_path = '../statistics/dqn' model_path = '../models/dqn' # Priortized Experience Replay. if (PER_AGENT): print('PER agent') stat_path += '_PER' model_path+= '_PER' dqn_agent = DQN_PER_Agent(input_size, action_size) elif(DIST_AGENT): stat_path += '_DIST' model_path+= '_DIST' dqn_agent = DistributionalDQN(input_size, action_size) else: dqn_agent = DQN_Agent(input_size, action_size) # Use the Noisy Dueling Network. if (NOISY): stat_path += '_noisy_dueling' model_path += '_noisy_dueling' print('NOISY Dueling agent') dqn_agent.main_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.noisy_dueling_dqn(input_size, action_size, dqn_agent.target_lr) dqn_agent.noisy = True # Use the normal dueling network. elif(DUELING and DIST_AGENT): stat_path += '_dueling' model_path += '_dueling' print('Dueling distributional') dqn_agent.main_model = Networks.dueling_C51(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dueling_C51(input_size, action_size, dqn_agent.target_lr) elif (DUELING): stat_path += '_dueling' model_path += '_dueling' print('Dueling agent') dqn_agent.main_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dueling_dqn(input_size, action_size, dqn_agent.target_lr) elif(DIST_AGENT): dqn_agent.main_model = Networks.C51(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.C51(input_size, action_size, dqn_agent.target_lr) # Normal DQN. else: dqn_agent.main_model = Networks.dqn(input_size, action_size, dqn_agent.main_lr) dqn_agent.target_model = Networks.dqn(input_size, action_size, dqn_agent.target_lr) # Append correct suffix and filetype to paths. stat_path += '_stats.csv' main_model_path = model_path + '_main.h5' target_model_path = model_path + '_target.h5' # Load previous models. if (LOAD_MODELS): dqn_agent.load_models(main_model_path, target_model_path) # Modify statrting epsilon value if (EPSILON == START): dqn_agent.epsilon = dqn_agent.initial_epsilon elif (EPSILON == MIDDLE): dqn_agent.epsilon = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / 2) else: dqn_agent.epsilon = dqn_agent.final_epsilon # One episode is 4500 steps if not completed # 5 minutes of frames at 1/15th of a second = 4 60Hz frames total_timestep = 0 # Total number of timesteps over all episodes. for episode in range(EPISODES): done = False reward_sum = 0 # Average reward within episode. timestep = 0 # Track timesteps within the episode. first_obs = env.reset() # Experiences are a stack of the img_stack most frames to provide # temporal information. Initialize this sequence to the first # observation stacked 4 times. processed = preprocess_obs(first_obs, size=(img_rows, img_cols)) # (img_rows, img_cols, img_stack) exp_stack = np.stack(([processed]*img_stack), axis = 2) # Expand dimension to stack and submit multiple exp_stacks in a batch # (1, img_rows, img_cols, img_stack). exp_stack = np.expand_dims(exp_stack, axis=0) # 1x64x64x4 # Punish the agent for not moving forward prev_state = {} steps_stuck = 0 # Continue until the end of the zone is reached or 4500 timesteps have # passed. while not done: # Predict an action to take based on the most recent # experience. # # Note that the first dimension # (1, img_rows, img_cols, img_stack) is ignored by the # network here as it represents a batch size of 1. act_idx, action = dqn_agent.act(exp_stack) obs, reward, done, info = env.step(action) # env.render() # Punish the agent for standing still for too long. if (prev_state == info): steps_stuck += 1 else: steps_stuck = 0 prev_state = info # Position based reward does not include stagnation punishment. reward_sum += reward if (steps_stuck > 20): reward -= 1 # Track various events timestep += 1 total_timestep += 1 obs = preprocess_obs(obs, size=(img_rows, img_cols)) # Create a 1st dimension for stacking experiences and a 4th for # stacking img_stack frames. obs = np.reshape(obs, (1, img_rows, img_cols, 1)) # Append the new observation to the front of the stack and remove # the oldest (4th) frame. exp_stack_new = np.append(obs, exp_stack[:, :, :, :3], axis=3) # Save the experience: <state, action, reward, next_state, done>. dqn_agent.save_memory(exp_stack, act_idx, reward, exp_stack_new, done) exp_stack = exp_stack_new # In the observation phase skip training updates and decrmenting epsilon. if (total_timestep >= dqn_agent.observation_timesteps): # Update the target model with the main model's weights. if ((total_timestep % dqn_agent.update_target_freq) == 0): dqn_agent.update_target_model() # Train the agent on saved experiences. if ((total_timestep % dqn_agent.timestep_per_train) == 0): dqn_agent.replay_update() dqn_agent.save_models(main_model_path, target_model_path) if (dqn_agent.epsilon > dqn_agent.final_epsilon): # Decrease epsilon by a fraction of the range such that epsilon decreases # for "exploration_timesteps". dec = ((dqn_agent.initial_epsilon - dqn_agent.final_epsilon) / dqn_agent.exploration_timesteps) dqn_agent.epsilon -= dec # print(info) print("Epsisode:", episode, " Timestep:", timestep, " Action:", act_idx, " Episode Reward Sum:", reward_sum, " Epsilon:", dqn_agent.epsilon) # Save mean episode reward at the end of the episode - append to stats file with open(stat_path, "a") as stats_fd: reward_str = "Epsiode Cummulative Reward: " + str(reward_sum) + ", Episode Timestpes: " + str(timestep) + ",\n" stats_fd.write(str(reward_str))
from dqn_agent import DQN_Agent import gym from tqdm import tqdm env = gym.make('CartPole-v0') input_dim = env.observation_space.shape[0] output_dim = env.action_space.n exp_replay_size = 256 agent = DQN_Agent(seed=1423, layer_sizes=[input_dim, 64, output_dim], lr=1e-3, sync_freq=5, exp_replay_size=exp_replay_size) # Main training loop losses_list, reward_list, episode_len_list, epsilon_list = [], [], [], [] episodes = 10000 epsilon = 1 # initiliaze experiance replay index = 0 for i in range(exp_replay_size): obs = env.reset() done = False while not done: A = agent.get_action(obs, env.action_space.n, epsilon=1) obs_next, reward, done, _ = env.step(A.item()) agent.collect_experience([obs, A.item(), reward, obs_next]) obs = obs_next index += 1 if index > exp_replay_size: