def __init__(self, env_name, params): self.env = envs.make(env_name) self.params = params self.action_bound = self.env.action_bound[1] self.iterations = params["iterations"] self.mem_len = params["mem_len"] self.seed = params["seed"] self.render = params["render"] self.log_interval = params["log_interval"] self.warmup = params["warmup"] self.batch_size = params["batch_size"] self.save = params["save"] hidden_dim = params["hidden_dim"] state_dim = self.env.observation_space action_dim = self.env.action_space cuda = params["cuda"] network_settings = params["network_settings"] actor = utils.Actor(state_dim, hidden_dim, action_dim) target_actor = utils.Actor(state_dim, hidden_dim, action_dim) critic = utils.Critic(state_dim+action_dim, hidden_dim, 1) target_critic = utils.Critic(state_dim+action_dim, hidden_dim, 1) self.memory = utils.ReplayMemory(1000000) self.agent = sw.Sleepwalk(actor, critic, target_actor, target_critic, network_settings, GPU=cuda) self.noise = utils.OUNoise(action_dim) self.noise.set_seed(self.seed) self.memory = utils.ReplayMemory(self.mem_len) self.pol_opt = torch.optim.Adam(actor.parameters()) self.crit_opt = torch.optim.Adam(critic.parameters()) if cuda: self.Tensor = torch.cuda.FloatTensor else: self.Tensor = torch.Tensor if self.render: self.env.init_rendering() self.best = None # initialize experiment logging self.logging = params["logging"] if self.logging: self.directory = os.getcwd() filename = self.directory + "/data/qprop.csv" with open(filename, "w") as csvfile: self.writer = csv.writer(csvfile) self.writer.writerow(["episode", "reward"]) self.train() else: self.train()
def play(save_path): ''' Loads network from the location of save_path and plays a game of Pong. ''' # Initialize the Pong gym environment, set seeds env = gym.make('Pong-v0') replay_memory = u.ReplayMemory() G = tf.Graph() with G.as_default(): # Import TF graph saver = tf.train.import_meta_graph(save_path + '.meta', clear_devices=True) G.device( '/cpu:0' ) # Run graph on CPU so play can be done without taking GPU resources # Get input/output tensors X = G.get_tensor_by_name('X:0') Y = G.get_tensor_by_name('Y:0') # Initialize TF session sess_config = tf.ConfigProto(device_count={'CPU': 1, 'GPU': 0}) with tf.Session(config=sess_config) as sess: print('Reloading parameters...') saver.restore(sess, save_path) # Iterate over episodes while True: obs = u.preprocess_image(env.reset()) for i in range(3): replay_memory.add_frame( np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE))) replay_memory.add_frame(obs) # Iterate over frames done = False while not done: # Feed state into DQN s = np.stack( [replay_memory.frames[i] for i in range(-4, 0)], axis=-1).reshape(1, 160 // DOWNSAMPLE, 160 // DOWNSAMPLE, 4) y = sess.run(Y, feed_dict={X: s}) # Decide on action greedily a = np.argmax(y) + 1 # Take action, observe environment, reward obs, r, done, _ = env.step(a) for i in range(STEPS_TO_SKIP): obs, r, done_temp, _ = env.step(1) if done_temp == True: done = True env.render() # Add new frame to replay memory replay_memory.add_frame(u.preprocess_image(obs)) q = input('Play again? ') if q in ['', 'y', 'Y']: pass else: env.render(close=True) break
def __init__(self, num_inputs, action_space, args, writer=None, outdir=None, device=torch.device("cpu")): self.index = 0 self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.writer = writer self.outdir = outdir self.batch_size = args.batch_size self.save_freq = args.save_freq self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = device self.replay_buffer = utils.ReplayMemory(capacity=args.buffer_max_size, seed=args.seed) self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = torch.optim.Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = torch.optim.Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, model_name, env): super(DQNAgent, self).__init__(model_name, env) self.episode = self.configs.episode self.batch_size = self.configs.batch_size self.gamma = self.configs.gamma self.eps_start = self.configs.eps_start self.eps_end = self.configs.eps_end self.eps_decay = self.configs.eps_decay self.target_update_episode = self.configs.target_update_episode self.model_path = self.configs.save_path self.save_episode = self.configs.save_episode self.plot_episode = self.configs.plot_episode self.policy_net = models.DQN(self.configs, env).to(self.device) self.target_net = models.DQN(self.configs, env).to(self.device) self.load_model(self.model_path) self.optimizer = optim.Adam( self.policy_net.parameters(), lr=self.configs.optimizer_lr, betas=(self.configs.optimizer_beta1, self.configs.optimizer_beta2), eps=self.configs.optimizer_eps, weight_decay=self.configs.optimizer_weight_decay) self.memory = utils.ReplayMemory(10000) self.num_random_choose = 0 self.num_choice_per_dim = self.configs.num_choice_per_dim self.action_dim = env.action_spec().shape self.action_min = env.action_spec().minimum self.action_max = env.action_spec().maximum self.action_space = utils.enumerate(self.num_choice_per_dim, self.action_min, self.action_max)
def play(G, save_path): # Initialize the Pong gym environment, set seeds env = gym.make('Pong-v0') replay_memory = u.ReplayMemory() with G.as_default(): # Get input/output tensors X = G.get_tensor_by_name('X:0') Y = G.get_tensor_by_name('Y:0') saver = tf.train.Saver(var_list=None, max_to_keep=5) # Initialize TF session with tf.Session() as sess: print('Reloading parameters...') saver.restore(sess, save_path) # Iterate over episodes while True: obs = u.preprocess_image(env.reset()) for i in range(3): replay_memory.add_frame( np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE))) replay_memory.add_frame(obs) # Iterate over frames done = False while not done: # Feed state into DQN s = np.stack( [replay_memory.frames[i] for i in range(-4, 0)], axis=-1).reshape(1, 160 // DOWNSAMPLE, 160 // DOWNSAMPLE, 4) y = sess.run(Y, feed_dict={X: s}) # Decide on action a = np.argmax(y) + 1 # Take action, observe environment, reward obs, r, done, _ = env.step(a) for i in range(STEPS_TO_SKIP): obs, r, done_temp, _ = env.step(1) if done_temp == True: done = True env.render() # Add new state/reward to replay memory replay_memory.add_frame(u.preprocess_image(obs)) q = input('play again?') if q in ['', 'y', 'Y']: pass else: env.render(close=True) break
def __init__(self, agent_id, num_countries, replay_capacity, num_node_actions, num_global_actions, gamma, device): # more node features because we will add indicator of self country and ally countries num_node_features, num_edge_features = 4, 7 # create two DQNs for stable learning self.policy_net = net.RecurGraphAgent(num_node_features, num_edge_features, num_node_actions, num_global_actions).to(device) self.target_net = net.RecurGraphAgent(num_node_features, num_edge_features, num_node_actions, num_global_actions).to(device) self.optimizer = torch.optim.RMSprop(self.policy_net.parameters()) self.memory = utils.ReplayMemory(replay_capacity) # ensure they match self.target_net.load_state_dict(self.policy_net.state_dict()) self.agent_id = agent_id self.num_countries = num_countries self.num_node_actions = num_node_actions self.num_global_actions = num_global_actions self.gamma = gamma self.device = device
def play(env, args, transpose=True, fps=30, zoom=None, callback=None, keys_to_action=None): """Allows one to play the game using keyboard. To simply play the game use: play(gym.make("Pong-v4")) Above code works also if env is wrapped, so it's particularly useful in verifying that the frame-level preprocessing does not render the game unplayable. If you wish to plot real time statistics as you play, you can use gym.utils.play.PlayPlot. Here's a sample code for plotting the reward for last 5 second of gameplay. def callback(obs_t, obs_tp1, action, rew, done, info): return [rew,] plotter = PlayPlot(callback, 30 * 5, ["reward"]) env = gym.make("Pong-v4") play(env, callback=plotter.callback) Arguments --------- env: gym.Env Environment to use for playing. transpose: bool If True the output of observation is transposed. Defaults to true. fps: int Maximum number of steps of the environment to execute every second. Defaults to 30. zoom: float Make screen edge this many times bigger callback: lambda or None Callback if a callback is provided it will be executed after every step. It takes the following input: obs_t: observation before performing action obs_tp1: observation after performing action action: action that was executed rew: reward that was received done: whether the environment is done or not info: debug info keys_to_action: dict: tuple(int) -> int or None Mapping from keys pressed to action performed. For example if pressed 'w' and space at the same time is supposed to trigger action number 2 then key_to_action dict would look like this: { # ... sorted(ord('w'), ord(' ')) -> 2 # ... } If None, default key_to_action mapping for that env is used, if provided. """ config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) data_list = [] obs = env.reset(sess) rendered = env.env.render(mode='rgb_array') if keys_to_action is None: if hasattr(env.env, 'get_keys_to_action'): keys_to_action = env.env.get_keys_to_action() elif hasattr(env.env.unwrapped, 'get_keys_to_action'): keys_to_action = env.env.unwrapped.get_keys_to_action() else: assert False, env.env.spec.id + " does not have explicit key to action mapping, " + \ "please specify one manually" relevant_keys = set(sum(map(list, keys_to_action.keys()), [])) video_size = [rendered.shape[1], rendered.shape[0]] if zoom is not None: video_size = int(video_size[0] * zoom), int(video_size[1] * zoom) pressed_keys = [] running = True env_done = True screen = pygame.display.set_mode(video_size) clock = pygame.time.Clock() count = 0 num_traj = 0 while running: if env_done and count > 0: env_done = False num_traj += 1 obs = env.reset(sess) print(num_traj, count) replay_mem = utils.ReplayMemory(len(data_list)) for i in range(len(data_list)): action = data_list[i][0] obs = data_list[i][1] rew = data_list[i][2] terminal = data_list[i][3] replay_mem.add_experience(action=action, frame=obs[:, :, 0], reward=rew, terminal=terminal) pickle.dump(replay_mem, open("human_" + args.env + "_" + str(num_traj) + ".pkl", "wb"), protocol=4) else: action = keys_to_action.get(tuple(sorted(pressed_keys)), 0) obs, rew, env_done, terminal, frame = env.step(sess, action) data_list.append([action, obs, rew, terminal]) count += 1 if obs is not None: rendered = env.env.render(mode='rgb_array') display_arr(screen, rendered, transpose=transpose, video_size=video_size) # process pygame events for event in pygame.event.get(): # test events, set key states if event.type == pygame.KEYDOWN: if event.key in relevant_keys: pressed_keys.append(event.key) elif event.key == 27: running = False elif event.type == pygame.KEYUP: if event.key in relevant_keys: pressed_keys.remove(event.key) elif event.type == pygame.QUIT: running = False elif event.type == VIDEORESIZE: video_size = event.size screen = pygame.display.set_mode(video_size) print(video_size) pygame.display.flip() clock.tick(fps) pygame.quit()
# if gpu is to be used device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logs = open(log_path, 'w') transform = T.Compose([T.ToPILImage(), T.ToTensor()]) policy_net = dqn.DQN(n_angle, n_actions, hidden_layer1_size, hidden_layer2_size).to(device) target_net = dqn.DQN(n_angle, n_actions, hidden_layer1_size, hidden_layer2_size).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = utils.ReplayMemory(100000) steps_done = 0 def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: with torch.no_grad(): return policy_net(state).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
def __init__(self, env_name, params): # initialize environment self.env = envs.make(env_name) self.env_name = env_name # save important experiment parameters for the training loop self.iterations = params["iterations"] self.mem_len = params["mem_len"] self.seed = params["seed"] self.render = params["render"] self.log_interval = params["log_interval"] self.warmup = params["warmup"] self.batch_size = params["batch_size"] self.save = params["save"] # initialize DDPG agent using experiment parameters from config file self.action_bound = self.env.action_bound[1] state_dim = self.env.observation_space action_dim = self.env.action_space hidden_dim = params["hidden_dim"] cuda = params["cuda"] network_settings = params["network_settings"] actor = ddpg.Actor(state_dim, hidden_dim, action_dim) target_actor = ddpg.Actor(state_dim, hidden_dim, action_dim) critic = utils.Critic(state_dim + action_dim, hidden_dim, 1) target_critic = utils.Critic(state_dim + action_dim, hidden_dim, 1) self.agent = ddpg.DDPG(actor, target_actor, critic, target_critic, network_settings, GPU=cuda) # intitialize ornstein-uhlenbeck noise for random action exploration ou_scale = params["ou_scale"] ou_mu = params["ou_mu"] ou_sigma = params["ou_sigma"] self.noise = utils.OUNoise(action_dim, scale=ou_scale, mu=ou_mu, sigma=ou_sigma) self.noise.set_seed(self.seed) self.memory = utils.ReplayMemory(self.mem_len) self.pol_opt = torch.optim.Adam(actor.parameters()) self.crit_opt = torch.optim.Adam(critic.parameters()) # want to save the best policy self.best = None # send to GPU if flagged in experiment config file if cuda: self.Tensor = torch.cuda.FloatTensor self.agent = self.agent.cuda() else: self.Tensor = torch.Tensor if self.render: self.env.init_rendering() # initialize experiment logging. This wipes any previous file with the same name self.logging = params["logging"] if self.logging: self.directory = os.getcwd() filename = self.directory + "/data/ddpg.csv" with open(filename, "w") as csvfile: self.writer = csv.writer(csvfile) self.writer.writerow(["episode", "reward"]) self.train() else: self.train()
def train(): # Graph Part print("Graph initialization...") xdim = xtrim[1] - xtrim[0] ydim = ytrim[1] - ytrim[0] channel=3 num_action = env.action_space.n policy_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel, num_action=num_action, learning_rate=learning_rate, batch_size=batch_size) target_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel, num_action=num_action, learning_rate=learning_rate, batch_size=batch_size) policy_net.to(DEVICE) target_net.to(DEVICE) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # Memory memory = utils.ReplayMemory(10000) # ETCs steps_done = 0 episode_durations = [] policy_net.float() target_net.float() print("Training Start.....") for episode in range(num_episodes): REWARD = 0 previous_screenshot = utils.dimension_manipulation(env.reset()[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]]) current_screenshot = previous_screenshot state = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE) for t in count(): #env.render() action = utils.select_action(state, steps_done, policy_net) observation, reward, done, _ = env.step(action.item()) previous_screenshot = current_screenshot current_screenshot = utils.dimension_manipulation(observation[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]]) if not done: next_status = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE) REWARD += reward else: next_status = None if True : memory.push(state, action, next_status, torch.tensor(float(t+1)).to(DEVICE)[None]) state = next_status utils.optimize_model(policy_net, target_net, memory, batch_size) if done: utils.optimize_model(policy_net, target_net, memory, batch_size) episode_durations.append(t + 1) utils.plot_durations(episode_durations) if REWARD != 0: print("\n######## Episode " + str(episode)) print("Duration : " + str(t + 1)) print("REWARD : " + str(REWARD)) print("loss : " + str(policy_net.loss.item())) break if episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict())
def save_gif(gif_save_path, save_path): ''' Loads network from the location of save_path and plays a game of Pong. ''' # Initialize the Pong gym environment, set seeds env = gym.make('Pong-v0') replay_memory = u.ReplayMemory() G = tf.Graph() gifwriter = matplotlib.animation.ImageMagickFileWriter(fps=20) plt.ioff() fig = plt.figure('Pong') gifwriter.setup(fig, gif_save_path, dpi=100) with G.as_default(): # Import TF graph saver = tf.train.import_meta_graph(save_path + '.meta', clear_devices=False) G.device('/gpu:0') # Get input/output tensors X = G.get_tensor_by_name('X:0') Y = G.get_tensor_by_name('Y:0') # Initialize TF session sess_config = tf.ConfigProto(device_count={'CPU': 1, 'GPU': 1}) with tf.Session(config=sess_config) as sess: print('Reloading parameters...') saver.restore(sess, save_path) # Play a single episode obs = env.reset() plt.clf() fig.clf() plt.imshow(obs) gifwriter.grab_frame() obs = u.preprocess_image(obs) for i in range(3): replay_memory.add_frame( np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE))) replay_memory.add_frame(obs) # Iterate over frames done = False f = 0 while not done: f += 1 print('Frame {}'.format(f)) # Feed state into DQN s = np.stack([replay_memory.frames[i] for i in range(-4, 0)], axis=-1).reshape(1, 160 // DOWNSAMPLE, 160 // DOWNSAMPLE, 4) y = sess.run(Y, feed_dict={X: s}) # Decide on action greedily a = np.argmax(y) + 1 # Take action, observe environment, reward obs, r, done, _ = env.step(a) plt.clf() fig.clf() plt.imshow(obs) gifwriter.grab_frame() for i in range(STEPS_TO_SKIP): obs, r, done_temp, _ = env.step(1) plt.clf() fig.clf() plt.imshow(obs) gifwriter.grab_frame() if done_temp == True: done = True # env.render() # Add new frame to replay memory replay_memory.add_frame(u.preprocess_image(obs)) # Save gif gifwriter.finish()
def train(G, max_episodes, save_path): ''' Trains a DQN to play pong. Periodically saves progress to a checkpoint file, and saves plots of several metrics to monitor training. Input: G: computational graph by which the action-value function Q is calculated. max_episodes: the maximum number of episodes to run for before terminating training save_path: a file path to the location of the checkpoint files Output: none ''' # Define some constants, lists, metrics, etc action_map = {1: 'x', 2: '^', 3: 'v'} # Stay, up, down replay_memory = u.ReplayMemory(max_exp_len=REPLAY_MEM_LEN) step_list = [] reward_list = [] avg_reward = None val_Q_list = [] episode_length_list = [] episode_time_list = [] avg_episode_length_list = [] avg_episode_length = None episode_score_list = {'player': [], 'computer': []} X_val = u.load_validation_screens() # Initialize the Pong gym environment, set seeds env = gym.make('Pong-v0') np.random.seed(SEED) tf.set_random_seed(SEED) plt.ioff() # Gather screens # Initialize computational graph with G.as_default(): # Get input/output tensors X = G.get_tensor_by_name('X:0') Y = G.get_tensor_by_name('Y:0') Q = G.get_tensor_by_name('Q:0') A = G.get_tensor_by_name('A:0') L = G.get_tensor_by_name('L:0') LR = G.get_tensor_by_name('LR:0') train_op = G.get_operation_by_name('TrainOp') saver = tf.train.Saver() # Initialize TF session with tf.Session() as sess: # Reload/initialize variables if RELOAD_PARAMETERS: print('Reloading from last checkpoint...') saver.restore(sess, save_path) else: print('Initializing variables...') sess.run(tf.global_variables_initializer()) # Iterate over episodes global_steps = 0 for episode in range(max_episodes): tic = time.time() obs = u.preprocess_image(env.reset()) for i in range(3): replay_memory.add_frame( np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE), dtype=bool)) replay_memory.add_frame(obs) # Iterate over frames done = False frame = 0 episode_score = [0, 0] while not done: if (global_steps >= OBSERVE_STEPS): # Feed state into DQN s = np.stack( [replay_memory.frames[i] for i in range(-4, 0)], axis=-1).reshape(1, 160 // DOWNSAMPLE, 160 // DOWNSAMPLE, 4) y = sess.run(Y, feed_dict={X: s}) # Decide on action epsilon = max( MAX_EPSILON * (1 - global_steps / EPSILON_ANNEALING_STEPS), MIN_EPSILON) if (np.random.rand() < epsilon): a = np.random.choice([1, 2, 3]) else: a = np.argmax(y) + 1 else: a = np.random.choice([1, 2, 3]) # Take action, observe environment, reward obs, r, done, _ = env.step(a) r_sum = r for i in range(STEPS_TO_SKIP): obs, r, done_temp, _ = env.step(1) r_sum += r if done_temp == True: done = True if r_sum > 0: episode_score[0] += int(r_sum) elif r_sum < 0: episode_score[1] -= int(r_sum) # Add new state/reward to replay memory replay_memory.add_frame(u.preprocess_image(obs)) experience = (np.stack(list(replay_memory.frames), axis=-1).astype(bool), a, r_sum, done) replay_memory.add_exp(experience) # Do training batch update if (global_steps >= OBSERVE_STEPS): S, A_, R, D = replay_memory.sample(BATCH_SIZE) y2 = sess.run(Y, feed_dict={X: S[:, :, :, -4:]}) q = R + (1 - D) * GAMMA * np.max(y2, axis=1) _, batch_loss = sess.run( [train_op, L], feed_dict={ X: S[:, :, :, -5:-1], Q: q, A: (A_ - 1), LR: LEARNING_RATE }) if (batch_loss == np.nan): print('nan error, exiting training') exit() elif (np.mean(np.max(y2, axis=-1)) > 1e2): print('unstable Q value, exiting training') exit() # Print updates print( 'Episode: {}/{},\tframe: {},\tscore: {},\t<max(Q)>: {:.3e},\nmax(Q): {:.3e},\taction: {},\tcurrent std(Q)/mean(Q): {:.3e}' .format(episode + 1, max_episodes, (frame + 1) * (STEPS_TO_SKIP + 1), episode_score, np.mean(np.max(y2, axis=-1)), np.max(y), action_map[a], np.std(y) / np.mean(y))) # Plot frame-by-frame metrics if avg_reward is None: avg_reward = r_sum else: avg_reward = (1 - np.exp(-1 / 500)) * r_sum + np.exp( -1 / 500) * avg_reward if (global_steps % PLOT_EVERY_N_STEPS == 0): step_list.append(global_steps) reward_list.append(10 * avg_reward) y_val = sess.run(Y, feed_dict={X: X_val}) val_Q_list.append(np.mean(np.max(y_val, axis=-1))) u.plot_metrics(step_list, 'PongMetrics', 'Pong Metrics', 'Global step', '', (val_Q_list, 'Validation <max(Q)>'), (reward_list, '10*<R>')) else: print('Observation step {}/{}'.format( global_steps, OBSERVE_STEPS)) # Update state variables global_steps += 1 frame += 1 # Save parameters at end of episode, plot episode metrics print('Saving parameters...') saver.save(sess, SAVE_PATH) episode_length_list.append(frame * (STEPS_TO_SKIP + 1) / 1000) if avg_episode_length is None: avg_episode_length = frame * (STEPS_TO_SKIP + 1) else: avg_episode_length = (1 - np.exp(-1 / 10)) * frame * ( STEPS_TO_SKIP + 1) + np.exp( -1 / 10) * avg_episode_length avg_episode_length_list.append(avg_episode_length / 1000) toc = time.time() episode_time_list.append((toc - tic) / 60) episode_score_list['player'].append(episode_score[0]) episode_score_list['computer'].append(episode_score[1]) u.plot_metrics(range(episode + 1), 'EpisodeLength', 'Episode Length', 'Episode', 'Steps/1000', (episode_length_list, 'Steps/episode'), (avg_episode_length_list, 'Average')) u.plot_metrics(range(episode + 1), 'EpisodeScore', 'Episode Score', 'Episode', 'Score', (episode_score_list['player'], 'Player'), (episode_score_list['computer'], 'Computer')) u.plot_metrics(range(episode + 1), 'EpisodeTime', 'Episode time', 'Episode', 'Time (min)', (episode_time_list, 'Episode time'))
for agent_start in agent_start_list: reward_mean_dict[agent_start] = 0 reward_std_dict[agent_start] = 0 step_counts_mean_dict[agent_start] = 0 reward_max_dict[agent_start] = 0 reward_accum_dict[agent_start] = 0 step_counts_min_dict[agent_start] = 0 test_reward_mean_dict[agent_start] = 0 test_reward_std_dict[agent_start] = 0 test_step_counts_mean_dict[agent_start] = 0 test_reward_max_dict[agent_start] = 0 test_reward_accum_dict[agent_start] = 0 test_step_counts_min_dict[agent_start] = 0 memory = utils.ReplayMemory(10000) test_memory = utils.ReplayMemory(1) # dummy replay memory for test (to reuse code) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) steps_done = 0 steps_done_test = 0 min_reward = round(-env.steps_remaining * 0.1, 1) global_reward_max = min_reward test_global_reward_max = min_reward total_train_reward_accum = 0 total_test_reward_accum = 0 eval_ep_batch = 10 for epoch in range(EPOCHS):
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000): # training parameters num_epochs = 5 num_steps_train_epoch = num_steps_train_total/num_epochs # steps per epoch of training num_steps_test = 100 update_frequency = 10 # step frequency of model training/updates epsilon = 0.15 # percentage of time we perform a random action, help exploration. epsilon_steps = 1000 # decay steps epsilon_min = 0.1 epsilon_rate = (epsilon - epsilon_min) / epsilon_steps # memory settings max_memory_size = 10000 min_memory_size = 60 # number needed before model training starts game = FlappyBird() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = init_agent(env) memory = utils.ReplayMemory(max_memory_size, min_memory_size) env.init() # Logging configuration and figure plotting logging.basicConfig(filename='../learning.log', filemode='w', level=logging.DEBUG, format='%(levelname)s:%(message)s') logging.info('========================================================') logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n') learning_rewards = [0] testing_rewards = [0] for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train_epoch: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_train_epoch: state = env.getGameState() reward, action = my_agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(my_agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if steps < num_steps_train_epoch: learning_rewards.append(episode_reward) if num_episodes % 5 == 0: # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward) logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() # print "Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n"\ # .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes) logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes)) steps, num_episodes = 0, 0 losses, rewards = [], [] # display the screen # env.display_screen = True # slow it down so we can watch it fail! # env.force_fps = True # testing loop while steps < num_steps_test: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_test: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward testing_rewards.append(testing_rewards[-1]+reward) steps += 1 # done watching after 500 steps. if steps > 500: env.display_screen = False if num_episodes % 5 == 0: # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward) logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) if steps < num_steps_test: testing_rewards.append(episode_reward) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() # print "Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n"\ # .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes) logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes)) logging.info("Training complete.\n\n") plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total) plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total) save_agent(my_agent, agent_file_path, agent_file_name)
import sys sys.path.append('.') import gym import torch from torch import optim import agent, train, utils # hyperparameters replay_mem_size = int(1e6) mini_batch_size = 32 num_episodes = int(2e3) agt = agent.DQNAgent() replay_memory = utils.ReplayMemory(replay_mem_size, mini_batch_size) obs_history = utils.ObsHistory() optimizer = optim.RMSprop(agt.qnet.parameters()) env = gym.envs.make('PongNoFrameskip-v4') for episode in range(num_episodes): # loop over episodes obs_init = env.reset() # reset environment to start new episode obs_history.reset(obs_init) # reset observations for new episode done = False print('Episode #{}'.format(episode)) if episode % 10 == 9: torch.save(agt.qnet.state_dict(), 'dqn_agt.pt') cumulative_loss = 0
def train(G, max_episodes, save_path): ''' Trains a DQN to play pong. ''' # Define some constants, lists, metrics, etc action_map = {1: 'x', 2: '^', 3: 'v'} # Stay, up, down replay_memory = u.ReplayMemory(max_exp_len=REPLAY_MEM_LEN) step_list = [] reward_list = [] val_Q_list = [] episode_length_list = [] avg_episode_length_list = [] episode_score_list = {'player': [], 'computer': []} X_val = u.load_validation_screens() # Initialize the Pong gym environment, set seeds env = gym.make('Pong-v0') np.random.seed(SEED) tf.set_random_seed(SEED) plt.ioff() # Gather screens # Initialize computational graph with G.as_default(): # Get input/output tensors X = G.get_tensor_by_name('X:0') Y = G.get_tensor_by_name('Y:0') # Append loss function to graph Q = tf.placeholder(dtype=tf.float32, shape=[None], name='Q') A = tf.placeholder(dtype=tf.int32, shape=[None], name='A') mask = tf.one_hot(A, depth=3, dtype=tf.float32, axis=-1) L = tf.reduce_mean(tf.square(tf.reduce_sum(mask * Y, axis=-1) - Q), name='L') # Define optimizer, training op, gradient clipping, etc. if not RELOAD_PARAMETERS: optimizer = tf.train.AdamOptimizer(LEARNING_RATE, name='Adam') else: optimizer = G.get_operation_by_name('Adam') saver = tf.train.Saver() # Initialize TF session with tf.Session() as sess: # Reload/initialize variables if RELOAD_PARAMETERS: print('Reloading from last checkpoint...') saver.restore(sess, save_path) else: print('Initializing variables...') gradients, variables = zip(*optimizer.compute_gradients(L)) train_op = optimizer.apply_gradients(zip(gradients, variables)) sess.run(tf.global_variables_initializer()) # Iterate over episodes global_steps = 0 for episode in range(max_episodes): obs = u.preprocess_image(env.reset()) for i in range(3): replay_memory.add_frame( np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE))) replay_memory.add_frame(obs) # Iterate over frames done = False frame = 0 episode_score = [0, 0] while not done: if (global_steps >= OBSERVE_STEPS): # Feed state into DQN s = np.stack( [replay_memory.frames[i] for i in range(-4, 0)], axis=-1).reshape(1, 160 // DOWNSAMPLE, 160 // DOWNSAMPLE, 4) y = sess.run(Y, feed_dict={X: s}) # Decide on action epsilon = max( MAX_EPSILON * (1 - global_steps / EPSILON_ANNEALING_STEPS), MIN_EPSILON) if (np.random.rand() < epsilon): a = np.random.choice([1, 2, 3]) else: a = np.argmax(y) + 1 else: a = np.random.choice([1, 2, 3]) # Take action, observe environment, reward obs, r, done, _ = env.step(a) r_sum = r for i in range(STEPS_TO_SKIP): obs, r, done_temp, _ = env.step(1) r_sum += r if done_temp == True: done = True if r_sum > 0: episode_score[0] += r_sum elif r_sum < 0: episode_score[1] -= r_sum # Add new state/reward to replay memory replay_memory.add_frame(u.preprocess_image(obs)) experience = (np.stack(list(replay_memory.frames), axis=-1), a, r_sum, done) replay_memory.add_exp(experience) # Do training batch update if (global_steps >= OBSERVE_STEPS): S, A_, R, D = replay_memory.sample(BATCH_SIZE) y2 = sess.run(Y, feed_dict={X: S[:, :, :, -4:]}) q = R + (1 - D) * GAMMA * np.max(y2, axis=1) _, batch_loss = sess.run([train_op, L], feed_dict={ X: S[:, :, :, -5:-1], Q: q, A: (A_ - 1) }) if (batch_loss == np.nan): print('nan error, exiting training') exit() elif (np.mean(np.max(y2, axis=-1)) > 1e2): print('unstable Q value, exiting training') exit() # Print updates print( 'Episode: {}/{},\tframe: {},\treward: {},\t<max(Q)>: {:.3e},\nmax(Q): {:.3e},\taction: {},\tcurrent std(Q)/mean(Q): {:.3e}' .format(episode + 1, max_episodes, frame + 1, int(r_sum), np.mean(np.max(y2, axis=-1)), np.max(y), action_map[a], np.std(y) / np.mean(y))) # Plot frame-by-frame metrics if global_steps == 0: avg_reward = r_sum else: avg_reward = (1 - np.exp(-1 / 500)) * r_sum + np.exp( -1 / 500) * avg_reward if (global_steps % PLOT_EVERY_N_STEPS == 0): step_list.append(global_steps) reward_list.append(10 * avg_reward) y_val = sess.run(Y, feed_dict={X: X_val}) val_Q_list.append(np.mean(np.max(y_val, axis=-1))) u.plot_metrics(step_list, 'PongMetrics', 'Pong Metrics', 'Global step', '', (val_Q_list, 'Validation <max(Q)>'), (reward_list, '10*<R>')) else: print('Observation step {}/{}'.format( global_steps, OBSERVE_STEPS)) # Update state variables global_steps += 1 frame += 1 # Save parameters at end of episode, plot episode metrics saver.save(sess, SAVE_PATH) episode_length_list.append(frame * (STEPS_TO_SKIP + 1) / 1000) if episode == 0: avg_episode_length = frame * (STEPS_TO_SKIP + 1) else: avg_episode_length = (1 - np.exp(-1 / 10)) * frame * ( STEPS_TO_SKIP + 1) + np.exp( -1 / 10) * avg_episode_length avg_episode_length_list.append(avg_episode_length / 1000) episode_score_list['player'].append(episode_score[0]) episode_score_list['computer'].append(episode_score[1]) u.plot_metrics(range(episode + 1), 'EpisodeLength', 'Episode Length', 'Episode', 'Length/1000', (episode_length_list, 'Episode length'), (avg_episode_length_list, 'Average')) u.plot_metrics(range(episode + 1), 'EpisodeScore', 'Episode Score', 'Episode', 'Score', (episode_score_list['player'], 'Player'), (episode_score_list['computer'], 'Computer'))