class Game(object): def __init__(self): self.args = args = agent.parse_args() self.ep = EnvPool(args.env, self.args.env_size) self.eps = [ MultiStageEpsilon([ LinearAnnealEpsilon(1.0, 0.1, int(1e6)), LinearAnnealEpsilon(0.1, 0.05, int(1e7 - 1e6)) ]), 0 ] self.replay = ReplayBuffer(args.replay_buffer_size) main_logger.info("Replay Buffer Max Size: {}B".format( pretty_num(args.replay_buffer_size * (84 * 84 * 4 * 2 + 8), True))) self.sess = agent.make_session() self.sess.__enter__() agent.setup(self.ep.action_num, self.replay) self.train_epi = 0 self.max_reward = agent.score def random(self): random_step = self.args.replay_buffer_size // 2 obs = self.ep.reset() with tqdm(total=random_step, desc="random", ascii=True) as t: while t.n < random_step: action, (obs_, reward, done, info) = self.ep.random() [ self.replay.add(obs[i], action[i], reward[i], float(done[i]), obs_[i]) for i in range(self.ep.size) ] obs, info = self.ep.auto_reset() t.update(self.ep.size) total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size)) mean_reward = np.mean([ np.mean(info[i]['rewards']) for i in range(self.ep.size) if info[i]['rewards'] ]) record = Record() record.add_key_value('Phase', 'Random') record.add_key_value('Episodes', pretty_num(total_epi)) record.add_key_value('Mean Reward', np.round(mean_reward, 2)) main_logger.info("\n" + record.dumps()) if not self.max_reward: self.max_reward = mean_reward def train(self): train_step = 250000 self.ep.reset_state() obs = self.ep.reset() with tqdm(total=train_step, desc="Train", ascii=True) as t: while t.n < train_step: action = agent.take_action( obs, self.eps[0].get(self.train_epi * train_step + t.n)) obs_, reward, done, info = self.ep.step(action) [ self.replay.add(obs[i], action[i], reward[i], float(done[i]), obs_[i]) for i in range(self.ep.size) ] obs, info = self.ep.auto_reset() if t.n % self.args.target_update_freq == 0: agent.update_target() if t.n % self.args.learning_freq == 0: agent.train(self.ep.size) t.update(self.ep.size) self.train_epi += 1 completion = np.round(self.train_epi / self.args.num_iters, 2) total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size)) mean_reward = np.mean([ np.mean(info[i]['rewards'][-100:]) for i in range(self.ep.size) if info[i]['rewards'] ]) record = Record() record.add_key_value('Phase', 'Train') record.add_key_value('% Completion', completion) record.add_key_value('Episodes', pretty_num(total_epi)) record.add_key_value( '% Exploration', np.round(self.eps[0].get(self.train_epi * train_step) * 100, 2)) record.add_key_value('Reward (100 epi mean)', np.round(mean_reward, 2)) main_logger.info("\n" + record.dumps()) def test(self): test_step = 200000 self.ep.reset_state() obs = self.ep.reset() with tqdm(total=test_step, desc="Evaluation", ascii=True) as t: while t.n < test_step: action = agent.take_action(obs, self.eps[1]) self.ep.step(action) obs, info = self.ep.auto_reset() t.update(self.ep.size) total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size)) mean_reward = np.mean([ np.mean(info[i]['rewards']) for i in range(self.ep.size) if info[i]['rewards'] ]) record = Record() record.add_key_value('Phase', 'Evaluation') record.add_key_value('Episodes', pretty_num(total_epi)) record.add_key_value('Mean Reward', np.round(mean_reward, 2)) main_logger.info("\n" + record.dumps()) if self.max_reward < mean_reward: self.max_reward = mean_reward agent.score = mean_reward agent.save_model() def run(self): self.random() for i in range(self.args.num_iters): self.train() self.test() self.exit() def exit(self): self.ep.close()
class TrainDQN: def __init__(self, env, sess, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=20, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=2000, eps_decay_rate=-0.0001, target_update_freq=1000, ): """Trains an openai gym-like environment with deep q learning. Args: env: gym.Env where our agent resides seed: Random seed for reproducibility gamma: Discount factor max_eps: Starting exploration factor min_eps: Exploration factor to decay towards max_episode_len: Maximum length of an individual episode render: True to render the environment, else False print_freq: Displays logging information every 'print_freq' episodes load_path: (str) Path to load existing model from save_path: (str) Path to save model during training max_steps: maximum number of times to sample the environment buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store max_episode_len: Maximum number of timesteps in an episode eps_decay_rate: lambda parameter in exponential decay for epsilon target_update_fraction: Fraction of max_steps update the target network """ np.random.seed(seed) self.sess = sess self.env = env self.input_dim = env.observation_space.shape[0] self.output_dim = env.action_space.n self.max_steps = max_steps self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.max_episode_len = max_episode_len self.render = render self.print_freq = print_freq self.rewards = [] self.metrics = [] self.save_path = save_path self.load_path = load_path self.batch_size = batch_size self.num_updates = 0 self.gamma = gamma self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity) self.target_update_freq = target_update_freq self.learning_rate = learning_rate with tf.variable_scope('q_network'): self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) with tf.variable_scope('target_network'): self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) self.update_target_network = [old.assign(new) for (new, old) in zip(tf.trainable_variables('q_network'), tf.trainable_variables('target_network'))] if self.load_path is not None: self.load() self.add_summaries(log_dir) def add_summaries(self, log_dir): tf.summary.scalar('Loss', self.q_network.loss, ) tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred)) # Merge all the summaries and write them out to log_dir self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph) def learn(self): """Learns via Deep-Q-Networks (DQN)""" obs = self.env.reset() mean_reward = None total_reward = 0 ep = 0 ep_len = 0 rand_actions = 0 for t in range(self.max_steps): # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp( self.eps_decay_rate * t) if self.render: self.env.render() # Take exploratory action with probability epsilon if np.random.uniform() < eps: action = self.env.action_space.sample() rand_actions += 1 else: action = self.act(obs) # Execute action in emulator and observe reward and next state new_obs, reward, done, info = self.env.step(action) total_reward += reward # Store transition s_t, a_t, r_t, s_t+1 in replay buffer self.buffer.add((obs, action, reward, new_obs, done)) # Perform learning step self.update() obs = new_obs ep_len += 1 if done or ep_len >= self.max_episode_len: # print("Episode Length:", ep_len) # print(f"Episode {ep} Reward:{total_reward}") # print(f"Random Action Percent: {rand_actions/ep_len}") ep += 1 ep_len = 0 rand_actions = 0 self.rewards.append(total_reward) total_reward = 0 obs = self.env.reset() if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print(f"-------------------------------------------------------") print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}") print(f"Exploration fraction: {eps}") print(f"Total Episodes: {ep}") print(f"Total timesteps: {t}") print(f"-------------------------------------------------------") # Add reward summary summary = tf.Summary() summary.value.add(tag=f'Mean {self.print_freq} Episode Reward', simple_value=new_mean_reward) summary.value.add(tag=f'Epsilon', simple_value=eps) self.train_writer.add_summary(summary, self.num_updates) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}") print(f'Location: {self.save_path}') # save_path = f"{self.save_path}_model" self.save() mean_reward = new_mean_reward def act(self, observation): """Takes an action given the observation. Args: observation: observation from the environment Returns: integer index of the selected action """ pred = self.sess.run([self.q_network.output_pred], feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))}) return np.argmax(pred) def update(self): """Applies gradients to the Q network computed from a minibatch of self.batch_size.""" if self.batch_size <= self.buffer.size(): self.num_updates += 1 # Update the Q network with model parameters from the target network if self.num_updates % self.target_update_freq == 0: self.sess.run(self.update_target_network) print('Updated Target Network') # Sample random minibatch of transitions from the replay buffer sample = self.buffer.sample(self.batch_size) states, action, reward, next_states, done = sample # Calculate discounted predictions for the subsequent states using target network next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred, feed_dict={ self.target_network.input_ph: next_states}, ) # Adjust the targets for non-terminal states reward = reward.reshape(len(reward), 1) targets = reward loc = np.argwhere(done != True).flatten() if len(loc) > 0: max_q = np.amax(next_state_pred, axis=1) targets[loc] = np.add( targets[loc], max_q[loc].reshape(max_q[loc].shape[0], 1), casting='unsafe') # Update discount factor and train model on batch _, loss = self.sess.run([self.q_network.opt, self.q_network.loss], feed_dict={self.q_network.input_ph: states, self.q_network.target_ph: targets.flatten(), self.q_network.action_indices_ph: action}) def save(self): """Saves the Q network.""" self.q_network.saver.save(self.sess, self.save_path) def load(self): """Loads the Q network.""" self.q_network.saver.restore(self.sess, self.save_path) def plot_rewards(self, path=None): """Plots rewards per episode. Args: path: Location to save the rewards plot. If None, image will be displayed with plt.show() """ plt.plot(self.rewards) plt.xlabel('Episode') plt.ylabel('Reward') if path is None: plt.show() else: plt.savefig(path) plt.close('all')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get targets self.qnetwork_target.eval() with torch.no_grad(): Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0] Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # get outputs self.qnetwork_local.train() Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # compute loss loss = F.mse_loss(Q_expected, Q_targets) # clear gradients self.optimizer.zero_grad() # update weights local network loss.backward() # take one SGD step self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
for agent_index in range(3): summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000) agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2) a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]] a.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) o_n_next, r_n, d_n, i_n = env.step(a) for agent_index in range(3): reward_100_list[agent_index].append(r_n[agent_index]) reward_100_list[agent_index] = reward_100_list[agent_index][-1000:] agent1_memory.add(o_n[0], agent1_action[0], r_n[0], o_n_next[0], False) agent2_memory.add(o_n[1], agent2_action[0], r_n[1], o_n_next[1], False) agent3_memory.add(o_n[2], agent3_action[0], r_n[2], o_n_next[2], False) if i > 50000: #print('train') e *= 0.9999 # agent1 train train_agent(agent1_ddpg, agent1_ddpg_target, agent1_memory, agent1_actor_target_update, agent1_critic_target_update, sess) train_agent(agent2_ddpg, agent2_ddpg_target, agent2_memory, agent2_actor_target_update, agent2_critic_target_update, sess) train_agent(agent3_ddpg, agent3_ddpg_target, agent3_memory, agent3_actor_target_update, agent3_critic_target_update, sess)
for agent_index in range(3): summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000) agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2) a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]] a.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) o_n_next, r_n, d_n, i_n = env.step(a) for agent_index in range(3): reward_100_list[agent_index].append(r_n[agent_index]) reward_100_list[agent_index] = reward_100_list[agent_index][-1000:] agent1_memory.add(np.vstack([o_n[0], o_n[1], o_n[2]]), np.vstack([agent1_action[0], agent2_action[0], agent3_action[0]]), r_n[0], np.vstack([o_n_next[0], o_n_next[1], o_n_next[2]]), False) agent2_memory.add(np.vstack([o_n[1], o_n[2], o_n[0]]), np.vstack([agent2_action[0], agent3_action[0], agent1_action[0]]), r_n[1], np.vstack([o_n_next[1], o_n_next[2], o_n_next[0]]), False) agent3_memory.add(np.vstack([o_n[2], o_n[0], o_n[1]]), np.vstack([agent3_action[0], agent1_action[0], agent2_action[0]]), r_n[2], np.vstack([o_n_next[2], o_n_next[0], o_n_next[1]]), False) if i > 50000: # e *= 0.9999 # agent1 train train_agent(agent1_ddpg, agent1_ddpg_target, agent1_memory, agent1_actor_target_update, agent1_critic_target_update, sess, [agent2_ddpg_target, agent3_ddpg_target]) train_agent(agent2_ddpg, agent2_ddpg_target, agent2_memory, agent2_actor_target_update, agent2_critic_target_update, sess, [agent3_ddpg_target, agent1_ddpg_target])
class NeuralNetworkAgent(Agent): def __init__(self, api, network_class, sess, save_path, history_size=15, restore_path=None, verbose=False, train=False, test=False): super(NeuralNetworkAgent, self).__init__(api, verbose=verbose) # currently 7500 w/ 1000 # Network self.network = network_class(sess, save_path, restore_path=restore_path, hist_size=history_size) self.replay_buffer = ReplayBuffer(max_size=2500) self.train = train self.history_size = history_size # Internal self.launched = False self.placed_move = False self.ctr = 0 self.restart_game = 1 self.game_restarted = True self.show_board = False self.last_move = -2 self.start_state = np.zeros((20, 10, 1)) self.possible_moves = [-1, 0, 6, 7] self.training_begun = False if not test else True self.epsilon = 1. if not test else 0 self.decay = 0.999 self.test = test self.prev_states = [self.start_state] * self.history_size def _controller_listener(self): piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) if piece_id != 19 and game_state == 1: # Train if self.train and self.replay_buffer.size( ) > 250 and not self.test: batch = self.replay_buffer.sample(batch_sz=250) self.network.train(batch) self.training_begun = True self.epsilon *= self.decay if self.epsilon < 0.010: self.epsilon = 0.010 if not self.placed_move: # and (random_move >= 0 or self.restart_game > 0): # os.system('clear') print '--------------' is_random = False move = None if np.random.random() < self.epsilon or not self.training_begun: move = np.random.choice(self.possible_moves) is_random = True else: tensor = np.dstack([self.grid] + self.prev_states) pred = self.network.predict(tensor)[0] move = self.possible_moves[pred] if self.restart_game > 0: self.api.writeGamepad(0, 3, True) self.restart_game -= 1 move = -2 else: if move >= 0: self.api.writeGamepad(0, move, True) self.placed_move = True self.show_board = True if self.last_move != -2 and piece_id != 19: print 'Random:', is_random S = self.grid.copy() self._update_board(self.api.peekCPU(0x0042)) board = self._simulate_piece_drop(self.api.peekCPU(0x0042)) n_empty = self._count_empty(self.grid) n_holes = self._count_holes(self.grid) height = self._count_height(board) levelness = self._determine_levelness(board) A = self.last_move # R = self._count_total() + self._get_score() - n_empty #R = (-50 * height) + (-20 * n_holes) + (self._get_score()) if height <= 2: R = 1000 else: R = -200 * height R += -20 * n_holes + 10 * levelness # 10 * self._get_score() SP = self.grid.copy() self.prev_states.insert(0, S) print np.dstack(self.prev_states).shape self.replay_buffer.add( np.dstack(self.prev_states), self.possible_moves.index(A), R, np.dstack([SP] + self.prev_states[:self.history_size])) self.prev_states = self.prev_states[:self.history_size] print self.epsilon self._print_transition(S, A, board, R) self.last_move = move else: self.placed_move = False def _frame_render_finished(self): """ Renders the board the the current piece TODO: do this lazily, so we aren't calling read too often O_o """ # To make things easier, we're going to modify the next piece drop # Always drop a certain type of block (currently square). self.api.writeCPU(0x00bf, 0x0a) piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) # Restart the game if piece_id == 19 and (game_state == 10 or game_state == 0): self.prev_states = [self.start_state] * self.history_size self.game_restarted = True self.restart_game = 1 return # Probably a line clear... Skip if piece_id == 19 and game_state != 1: return def _piece_update(self, access_type, address, value): """ Can be used to control the piece being dropped """ if self.api.readCPU(0x0048) == 1: return 0x0a return value def agent_name(self): return 'NeuralNetworkAgent'
class DQN: def __init__( self, env, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=1, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=None, eps_decay_rate=-1e-4, target_update_freq=1000, ): tf.random.set_seed(seed) np.random.seed(seed) self.gamma = gamma self.render = render self.batch_size = batch_size self.print_freq = print_freq self.q_lr = learning_rate self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.buffer = ReplayBuffer(buffer_capacity) self.max_steps = max_steps self.target_update = target_update_freq self.model = QNetwork(env.action_space.n, name='q_network') self.target = QNetwork(env.action_space.n, name='target_network') self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.summary_writer = tf.summary.create_file_writer(log_dir) self.env = env self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps self.rewards = [] self.save_path = save_path if load_path is not None: self.model.load_weights(load_path) def act(self, state): return np.argmax(self.model(state)) @tf.function def train_step(self, states, indices, targets): """ Performs a single step of gradient descent on the Q network Args: states: numpy array of states with shape (batch size, state dim) indices: list indices of the selected actions targets: targets for computing the MSE loss """ with tf.GradientTape() as tape: action_values = tf.gather_nd(self.model(states), indices) mse_loss = tf.keras.losses.MeanSquaredError()(action_values, targets) gradients = tape.gradient(mse_loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.model.trainable_variables)) # Log training information with self.summary_writer.as_default(): tf.summary.scalar('MSE Loss', mse_loss, step=self.optimizer.iterations) tf.summary.scalar('Estimated Q Value', tf.reduce_mean(action_values), step=self.optimizer.iterations) def update(self): """ Computes the target for the MSE loss and calls the tf.function for gradient descent """ if len(self.buffer) >= self.batch_size: # Sample random minibatch of N transitions states, actions, rewards, next_states, dones = self.buffer.sample( self.batch_size) # Adjust the targets for non-terminal states next_state_pred = self.target(next_states) targets = rewards + self.gamma * next_state_pred.numpy().max( axis=1) * (1 - dones) batch_range = tf.range(start=0, limit=actions.shape[0]) indices = tf.stack((batch_range, actions), axis=1) # update critic by minimizing the MSE loss self.train_step(states, indices, targets) def learn(self): """Learns via Deep-Q-Networks (DQN)""" obs = self.env.reset() total_reward = 0 ep = 0 ep_len = 0 rand_actions = 0 mean_reward = None for t in range(self.max_steps): if t % self.target_update == 0: copy_weights(self.model.variables, self.target.variables) # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp( self.eps_decay_rate * t) if self.render: self.env.render() # Take exploratory action with probability epsilon if np.random.uniform() < eps: action = self.env.action_space.sample() rand_actions += 1 else: action = self.act(np.expand_dims(obs, axis=0)) # Execute action in emulator and observe reward and next state new_obs, reward, done, info = self.env.step(action) total_reward += reward # Store transition s_t, a_t, r_t, s_t+1 in replay buffer self.buffer.add((obs, action, reward, new_obs, done)) # Perform learning step self.update() obs = new_obs ep_len += 1 if done or ep_len >= self.max_episode_len: with self.summary_writer.as_default(): ep += 1 self.rewards.append(total_reward) total_reward = 0 obs = self.env.reset() if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean( self.rewards[-self.print_freq - 1:]) print( f"-------------------------------------------------------" ) print( f"Mean {self.print_freq} Episode Reward: {new_mean_reward}" ) print(f"Exploration fraction: {rand_actions / ep_len}") print(f"Total Episodes: {ep}") print(f"Total timesteps: {t}") print( f"-------------------------------------------------------" ) tf.summary.scalar( f'Mean {self.print_freq} Episode Reward', new_mean_reward, step=t) tf.summary.scalar(f'Epsilon', eps, step=t) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print( f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}" ) print(f'Location: {self.save_path}') mean_reward = new_mean_reward self.model.save_weights(self.save_path) ep_len = 0 rand_actions = 0
def play(train_indicator): buffer_size = 100000 batch_size = 32 gamma = 0.99 # discount factor tau = 0.001 # Target Network HyperParameter lra = 0.0001 # Learning rate for Actor lrc = 0.001 # Learning rate for Critic ou_sigma = 0.3 action_dim = 1 # Steering angle state_dim = 21 # num of sensors input episodes_num = 2000 max_steps = 100000 step = 0 train_stat_file = "data/train_stat.txt" actor_weights_file = "data/actor.h5" critic_weights_file = "data/critic.h5" config = tf.ConfigProto() config.gpu_options.allow_growth = True tf_session = tf.Session(config=config) keras_backend.set_session(tf_session) actor = ActorNetwork(tf_session=tf_session, state_size=state_dim, action_size=action_dim, hidden_units=(300, 600), tau=tau, lr=lra) critic = CriticNetwork(tf_session=tf_session, state_size=state_dim, action_size=action_dim, hidden_units=(300, 600), tau=tau, lr=lrc) buffer = ReplayBuffer(buffer_size) # noise function for exploration ou = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim), sigma=ou_sigma * np.ones(action_dim)) # Torcs environment - throttle and gear change controlled by client env = TorcsEnv(vision=False, throttle=False, gear_change=False) try: actor.model.load_weights(actor_weights_file) critic.model.load_weights(critic_weights_file) actor.target_model.load_weights(actor_weights_file) critic.target_model.load_weights(critic_weights_file) print("Weights loaded successfully") except: print("Cannot load weights") for i in range(episodes_num): print("Episode : %s Replay buffer %s" % (i, len(buffer))) if i % 3 == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() # 21 len state dimensions - https://arxiv.org/abs/1304.1672 state = np.hstack((ob.angle, ob.track, ob.trackPos)) total_reward = 0. for j in range(max_steps): loss = 0 action_predicted = actor.model.predict( state.reshape(1, state.shape[0])) + ou() # predict and add noise observation, reward, done, info = env.step(action_predicted[0]) state1 = np.hstack( (observation.angle, observation.track, observation.trackPos)) buffer.add((state, action_predicted[0], reward, state1, done)) # add replay buffer # batch update batch = buffer.get_batch(batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + gamma * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.get_gradients(states, a_for_grad) actor.train(states, grads) actor.train_target_model() critic.train_target_model() total_reward += reward state = state1 print("Episode %s - Step %s - Action %s - Reward %s" % (i, step, action_predicted[0][0], reward)) step += 1 if done: break if i % 3 == 0 and train_indicator: print("Saving weights...") actor.model.save_weights(actor_weights_file, overwrite=True) critic.model.save_weights(critic_weights_file, overwrite=True) tm = time.strftime("%Y-%m-%d %H:%M:%S") episode_stat = "%s -th Episode. %s total steps. Total reward: %s. Time %s" % ( i, step, total_reward, tm) print(episode_stat) with open(train_stat_file, "a") as outfile: outfile.write(episode_stat + "\n") env.end()
def train(conf, env, model, num_episodes=500, batch_size=100, buffer_size=10000): conf.buffer_size = buffer_size conf.batch_size = batch_size replay_buffer = ReplayBuffer(size=buffer_size) discount_rate = conf.discount_rate eps = conf.initial_eps decay_factor = conf.decay_factor for episode in range(num_episodes): print("Episode {}".format(episode)) observation = env.reset() eps *= decay_factor done = False total_food = 0 step = 0 while not done: model_input = np.array([observation]) prediction = model.predict(model_input) if np.random.random() < eps: action = np.random.randint(0, 4) was_random = True else: action = np.argmax(prediction) was_random = False debugger.print_step_before_move(step, observation, prediction, action, was_random) debugger.render_env_until_key_press(env) new_observation, reward, done, _ = env.step(action) replay_buffer.add(observation, action, reward, new_observation, float(done)) # target_action_score = reward + (0 if done else discount_rate * np.max(model.predict( # np.array([new_observation])))) # label = prediction # label[0][action] = target_action_score # model.fit(model_input, label, epochs=1, # verbose=0) obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) labels = model.predict(obses_t) targets = discount_rate * np.max(model.predict(obses_tp1), axis=1) # print('targets', targets) # print('rewards', rewards) for i in range(len(dones)): if dones[i]: targets[i] = 0 targets[i] += rewards[i] labels[i][actions[i]] = targets[i] model.fit(obses_t, labels, epochs=1, verbose=0) weights, batch_idxes = np.ones_like(rewards), None # debugger.print_step_after_move(reward, target_action_score, # label, model.predict(model_input)) if (reward > 0): total_food += 1 step += 1 observation = new_observation wandb.log({ 'episode': episode, 'total_food': total_food, 'eps': eps, 'lifetime': step }) print('Score: {}'.format(total_food)) print() env.close()