class DQN: """ Implementation of deep q learning algorithm """ def __init__(self): self.prob_random = 1.0 # Probability to play random action self.y = .99 # Discount factor self.batch_size = 64 # How many experiences to use for each training step self.prob_random_end = .01 # Ending chance of random action self.prob_random_decay = .996 # Decrease decay of the prob random self.max_episode = 300 # Max number of episodes you are allowes to played to train the game self.expected_goal = 200 # Expected goal self.dnn = DNN() self.env = gym.make('CartPole-v0') self.memory = ExperienceReplay(buffer_size=10000) self.metadata = [ ] # we will store here info score, at the end of each episode def choose_action(self, state, prob_random): if np.random.rand() <= prob_random: action = np.random.randint(self.env.action_space.n) else: action = np.argmax(self.dnn.model.predict(state)) return action def run_one_step(self, state): action = self.choose_action(state, self.prob_random) next_state, reward, done, _ = self.env.step(action) next_state = np.expand_dims(next_state, axis=0) return state, action, reward, next_state, done def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # Our predictions (actions to take) from the main Q network target_q = self.dnn.model.predict(train_state) # Tells us whether game over or not # We will multiply our rewards by this value # to ensure we don't train on the last move train_gameover = train_done == 0 # Q value of the next state based on action target_q_next_state = self.dnn.model.predict(train_next_state) train_next_state_values = np.max(target_q_next_state[range( self.batch_size)], axis=1) # Reward from the action chosen in the train batch actual_reward = train_reward + (self.y * train_next_state_values * train_gameover) target_q[range(self.batch_size), train_action] = actual_reward return target_q def train_one_step(self): batch_data = self.memory.sample(self.batch_size) train_state = np.array([i[0] for i in batch_data]) train_action = np.array([i[1] for i in batch_data]) train_reward = np.array([i[2] for i in batch_data]) train_next_state = np.array([i[3] for i in batch_data]) train_done = np.array([i[4] for i in batch_data]) # These lines remove useless dimension of the matrix train_state = np.squeeze(train_state) train_next_state = np.squeeze(train_next_state) # Generate target Q target_q = self.generate_target_q(train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done) loss = self.dnn.model.train_on_batch(train_state, target_q) return loss def train(self): scores = [] for e in range(self.max_episode): # Init New episode state = self.env.reset() state = np.expand_dims(state, axis=0) episode_score = 0 while True: state, action, reward, next_state, done = self.run_one_step( state) self.memory.add( experiences=[[state, action, reward, next_state, done]]) episode_score += reward state = next_state if len(self.memory.buffer) > self.batch_size: self.train_one_step() if self.prob_random > self.prob_random_end: self.prob_random *= self.prob_random_decay if done: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") self.metadata.append( [now, e, episode_score, self.prob_random]) print( "{} - episode: {}/{}, score: {:.1f} - prob_random {:.3f}" .format(dt_string, e, self.max_episode, episode_score, self.prob_random)) break scores.append(episode_score) # Average score of last 100 episode means_last_10_scores = np.mean(scores[-10:]) if means_last_10_scores == self.expected_goal: print('\n Task Completed! \n') break print("Average over last 10 episode: {0:.2f} \n".format( means_last_10_scores)) print("Maximum number of episode played: %d" % self.max_episode)
class DQN: def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay() def update_target_graph(self): updated_weights = np.array(self.main_qn.model.get_weights()) self.target_qn.model.set_weights(updated_weights) def choose_action(self, state, prob_random, num_episode): if np.random.rand() < prob_random or \ num_episode < self.min_pre_train_episodes: # Act randomly based on prob_random or if we # have not accumulated enough pre_train episodes action = np.random.randint(self.env.actions) else: # Decide what action to take from the Q network # First add one dimension to the netword to fit expected dimension of the network state = np.expand_dims(state, axis=0) action = np.argmax(self.main_qn.model.predict(state)) return action def run_one_episode(self, num_episode, prob_random): # Create an experience replay for the current episode. experiences_episode = [] # Get the game state from the environment state = self.env.reset() done = False # Game is complete cur_step = 0 # Running sum of number of steps taken in episode while cur_step < self.max_num_step and not done: cur_step += 1 action = self.choose_action(state=state, prob_random=prob_random, num_episode=num_episode) # Take the action and retrieve the next state, reward and done next_state, reward, done = self.env.step(action) # Setup the experience to be stored in the episode buffer experience = [state, action, reward, next_state, done] # Store the experience in the episode buffer experiences_episode.append(experience) # Update the state state = next_state return experiences_episode def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # Our predictions (actions to take) from the main Q network target_q = self.main_qn.model.predict(train_state) # Tells us whether game over or not # We will multiply our rewards by this value # to ensure we don't train on the last move train_gameover = train_done == 0 # Q value of the next state based on action target_q_next_state = self.target_qn.model.predict(train_next_state) train_next_state_values = np.max(target_q_next_state[range( self.batch_size)], axis=1) # Reward from the action chosen in the train batch actual_reward = train_reward + (self.y * train_next_state_values * train_gameover) target_q[range(self.batch_size), train_action] = actual_reward return target_q def train_one_step(self): # Train batch is [[state,action,reward,next_state,done],...] train_batch = self.experience_replay.sample(self.batch_size) # Separate the batch into numpy array for each compents train_state = np.array([x[0] for x in train_batch]) train_action = np.array([x[1] for x in train_batch]) train_reward = np.array([x[2] for x in train_batch]) train_next_state = np.array([x[3] for x in train_batch]) train_done = np.array([x[4] for x in train_batch]) # Generate target Q target_q = self.generate_target_q(train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done) # Train the main model loss = self.main_qn.model.train_on_batch(train_state, target_q) return loss def train(self): # Make the networks equal self.update_target_graph() # We'll begin by acting complete randomly. As we gain experience and improve, # we will begin reducing the probability of acting randomly, and instead # take the actions that our Q network suggests prob_random = self.prob_random_start prob_random_drop = (self.prob_random_start - self.prob_random_end) / self.annealing_steps # Init variable num_steps = [] # Tracks number of steps per episode rewards = [] # Tracks rewards per episode print_every = 50 # How often to print status losses = [0] # Tracking training losses num_episode = 0 while True: # Run one episode experiences_episode = self.run_one_episode(num_episode, prob_random) # Save the episode in the replay buffer self.experience_replay.add(experiences_episode) # If we have play enoug episode. Start the training if num_episode > self.min_pre_train_episodes: # Drop the probability of a random action if wi didn't reach the prob_random_end value if prob_random > self.prob_random_end: prob_random -= prob_random_drop # Every train_frequency iteration, train the model if num_episode % self.train_frequency == 0: for num_epoch in range(self.num_epochs): loss = self.train_one_step() losses.append(loss) # Update the target model with values from the main model self.update_target_graph() # Increment the episode num_episode += 1 num_steps.append(len(experiences_episode)) rewards.append(sum([e[2] for e in experiences_episode])) # Print Info if num_episode % print_every == 0: # datetime object containing current date and time now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print( "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}" .format(dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) # Stop Condition if np.mean(rewards[-print_every:]) >= self.goal: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print( "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}" .format(dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) print("Training complete because we reached goal rewards.") break if num_episode > self.max_num_episodes: print("Training Stop because we reached max num of episodes") break
# get action if (config.total_step < config.args.num_pretrain_step or np.random.rand(1) < epsilon): action = np.random.randint(env.num_action) num_random_step += 1 else: action = qnet.get_actions(state)[0] # get after take action newstate, reward, done, _ = env.step(action) if (newstate == []): print("Terminate") break replay_ep.add( np.reshape(np.array([state, action, reward, done, newstate]), [1, 5])) # train if config.total_step > config.args.num_pretrain_step: if epsilon > config.args.end_epsilon: epsilon -= epsilon_decay if config.total_step % config.args.online_update_freq == 0: train_batch = replay.sample(config.args.batch_size) loss = qnet.learn_on_minibatch(train_batch, config.args.gamma) sys.stdout.write( "\rTrain step at {}th step | loss {} | epsilon {}".format( config.total_step, loss, epsilon)) sys.stdout.flush() if config.total_step % config.args.target_update_freq == 0:
next_state = preprocess(g.state) next_state = np.expand_dims(next_state, axis=2) # channel axis action_onehot = np.zeros(NUM_ACTIONS) action_onehot[action] = 1 state = np.expand_dims(state, axis=2) if reward > 0: extra_bonus = 0 if np.max(state) == state[0, 0]: extra_bonus += math.log2(2**20) if np.argmax(np.sum(state, axis=1)): extra_bonus += math.log2(2**20) reward = math.log2(reward) + num_merges + extra_bonus replay.add((state, action_onehot, reward, next_state)) if g.is_game_over(): logger.log("stats/score", g.score, i) logger.log("stats/num_moves", num_moves, i) logger.log("stats/max_tile", np.max(g.state), i) logger.log("stats/best_score", best_score, i) logger.log("settings/epsilon", epsilon, i) logger.log("settings/num_random_moves", num_random_moves, i) logger.log("settings/perc_random_moves", num_random_moves / num_moves, i) logger.log("settings/experience", len(replay), i) reward = 0 replay.add(
class Agent: def __init__(self, s_size, a_size, seed): """ Parameters: s_size (int): dimension of each state a_size (int): dimension of each action seed (int): random seed """ self.s_size = s_size self.a_size = a_size self.seed = random.seed(seed) # Initialize both the Q-networks self.local_dqn = Model(s_size, a_size, seed).to(device) self.target_dqn = Model(s_size, a_size, seed).to(device) self.optimizer = optim.Adam(self.local_dqn.parameters(), lr=c.LEARNING_RATE) # Initialize experience deque self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE, c.BATCH_SIZE, seed) # Time step counter used for updating as per UPDATE_FREQUENCY self.t_step = 0 def step(self, s, a, r, s_next, done, transfer_method): # Add experience to dequeue self.buffer.add(s, a, r, s_next, done) # Learn if UPDATE_FREQUENCY matched. self.t_step = (self.t_step + 1) % c.UPDATE_FREQUENCY if self.t_step == 0: # Get random experiences to learn from. if len(self.buffer) > c.BATCH_SIZE: es = self.buffer.sample() self.learn(es, transfer_method, c.GAMMA) def act(self, state, transfer_method, eps=0.): """Returns actions for given state as per current policy. Parameters: state (array_like): current state isTransfer (int): 0 if pre-trained weights to be used, int otherwise eps (float): epsilon, for exploration """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_dqn.eval() with torch.no_grad(): a_values = self.local_dqn(state, transfer_method) self.local_dqn.train() # Generate a random number. If larger than epsilon pick greedy, or random otherwise if random.random() > eps: return np.argmax(a_values.cpu().data.numpy()) else: return random.choice(np.arange(self.a_size)) def learn(self, es, transfer_method, gamma): """Update parameters based on experiences. Parameters: es (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ s_, a_, r_, s_next_, d_ = es # Max predicted Q-values target_Q_next = self.target_dqn( s_next_, transfer_method).detach().max(1)[0].unsqueeze(1) # Target Q-value target_Q = r_ + (gamma * target_Q_next * (1 - d_)) # Expected Q-vales expected_Q = self.local_dqn(s_, transfer_method).gather(1, a_) loss = F.mse_loss(expected_Q, target_Q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network update(self.local_dqn, self.target_dqn, c.TAU)
class DQNAgent: def __init__(self, env, net_update_rate: int = 25, exploration_rate: float = 1.0, exploration_decay: float = 0.00005): # set hyper parameters self.exploration_rate = exploration_rate self.exploration_decay = exploration_decay self.net_updating_rate = net_update_rate # set environment self.env = env self.state_shape = env.get_state_shape() self.action_shape = env.get_action_shape() # the number of experience per batch for batch learning # Experience Replay for batch learning self.exp_rep = ExperienceReplay() # Deep Q Network self.net = None def set_model(self, model): """ Sets the model the agent is used to train. Receives a compiled tf Model with input_shape = env.observation_space and output_shape = env.action_s pace""" self.net = DoubleDQN(model) def get_action(self, state: np.ndarray, eps=0) -> int: """Given a state returns a random action with probability eps, and argmax(q_net(state)) with probability 1-eps. (only legal actions are considered)""" if self.net is None: raise NotImplementedError( 'agent.get_action called before model was not initiated.\n Please set the agent\'s model' ' using the set_model method. You can access the state and action shapes using ' 'agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) legal_actions = self.env.get_legal_actions(state) if np.random.random() >= eps: # Exploitation # Calculate the Q-value of each action q_values = self.net.predict(state[np.newaxis, ...], np.expand_dims(legal_actions, 0)) # Make sure we only choose between available actions legal_actions = np.logical_and(legal_actions, q_values == np.max(q_values)) return np.random.choice(np.flatnonzero(legal_actions)) def update_net(self, batch_size: int): """ if there are more than batch_size experiences, Optimizes the network's weights using the Double-Q-learning algorithm with a batch of experiences, else returns""" if self.exp_rep.get_num() < batch_size: return batch = self.exp_rep.get_batch(batch_size) self.net.fit(*batch) def train(self, episodes: int, path: str, checkpoint_rate=100, batch_size: int = 64, exp_decay_func=lambda exp_rate, exp_decay, i: 0.01 + (exp_rate - 0.01) * np.exp(exp_decay * (i + 1)), show_progress=False): """ Runs a training session for the agent :param episodes: number of episodes to train. :param path: a path to a directory where the trained weights will be saved. :param batch_size: number of experiences to learn from in each net_update. """ if self.net is None: raise NotImplementedError( 'agent.train called before model was not initiated.\n Please set the agent\'s model' ' using the set_model method. You can access the state and action shapes using ' 'agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) # set hyper parameters exploration_rate = self.exploration_rate total_rewards = [] # start training for episode in tqdm(range(episodes)): state = self.env.reset() # Reset the environment for a new episode step, episode_reward = 0, 0 run = True # Run until max actions is reached or episode has ended while run: step += 1 # choose a current action using epsilon greedy exploration action = self.get_action(state, exploration_rate) # apply the chosen action to the environment and observe the next_state and reward obs = self.env.step(action) next_state, reward, is_terminal = obs[:3] episode_reward += reward # Add experience to memory self.exp_rep.add(state, action, reward, next_state, self.env.get_legal_actions(state), is_terminal) # Optimize the DoubleQ-net self.update_net(batch_size) if is_terminal: # The action taken led to a terminal state run = False if (step % self.net_updating_rate) == 0 and step > 0: # update target network self.net.align_target_model() state = next_state # Update total_rewards to keep track of progress total_rewards.append(episode_reward) # Update target network at the end of the episode self.net.align_target_model() # Update exploration rate - exploration_rate = exp_decay_func(exploration_rate, self.exploration_decay, episode) if episode % checkpoint_rate == 0 and self.exp_rep.get_num( ) > batch_size: self.save_weights( os.path.join(path, f'episode_{episode}_weights')) if show_progress: # Plot a moving average of last 10 episodes self.plot_progress(total_rewards) # update the agents exploration rate in case more training is needed. self.exploration_rate = exploration_rate # saves the total_rewards as csv file to the path specified. with open(os.path.join(path, 'rewards.csv'), 'w') as reward_file: rewards = pd.DataFrame(total_rewards) rewards.to_csv(reward_file) self.save_weights(os.path.join(path, 'final_weights')) def plot_progress(self, total_rewards): w = np.ones(10) / 10 moving_average = np.convolve(total_rewards, w, mode='valid') plt.plot(np.arange(len(moving_average)), moving_average) plt.title('Moving average of rewards across episodes') plt.xlabel('episodes') plt.ylabel('average reward over last 10 episodes') plt.show() def get_state_shape(self): return self.state_shape def get_action_shape(self): return self.action_shape # Handles saving\loading the model as explained here: https://www.tensorflow.org/guide/keras/save_and_serialize def load_weights(self, path): self.net.load_weights(path) def save_weights(self, path): self.net.save_weights(path) def save_model(self, path): if self.net is None: raise NotImplementedError( 'agent.save_model was called before model was not initiated.\n Please set the ' 'agent\'s model using the set_model method. You can access the state and action ' 'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) self.net.save_model(path) def load_model(self, path): model = load_model(path) self.set_model(model) def to_json(self, **kwargs): if self.net is None: raise NotImplementedError( 'agent.to_json was called before model was not initiated.\n Please set the ' 'agent\'s model using the set_model method. You can access the state and action ' 'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) return self.net.to_json(**kwargs) def from_json(self, json_config): model = model_from_json(json_config) self.set_model(model)
class DQN: def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay() def update_target_graph(self): # TODO return def choose_action(self, state, prob_random, num_episode): # TODO return action def run_one_episode(self, num_episode, prob_random): # TODO return experiences_episode def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # TODO return target_q def train_one_step(self): # Train batch is [[state,action,reward,next_state,done],...] train_batch = self.experience_replay.sample(self.batch_size) # Separate the batch into numpy array for each compents train_state = np.array([x[0] for x in train_batch]) train_action = np.array([x[1] for x in train_batch]) train_reward = np.array([x[2] for x in train_batch]) train_next_state = np.array([x[3] for x in train_batch]) train_done = np.array([x[4] for x in train_batch]) # Generate target Q target_q = self.generate_target_q( train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done ) # Train the main model loss = self.main_qn.model.train_on_batch(train_state, target_q) return loss def train(self): # Make the networks equal self.update_target_graph() # We'll begin by acting complete randomly. As we gain experience and improve, # we will begin reducing the probability of acting randomly, and instead # take the actions that our Q network suggests prob_random = self.prob_random_start prob_random_drop = (self.prob_random_start - self.prob_random_end) / self.annealing_steps # Init variable num_steps = [] # Tracks number of steps per episode rewards = [] # Tracks rewards per episode print_every = 50 # How often to print status losses = [0] # Tracking training losses num_episode = 0 while True: # Run one episode experiences_episode = self.run_one_episode(num_episode, prob_random) # Save the episode in the replay buffer self.experience_replay.add(experiences_episode) # If we have play enoug episode. Start the training if num_episode > self.min_pre_train_episodes: # Drop the probability of a random action if wi didn't reach the prob_random_end value if prob_random > self.prob_random_end: prob_random -= prob_random_drop # Every train_frequency iteration, train the model if num_episode % self.train_frequency == 0: for num_epoch in range(self.num_epochs): loss = self.train_one_step() losses.append(loss) # Update the target model with values from the main model self.update_target_graph() # Increment the episode num_episode += 1 num_steps.append(len(experiences_episode)) rewards.append(sum([e[2] for e in experiences_episode])) # Print Info if num_episode % print_every == 0: # datetime object containing current date and time now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format( dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) # Stop Condition if np.mean(rewards[-print_every:]) >= self.goal: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format( dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) print("Training complete because we reached goal rewards.") break if num_episode > self.max_num_episodes: print("Training Stop because we reached max num of episodes") break
class DQN_agent(): def __init__(self): self.eps = 0.1 self.env = GridEnv(3) self.batch_size = 20 if prioritized_replay and replay_type == "proportional": self.replay = ProportionalReplay(max_buffer_size, prioritized_replay_alpha) elif prioritized_replay and replay_type == "ranked": N_list = [self.batch_size] + [ int(x) for x in np.linspace(100, max_buffer_size, 5) ] save_quantiles(N_list=N_list, k=self.batch_size, alpha=prioritized_replay_alpha) self.replay = RankBasedReplay(max_buffer_size, prioritized_replay_alpha) else: self.replay = ExperienceReplay( max_buffer_size) # passing size of buffer # define graph self.inputs = tf.placeholder(tf.float32, shape=(None, self.env.state_size)) self.target_values = tf.placeholder(tf.float32, shape=(None, )) self.actions = tf.placeholder(tf.int32, shape=(None, )) self.is_weights = tf.placeholder(tf.float32, shape=( None, )) # importance sampling weights for prioritized replay self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph( ) # build main network self.target_Q_out_op, _, _ = self.build_graph( 'target') # build identical target network self.init_op = tf.global_variables_initializer() self.sess = tf.Session() def build_graph(self, scope='main'): with tf.variable_scope(scope): h = tf.layers.dense(self.inputs, 16, activation=tf.nn.relu, name="h") outputs = tf.layers.dense(h, self.env.num_actions, activation=tf.nn.softmax, name="outputs") # everything is now the same shape (batch_size, num_actions) # nonzero error only for selected actions action_mask = tf.one_hot(self.actions, self.env.num_actions, on_value=True, off_value=False) targets = tf.tile(tf.expand_dims(self.target_values, 1), [1, self.env.num_actions]) target_outputs = tf.where( action_mask, targets, outputs ) # takes target value where mask is true. takes outputs value otherwise td_error = target_outputs - outputs # only one element in each row is non-zero weights = tf.tile(tf.expand_dims(self.is_weights, 1), [1, self.env.num_actions ]) # all 1s when not using priority replay weighted_td_error = weights * td_error # element-wise multiplication loss = tf.reduce_sum(tf.square(weighted_td_error)) update = tf.train.AdamOptimizer().minimize(loss) return outputs, update, td_error def train(self): steps_per_ep = np.zeros(episodes) for episode in range(episodes): print(episode) self.env.reset() state = self.env.state done = False num_steps = 0 while not done: num_steps += 1 action = self.get_eps_action(state, self.eps) next_state, reward, done, _ = self.env.step(action) self.replay.add((state, action, reward, next_state, done)) # store in experience replay # sample from experience replay if prioritized_replay: beta = beta0 + episode * ( 1 - beta0 ) / episodes # linear annealing schedule for IS weights states, actions, rewards, next_states, dones, weights, indices = self.replay.sample( self.batch_size, beta) self.net_update(states, actions, rewards, next_states, dones, weights, indices) # qlearning else: states, actions, rewards, next_states, dones = self.replay.sample( self.batch_size) self.net_update(states, actions, rewards, next_states, dones) # qlearning # slowly update target network if num_steps % update_every == 0: self.target_net_update() # sort max heap periodically if num_steps % sort_every == 0: if prioritized_replay and replay_type == "ranked": self.replay.sort() state = next_state steps_per_ep[episode] = num_steps return steps_per_ep # from https://tomaxent.com/2017/07/09/Using-Tensorflow-and-Deep-Q-Network-Double-DQN-to-Play-Breakout/ def target_net_update(self): # get sorted lists of parameters in each of the networks main_params = [ t for t in tf.trainable_variables() if t.name.startswith("main") ] main_params = sorted(main_params, key=lambda v: v.name) target_params = [ t for t in tf.trainable_variables() if t.name.startswith("target") ] target_params = sorted(target_params, key=lambda v: v.name) update_ops = [] for main_v, target_v in zip(main_params, target_params): op = target_v.assign(main_v) update_ops.append(op) self.sess.run(update_ops) # minibatch qlearning def net_update(self, states, actions, rewards, next_states, dones, weights=None, indices=None): not_dones = np.logical_not(dones) # create a shape (batch_size, ) array of target values target_values = rewards.astype( float) # np.array of shape (batch_size, ) next_inputs = next_states[ not_dones] # np.array of shape (#done, state_size) next_Qs = self.sess.run(self.Q_out_op, {self.inputs: next_inputs }) # np.array of shape (#done, num_actions) max_Qs = np.max(next_Qs, axis=1) # np.array of shape (#done,) target_values[not_dones] += gamma * max_Qs # if not using prioritized replay if weights is None: weights = np.ones(self.batch_size) # compute gradients and update parameters _, td_error = self.sess.run([self.Q_update_op, self.td_error_op], \ {self.inputs: states, self.target_values: target_values, self.actions: actions, self.is_weights: weights}) # update priority replay priorities if indices is not None: td_error = td_error.ravel()[np.flatnonzero( td_error)] # shape (batch_size, ) self.replay.update_priorities( indices, np.abs(td_error) + 1e-3 ) # add small number to prevent never sampling 0 error transitions # returns eps-greedy action with respect to Q def get_eps_action(self, state, eps): if self.env.np_random.uniform() < eps: action = self.env.sample() else: Q = self.sess.run(self.Q_out_op, {self.inputs: np.array([state])}) max_actions = np.where(np.ravel(Q) == Q.max())[0] action = self.env.np_random.choice( max_actions) # to select argmax randomly return action
def end_to_end_training( epochs: int, model_cls: BaseConditionalGenerationOracle, optimizer_cls: BaseOptimizer, optimized_function_cls: BaseConditionalGenerationOracle, logger: BaseLogger, model_config: dict, optimizer_config: dict, n_samples_per_dim: int, step_data_gen: float, n_samples: int, current_psi: Union[List[float], torch.tensor], reuse_optimizer: bool = False, reuse_model: bool = False, shift_model: bool = False, finetune_model: bool = False, use_experience_replay: bool = True, add_box_constraints: bool = False, experiment=None, scale_psi=False): """ :param epochs: int number of local training steps to perfomr :param model_cls: BaseConditionalGenerationOracle model that is able to generate samples and calculate loss function :param optimizer_cls: BaseOptimizer :param logger: BaseLogger :param model_config: dict :param optimizer_config: dict :param n_samples_per_dim: int :param step_data_gen: float :param n_samples: int :param current_psi: :param reuse_model: :param reuse_optimizer: :param finetune_model: :param shift_model: :return: """ gan_logger = GANLogger(experiment) # gan_logger = RegressionLogger(experiment) # gan_logger = None y_sampler = optimized_function_cls(device=device, psi_init=current_psi) model = model_cls(y_model=y_sampler, **model_config, logger=gan_logger).to(device) optimizer = optimizer_cls(oracle=model, x=current_psi, **optimizer_config) print(model_config) exp_replay = ExperienceReplay(psi_dim=model_config['psi_dim'], y_dim=model_config['y_dim'], x_dim=model_config['x_dim'], device=device) weights = None logger.log_performance(y_sampler=y_sampler, current_psi=current_psi, n_samples=n_samples) for epoch in range(epochs): # generate new data sample x, condition = y_sampler.generate_local_data_lhs( n_samples_per_dim=n_samples_per_dim, step=step_data_gen, current_psi=current_psi, n_samples=n_samples) if x is None and condition is None: print("Empty training set, continue") continue x_exp_replay, condition_exp_replay = exp_replay.extract( psi=current_psi, step=step_data_gen) exp_replay.add(y=x, condition=condition) x = torch.cat([x, x_exp_replay], dim=0) condition = torch.cat([condition, condition_exp_replay], dim=0) used_samples = n_samples # breaking things if model_config.get("predict_risk", False): condition = condition[::n_samples_per_dim, :current_psi.shape[0]] x = y_sampler.func(condition, num_repetitions=n_samples_per_dim).reshape( -1, x.shape[1]) print(x.shape, condition.shape) ## Scale train set if scale_psi: scale_factor = 10 feature_max = condition[:, :model_config['psi_dim']].max(axis=0)[0] y_sampler.scale_factor = scale_factor y_sampler.feature_max = feature_max y_sampler.scale_psi = True print("MAX FEATURES", feature_max) condition[:, : model_config['psi_dim']] /= feature_max * scale_factor current_psi = current_psi / feature_max * scale_factor print(feature_max.shape, current_psi.shape) print("MAX PSI", current_psi) model.train() if reuse_model: if shift_model: if isinstance(model, ShiftedOracle): model.set_shift(current_psi.clone().detach()) else: model = ShiftedOracle(oracle=model, shift=current_psi.clone().detach()) model.fit(x, condition=condition, weights=weights) else: model.fit(x, condition=condition, weights=weights) else: # if not reusing model # then at each epoch re-initialize and re-fit model = model_cls(y_model=y_sampler, **model_config, logger=gan_logger).to(device) print("y_shape: {}, cond: {}".format(x.shape, condition.shape)) model.fit(x, condition=condition, weights=weights) model.eval() if reuse_optimizer: optimizer.update(oracle=model, x=current_psi) else: # find new psi optimizer = optimizer_cls(oracle=model, x=current_psi, **optimizer_config) if add_box_constraints: box_barriers = make_box_barriers(current_psi, step_data_gen) add_barriers_to_oracle(oracle=model, barriers=box_barriers) previous_psi = current_psi.clone() current_psi, status, history = optimizer.optimize() if scale_psi: current_psi, status, history = optimizer.optimize() current_psi = current_psi / scale_factor * feature_max y_sampler.scale_psi = False print("NEW_PSI: ", current_psi) try: # logging optimization, i.e. statistics of psi logger.log_grads(model, y_sampler, current_psi, n_samples_per_dim, log_grad_diff=False) logger.log_optimizer(optimizer) logger.log_performance(y_sampler=y_sampler, current_psi=current_psi, n_samples=n_samples) experiment.log_metric("used_samples_per_step", used_samples) experiment.log_metric("sample_size", len(x)) except Exception as e: print(e) print(traceback.format_exc()) # raise torch.cuda.empty_cache() logger.func_saver.join() return