def create_model(input_shape, num_actions, model_name, create_network_fn, learning_rate): # noqa: D103 """Create the Q-network model.""" with tf.name_scope(model_name): input_frames = tf.placeholder(tf.float32, [None, input_shape], name ='input_frames') q_network, network_parameters = create_network_fn( input_frames, input_shape, num_actions) mean_max_Q =tf.reduce_mean( tf.reduce_max(q_network, axis=[1]), name='mean_max_Q') Q_vector_indexes = tf.placeholder(tf.int32, [None, 2], name ='Q_vector_indexes') gathered_outputs = tf.gather_nd(q_network, Q_vector_indexes, name='gathered_outputs') y_ph = tf.placeholder(tf.float32, name='y_ph') loss = mean_huber_loss(y_ph, gathered_outputs) train_step = tf.train.RMSPropOptimizer(learning_rate, decay=RMSP_DECAY, momentum=RMSP_MOMENTUM, epsilon=RMSP_EPSILON).minimize(loss) model = { 'q_network' : q_network, 'input_frames' : input_frames, 'Q_vector_indexes' : Q_vector_indexes, 'y_ph' : y_ph, 'train_step': train_step, 'mean_max_Q' : mean_max_Q, } return model, network_parameters
def testHuberLoss(): with tf.Session() as sess: y_true = tf.constant([0.7,0.3,0.8,0.1]) y_pred = tf.constant([2.1,0.4,0.9,3.2]) loss = sess.run(huber_loss(y_true, y_pred)) mean_loss = sess.run(mean_huber_loss(y_true, y_pred)) assert(np.isclose(loss, [0.9, 0.005, 0.005, 2.6]).all()) assert(np.isclose(mean_loss, 0.8775))
def create_dueling_dqn_model(self, window, input_shape, num_actions, model_name): # noqa: D103 with tf.name_scope(model_name) as scope: self.x = tf.placeholder( tf.float32, shape=[None, input_shape[0], input_shape[1], window], name='input_state') self.y_true = tf.placeholder(tf.float32, shape=[None, 1], name='target_q_val') # conv1 layer self.W_conv1 = tf.Variable(tf.truncated_normal([8, 8, window, 16], stddev=0.1), name='W_conv1') self.b_conv1 = tf.Variable(tf.constant(0.1, shape=[16]), name='b_conv1') self.conv1 = tf.nn.conv2d( self.x, self.W_conv1, strides=[1, 4, 4, 1], padding='VALID') + self.b_conv1 self.relu_conv1 = tf.nn.relu(self.conv1) #conv2 layer self.W_conv2 = tf.Variable(tf.truncated_normal([4, 4, 16, 32], stddev=0.1), name='W_conv2') self.b_conv2 = tf.Variable(tf.constant(0.1, shape=[32]), name='b_conv2') self.conv2 = tf.nn.conv2d(self.relu_conv1, self.W_conv2, strides=[1, 2, 2, 1], padding='VALID') + self.b_conv2 self.relu_conv2 = tf.nn.relu(self.conv2) self.relu_conv2_flat = tf.reshape(self.relu_conv2, [-1, 9 * 9 * 32]) # # # # # # # # # # # # # : Remember, the convolutional layers are shared, but not the fully connected # # # # # # # # # # # # # # # Now splitting into advantage and value streams, using 512 hidden units for Dueling architecture. # Advantage Stream: self.W_fc3_adv = tf.Variable(tf.truncated_normal([9 * 9 * 32, 512], stddev=0.1), name='W_fc3_adv') self.b_fc3_adv = tf.Variable(tf.constant(0.1, shape=[512]), name='b_fc3_adv') self.fc3_adv = tf.matmul(self.relu_conv2_flat, self.W_fc3_adv) + self.b_fc3_adv self.relu_fc3_adv = tf.nn.relu(self.fc3_adv) self.W_fc4_adv = tf.Variable(tf.truncated_normal( [512, num_actions], stddev=0.1), name='W_fc4_adv') self.b_fc4_adv = tf.Variable(tf.constant(0.1, shape=[num_actions]), name='b_output') self.fc4_adv = tf.matmul(self.relu_fc3_adv, self.W_fc4_adv) + self.b_fc4_adv # Value Stream: self.W_fc3_val = tf.Variable(tf.truncated_normal([9 * 9 * 32, 512], stddev=0.1), name='W_fc3_val') self.b_fc3_val = tf.Variable(tf.constant(0.1, shape=[512]), name='b_fc3_val') self.fc3_val = tf.matmul(self.relu_conv2_flat, self.W_fc3_val) + self.b_fc3_val self.relu_fc3_val = tf.nn.relu(self.fc3_val) self.W_fc4_val = tf.Variable(tf.truncated_normal([512, 1], stddev=0.1), name='W_fc4_val') self.b_fc4_val = tf.Variable(tf.constant(0.1, shape=[1]), name='b_fc4_val') self.fc4_val = tf.matmul(self.relu_fc3_val, self.W_fc4_val) + self.b_fc4_val # Merging into Q values (subtracting out the average advantage to disambiguate the value and advantage.) self.pred_q = tf.add(self.fc4_val, tf.subtract(self.fc4_adv, tf.reduce_mean(self.fc4_adv)), name='pred_q') self.selected_action = tf.placeholder( tf.float32, shape=[None, self.num_actions], name='selected_action') if model_name.startswith('source'): self.pred_y = tf.reduce_sum(tf.multiply( self.pred_q, self.selected_action), axis=1) self.loss = mean_huber_loss(self.y_true, self.pred_y) self.accumulated_avg_reward = tf.placeholder( tf.float32, shape=(), name='accumulated_avg_reward') self.train = tf.train.AdamOptimizer(1e-4).minimize( self.loss, name='Adam_minimizer') self.maxq_summary = tf.summary.scalar( 'Max_Q', tf.reduce_max(self.pred_q)) self.loss_summary = tf.summary.scalar('Loss', self.loss) self.merged = tf.summary.merge_all() self.train_reward_val = tf.placeholder(tf.float32, shape=(), name='train_reward_val') self.reward_train_summary = tf.summary.scalar( 'Training_reward', self.train_reward_val) self.reward_summary = tf.summary.scalar( 'Average_reward', self.accumulated_avg_reward)
def create_dqn_model(self, window, input_shape, num_actions, model_name): # noqa: D103 """Create the Q-network model. We highly recommend that you use tf.name_scope as discussed in class when creating the model and the layers. This will make it far easier to understnad your network architecture if you are logging with tensorboard. Parameters ---------- window: int Each input to the network is a sequence of frames. This value defines how many frames are in the sequence. input_shape: tuple(int, int) The expected input image size. num_actions: int Number of possible actions. Defined by the gym environment. """ # input placeholders with tf.name_scope(model_name) as scope: # Input and target placeholders. self.x = tf.placeholder( tf.float32, shape=[None, input_shape[0], input_shape[1], window], name='input_state') self.y_true = tf.placeholder(tf.float32, shape=[None, 1], name='target_q_val') # conv1 layer self.W_conv1 = tf.Variable(tf.truncated_normal([8, 8, window, 16], stddev=0.1), name='W_conv1') self.b_conv1 = tf.Variable(tf.constant(0.1, shape=[16]), name='b_conv1') self.conv1 = tf.nn.conv2d( self.x, self.W_conv1, strides=[1, 4, 4, 1], padding='VALID') + self.b_conv1 self.relu_conv1 = tf.nn.relu(self.conv1) #conv2 layer self.W_conv2 = tf.Variable(tf.truncated_normal([4, 4, 16, 32], stddev=0.1), name='W_conv2') self.b_conv2 = tf.Variable(tf.constant(0.1, shape=[32]), name='b_conv2') self.conv2 = tf.nn.conv2d(self.relu_conv1, self.W_conv2, strides=[1, 2, 2, 1], padding='VALID') + self.b_conv2 self.relu_conv2 = tf.nn.relu(self.conv2) self.relu_conv2_flat = tf.reshape(self.relu_conv2, [-1, 9 * 9 * 32]) # fc3 layer self.W_fc3 = tf.Variable(tf.truncated_normal([9 * 9 * 32, 256], stddev=0.1), name='W_fc3') self.b_fc3 = tf.Variable(tf.constant(0.1, shape=[256]), name='b_fc3') self.fc3 = tf.matmul(self.relu_conv2_flat, self.W_fc3) + self.b_fc3 self.relu_fc3 = tf.nn.relu(self.fc3) # output layer self.W_fc4 = tf.Variable(tf.truncated_normal([256, num_actions], stddev=0.1), name='W_output') self.b_fc4 = tf.Variable(tf.constant(0.1, shape=[num_actions]), name='b_output') # Selected Action is a one-hot encoding of which actions were chosen. self.selected_action = tf.placeholder( tf.float32, shape=[None, self.num_actions], name='selected_action') # Extract predicted Q values. self.pred_q = tf.add(tf.matmul(self.relu_fc3, self.W_fc4), self.b_fc4, name='pred_q') # For the source network, and not target network. if model_name.startswith('source'): # Predicted Q at the executed action. self.pred_y = tf.reduce_sum(tf.multiply( self.pred_q, self.selected_action), axis=1) # Lloss value self.loss = mean_huber_loss(self.y_true, self.pred_y) # Evaluation reward. self.accumulated_avg_reward = tf.placeholder( tf.float32, shape=(), name='accumulated_avg_reward') # Train with ADAM. self.train = tf.train.AdamOptimizer(1e-4).minimize( self.loss, name='Adam_minimizer') self.maxq_summary = tf.summary.scalar( 'Max_Q', tf.reduce_max(self.pred_q)) self.loss_summary = tf.summary.scalar('Loss', self.loss) self.merged = tf.summary.merge_all() # VARIABLE AND SUMMARY for training reward. self.train_reward_val = tf.placeholder(tf.float32, shape=(), name='train_reward_val') self.reward_train_summary = tf.summary.scalar( 'Training_reward', self.train_reward_val) self.reward_summary = tf.summary.scalar( 'Average_reward', self.accumulated_avg_reward)
def create_linear_model(self, window, input_shape, num_actions, model_name): """ Create Linear network Parameters ---------- window: int Each input to the network is a sequence of frames. This value defines how many frames are in the sequence. input_shape: tuple(int, int) The expected input image size. num_actions: int Number of possible actions. Defined by the gym environment. """ # input placeholders with tf.name_scope(model_name) as scope: # Input and target placeholders. self.x = tf.placeholder( tf.float32, shape=[None, input_shape[0], input_shape[1], window], name='input_state') self.y_true = tf.placeholder(tf.float32, shape=[None, 1], name='target_q_val') # Reshape input. self.x_flat = tf.reshape(self.x, [-1, 84 * 84 * 4], name='flat_input') # linear layer self.W = tf.Variable(tf.truncated_normal( [84 * 84 * 4, num_actions], stddev=0.1), name='weight') self.b = tf.Variable(tf.constant(0.1, shape=[num_actions]), name='bias') # Extract predicted Q values. self.pred_q = tf.add(tf.matmul(self.x_flat, self.W), self.b, name='pred_q') # Selected Action is a one-hot encoding of which actions were chosen. self.selected_action = tf.placeholder( tf.float32, shape=[None, self.num_actions], name='selected_action') # Create the following summaries only for the source network. if model_name.startswith('source'): # Predicted y: Q values for teh action selected. self.pred_y = tf.reduce_sum(tf.multiply( self.pred_q, self.selected_action), axis=1) # Loss self.loss = mean_huber_loss(self.y_true, self.pred_y) # Evaluated Reward. self.accumulated_avg_reward = tf.placeholder( tf.float32, shape=(), name='accumulated_avg_reward') # Train variable. self.train = tf.train.AdamOptimizer(1e-4).minimize( self.loss, name='Adam_minimizer') self.maxq_summary = tf.summary.scalar( 'Max_Q', tf.reduce_max(self.pred_q)) self.loss_summary = tf.summary.scalar('Loss', self.loss) self.merged = tf.summary.merge_all() # VARIABLE AND SUMMARY for training reward. self.train_reward_val = tf.placeholder(tf.float32, shape=(), name='train_reward_val') self.reward_train_summary = tf.summary.scalar( 'Training_reward', self.train_reward_val) # Evaluation reward summary. self.reward_summary = tf.summary.scalar( 'Average_reward', self.accumulated_avg_reward)