def setConfig(): config = network_config.getConfig() config['train_dir'] = 'cifar10_train_highway' config['max_steps'] = 78125 # 200 epochs # 64000 #1000000 # Vinh: use ResNet's max steps config['log_device_placement'] = False config['batch_size'] = 128 config['data_dir'] = 'cifar10_data'
def inputs(eval_data): """Construct input for CIFAR evaluation using the Reader ops. Args: eval_data: bool, indicating if one should use the train or eval data set. Returns: images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. labels: Labels. 1D tensor of [batch_size] size. Raises: ValueError: If no data_dir """ config = network_config.getConfig() data_dir = config['data_dir'] batch_size = config['batch_size'] # Vinh: debug print("THe batch size for input test is %d " % (batch_size)) #if not FLAGS.data_dir: # raise ValueError('Please supply a data_dir') data_dir = os.path.join(data_dir, 'cifar-10-batches-bin') return cifar10_input.inputs(eval_data=eval_data, data_dir=data_dir, batch_size=batch_size)
def main(argv=None): # pylint: disable=unused-argument cifar10_train.setConfig() setConfig() eval_dir = network_config.getConfig()['eval_dir'] cifar10.maybe_download_and_extract() if gfile.Exists(eval_dir): gfile.DeleteRecursively(eval_dir) gfile.MakeDirs(eval_dir) evaluate()
def main(argv=None): # pylint: disable=unused-argument setConfig() config = network_config.getConfig() train_dir = config['train_dir'] cifar10.maybe_download_and_extract() if gfile.Exists(train_dir): gfile.DeleteRecursively(train_dir) gfile.MakeDirs(train_dir) train()
def setConfig(): config = network_config.getConfig() config['eval_dir'] = 'cifar10_eval' config['eval_data'] = 'test' config['checkpoint_dir'] = 'cifar10_train' config['eval_interval_secs'] = 60 * 5 config['num_examples'] = 10000 config['run_once'] = False config[ 'batch_size'] = 100 # To make sure all the test instances are evaluated
def main(argv=None): # pylint: disable=unused-argument # Have to set config first # TODO: remove the need for this, will check how Python initialize a module setConfig() cifar10.maybe_download_and_extract() config = network_config.getConfig() train_dir = config['train_dir'] if gfile.Exists(train_dir): gfile.DeleteRecursively(train_dir) gfile.MakeDirs(train_dir) train()
def loss(logits, labels): """Add L2Loss to all the trainable variables. Add summary for for "Loss" and "Loss/avg". Args: logits: Logits from inference(). labels: Labels from distorted_inputs or inputs(). 1-D tensor of shape [batch_size] Returns: Loss tensor of type float. """ # Reshape the labels into a dense Tensor of # shape [batch_size, NUM_CLASSES]. config = network_config.getConfig() batch_size = config['batch_size'] sparse_labels = tf.reshape(labels, [batch_size, 1]) indices = tf.reshape(tf.range(batch_size), [batch_size, 1]) concated = tf.concat(1, [indices, sparse_labels]) dense_labels = tf.sparse_to_dense(concated, [batch_size, NUM_CLASSES], 1.0, 0.0) # # Vinh: NAN problem turns out exactly as I suspected: # # http://stackoverflow.com/questions/33712178/tensorflow-nan-bug # # Can't prevent NAN using this method because I have no control over the scaled # # logits. Why didn't they prevent this in their (presumably C++?) implementation # # of softmax_cross_entropy_with_logits? # #cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, onehot_labels) # # Thanks to the solution from the above link # # The NAN fix makes tf.reduce_meloss converge very slowly # logits = tf.clip_by_value(tf.nn.softmax(logits), 1e-10,1.0) # cross_entropy = - onehot_labels * tf.log(logits) # # # Why it changes so much in this case? # loss = tf.reduce_mean(cross_entropy, name = 'xentropy_mean') # # # In case of highway network, reduce_sum converges much faster # # than reduce_mean is, which is a bit strange because reduce_mean # # is supposed to reduce the variance in stochastic gradient descent # # Will fix this after I get back or tomorrow # #loss = tf.reduce_sum(cross_entropy, name = 'xentropy_sum') # Calculate the average cross entropy loss across the batch. cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits, dense_labels, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # The total loss is defined as the cross entropy loss plus all of the weight # decay terms (L2 loss). return tf.add_n(tf.get_collection('losses'), name='total_loss')
def distorted_inputs(): """Construct distorted input for CIFAR training using the Reader ops. Returns: images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. labels: Labels. 1D tensor of [batch_size] size. Raises: ValueError: If no data_dir """ # if not FLAGS.data_dir: # raise ValueError('Please supply a data_dir') config = network_config.getConfig() data_dir = config['data_dir'] batch_size = config['batch_size'] data_dir = os.path.join(data_dir, 'cifar-10-batches-bin') return cifar10_input.distorted_inputs(data_dir=data_dir, batch_size=batch_size)
def maybe_download_and_extract(): """Download and extract the tarball from Alex's website.""" config = network_config.getConfig() data_dir = config['data_dir'] dest_directory = data_dir if not os.path.exists(dest_directory): os.makedirs(dest_directory) filename = DATA_URL.split('/')[-1] filepath = os.path.join(dest_directory, filename) if not os.path.exists(filepath): def _progress(count, block_size, total_size): sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename, float(count * block_size) / float(total_size) * 100.0)) sys.stdout.flush() filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, reporthook=_progress) print() statinfo = os.stat(filepath) print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') tarfile.open(filepath, 'r:gz').extractall(dest_directory)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) config = network_config.getConfig() batch_size = config['batch_size'] num_gpus = config['num_gpus'] log_device_placement = config['log_device_placement'] train_dir = config['train_dir'] max_steps = config['max_steps'] # Calculate the learning rate schedule. # num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / # batch_size) # decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # # # Decay the learning rate exponentially based on the number of steps. # lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, # global_step, # decay_steps, # cifar10.LEARNING_RATE_DECAY_FACTOR, # staircase=True) # Vinh: use ResNet's simple learning rate lr = tf.placeholder(tf.float32, [], "learning_rate") momentum = 0.9 # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Vinh: Use momentum as in ResNet #opt = tf.train.MomentumOptimizer(lr, momentum) # Calculate the gradients for each model tower. tower_grads = [] for i in xrange(num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) # Vinh: don't use moving average for weights for now to have # fair comparison with ResNet paper. Well, I'll just save them here # and I'll do 2 evaluations: one without restoring them and one # with: variables_averages_op = variable_averages.apply( tf.trainable_variables()) batchNormCol = tf.get_collection(BatchNormLayer.batchNormCollectionID) batchNormAverageOp = variable_averages.apply(batchNormCol) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op, batchNormAverageOp) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(train_dir, graph_def=sess.graph_def) # Vinh: change learning rates at steps 32k and 48k, terminating # at step 64k (counts from 1) (as in ResNet paper) feed_dict = {lr: 0.01} for step in xrange(max_steps): start_time = time.time() if (step + 1) == 800: feed_dict = {lr: 0.1} elif (step + 1) == 32000: feed_dict = {lr: 0.01} elif (step + 1) == 48000: feed_dict = {lr: 0.001} _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = batch_size * num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / num_gpus format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op, feed_dict) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == max_steps: checkpoint_path = os.path.join(train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def eval_once(saver, summary_writer, top_k_op, summary_op, ema): """Run Eval once. Args: saver: Saver. summary_writer: Summary writer. top_k_op: Top K op. summary_op: Summary op. """ config = network_config.getConfig() checkpoint_dir = config['checkpoint_dir'] num_examples = config['num_examples'] batch_size = config['batch_size'] with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] print('Gonna restore for batch norm here') # Vinh: restore the moving averages of batch norm's mean and variance. # It seems like the normal Saver object doesn't restore the moving averages of # untrainable variables # Have to hack by copying some ideas in # moving_average()'s source code # TODO: temporarily comment out because I forgot to make # change in BatchNorm layer with regards to get_variable() # TODO: test with the 2 options: using population mean and variance # and mini-batch mean and variance # untrainableVars = list(set(variables.all_variables()) - # set(variables.trainable_variables())) # # variables_to_restore = {val.name : val #{ema.average_name(val) : val # for val in untrainableVars} # # batchNormCol = tf.get_collection(BatchNormLayer.batchNormCollectionID) # # a bit clunky # restoredVarMap = {ema.average_name(variables_to_restore[key]) # : variables_to_restore[key] # for key in batchNormCol # if key in variables_to_restore} # batchNormSaver = tf.train.Saver(restoredVarMap) # batchNormSaver.restore(sess, ckpt.model_checkpoint_path) # Vinh: adjust after a new change in BatchNormLayer batchNormVarCol = tf.get_collection( BatchNormLayer.batchNormCollectionID) restoredVarMap = { ema.average_name(var): var for var in batchNormVarCol } batchNormSaver = tf.train.Saver(restoredVarMap) batchNormSaver.restore(sess, ckpt.model_checkpoint_path) # Vinh: So from this point, batch norm's means and variances are restored to their # moving average values else: print('No checkpoint file found') return # Start the queue runners. coord = tf.train.Coordinator() try: threads = [] for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend( qr.create_threads(sess, coord=coord, daemon=True, start=True)) # Vinh: the original tensorflow way will underestimate the true precision num_iter = int(math.ceil(num_examples / batch_size)) true_count = 0 # Counts the number of correct predictions. # If num_examples is not divisible by batch_size, then total_sample_count # will be greater than num_examples. If we assume, in this case, the # filename_queue will be read until the last file, than the precision # calculated here will be less than the correct precision. total_sample_count = num_iter * batch_size step = 0 while step < num_iter and not coord.should_stop(): predictions = sess.run([top_k_op]) true_count += np.sum(predictions) step += 1 # Compute precision @ 1. precision = true_count / total_sample_count print('%s: Original precision @ 1 = %.3f' % (datetime.now(), precision)) totalExamplesEvaluated = step * batch_size correctPrecision = true_count / totalExamplesEvaluated print( 'The correct precision %.3f over %d evaluated examples is: ' % (correctPrecision, totalExamplesEvaluated)) summary = tf.Summary() summary.ParseFromString(sess.run(summary_op)) summary.value.add(tag='Precision @ 1', simple_value=precision) summary_writer.add_summary(summary, global_step) print('Reach here with the total steps: %d' % (step)) except Exception as e: # pylint: disable=broad-except coord.request_stop(e) coord.request_stop() coord.join(threads, stop_grace_period_secs=10)
def evaluate(): config = network_config.getConfig() test_or_train = config['eval_data'] eval_dir = config['eval_dir'] run_once = config['run_once'] eval_interval_secs = config['eval_interval_secs'] """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): # Get images and labels for CIFAR-10. eval_data = test_or_train == 'test' images, labels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. #logits = cifar10.inference(images) # 20 layer network # logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = False) # 56 layers # logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = False, # numStackedBlocks = 9) # is_train_phase should be false but I need to change BatchNormLayer first # so use poor man's batch norm for now # logits = cifar10_model.buildNetworkWithVariableScope(images, # is_train_phase = True, # gateType = MywayFFLayer.HIGHWAY_GATE, # numStackedBlocks = 3) #logits = cifar10_model.inference(images) logits = cifar10_model.buildNetworkWithVariableScope( images, is_train_phase=False, gateType=MywayFFLayer.RESIDUAL_GATE, numStackedBlocks=3) # logits = cifar10_model.buildNetworkWithVariableScope(images, # is_train_phase = False, # gateType = MywayFFLayer.HIGHWAY_GATE, # numStackedBlocks = 9) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. # Vinh: disable this for now variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) # Vinh: Will restore weights after I do a fair comparison with ResNet paper # variables_to_restore = variable_averages.variables_to_restore() # saver = tf.train.Saver(variables_to_restore) saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() graph_def = tf.get_default_graph().as_graph_def() summary_writer = tf.train.SummaryWriter(eval_dir, graph_def=graph_def) while True: eval_once(saver, summary_writer, top_k_op, summary_op, variable_averages) if run_once: break time.sleep(eval_interval_secs)
def train(): #setConfig() # Already set in main config = network_config.getConfig() train_dir = config['train_dir'] max_steps = config['max_steps'] log_device_placement = config['log_device_placement'] batch_size = config['batch_size'] """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. # 20 layer network # logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = True) # 56 layers # logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = True, # numStackedBlocks = 9) logits = cifar10_model.buildNetworkWithVariableScope(images, is_train_phase = True, gateType = MywayFFLayer.HIGHWAY_GATE, numStackedBlocks = 9) # Calculate loss. loss = cifar10.loss(logits, labels) print('Loss used logits from cifar10_model') # Build a Graph that trains the model with one batch of examples and # updates the model parameters. lr = tf.placeholder(tf.float32, [], "learning_rate") train_op = cifar10.train(loss, global_step, lr) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=log_device_placement)) sess.run(init) # Create a saver. # Vinh: this should save the moving averages of batch mean and variance # =============== Maybe not, I need to check it now ============================= saver = tf.train.Saver(tf.all_variables()) #saver = tf.train.Saver() # What is the difference here? # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(train_dir, graph_def=sess.graph_def) for step in xrange(max_steps): start_time = time.time() # Vinh: change learning rates at steps 32k and 48k, terminating # at step 64k (counts from 1) (as in ResNet paper) feed_dict = {lr : 0.1} # Not reducing the learning rate for now if (step + 1) == 32000: feed_dict = {lr : 0.01} elif (step + 1) == 48000: feed_dict = {lr : 0.001} _, loss_value = sess.run([train_op, loss], feed_dict = feed_dict) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: # Vinh: If I monitor the learning rate in cifar10.py, I'd need to # pass the feed_dict above to the running of summary_op summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == max_steps: checkpoint_path = os.path.join(train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train2(total_loss, global_step, optimizer): """Train CIFAR-10 model. Create an optimizer and apply to all trainable variables. Add moving average for all trainable variables. Args: total_loss: Total loss from loss(). global_step: Integer Variable counting the number of training steps processed. Returns: train_op: op for training. """ config = network_config.getConfig() batch_size = config['batch_size'] # Variables that affect learning rate. num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. # lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, # global_step, # decay_steps, # LEARNING_RATE_DECAY_FACTOR, # staircase=True) # Vinh: implement ResNet's simple learning rate reduction # Use mrry's answer to change the learning rate # http://stackoverflow.com/questions/33919948/how-to-set-adaptive-learning-rate-for-gradientdescentoptimizer #lr = INITIAL_LEARNING_RATE #lr = tf.placeholder(tf.float32, [], "learning_rate") # No need to monitor this now because I use ResNet's adaptive learning rates # tf.scalar_summary('learning_rate', lr) #momentum = 0.9 # Generate moving averages of all losses and associated summaries. loss_averages_op = _add_loss_summaries(total_loss) # Compute gradients. with tf.control_dependencies([loss_averages_op]): # Vinh: as in ResNet paper #opt = tf.train.GradientDescentOptimizer(lr) #opt = tf.train.MomentumOptimizer(lr, momentum) #grads = opt.compute_gradients(total_loss) grads = optimizer.compute_gradients(total_loss) # Apply gradients. apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) # Add histograms for gradients. for grad, var in grads: if grad: tf.histogram_summary(var.op.name + '/gradients', grad) # Track the moving averages of all trainable variables. # Vinh: move this to cifar10_train to see if they can solve # the problem of not all variables having ExpoMovAvg. # Well, I disable this for now. I'm gonna fix it later # with a much smaller subset of CIFAR10 # variable_averages = tf.train.ExponentialMovingAverage( # MOVING_AVERAGE_DECAY, global_step) # variables_averages_op = variable_averages.apply(tf.trainable_variables()) #with tf.control_dependencies([apply_gradient_op, variables_averages_op]): with tf.control_dependencies([apply_gradient_op]): train_op = tf.no_op(name='train') return train_op
def inference(images): """Build the CIFAR-10 model. Args: images: Images returned from distorted_inputs() or inputs(). Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU training runs. # If we only ran this model on a single GPU, we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). # # conv1 with tf.variable_scope('conv1') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64], stddev=1e-4, wd=0.0) conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) bias = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(bias, name=scope.name) _activation_summary(conv1) # pool1 pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1') # norm1 norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1') # conv2 with tf.variable_scope('conv2') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64], stddev=1e-4, wd=0.0) conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) bias = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(bias, name=scope.name) _activation_summary(conv2) # norm2 norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2') # pool2 pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2') config = network_config.getConfig() batch_size = config['batch_size'] # local3 with tf.variable_scope('local3') as scope: # Move everything into depth so we can perform a single matrix multiply. dim = 1 for d in pool2.get_shape()[1:].as_list(): dim *= d reshape = tf.reshape(pool2, [batch_size, dim]) weights = _variable_with_weight_decay('weights', shape=[dim, 384], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) local3 = tf.nn.relu_layer(reshape, weights, biases, name=scope.name) _activation_summary(local3) # local4 with tf.variable_scope('local4') as scope: weights = _variable_with_weight_decay('weights', shape=[384, 192], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) local4 = tf.nn.relu_layer(local3, weights, biases, name=scope.name) _activation_summary(local4) # softmax, i.e. softmax(WX + b) with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES], stddev=1 / 192.0, wd=0.0) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0)) softmax_linear = tf.nn.xw_plus_b(local4, weights, biases, name=scope.name) _activation_summary(softmax_linear) return softmax_linear