def projection(inputs, channels, is_training, data_format): """1x1 projection (as in ResNet) followed by batch normalization and ReLU.""" with tf.variable_scope('projection'): net = base_ops.conv_bn_relu(inputs, 1, channels, is_training, data_format) return net
def build_model(features, spec, config, mode=tf.estimator.ModeKeys.TRAIN): """Builds the model from the input features.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) if config['data_format'] == 'channels_last': channel_axis = 3 elif config['data_format'] == 'channels_first': # Currently this is not well supported channel_axis = 1 else: raise ValueError('invalid data_format') # Store auxiliary activations increasing in depth of network. First # activation occurs immediately after the stem and the others immediately # follow each stack. aux_activations = [] # Initial stem convolution with tf.variable_scope('stem'): net = base_ops.conv_bn_relu(features, 3, config['stem_filter_size'], is_training, config['data_format']) aux_activations.append(net) for stack_num in range(config['num_stacks']): channels = net.get_shape()[channel_axis].value # Downsample at start (except first) if stack_num > 0: net = tf.layers.max_pooling2d(inputs=net, pool_size=(2, 2), strides=(2, 2), padding='same', data_format=config['data_format']) # Double output channels each time we downsample channels *= 2 with tf.variable_scope('stack{}'.format(stack_num)): for module_num in range(config['num_modules_per_stack']): with tf.variable_scope('module{}'.format(module_num)): print(channels, is_training) net = build_module(spec, inputs=net, channels=channels, is_training=is_training) aux_activations.append(net) # Global average pool if config['data_format'] == 'channels_last': net = tf.reduce_mean(net, [1, 2]) elif config['data_format'] == 'channels_first': net = tf.reduce_mean(net, [2, 3]) else: raise ValueError('invalid data_format') # Fully-connected layer to labels logits = tf.layers.dense(inputs=net, units=config['num_labels']) return logits
def nasbench_tensorflow_model_builder(model_spec, config, in_shape, is_training=True): if config["data_format"] == "channels_last": channel_axis = 3 else: assert False # setup inputs features = tf.placeholder(tf.float32, shape=in_shape, name="g_input") # build the stem with tf.variable_scope("stem"): net = base_ops.conv_bn_relu(features, 3, config["stem_filter_size"], is_training, config["data_format"]) # Build stacks for stack_num in range(config["num_stacks"]): channels = net.get_shape()[channel_axis].value # Downsample at start (except first) if stack_num > 0: net = tf.layers.max_pooling2d( inputs=net, pool_size=(2, 2), strides=(2, 2), padding="same", data_format=config["data_format"], ) # Double output channels each time we downsample channels *= 2 with tf.variable_scope("stack{}".format(stack_num)): for module_num in range(config["num_modules_per_stack"]): with tf.variable_scope("module{}".format(module_num)): net = model_builder.build_module( model_spec, inputs=net, channels=channels, is_training=is_training, ) # Global average pool if config["data_format"] == "channels_last": net = tf.reduce_mean(net, [1, 2]) elif config["data_format"] == "channels_first": net = tf.reduce_mean(net, [2, 3]) else: raise ValueError("invalid data_format") # Fully-connected layer to labels logits = tf.layers.dense(inputs=net, units=config["num_labels"]) return features, logits
def model_fn(features, labels, mode, params): """Builds the model from the input features.""" del params # Unused is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Store auxiliary activations increasing in depth of network. First # activation occurs immediately after the stem and the others immediately # follow each stack. aux_activations = [] # Initial stem convolution with tf.variable_scope('stem'): net = base_ops.conv_bn_relu(features, 3, config['stem_filter_size'], is_training, config['data_format']) aux_activations.append(net) for stack_num in range(config['num_stacks']): channels = net.get_shape()[channel_axis].value # Downsample at start (except first) if stack_num > 0: net = tf.layers.max_pooling2d( inputs=net, pool_size=(2, 2), strides=(2, 2), padding='same', data_format=config['data_format']) # Double output channels each time we downsample channels *= 2 with tf.variable_scope('stack{}'.format(stack_num)): for module_num in range(config['num_modules_per_stack']): with tf.variable_scope('module{}'.format(module_num)): net = build_module(spec, inputs=net, channels=channels, is_training=is_training) aux_activations.append(net) # Global average pool if config['data_format'] == 'channels_last': net = tf.reduce_mean(net, [1, 2]) elif config['data_format'] == 'channels_first': net = tf.reduce_mean(net, [2, 3]) else: raise ValueError('invalid data_format') # Fully-connected layer to labels logits = tf.layers.dense(inputs=net, units=config['num_labels']) if mode == tf.estimator.ModeKeys.PREDICT and not config['use_tpu']: # It is a known limitation of Estimator that the labels # are not passed during PREDICT mode when running on CPU/GPU # (https://github.com/tensorflow/tensorflow/issues/17824), thus we cannot # compute the loss or anything dependent on it (i.e., the gradients). loss = tf.constant(0.0) else: if config['use_KD']: imitation_lmb = config['imitation_lmb'] temperature = config['temperature'] loss_soft = tf.keras.losses.KLD( tf.math.log_softmax(logits / temperature), tf.math.softmax(labels[:, 1:] / temperature)) loss_soft = tf.math.reduce_mean(loss_soft) loss_soft *= (temperature**2.0) loss_ce = tf.losses.softmax_cross_entropy( onehot_labels=tf.one_hot( tf.dtypes.cast(labels[:, 0], tf.int32), config['num_labels']), logits=logits) loss = (1.0 - imitation_lmb) * loss_ce + imitation_lmb * loss_soft # loss = tf.losses.softmax_cross_entropy( # onehot_labels=tf.one_hot(tf.dtypes.cast(labels[:, 0], tf.int32), config['num_labels']), # logits=logits) else: loss = tf.losses.softmax_cross_entropy( onehot_labels=tf.one_hot(tf.dtypes.cast(labels, tf.int32), config['num_labels']), logits=logits) loss += config['weight_decay'] * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # Use inference mode to compute some useful metrics on a fixed sample # Due to the batch being sharded on TPU, these metrics should be run on CPU # only to ensure that the metrics are computed on the whole batch. We add a # leading dimension because PREDICT expects batch-shaped tensors. if mode == tf.estimator.ModeKeys.PREDICT: parameter_norms = { 'param:' + tensor.name: tf.expand_dims(tf.norm(tensor, ord=2), 0) for tensor in tf.trainable_variables() } # Compute gradients of all parameters and the input simultaneously all_params_names = [] all_params_tensors = [] for tensor in tf.trainable_variables(): all_params_names.append('param_grad_norm:' + tensor.name) all_params_tensors.append(tensor) all_params_names.append('input_grad_norm') all_params_tensors.append(features) grads = tf.gradients(loss, all_params_tensors) param_gradient_norms = {} for name, grad in list(zip(all_params_names, grads))[:-1]: if grad is not None: param_gradient_norms[name] = (tf.expand_dims( tf.norm(grad, ord=2), 0)) else: param_gradient_norms[name] = (tf.expand_dims( tf.constant(0.0), 0)) if grads[-1] is not None: input_grad_norm = tf.sqrt( tf.reduce_sum(tf.square(grads[-1]), axis=[1, 2, 3])) else: input_grad_norm = tf.expand_dims(tf.constant(0.0), 0) covariance_matrices = { 'cov_matrix_%d' % i: tf.expand_dims(_covariance_matrix(aux), 0) for i, aux in enumerate(aux_activations) } predictions = { 'logits': logits, 'loss': tf.expand_dims(loss, 0), 'input_grad_norm': input_grad_norm, } predictions.update(parameter_norms) predictions.update(param_gradient_norms) predictions.update(covariance_matrices) return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() base_lr = config['learning_rate'] if config['use_tpu']: base_lr *= config['tpu_num_shards'] if config['lr_decay_method'] == 'COSINE_BY_STEP': total_steps = int(config['train_epochs'] * num_train_images / config['batch_size']) progress_fraction = tf.cast(global_step, tf.float32) / total_steps learning_rate = (0.5 * base_lr * (1 + tf.cos(np.pi * progress_fraction))) elif config['lr_decay_method'] == 'COSINE_BY_TIME': # Requires training_time.limit hooks to be added to Estimator elapsed_time = tf.cast(training_time.get_total_time(), dtype=tf.float32) progress_fraction = elapsed_time / config['train_seconds'] learning_rate = (0.5 * base_lr * (1 + tf.cos(np.pi * progress_fraction))) elif config['lr_decay_method'] == 'STEPWISE': # divide LR by 10 at 1/2, 2/3, and 5/6 of total epochs total_steps = (config['train_epochs'] * num_train_images / config['batch_size']) boundaries = [ int(0.5 * total_steps), int(0.667 * total_steps), int(0.833 * total_steps) ] values = [ 1.0 * base_lr, 0.1 * base_lr, 0.01 * base_lr, 0.0001 * base_lr ] learning_rate = tf.train.piecewise_constant( global_step, boundaries, values) else: raise ValueError('invalid lr_decay_method') # Set LR to 0 for step 0 to initialize the weights without training learning_rate = tf.where(tf.equal(global_step, 0), 0.0, learning_rate) if "optimizer" in config and config["optimizer"] == 'Adam': optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, momentum=config['momentum'], ) #epsilon=1.0) elif "optimizer" in config and config["optimizer"] == 'SGD': optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif "optimizer" in config and config["optimizer"] == 'Momentum': optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=config['momentum'], ) else: optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate, momentum=config['momentum'], epsilon=1.0) if config['use_tpu']: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # Update ops required for batch norm moving variables update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): predictions = tf.argmax(logits, axis=1) if config['use_KD']: accuracy = tf.metrics.accuracy( tf.dtypes.cast(labels[:, 0], tf.int32), predictions) else: accuracy = tf.metrics.accuracy(labels, predictions) return {'accuracy': accuracy} eval_metrics = (metric_fn, [labels, logits]) return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=eval_metrics)