def testInceptionV2_TotalCost(self): conv_params = { 'activation_fn': tf.nn.relu6, 'weights_regularizer': contrib_layers.l2_regularizer(0.00004), 'weights_initializer': tf.random_normal_initializer(stddev=0.03), 'trainable': True, 'biases_initializer': tf.constant_initializer(0.0), 'normalizer_fn': contrib_layers.batch_norm, 'normalizer_params': { 'is_training': False, 'decay': 0.9997, 'scale': True, 'epsilon': 0.001, } } tf.reset_default_graph() with slim.arg_scope([slim.layers.conv2d, slim.layers.separable_conv2d], **conv_params): # Build model. image = tf.zeros([1, 224, 224, 3]) net, _ = inception.inception_v2_base(image) logits = slim.layers.fully_connected( net, 1001, activation_fn=None, scope='logits', weights_initializer=tf.random_normal_initializer(stddev=1e-3), biases_initializer=tf.constant_initializer(0.0)) # Instantiate regularizers. flop_reg = flop_regularizer.GammaFlopsRegularizer( [logits.op], gamma_threshold=0.5) p100_reg = latency_regularizer.GammaLatencyRegularizer( [logits.op], gamma_threshold=0.5, hardware='P100') v100_reg = latency_regularizer.GammaLatencyRegularizer( [logits.op], gamma_threshold=0.5, hardware='V100') model_size_reg = model_size_regularizer.GammaModelSizeRegularizer( [logits.op], gamma_threshold=0.5) with self.cached_session(): tf.global_variables_initializer().run() # Verify costs are expected. self.assertAllClose(3.86972e+09, flop_reg.get_cost()) self.assertAllClose(517536.0, p100_reg.get_cost()) self.assertAllClose(173330.453125, v100_reg.get_cost()) self.assertAllClose(1.11684e+07, model_size_reg.get_cost())
def build_model(self): # Our test model is: # # -> conv1 --+ -> conv3 --> # / | / # image [concat] # \ | \ # -> conv2 --+ -> conv4 --> # # (the model has two "outputs", conv3 and conv4). # image = tf.constant(0.0, shape=[1, 17, 19, NUM_CHANNELS]) conv1 = slim.layers.conv2d(image, 13, [7, 5], padding='SAME', scope='conv1') conv2 = slim.layers.conv2d(image, 23, [1, 1], padding='SAME', scope='conv2') concat = tf.concat([conv1, conv2], 3) self.conv3 = slim.layers.conv2d(concat, 29, [3, 3], stride=2, padding='SAME', scope='conv3') self.conv4 = slim.layers.conv2d(concat, 31, [1, 1], stride=1, padding='SAME', scope='conv4') self.name_to_var = {v.op.name: v for v in tf.global_variables()} self.regularizer = latency_regularizer.GammaLatencyRegularizer( [self.conv3.op, self.conv4.op], gamma_threshold=0.45, hardware=HARDWARE)
def testInceptionV2(self, hardware): image = tf.zeros([1, 224, 224, 3]) net, _ = inception.inception_v2_base(image) g = tf.get_default_graph() self.regularizer = latency_regularizer.GammaLatencyRegularizer( [net.op], gamma_threshold=0.5, hardware=hardware) # Compute-bound convolution. op = g.get_operation_by_name( 'InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/Conv2D') # FLOP cost = 2 * NHWRSCK expected_cost = (2 * 28 * 28 * 3 * 3 * 96 * 96 / resource_function.PEAK_COMPUTE[hardware]) self.assertAllClose(expected_cost, self.get_cost([op])) # Memory-bound convolution. op = g.get_operation_by_name( 'InceptionV2/Conv2d_1a_7x7/separable_conv2d') # Memory cost = input_tensor + weight_tensor + output_tensor # = NHWC + RSCK + NHWK # Note that this is a pointwise convolution with kernel 1x1. expected_cost = ((112 * 112 * 24 + 24 * 64 + 112 * 112 * 64) * 4 / resource_function.MEMORY_BANDWIDTH[hardware]) self.assertAllClose(expected_cost, self.get_cost([op]))
def main(args): # Load MNIST Data train_data, test_data = tf.keras.datasets.mnist.load_data() X_train, y_train = train_data[0], train_data[1] X_test, y_test = test_data[0], test_data[1] global_step = tf.train.get_or_create_global_step() N, H, W = X_train.shape X_ph = tf.placeholder(tf.float32, [None, H, W, 1]) y_ph = tf.placeholder(tf.int64, [None]) # Defining Model logits, pred = mnist_model(X_ph, scope='base') loss_op = tf.losses.sparse_softmax_cross_entropy(labels=y_ph, logits=logits) acc_op = tf.reduce_mean(tf.cast(tf.equal(pred, y_ph), tf.float32)) # Setting Regularizer and Loss Ops if args.reg_type == "activation": network_regularizer = activation_regularizer.GammaActivationRegularizer( output_boundary=[logits.op], input_boundary=[X_ph.op, y_ph.op], gamma_threshold=args.gamma_threshold) elif args.reg_type == "flop": network_regularizer = flop_regularizer.GammaFlopsRegularizer( output_boundary=[logits.op], input_boundary=[X_ph.op, y_ph.op], gamma_threshold=args.gamma_threshold) elif args.reg_type == "latency": network_regularizer = latency_regularizer.GammaLatencyRegularizer( output_boundary=[logits.op], input_boundary=[X_ph.op, y_ph.op], hardware=args.hardware, gamma_threshold=args.gamma_threshold) reg_loss_op = network_regularizer.get_regularization_term( ) * args.reg_penalty cost_op = network_regularizer.get_cost() exporter = structure_exporter.StructureExporter( network_regularizer.op_regularizer_manager) optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) train_op = optimizer.minimize(loss_op + reg_loss_op, global_step=global_step) hooks = [ tf.train.StopAtStepHook(last_step=args.steps + 1), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss_op }, every_n_iter=10) ] # pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True # Training Loop with tf.train.MonitoredTrainingSession(checkpoint_dir=args.outdir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): idx = np.random.choice(N, args.batch_size, replace=False) x_t, y_t = np.expand_dims(X_train[idx], axis=-1), y_train[idx] train_dict = {X_ph: x_t, y_ph: y_t} val_idx = np.random.choice(X_test.shape[0], 5000, replace=False) x_v, y_v = np.expand_dims(X_test[val_idx], axis=-1), y_test[val_idx] val_dict = {X_ph: x_v, y_ph: y_v} global_step_val = mon_sess.run(global_step, feed_dict=train_dict) structure_exporter_tensors, v_loss, v_acc, reg_cost = mon_sess.run( [exporter.tensors, loss_op, acc_op, cost_op], feed_dict=val_dict) mon_sess.run(train_op, feed_dict=train_dict) print("Step: ", global_step_val) print("Validation Loss: ", v_loss) print("Validation Acc: ", v_acc) print("Reg Cost: ", reg_cost) # exporting model to JSON if global_step_val % 1000 == 0: exporter.populate_tensor_values(structure_exporter_tensors) exporter.create_file_and_save_alive_counts( args.outdir, global_step_val)