def model_fn(features, labels, mode): """Inception_Resnet_V2 model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) tower_features = features tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. data_format = None if not data_format: if GPU_COUNT == 0: data_format = 'channels_last' else: data_format = 'channels_first' if GPU_COUNT == 0: num_devices = 1 device_type = 'cpu' else: num_devices = GPU_COUNT device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if VARIABLE_STRATEGY == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif VARIABLE_STRATEGY == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( GPU_COUNT, tf.contrib.training.byte_size_load_fn)) with tf.variable_scope('', reuse=bool(i != 0)): with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = tower_fn(is_training, tower_features[i], tower_labels and tower_labels[i], num_classes) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) if i == 0: # Only trigger batch_norm moving mean and variance update from # the 1st tower. Ideally, we should grab the updates from all # towers but these stats accumulate extremely fast so we can # ignore the other stats from the other towers without # significant detriment. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) if mode == 'train' or mode == 'eval': # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_ing'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply( tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if VARIABLE_STRATEGY == 'GPU' else '/cpu:0' with tf.device(consolidation_device): loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( BATCH_SIZE, every_n_steps=10) global_step = tf.train.get_global_step() learning_rate = tf.constant(LEARNING_RATE) tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=100) initializer_hook = utils.IteratorInitializerHook() train_hooks = [initializer_hook, logging_hook, examples_sec_hook] optimizer = tf.train.MomentumOptimizer( learning_rate=LEARNING_RATE, momentum=MOMENTUM) # Create single grouped train op train_op = [ optimizer.apply_gradients(gradvars, global_step=global_step) ] train_op.extend(update_ops) train_op = tf.group(*train_op) predictions = { 'classes': tf.concat([p['classes'] for p in tower_preds], axis=0), 'probabilities': tf.concat([p['probabilities'] for p in tower_preds], axis=0) } stacked_labels = tf.concat(labels, axis=0) metrics = { 'accuracy': tf.metrics.accuracy(stacked_labels, predictions['classes']) } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics) else: predictions = { 'classes': tf.concat([p['classes'] for p in tower_preds], axis=0), 'probabilities': tf.concat([p['probabilities'] for p in tower_preds], axis=0), 'features': tf.concat([feature for feature in features], axis=0) } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions)
def model_signature(input_placeholder, mode, params): features = input_placeholder.x labels = input_placeholder.y length = input_placeholder.length len_per_stroke = input_placeholder.len_per_stroke is_training = input_placeholder.is_training strokes_features = input_placeholder.strokes_features global_features = input_placeholder.global_features loss_function = _loss_2logits # loss_function = _loss_siamese losses_all_tower = [] stroke_losses_all_tower = [] distance_all_tower = [] prediction_all_tower = [] if params.stroke_base: features = tf.reshape(features, [-1, params.length_per_signature, params.length_per_stroke, params.features]) len_stroke_tower = tf.split(len_per_stroke, params.num_gpus, axis=0) else: features = tf.reshape( features, [-1, params.max_sequence_length, params.features]) features = tf.cast(features, tf.float32) input_all_tower = tf.split(features, params.num_gpus, axis=0) length_all_tower = tf.split(length, params.num_gpus, axis=0) strokes_features_tower = tf.split( strokes_features, params.num_gpus, axis=0) global_features_tower = tf.split( global_features, params.num_gpus, axis=0) labels_all_tower = [None, None, None] if labels is not None: labels = tf.cast(labels, tf.float32) labels = tf.reshape(labels, [-1]) labels_all_tower = tf.split(labels, params.num_gpus, axis=0) for i in range(params.num_gpus): worker_device = '/{}:{}'.format('gpu', i) input_tower = input_all_tower[i] device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( params.num_gpus, tf.contrib.training.byte_size_load_fn)) with tf.device(device_setter): len_stroke = None if not params.stroke_base else len_stroke_tower[i] loss, stroke_loss, distance, prediction = loss_function(input_tower, labels_all_tower[i], length_all_tower[i], len_stroke, strokes_features_tower[i], global_features_tower[i], params, is_training=is_training) if labels_all_tower is not None: losses_all_tower.append(loss) if stroke_loss is not None: stroke_losses_all_tower.append(stroke_loss) distance_all_tower.append(distance) prediction_all_tower.append(prediction) consolidation_device = '/cpu:0' with tf.device(consolidation_device): distance = tf.concat(distance_all_tower, 0) distance = tf.reshape(distance, [-1, 1]) prediction = tf.concat(prediction_all_tower, 0) prediction = tf.reshape(prediction, [-1, 1]) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'distance': distance} return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) loss = tf.reduce_mean(losses_all_tower, 0) stroke_loss = tf.reduce_mean(stroke_losses_all_tower, 0) distance_norm = _normlize_distance(distance) labels = tf.reshape(labels, [-1, 1]) accuracy_ops = tf.metrics.accuracy(labels, prediction) labels_2value = tf.where( tf.equal(labels, 2.0), tf.zeros_like(labels), labels) labels_2value = tf.reshape(labels_2value, [-1, 1]) labels_reversal = tf.reshape(tf.subtract(tf.cast(1.0, tf.float32), labels_2value), [-1, 1]) # labels_ = !labels; positive_distance = tf.reduce_mean( tf.multiply(labels_2value, distance)) negative_distance = tf.reduce_mean( tf.multiply(labels_reversal, distance)) loss_summary = tf.summary.scalar('loss', loss) stroke_loss_summary = tf.summary.scalar('stroke_loss', stroke_loss) pos_summary = tf.summary.scalar('positive_distance', positive_distance) neg_summary = tf.summary.scalar('negative_distance', negative_distance) metric_ops = tf.metrics.auc( labels_reversal, distance_norm, name='auc_all') auc_summary = tf.summary.scalar('auc', metric_ops[1]) accuracy_summary = tf.summary.scalar('accuracy', accuracy_ops[1]) sec_at_spe_metric = tf.metrics.sensitivity_at_specificity( labels_reversal, distance_norm, 0.90) merged_summary = tf.summary.merge( [loss_summary, stroke_loss_summary, pos_summary, neg_summary, auc_summary, accuracy_summary]) if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = {'evaluation_auc': metric_ops, 'accuracy': accuracy_ops, 'sec_at_spe': sec_at_spe_metric} return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops) else: return loss, stroke_loss, distance, accuracy_ops[1], merged_summary
def _hg_model_fn(features, labels, mode, params): """ HG model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay momentum = params.momentum decay_factor = params.decay_factor decay_step = params.decay_step init_learning_rate = params.init_learning_rate num_stacks = params.num_stacks num_joints = params.num_joints tower_features = features if mode == tf.estimator.ModeKeys.PREDICT: if num_gpus < 1: tower_labels = [None] else: tower_labels = [None for i in range(num_gpus)] else: tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. data_format = params.data_format if not data_format: if num_gpus == 0: data_format = 'channels_last' else: data_format = 'channels_first' if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) if mode == tf.estimator.ModeKeys.TRAIN: batch_size = params.train_batch_size / num_devices else: batch_size = params.eval_batch_size / num_devices with tf.variable_scope('hg', reuse=bool(i != 0)): with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( mode, weight_decay, tower_features[i][0], tower_labels[i], data_format, params.batch_norm_decay, params.batch_norm_epsilon, params.num_stacks, params.num_out, params.n_low, params.num_joints, batch_size, params.seq_length) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) if i == 0: # Only trigger batch_norm moving mean and variance update from # the 1st tower. Ideally, we should grab the updates from all # towers but these stats accumulate extremely fast so we can # ignore the other stats from the other towers without # significant detriment. update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, name_scope) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): learning_rate = tf.train.exponential_decay( init_learning_rate, tf.train.get_global_step(), decay_step, decay_factor, staircase=True, name='learning_rate') loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size, every_n_steps=10) tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate) if params.sync: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook( params.is_chief) train_hooks.append(sync_replicas_hook) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op.extend(update_ops) train_op = tf.group(*train_op) predictions = { 'heatmaps': tf.concat([p['heatmaps'] for p in tower_preds], axis=0), 'images': tf.concat([i for i in tower_features], axis=0) } if mode == tf.estimator.ModeKeys.EVAL: hm = predictions['heatmaps'] stacked_labels = tf.concat(labels[0][0][0], axis=0) gt_labels = tf.transpose(stacked_labels, [1, 0, 3, 4, 2]) joint_accur = [] for j in range(params.seq_length): for i in range(params.num_joints): joint_accur.append( _pck_hm(hm[j, :, -1, :, :, i], gt_labels[j, :, :, :, i], params.eval_batch_size / num_devices)) accuracy = tf.stack(joint_accur) metrics = {'Mean Pixel Error': tf.metrics.mean(accuracy)} tf.logging.info('Accuracy op computed') else: metrics = None else: train_op = None loss = None train_hooks = None metrics = None predictions = { 'heatmaps': tf.concat([p['heatmaps'] for p in tower_preds], axis=0), 'images': tf.concat([i for i in tower_features], axis=0) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics)
def _model_fn(features, labels, mode, params): """Resnet model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay momentum = params.momentum tower_features = features tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. data_format = params.data_format if not data_format: if num_gpus == 0: data_format = 'channels_last' else: data_format = 'channels_first' if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) with tf.variable_scope(params.model_name, reuse=bool(i != 0)): with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( is_training, params.dp_keep_prob, weight_decay, tower_features[i], tower_labels[i], data_format, params.num_layers, params.batch_norm_decay, params.batch_norm_epsilon, params) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) if i == 0: # Only trigger batch_norm moving mean and variance update from # the 1st tower. Ideally, we should grab the updates from all # towers but these stats accumulate extremely fast so we can # ignore the other stats from the other towers without # significant detriment. update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, name_scope) # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): # Suggested learning rate scheduling from # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 num_batches_per_epoch = imagenet.ImageNetDataSet.num_examples_per_epoch( 'train') // (params.train_batch_size * num_workers) boundaries = [ num_batches_per_epoch * x for x in np.array([30, 60, 90], dtype=np.int64) ] staged_lr = [ params.learning_rate * x for x in [1, 0.1, 0.01, 0.002] ] learning_rate = tf.train.piecewise_constant( tf.train.get_global_step(), boundaries, staged_lr) loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size, every_n_steps=10) #optimizer = tf.train.MomentumOptimizer( # learning_rate=learning_rate, momentum=momentum) optimizer = tf.train.AdamOptimizer() if params.sync: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook( params.is_chief) train_hooks.append(sync_replicas_hook) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op.extend(update_ops) train_op = tf.group(*train_op) predictions = { 'classes': tf.concat([p['classes'] for p in tower_preds], axis=0), 'probabilities': tf.concat([p['probabilities'] for p in tower_preds], axis=0) } stacked_labels = tf.concat(labels, axis=0) metrics = { 'accuracy': tf.metrics.accuracy(stacked_labels, predictions['classes']) } tensors_to_log = { 'learning_rate': learning_rate, 'loss': loss, 'acc': metrics['accuracy'][0] } logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics)
def model_fn_signature(features, labels, mode, params): """Model function for tf.estimator Args: features: input batch of images labels:True or not mode: can be one of tf.estimator.ModeKeys.{TRAIN, EVAL } params: contains hyper parameters of the model (ex: `params.learning_rate`) Returns: model_spec: tf.estimator.EstimatorSpec object """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) loss_function = models[params.model] losses_all_tower = [] distance_all_tower = [] images_all_tower = tf.split(features, params.num_gpus, axis=0) labels_all_tower = None if labels is not None: labels = tf.reshape(labels, [-1]) labels_all_tower = tf.split(labels, params.num_gpus, axis=0) for i in range(params.num_gpus): worker_device = '/{}:{}'.format('gpu', i) images_tower = images_all_tower[i] device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy( params.num_gpus, tf.contrib.training.byte_size_load_fn)) with tf.device(device_setter): if labels_all_tower is not None: loss, distance = loss_function( images_tower, labels_all_tower[i], params, is_training) losses_all_tower.append(loss) else: _, distance = loss_function( images_tower, None, params, is_training) distance_all_tower.append(distance) consolidation_device = '/cpu:0' with tf.device(consolidation_device): distance = tf.concat(distance_all_tower, 0) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'distance': distance} return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) loss = tf.reduce_mean(losses_all_tower, name='loss_mean') labels = tf.reshape(labels, [-1, 1]) labels_reversal = tf.reshape(tf.subtract( 1.0, labels), [-1, 1]) # labels_ = !labels; positive_distance = tf.reduce_mean(tf.multiply(labels, distance)) negative_distance = tf.reduce_mean( tf.multiply(labels_reversal, distance)) tf.summary.scalar('loss', loss) tf.summary.scalar('positive_distance', positive_distance) tf.summary.scalar('negative_distance', negative_distance) distance_norm = _normlize_distance(distance) metric_ops = tf.metrics.auc(labels_reversal, distance_norm) tf.summary.scalar('auc', metric_ops[1]) if mode == tf.estimator.ModeKeys.EVAL: sec_at_spe_metric = tf.metrics.sensitivity_at_specificity( labels_reversal, distance_norm, 0.90) eval_metric_ops = {'evaluation_auc': metric_ops, 'sec_at_spe': sec_at_spe_metric} return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops) else: logging_hook = tf.train.LoggingTensorHook({"positive_distance": positive_distance, "negative_distance": negative_distance, "auc": metric_ops[1]}, every_n_iter=100) # optimizer = tf.train.RMSPropOptimizer(params.learning_rate) optimizer = tf.train.AdamOptimizer(params.learning_rate) global_step = tf.train.get_global_step() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize( loss, global_step=global_step, colocate_gradients_with_ops=True) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
def _linearregression_model_fn_sync(features, labels, mode, params): """Resnet model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay features = features[0:num_gpus] labels = labels[0:num_gpus] tower_features = features tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) with tf.variable_scope('LinearRegression', reuse=bool(i != 0)) as var_scope: with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( is_training, weight_decay, tower_features[i], tower_labels[i], params.feature_dim, var_scope.name, params.problem) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size, every_n_steps=100) tensors_to_log = {'loss': loss} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] # optimizer = tf.train.GradientDescentOptimizer(learning_rate=params.learning_rate) optimizer = tf.train.AdamOptimizer( learning_rate=params.learning_rate) if params.run_type == 'sync': optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook( params.is_chief) train_hooks.append(sync_replicas_hook) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op = tf.group(*train_op) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=train_hooks)
def _linearregression_model_fn_local(features, labels, mode, params): """ Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay # features = features[0:num_gpus] # labels = labels[0:num_gpus] tower_features = features tower_labels = labels tower_losses = [] tower_ops = [] tower_preds = [] var_scopes = [] if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = utils.local_device_setter( worker_device=worker_device) # device_setter = tf.train.replica_device_setter( # worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) # device_setter = tf.train.replica_device_setter( # ps_device=worker_device, # worker_device=worker_device # ) with tf.variable_scope( 'LinearRegression_{}'.format(i)) as var_scope: with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( is_training, weight_decay, tower_features[i], tower_labels[i], params.feature_dim, var_scope.name, params.problem) var_scopes.append(var_scope.name) tower_losses.append(loss) # tower_gradvars.append(gradvars) tower_preds.append(preds) global_step = tf.cast(tf.train.get_global_step(), tf.float32) lr = params.learning_rate # optimizer = tf.train.GradientDescentOptimizer(learning_rate=params.learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate=lr) # optimizer = tf.train.MomentumOptimizer(learning_rate=params.learning_rate,momentum=0.97) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step(), name='apply_gradient_tower_{}'.format(i)) ] tower_ops.append(train_op) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): examples_sec_hook = utils.ExamplesPerSecondHook( params.train_batch_size * (1 + params.redundancy), every_n_steps=100) loss = tf.reduce_mean(tower_losses, name='loss') tensors_to_log = {'loss': loss} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] if params.run_type == 'multi': if params.adaptive: alpha = 2 / (params.num_comm + 1) * (params.train_steps / (params.num_comm * params.sync_step)) local_updates = [ params.sync_step * (1 + alpha * i) for i in range(params.num_comm + 1) ] sync_hook = utils.SyncHook(scopes=var_scopes, every_n_steps=params.sync_step, adaptive=local_updates) else: sync_hook = utils.SyncHook(scopes=var_scopes, every_n_steps=params.sync_step) train_hooks.append(sync_hook) train_ops = tf.group(*tower_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_ops, training_hooks=train_hooks)