예제 #1
0
    def allreduce(self, grads):
        if self.hvd.size() == 1:
            return grads
        # copied from https://github.com/uber/horovod/blob/master/horovod/tensorflow/__init__.py
        averaged_gradients = []
        with tf.name_scope("AllReduce"):
            for grad, var in grads:
                if grad is not None:
                    # Apply gradient compression using GRACE.
                    import horovod.tensorflow as hvd
                    from grace_dl.tensorflow.communicator.allgather import Allgather
                    from grace_dl.tensorflow.compressor.topk import TopKCompressor
                    from grace_dl.tensorflow.memory.residual import ResidualMemory

                    world_size = hvd.size()
                    grc = Allgather(TopKCompressor(0.3), ResidualMemory(), world_size)

                    if self._compression is not None and self._has_compression:
                        avg_grad = self.hvd.allreduce(grad, grace=grc, average=self._average, compression=self._compression)
                    else:
                        avg_grad = self.hvd.allreduce(grad, grace=grc, average=self._average)
                    averaged_gradients.append((avg_grad, var))
                else:
                    averaged_gradients.append((None, var))
        return averaged_gradients
예제 #2
0
def grace_from_params(params):
    comp = params.get('compressor', 'none')
    mem = params.get('memory', 'none')
    comm = params.get('communicator', 'allreduce')
    if comp == 'adapsparse':
        from grace_dl.tensorflow.compressor.adapsparse import AdapSparseCompressor
        compressor = AdapSparseCompressor()
    elif comp == 'adaq':
        from grace_dl.tensorflow.compressor.adaq import AdaqCompressor
        compressor = AdaqCompressor()
    elif comp == 'dgc':
        from grace_dl.tensorflow.compressor.dgc import DgcCompressor
        compressor = DgcCompressor()
    elif comp == 'efsignsgd':
        from grace_dl.tensorflow.compressor.efsignsgd import EFSignSGDCompressor
        compressor = EFSignSGDCompressor()
    elif comp == 'fp16':
        from grace_dl.tensorflow.compressor.fp16 import FP16Compressor
        compressor = FP16Compressor()
    elif comp == 'inceptionnc':
        from grace_dl.tensorflow.compressor.inceptionn import INCEPTIONNCompressor
        compressor = INCEPTIONNCompressor()
    elif comp == 'natural':
        from grace_dl.tensorflow.compressor.natural import NaturalCompressor
        compressor = NaturalCompressor()
    elif comp == 'none':
        from grace_dl.tensorflow.compressor.none import NoneCompressor
        compressor = NoneCompressor()
    elif comp == 'onebit':
        from grace_dl.tensorflow.compressor.onebit import OneBitCompressor
        compressor = OneBitCompressor()
    elif comp == 'powersgd':
        from grace_dl.tensorflow.compressor.powersgd import PowerSGDCompressor
        compressor = PowerSGDCompressor()
    elif comp == 'qsgd':
        from grace_dl.tensorflow.compressor.qsgd import QSGDCompressor
        compressor = QSGDCompressor()
    elif comp == 'randomk':
        from grace_dl.tensorflow.compressor.randomk import RandomKCompressor
        compressor = RandomKCompressor()
    elif comp == 'signsgd':
        from grace_dl.tensorflow.compressor.signsgd import SignSGDCompressor
        compressor = SignSGDCompressor()
    elif comp == 'signum':
        from grace_dl.tensorflow.compressor.signum import SignumCompressor
        compressor = SignumCompressor()
    elif comp == 'sketch':
        from grace_dl.tensorflow.compressor.sketch import SketchCompressor
        compressor = SketchCompressor()
    elif comp == 'terngrad':
        from grace_dl.tensorflow.compressor.terngrad import TernGradCompressor
        compressor = TernGradCompressor()
    elif comp == 'threshold':
        from grace_dl.tensorflow.compressor.threshold import ThresholdCompressor
        compressor = ThresholdCompressor()
    elif comp == 'topk':
        from grace_dl.tensorflow.compressor.topk import TopKCompressor
        compressor = TopKCompressor()
    elif comp == 'u8bit':
        from grace_dl.tensorflow.compressor.u8bit import U8bitCompressor
        compressor = U8bitCompressor()
    else:
        raise NotImplementedError(comp)

    if mem == 'dgc':
        from grace_dl.tensorflow.memory.dgc import DgcMemory
        memory = DgcMemory()
    elif mem == 'none':
        from grace_dl.tensorflow.memory.none import NoneMemory
        memory = NoneMemory()
    elif mem == 'powersgd':
        from grace_dl.tensorflow.memory.powersgd import PowerSGDMemory
        memory = PowerSGDMemory()
    elif mem == 'residual':
        from grace_dl.tensorflow.memory.residual import ResidualMemory
        memory = ResidualMemory()
    else:
        raise NotImplementedError(mem)

    if comm == 'allreduce':
        from grace_dl.tensorflow.communicator.allreduce import Allreduce
        return Allreduce(compressor, memory, params['world_size'])
    elif comm == 'allgather':
        from grace_dl.tensorflow.communicator.allgather import Allgather
        return Allgather(compressor, memory, params['world_size'])
    elif comm == 'broadcast':
        from grace_dl.tensorflow.communicator.broadcast import Broadcast
        return Broadcast(compressor, memory, params['world_size'])
    else:
        raise NotImplementedError(comm)
data = tf.random.uniform([args.batch_size, 224, 224, 3])
target = tf.random.uniform([args.batch_size, 1],
                           minval=0,
                           maxval=999,
                           dtype=tf.int64)

loss = tf.losses.SparseCategoricalCrossentropy()

# Horovod: adjust learning rate based on number of GPUs.
opt = tf.optimizers.Adam(0.001 * hvd.size())

checkpoint_dir = './checkpoints'
checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)

# GRACE: compression algorithm
grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size())


@tf.function
def benchmark_step(first_batch):
    # Horovod: (optional) compression algorithm.
    #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: use DistributedGradientTape
    with tf.GradientTape() as tape:
        probs = model(data, training=True)
        loss = tf.losses.sparse_categorical_crossentropy(target, probs)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape, grace=grc)
예제 #4
0
    def __call__(self, features, labels, mode, params):

        if "debug_verbosity" not in params.keys():
            raise RuntimeError("Parameter `debug_verbosity` is missing...")

        if mode == tf.estimator.ModeKeys.TRAIN:

            if "rmsprop_decay" not in params.keys():
                raise RuntimeError("Parameter `rmsprop_decay` is missing...")

            if "rmsprop_momentum" not in params.keys():
                raise RuntimeError(
                    "Parameter `rmsprop_momentum` is missing...")

            if "learning_rate" not in params.keys():
                raise RuntimeError("Parameter `learning_rate` is missing...")

            if "learning_rate_decay_steps" not in params.keys():
                raise RuntimeError("Parameter `learning_rate` is missing...")

            if "learning_rate_decay_factor" not in params.keys():
                raise RuntimeError("Parameter `learning_rate` is missing...")

            if "weight_decay" not in params.keys():
                raise RuntimeError("Parameter `weight_decay` is missing...")

            if "loss_fn_name" not in params.keys():
                raise RuntimeError("Parameter `loss_fn_name` is missing...")

        if mode == tf.estimator.ModeKeys.PREDICT:
            y_pred, y_pred_logits = self.build_model(
                features,
                training=False,
                reuse=False,
                debug_verbosity=params["debug_verbosity"])

            predictions = {'logits': y_pred}
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=predictions)

        input_image, mask_image = features

        with tf.device("/gpu:0"):

            tf.identity(input_image, name="input_image_ref")
            tf.identity(mask_image, name="mask_image_ref")
            tf.identity(labels, name="labels_ref")

            y_pred, y_pred_logits = self.build_model(
                input_image,
                training=mode == tf.estimator.ModeKeys.TRAIN,
                reuse=False,
                debug_verbosity=params["debug_verbosity"])

            all_trainable_vars = tf.reduce_sum(
                [tf.reduce_prod(v.shape) for v in tf.trainable_variables()])
            tf.identity(all_trainable_vars,
                        name='trainable_parameters_count_ref')

            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metrics = dict()

            # ==================== Samples ==================== #

            image_uint8 = tf.cast((input_image + 1) * 127.5, dtype=tf.uint8)
            input_image_jpeg = tf.image.encode_jpeg(image_uint8[0],
                                                    format='grayscale',
                                                    quality=100)
            tf.identity(input_image_jpeg, name="input_image_jpeg_ref")

            for threshold in [
                    None, 0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99
            ]:
                binarize_img, binarize_img_jpeg = image_processing.binarize_output(
                    y_pred[0], threshold=threshold)

                tf.identity(binarize_img_jpeg,
                            name="output_sample_ths_%s_ref" % threshold)
                tf.summary.image('output_sample_ths_%s' % threshold,
                                 binarize_img, 10)

            # ==============+ Evaluation Metrics ==================== #

            with tf.name_scope("IoU_Metrics"):

                for threshold in [
                        0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99
                ]:

                    iou_score = metrics.iou_score(y_pred=y_pred,
                                                  y_true=mask_image,
                                                  threshold=threshold)

                    tf.identity(iou_score,
                                name='iou_score_ths_%s_ref' % threshold)
                    tf.summary.scalar('iou_score_ths_%s' % threshold,
                                      iou_score)

                    if mode == tf.estimator.ModeKeys.EVAL:
                        eval_metrics["IoU_THS_%s" %
                                     threshold] = tf.metrics.mean(iou_score)

            labels = tf.cast(labels, tf.float32)
            labels_preds = tf.reduce_max(y_pred, axis=(1, 2, 3))

            assert (abs(labels_preds - tf.clip_by_value(labels_preds, 0, 1)) <
                    0.00001,
                    "Clipping labels_preds introduces non-trivial loss.")
            labels_preds = tf.clip_by_value(labels_preds, 0, 1)

            with tf.variable_scope("Confusion_Matrix") as scope:

                tp, update_tp = tf.metrics.true_positives_at_thresholds(
                    labels=labels,
                    predictions=labels_preds,
                    thresholds=[
                        0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99
                    ],
                )

                tn, update_tn = tf.metrics.true_negatives_at_thresholds(
                    labels=labels,
                    predictions=labels_preds,
                    thresholds=[
                        0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99
                    ],
                )

                fp, update_fp = tf.metrics.false_positives_at_thresholds(
                    labels=labels,
                    predictions=labels_preds,
                    thresholds=[
                        0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99
                    ],
                )

                fn, update_fn = tf.metrics.false_negatives_at_thresholds(
                    labels=labels,
                    predictions=labels_preds,
                    thresholds=[
                        0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99
                    ],
                )

                if mode == tf.estimator.ModeKeys.TRAIN:
                    local_vars = tf.get_collection(
                        tf.GraphKeys.LOCAL_VARIABLES, scope=scope.name)
                    confusion_matrix_reset_op = tf.initializers.variables(
                        local_vars, name='reset_op')

                    with tf.control_dependencies([confusion_matrix_reset_op]):
                        with tf.control_dependencies(
                            [update_tp, update_tn, update_fp, update_fn]):
                            tp = tf.identity(tp)
                            tn = tf.identity(tn)
                            fp = tf.identity(fp)
                            fn = tf.identity(fn)

                else:
                    eval_metrics["Confusion_Matrix_TP"] = tp, update_tp
                    eval_metrics["Confusion_Matrix_TN"] = tn, update_tn
                    eval_metrics["Confusion_Matrix_FP"] = fp, update_fp
                    eval_metrics["Confusion_Matrix_FN"] = fn, update_fn

                tf.identity(tp, name='true_positives_ref'
                            )  # Confusion_Matrix/true_positives_ref:0
                tf.identity(tn, name='true_negatives_ref'
                            )  # Confusion_Matrix/true_negatives_ref:0
                tf.identity(fp, name='false_positives_ref'
                            )  # Confusion_Matrix/false_positives_ref:0
                tf.identity(fn, name='false_negatives_ref'
                            )  # Confusion_Matrix/false_negatives_ref:0

                tf.summary.scalar('true_positives', tp[3])  # For Ths = 0.5
                tf.summary.scalar('true_negatives', tn[3])  # For Ths = 0.5
                tf.summary.scalar('false_positives', fp[3])  # For Ths = 0.5
                tf.summary.scalar('false_negatives', fn[3])  # For Ths = 0.5

            binarized_mask, binarized_mask_jpeg = image_processing.binarize_output(
                mask_image[0], threshold=0.5)
            tf.identity(binarized_mask_jpeg, name="mask_sample_ref")
            tf.summary.image('sample_mask', binarized_mask, 10)

            ##########################

            mask_max_val = tf.reduce_max(mask_image)
            tf.identity(mask_max_val, name='mask_max_val_ref')

            mask_min_val = tf.reduce_min(mask_image)
            tf.identity(mask_min_val, name='mask_min_val_ref')

            mask_mean_val = tf.reduce_mean(mask_image)
            tf.identity(mask_mean_val, name='mask_mean_val_ref')

            mask_std_val = tf.math.reduce_std(mask_image)
            tf.identity(mask_std_val, name='mask_std_val_ref')

            ##########################

            output_max_val = tf.reduce_max(y_pred)
            tf.identity(output_max_val, name='output_max_val_ref')

            output_min_val = tf.reduce_min(y_pred)
            tf.identity(output_min_val, name='output_min_val_ref')

            output_mean_val = tf.reduce_mean(y_pred)
            tf.identity(output_mean_val, name='output_mean_val_ref')

            output_std_val = tf.math.reduce_std(y_pred)
            tf.identity(output_std_val, name='output_std_val_ref')

            with tf.variable_scope("losses"):

                # ==============+ Reconstruction Loss ==================== #

                if params["loss_fn_name"] == "x-entropy":
                    reconstruction_loss = losses.reconstruction_x_entropy(
                        y_pred=y_pred, y_true=mask_image)

                elif params["loss_fn_name"] == "l2_loss":
                    reconstruction_loss = losses.reconstruction_l2loss(
                        y_pred=y_pred, y_true=mask_image)

                elif params["loss_fn_name"] == "dice_sorensen":
                    reconstruction_loss = 1 - losses.dice_coe(
                        y_pred=y_pred, y_true=mask_image, loss_type='sorensen')

                elif params["loss_fn_name"] == "dice_jaccard":
                    reconstruction_loss = 1 - losses.dice_coe(
                        y_pred=y_pred, y_true=mask_image, loss_type='jaccard')

                elif params["loss_fn_name"] == "adaptive_loss":
                    reconstruction_loss = losses.adaptive_loss(
                        y_pred=y_pred,
                        y_pred_logits=y_pred_logits,
                        y_true=mask_image,
                        switch_at_threshold=0.3,
                        loss_type='sorensen')

                else:
                    raise ValueError("Unknown loss function received: %s" %
                                     params["loss_fn_name"])

                tf.identity(reconstruction_loss,
                            name='reconstruction_loss_ref')
                tf.summary.scalar('reconstruction_loss', reconstruction_loss)

                if mode == tf.estimator.ModeKeys.TRAIN:

                    # ============== Regularization Loss ==================== #

                    l2_loss = losses.regularization_l2loss(
                        weight_decay=params["weight_decay"])

                    tf.identity(l2_loss, name='l2_loss_ref')
                    tf.summary.scalar('l2_loss', l2_loss)

                    total_loss = tf.add(reconstruction_loss,
                                        l2_loss,
                                        name="total_loss")

                else:
                    total_loss = reconstruction_loss

                tf.identity(total_loss, name='total_loss_ref')
                tf.summary.scalar('total_loss', total_loss)

            if mode == tf.estimator.ModeKeys.TRAIN:

                with tf.variable_scope("optimizers"):

                    # Update Global Step
                    global_step = tf.train.get_or_create_global_step()
                    tf.identity(global_step, name="global_step_ref")

                    learning_rate = tf.train.exponential_decay(
                        learning_rate=params["learning_rate"],
                        decay_steps=params["learning_rate_decay_steps"],
                        decay_rate=params["learning_rate_decay_factor"],
                        global_step=global_step,
                        staircase=True)

                    tf.identity(learning_rate, name="learning_rate_ref")
                    tf.summary.scalar('learning_rate_ref', learning_rate)

                    opt = tf.train.RMSPropOptimizer(
                        learning_rate=learning_rate,
                        use_locking=False,
                        centered=True,
                        decay=params["rmsprop_decay"],
                        momentum=params["rmsprop_momentum"],
                    )

                    if hvd_utils.is_using_hvd():
                        # Apply gradient compression using GRACE.
                        from grace_dl.tensorflow.communicator.allgather import Allgather
                        from grace_dl.tensorflow.compressor.topk import TopKCompressor
                        from grace_dl.tensorflow.memory.residual import ResidualMemory

                        world_size = hvd.size()
                        grc = Allgather(TopKCompressor(0.3), ResidualMemory(),
                                        world_size)
                        opt = hvd.DistributedOptimizer(opt,
                                                       grace=grc,
                                                       device_dense='/gpu:0')

                    if params["apply_manual_loss_scaling"]:

                        # if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
                        #     Logger.log("Applying manual Loss Scaling ...")

                        loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(
                            init_loss_scale=2**32,  # 4,294,967,296
                            incr_every_n_steps=1000)
                        opt = tf.contrib.mixed_precision.LossScaleOptimizer(
                            opt, loss_scale_manager)

                    deterministic = True
                    gate_gradients = (tf.train.Optimizer.GATE_OP
                                      if deterministic else
                                      tf.train.Optimizer.GATE_NONE)

                    backprop_op = opt.minimize(total_loss,
                                               gate_gradients=gate_gradients,
                                               global_step=global_step)

                    train_op = tf.group(
                        backprop_op,
                        tf.get_collection(tf.GraphKeys.UPDATE_OPS))

                    return tf.estimator.EstimatorSpec(
                        mode,
                        loss=total_loss,
                        train_op=train_op,
                    )

            elif mode == tf.estimator.ModeKeys.EVAL:

                return tf.estimator.EstimatorSpec(
                    mode,
                    loss=total_loss,
                    eval_metric_ops=eval_metrics,
                    predictions={"output": y_pred})

            else:
                raise NotImplementedError('Unknown mode {}'.format(mode))
예제 #5
0
def ncf_model_ops(users,
                  items,
                  labels,
                  dup_mask,
                  params,
                  mode='TRAIN'):
    """
    Constructs the training and evaluation graphs
    """
    # Validation params
    val_batch_size = params['val_batch_size']
    K = params['top_k']
    # Training params
    learning_rate = params['learning_rate']
    beta_1 = params['beta_1']
    beta_2 = params['beta_2']
    epsilon = params['epsilon']
    # Model params
    fp16 = params['fp16']
    nb_users = params['num_users']
    nb_items = params['num_items']
    mf_dim = params['num_factors']
    mf_reg = params['mf_reg']
    mlp_layer_sizes = params['layer_sizes']
    mlp_layer_regs = params['layer_regs']
    dropout = params['dropout']
    sigmoid = False #params['sigmoid']
    loss_scale = params['loss_scale']

    model_dtype = tf.float16 if fp16 else tf.float32

    # If manually enabling mixed precision, use the custom variable getter
    custom_getter = None if not fp16 else float32_variable_storage_getter
    # Allow soft device placement
    with tf.device(None), \
         tf.variable_scope('neumf', custom_getter=custom_getter):
        # Model graph
        logits = neural_mf(
            users,
            items,
            model_dtype,
            nb_users,
            nb_items,
            mf_dim,
            mf_reg,
            mlp_layer_sizes,
            mlp_layer_regs,
            dropout,
            sigmoid
        )
        logits = tf.squeeze(logits)

        if mode == 'INFERENCE':
            return logits

        # Evaluation Ops
        found_positive, dcg = compute_eval_metrics(logits, dup_mask, val_batch_size, K)
        # Metrics
        hit_rate = tf.metrics.mean(found_positive, name='hit_rate')
        ndcg = tf.metrics.mean(dcg, name='ndcg')

        eval_op = tf.group(hit_rate[1], ndcg[1])

        if mode == 'EVAL':
            return hit_rate[0], ndcg[0], eval_op, None

        # Labels
        labels = tf.reshape(labels, [-1, 1])
        logits = tf.reshape(logits, [-1, 1])

        # Use adaptive momentum optimizer
        optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate,
            beta1=beta_1, beta2=beta_2,
            epsilon=epsilon)

        loss = tf.losses.sigmoid_cross_entropy(
            labels,
            logits,
            reduction=tf.losses.Reduction.MEAN)

        # Apply loss scaling if manually enabling mixed precision
        if fp16:
            if loss_scale is None:
                loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(2**32, 1000)
            else:
                loss_scale_manager = tf.contrib.mixed_precision.FixedLossScaleManager(loss_scale)
            optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)

        # Horovod wrapper for distributed training
        # Apply gradient compression using GRACE.
        from grace_dl.tensorflow.communicator.allgather import Allgather
        from grace_dl.tensorflow.compressor.topk import TopKCompressor
        from grace_dl.tensorflow.memory.residual import ResidualMemory

        world_size = hvd.size()
        grc = Allgather(TopKCompressor(0.3), ResidualMemory(), world_size)
        optimizer = hvd.DistributedOptimizer(optimizer, grace=grc)

        # Update ops
        global_step = tf.train.get_global_step()
        train_op = optimizer.minimize(loss, global_step=global_step)

        return hit_rate[0], ndcg[0], eval_op, train_op
예제 #6
0
def grace_from_params(params):
    import horovod.tensorflow as hvd
    world_size = hvd.size()
    comp = params.get('compressor', 'none')
    mem = params.get('memory', 'none')
    comm = params.get('communicator', 'allreduce')
    if comp == 'adaq':
        from grace_dl.tensorflow.compressor.adaq import AdaqCompressor
        compressor = AdaqCompressor(compress_ratio=0.01)
    elif comp == 'dgc':
        from grace_dl.tensorflow.compressor.dgc import DgcCompressor
        compressor = DgcCompressor(compress_ratio=0.01)
    elif comp == 'efsignsgd':
        from grace_dl.tensorflow.compressor.efsignsgd import EFSignSGDCompressor
        compressor = EFSignSGDCompressor(lr=0.1)
    elif comp == 'fp16':
        from grace_dl.tensorflow.compressor.fp16 import FP16Compressor
        compressor = FP16Compressor()
    elif comp == 'inceptionn':
        from grace_dl.tensorflow.compressor.inceptionn import INCEPTIONNCompressor
        compressor = INCEPTIONNCompressor(error_bound=2e-10)
    elif comp == 'natural':
        from grace_dl.tensorflow.compressor.natural import NaturalCompressor
        compressor = NaturalCompressor()
    elif comp == 'none':
        from grace_dl.tensorflow.compressor.none import NoneCompressor
        compressor = NoneCompressor()
    elif comp == 'onebit':
        from grace_dl.tensorflow.compressor.onebit import OneBitCompressor
        compressor = OneBitCompressor()
    elif comp == 'powersgd':
        from grace_dl.tensorflow.compressor.powersgd import PowerSGDCompressor
        compressor = PowerSGDCompressor(momentum_factor=0.9,
                                        world_size=world_size)
    elif comp == 'qsgd':
        from grace_dl.tensorflow.compressor.qsgd import QSGDCompressor
        compressor = QSGDCompressor(quantum_num=64)
    elif comp == 'randomk':
        from grace_dl.tensorflow.compressor.randomk import RandomKCompressor
        compressor = RandomKCompressor(compress_ratio=0.01)
    elif comp == 'signsgd':
        from grace_dl.tensorflow.compressor.signsgd import SignSGDCompressor
        compressor = SignSGDCompressor()
    elif comp == 'signum':
        from grace_dl.tensorflow.compressor.signum import SignumCompressor
        compressor = SignumCompressor(momentum=0.9)
    elif comp == 'sketch':
        from grace_dl.tensorflow.compressor.sketch import SketchCompressor
        compressor = SketchCompressor(quantiles=64)
    elif comp == 'terngrad':
        from grace_dl.tensorflow.compressor.terngrad import TernGradCompressor
        compressor = TernGradCompressor()
    elif comp == 'threshold':
        from grace_dl.tensorflow.compressor.threshold import ThresholdCompressor
        compressor = ThresholdCompressor(threshold=0.01)
    elif comp == 'topk':
        from grace_dl.tensorflow.compressor.topk import TopKCompressor
        compressor = TopKCompressor(compress_ratio=0.01)
    elif comp == 'u8bit':
        from grace_dl.tensorflow.compressor.u8bit import U8bitCompressor
        compressor = U8bitCompressor()
    else:
        raise NotImplementedError(comp)

    if mem == 'dgc':
        from grace_dl.tensorflow.memory.dgc import DgcMemory
        memory = DgcMemory(momentum=0.9,
                           gradient_clipping=False,
                           world_size=world_size)
    elif mem == 'none':
        from grace_dl.tensorflow.memory.none import NoneMemory
        memory = NoneMemory()
    elif mem == 'powersgd':
        from grace_dl.tensorflow.memory.powersgd import PowerSGDMemory
        memory = PowerSGDMemory(
            q_memory=compressor.q_memory,
            compress_rank=1,
        )
    elif mem == 'residual':
        from grace_dl.tensorflow.memory.residual import ResidualMemory
        memory = ResidualMemory()
    elif mem == 'efsignsgd':
        from grace_dl.tensorflow.memory.efsignsgd import EFSignSGDMemory
        memory = EFSignSGDMemory(lr=0.1)
    else:
        raise NotImplementedError(mem)

    if comm == 'allreduce':
        from grace_dl.tensorflow.communicator.allreduce import Allreduce
        return Allreduce(compressor, memory, world_size)
    elif comm == 'allgather':
        from grace_dl.tensorflow.communicator.allgather import Allgather
        return Allgather(compressor, memory, world_size)
    elif comm == 'broadcast':
        from grace_dl.tensorflow.communicator.broadcast import Broadcast
        return Broadcast(compressor, memory, world_size)
    else:
        raise NotImplementedError(comm)
예제 #7
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.AdamOptimizer(0.001 * hvd.size())

    # GRACE: compression algorithm
    grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt, grace=grc)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})