Python BroadcastGlobalVariablesHook示例

编程语言: Python

命名空间/包名称: horovod.tensorflow

方法/功能: BroadcastGlobalVariablesHook

hotexamples.com的示例: 30

Python BroadcastGlobalVariablesHook - 已找到30个示例。这些是从开源项目中提取的最受好评的horovod.tensorflow.BroadcastGlobalVariablesHook现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： train.py 项目： Hanyu-Li/EM_mask

def main(unused_argv):
    hvd.init()
    model_class = import_symbol(FLAGS.model_name, 'em_mask')
    model_args = json.loads(FLAGS.model_args)
    fov_size = tuple([int(i) for i in model_args['fov_size']])
    if 'label_size' in model_args:
        label_size = tuple([int(i) for i in model_args['label_size']])
    else:
        label_size = fov_size
        model_args['label_size'] = label_size
    num_classes = int(model_args['num_classes'])

    if num_classes == 1:
        # model_fn = model_utils.mask_model_fn_regression
        model_fn = model_utils.mask_model_fn_binary
    else:
        model_fn = model_utils.mask_model_fn_classfication

    params = {
        'model_class': model_class,
        'model_args': model_args,
        'batch_size': FLAGS.batch_size,
        'num_classes': num_classes,
        'learning_rate': FLAGS.learning_rate,
        'weighted': FLAGS.weighted
    }

    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
    sess_config = tf.compat.v1.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    model_dir = FLAGS.train_dir if hvd.rank() == 0 else None
    save_summary_steps = 90 if hvd.rank() == 0 else None
    save_checkpoints_secs = 540 if hvd.rank() == 0 else None

    config = tf.estimator.RunConfig(
        model_dir=model_dir,
        save_summary_steps=save_summary_steps,
        save_checkpoints_secs=save_checkpoints_secs,
        session_config=sess_config,
        keep_checkpoint_max=10,
    )
    mask_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                            config=config,
                                            params=params)
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    if FLAGS.weights_volumes:
        input_fn = io_utils.train_input_fn_with_weight(
            FLAGS.data_volumes, FLAGS.label_volumes, FLAGS.weights_volumes,
            FLAGS.tf_coords, num_classes, fov_size, label_size,
            FLAGS.batch_size, FLAGS.image_mean, FLAGS.image_stddev,
            FLAGS.rotation)
    else:
        # input_fn = io_utils.train_input_fn(
        #   FLAGS.data_volumes,
        #   FLAGS.label_volumes,
        #   FLAGS.tf_coords,
        #   num_classes,
        #   fov_size,
        #   label_size,
        #   FLAGS.batch_size,
        #   FLAGS.image_mean,
        #   FLAGS.image_stddev,
        #   FLAGS.rotation)
        # input_fn = io_utils.train_input_rebalance_fn(
        input_fn = io_utils.train_input_mult_fn(
            FLAGS.data_volumes, FLAGS.label_volumes, FLAGS.tf_coords,
            num_classes, fov_size, label_size, FLAGS.batch_size,
            FLAGS.image_mean, FLAGS.image_stddev, FLAGS.rotation, False)

    mask_estimator.train(input_fn=input_fn,
                         steps=FLAGS.max_steps,
                         hooks=[bcast_hook])

示例#2

显示文件

文件： ESN_distributed.py 项目： judyjudy963/Parallel-Reservoir-Computing-Network

    # optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate_).minimize(loss)

    # horovard inclusion
    opt = tf.train.AdamOptimizer(learning_rate=learning_rate_ * hvd.size())

    # Add Horovod Distributed Optimizer
    opt = hvd.DistributedOptimizer(opt)

    prediction = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))

    global_step = tf.train.get_or_create_global_step()

    # Add hook to broadcast variables from rank 0 to all other processes during
    # initialization.
    hooks = [hvd.BroadcastGlobalVariablesHook(0)]

    # Make training operation
    train_op = opt.minimize(loss,
                            global_step=tf.train.get_or_create_global_step())

    # Save checkpoints only on worker 0 to prevent other workers from corrupting them.
    checkpoint_dir = '/tmp/train_logs' if hvd.rank() == 0 else None

    # # initialize the variables
    init = tf.global_variables_initializer()

# Implementing a dynamic graph using tensorflow API

stime = time()
runtime = 0

示例#3

显示文件

    def train(self,
              iter_unit,
              num_iter,
              batch_size,
              warmup_steps=50,
              weight_decay=1e-4,
              lr_init=0.1,
              lr_warmup_epochs=5,
              momentum=0.9,
              log_every_n_steps=1,
              loss_scale=256,
              label_smoothing=0.0,
              mixup=0.0,
              use_cosine_lr=False,
              use_static_loss_scaling=False,
              is_benchmark=False):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError(
                '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])'
                % iter_unit)

        if self.run_hparams.data_dir is None and not is_benchmark:
            raise ValueError('`data_dir` must be specified for training!')

        if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16:
            if use_static_loss_scaling:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0"
            else:
                LOGGER.log("TF Loss Auto Scaling is activated")
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
        else:
            use_static_loss_scaling = False  # Make sure it hasn't been set to True on FP32 training

        num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size()
        global_batch_size = batch_size * num_gpus

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset(
                data_dir=self.run_hparams.data_dir,
                mode="train",
                iter_unit=iter_unit,
                num_iter=num_iter,
                global_batch_size=global_batch_size,
            )

            steps_per_epoch = num_steps / num_epochs

        else:
            num_epochs = 1
            num_steps = num_iter
            steps_per_epoch = num_steps
            num_decay_steps = num_steps
            num_samples = num_steps * batch_size

        if self.run_hparams.data_idx_dir is not None:
            idx_filenames = runner_utils.parse_dali_idx_dataset(
                data_idx_dir=self.run_hparams.data_idx_dir, mode="train")

        training_hooks = []

        if hvd.rank() == 0:
            LOGGER.log('Starting Model Training...')
            LOGGER.log("Training Epochs", num_epochs)
            LOGGER.log("Total Steps", num_steps)
            LOGGER.log("Steps per Epoch", steps_per_epoch)
            LOGGER.log("Decay Steps", num_decay_steps)
            LOGGER.log("Weight Decay Factor", weight_decay)
            LOGGER.log("Init Learning Rate", lr_init)
            LOGGER.log("Momentum", momentum)
            LOGGER.log("Num GPUs", num_gpus)
            LOGGER.log("Per-GPU Batch Size", batch_size)

            if is_benchmark:

                benchmark_logging_hook = hooks.BenchmarkLoggingHook(
                    log_file_path=os.path.join(self.run_hparams.log_dir,
                                               "training_benchmark.json"),
                    global_batch_size=global_batch_size,
                    log_every=log_every_n_steps,
                    warmup_steps=warmup_steps)

                training_hooks.append(benchmark_logging_hook)

            else:

                training_logging_hook = hooks.TrainingLoggingHook(
                    log_file_path=os.path.join(self.run_hparams.log_dir,
                                               "training.json"),
                    global_batch_size=global_batch_size,
                    num_steps=num_steps,
                    num_samples=num_samples,
                    num_epochs=num_epochs,
                    log_every=log_every_n_steps)

                training_hooks.append(training_logging_hook)

        if hvd_utils.is_using_hvd():
            bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
            training_hooks.append(bcast_hook)

        training_hooks.append(hooks.PrefillStagingAreasHook())

        estimator_params = {
            'batch_size': batch_size,
            'steps_per_epoch': steps_per_epoch,
            'num_gpus': num_gpus,
            'momentum': momentum,
            'lr_init': lr_init,
            'lr_warmup_epochs': lr_warmup_epochs,
            'weight_decay': weight_decay,
            'loss_scale': loss_scale,
            'apply_loss_scaling': use_static_loss_scaling,
            'label_smoothing': label_smoothing,
            'mixup': mixup,
            'num_decay_steps': num_decay_steps,
            'use_cosine_lr': use_cosine_lr
        }

        image_classifier = self._get_estimator(
            mode='train',
            run_params=estimator_params,
            use_xla=self.run_hparams.use_xla,
            use_dali=self.run_hparams.use_dali,
            gpu_memory_fraction=self.run_hparams.gpu_memory_fraction,
            gpu_id=self.run_hparams.gpu_id)

        def training_data_fn():

            if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None:
                if hvd.rank() == 0:
                    LOGGER.log("Using DALI input... ")

                return data_utils.get_dali_input_fn(
                    filenames=filenames,
                    idx_filenames=idx_filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            elif self.run_hparams.data_dir is not None:

                return data_utils.get_tfrecords_input_fn(
                    filenames=filenames,
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    training=True,
                    distort_color=self.run_hparams.distort_colors,
                    num_threads=self.run_hparams.num_preprocessing_threads,
                    deterministic=False
                    if self.run_hparams.seed is None else True)

            else:
                if hvd.rank() == 0:
                    LOGGER.log("Using Synthetic Data ...")
                return data_utils.get_synth_input_fn(
                    batch_size=batch_size,
                    height=self.run_hparams.height,
                    width=self.run_hparams.width,
                    num_channels=self.run_hparams.n_channels,
                    data_format=self.run_hparams.input_format,
                    num_classes=self.run_hparams.n_classes,
                    dtype=self.run_hparams.dtype,
                )

        try:
            image_classifier.train(
                input_fn=training_data_fn,
                steps=num_steps,
                hooks=training_hooks,
            )
        except KeyboardInterrupt:
            print("Keyboard interrupt")

        if hvd.rank() == 0:
            LOGGER.log('Ending Model Training ...')

示例#4

显示文件

def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    # Convert params (dict) to Config for easier access.
    training_hooks = None
    if params['data_format'] == 'channels_first':
        features = tf.transpose(features, [0, 3, 1, 2])

    def _model_outputs(inputs):
        return model(inputs, config=hparams_config.Config(params))

    cls_outputs, box_outputs = utils.build_model_with_precision(
        params['precision'], _model_outputs, features)

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/box_iou_loss', box_iou_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()
    if params['strategy'] == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        learning_rate = learning_rate * hvd.size()
    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        elif params['strategy'] == 'horovod':
            optimizer = hvd.DistributedOptimizer(optimizer)
            training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            if params['strategy'] == 'tpu':
                batch_size = params['batch_size'] * params['num_shards']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    testdev_dir=params['testdev_dir'],
                    disable_pyfun=params.get('disable_pyfun', None),
                    **kwargs)
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                              params['val_json_file'],
                                              **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint,
                                             ckpt_scope=ckpt_scope,
                                             var_scope=var_scope,
                                             var_exclude_expr=params.get(
                                                 'var_exclude_expr', None))

            tf.train.init_from_checkpoint(checkpoint, var_map)

            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn,
                                             training_hooks=training_hooks)

示例#5

显示文件

文件： main.py 项目： AlexJinlei/DeepLearningExamples

def main(_):
    tf.get_logger().setLevel(logging.ERROR)

    hvd.init()

    FLAGS = PARSER.parse_args()
    backends = [StdOutBackend(Verbosity.DEFAULT)]

    if FLAGS.log_dir:
        backends += [JSONStreamBackend(Verbosity.DEFAULT, FLAGS.log_dir)]

    DLLogger.init(backends=backends)
    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
    os.environ['TF_DISABLE_NVTX_RANGES'] = '1'

    if hvd.rank() == 0:
        DLLogger.log(step=tuple(), data={"mixed_precision": "ENABLED" if FLAGS.use_amp else "DISABLED"})

    dataset = MSDDataset(json_path=os.path.join(FLAGS.data_dir, 'dataset.json'),
                         dst_size=FLAGS.input_shape,
                         seed=FLAGS.seed,
                         interpolator=FLAGS.resize_interpolator,
                         data_normalization=FLAGS.data_normalization,
                         batch_size=FLAGS.batch_size,
                         train_split=FLAGS.train_split,
                         split_seed=FLAGS.split_seed)

    FLAGS.labels = dataset.labels

    gpu_options = tf.GPUOptions()
    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.estimator.RunConfig(
        save_summary_steps=None,
        save_checkpoints_steps=dataset.train_steps * FLAGS.train_epochs,
        save_checkpoints_secs=None,
        tf_random_seed=None,
        session_config=config,
        keep_checkpoint_max=1)

    estimator = tf.estimator.Estimator(
        model_fn=vnet_v2,
        model_dir=FLAGS.model_dir if hvd.rank() == 0 else None,
        config=run_config,
        params=FLAGS)

    train_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

    if 'train' in FLAGS.exec_mode:
        steps = dataset.train_steps * FLAGS.train_epochs

        if FLAGS.benchmark:
            steps = FLAGS.warmup_steps * 2
            if hvd.rank() == 0:
                train_hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger)]
        else:
            if hvd.rank() == 0:
                train_hooks += [TrainHook(FLAGS.log_every, DLLogger)]

        estimator.train(
            input_fn=lambda: dataset.train_fn(FLAGS.augment),
            steps=steps,
            hooks=train_hooks)

    if 'evaluate' in FLAGS.exec_mode:
        if hvd.rank() == 0:
            if FLAGS.train_split >= 1.0:
                raise ValueError("Missing argument: --train_split < 1.0")
            result = estimator.evaluate(
                input_fn=dataset.eval_fn,
                steps=dataset.eval_steps,
                hooks=[])
            DLLogger.log(step=tuple(), data={'background_dice': result['background dice']})
            DLLogger.log(step=tuple(), data={'anterior_dice': result['Anterior dice']})
            DLLogger.log(step=tuple(), data={'posterior_dice': result['Posterior dice']})

    if 'predict' in FLAGS.exec_mode:
        count = 1
        hooks = []
        if hvd.rank() == 0:
            if FLAGS.benchmark:
                count = math.ceil((FLAGS.warmup_steps * 2) / dataset.test_steps)
                hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger, training=False)]

            predictions = estimator.predict(input_fn=lambda: dataset.test_fn(count=count),
                                            hooks=hooks)

            pred = [p['prediction'] for p in predictions]

            predict_path = os.path.join(FLAGS.model_dir, 'predictions')
            if os.path.exists(predict_path):
                shutil.rmtree(predict_path)

            os.makedirs(predict_path)

            pickle.dump(pred, open(os.path.join(predict_path, 'predictions.pkl'), 'wb'))

示例#6

显示文件

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if FLAGS.amp:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    if FLAGS.horovod:
        import horovod.tensorflow as hvd
        hvd.init()

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    config = tf.ConfigProto()
    if FLAGS.horovod:
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoint_steps
        if not FLAGS.horovod or hvd.rank() == 0 else None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        # This variable controls how often estimator reports examples/sec.
        # Default value is every 100 steps.
        # When --report_loss is True, we set to very large value to prevent
        # default info reporting from estimator.
        # Ideally we should set it to None, but that does not work.
        log_step_count_steps=10000 if FLAGS.report_loss else 100)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu,
                                hvd=None if not FLAGS.horovod else hvd)

    training_hooks = []
    if FLAGS.horovod and hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.report_loss:
        global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size(
        )
        training_hooks.append(
            _LogSessionRunHook(global_batch_size, 1,
                               -1 if not FLAGS.horovod else hvd.rank()))

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True,
            hvd=None if not FLAGS.horovod else hvd)
        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False,
            hvd=None if not FLAGS.horovod else hvd)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

示例#7

显示文件

文件： run_classifier_hvd.py 项目： LeoWood/bert-horovod

def main(_):
    hvd.init()
    FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(
        FLAGS.output_dir, str(hvd.rank()))

    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
        "cla": ClaProcessor,
        "pair": PairProcessor
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        log_step_count_steps=25,
        session_config=config)

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
        num_train_steps = num_train_steps // hvd.size()
        num_warmup_steps = num_warmup_steps // hvd.size()

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=hooks)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        #######################################################################################################################
        # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy
        steps_and_files = []
        filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                global_step = int(cur_filename.split("-")[-1])
                tf.logging.info("Add {} to eval list.".format(cur_filename))
                steps_and_files.append([global_step, cur_filename])
        steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        print("output_eval_file:", output_eval_file)
        tf.logging.info("output_eval_file:" + output_eval_file)
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            for global_step, filename in sorted(steps_and_files,
                                                key=lambda x: x[0]):
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=filename)

                tf.logging.info("***** Eval results %s *****" % (filename))
                writer.write("***** Eval results %s *****\n" % (filename))
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
        #######################################################################################################################

        # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        #
        # output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        # with tf.gfile.GFile(output_eval_file, "w") as writer:
        #     tf.logging.info("***** Eval results *****")
        #     for key in sorted(result.keys()):
        #         tf.logging.info("  %s = %s", key, str(result[key]))
        #         writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        true_labels = []
        with open(os.path.join(FLAGS.data_dir, "test.tsv"),
                  'r',
                  encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()
                true_labels.append(int(line.split('\t')[0]))

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        predictions = []
        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                a = probabilities.tolist()
                predictions.append(a.index(max(a)))
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

        count = 0
        for i in range(len(predictions)):
            if predictions[i] == true_labels[i]:
                count += 1
        print("Average accuracy: ", count / len(predictions))

        with open(os.path.join(FLAGS.data_dir, "id2label.json"),
                  'r',
                  encoding='utf-8') as f:
            ld2label = json.load(f)

        cla_labels = [i for i in range(FLAGS.cla_nums)]
        report = metrics.classification_report(
            y_true=true_labels,
            y_pred=predictions,
            labels=cla_labels,
            target_names=[ld2label[str(i)].split()[0] for i in cla_labels],
            digits=4)

        confution_matrix = metrics.confusion_matrix(y_true=true_labels,
                                                    y_pred=predictions,
                                                    labels=cla_labels)
        print(report)
        print(confution_matrix)
        with open(os.path.join(FLAGS.output_dir, "eval_report.txt"),
                  'w',
                  encoding='utf-8') as f:
            f.write(report)

示例#8

显示文件

def main(_):

    #Initialize Horovod
    hvd.init()
    # Using the Winograd non-fused algorithms provides a small performance boost.
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
    # set inter_op to 1 and intra_op to number of physical cores
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False,
                            intra_op_parallelism_threads=FLAGS.num_cpu_threads,
                            inter_op_parallelism_threads=1,
                            gpu_options=gpu_options)

    num_gpus = validate_batch_size_for_multi_gpu(FLAGS.batch_size)

    #Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them
    model_dir_hvd = FLAGS.model_dir if hvd.rank() == 0 else None

    # Set up a RunConfig to only save checkpoints once per training cycle.
    run_config = tf.estimator.RunConfig().replace(
        save_checkpoints_secs=FLAGS.save_checkpoints_secs
    ).replace(save_checkpoints_steps=None).replace(
        save_summary_steps=FLAGS.save_summary_steps).replace(
            keep_checkpoint_max=5).replace(
                tf_random_seed=FLAGS.tf_random_seed).replace(
                    log_step_count_steps=FLAGS.log_every_n_steps).replace(
                        session_config=config).replace(model_dir=model_dir_hvd)

    replicate_ssd_model_fn = tf.contrib.estimator.replicate_model_fn(
        ssd_model_fn, loss_reduction=tf.losses.Reduction.MEAN)

    ssd_detector = tf.estimator.Estimator(
        model_fn=replicate_ssd_model_fn,
        model_dir=model_dir_hvd,
        config=run_config,
        params={
            'num_gpus': num_gpus,
            'data_format': FLAGS.data_format,
            'batch_size': FLAGS.batch_size,
            'model_scope': FLAGS.model_scope,
            'num_classes': FLAGS.num_classes,
            'negative_ratio': FLAGS.negative_ratio,
            'match_threshold': FLAGS.match_threshold,
            'neg_threshold': FLAGS.neg_threshold,
            'weight_decay': FLAGS.weight_decay,
            'momentum': FLAGS.momentum,
            'learning_rate': FLAGS.learning_rate,
            'end_learning_rate': FLAGS.end_learning_rate,
            'decay_boundaries': parse_comma_list(FLAGS.decay_boundaries),
            'lr_decay_factors': parse_comma_list(FLAGS.lr_decay_factors),
        })
    tensors_to_log = {
        'lr': 'learning_rate_log',
        'ce': 'cross_entropy_loss',
        'loc': 'location_loss',
        'loss': 'total_loss',
        'l2': 'l2_loss',
        'acc': 'post_forward/cls_accuracy',
    }
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log,
        every_n_iter=FLAGS.log_every_n_steps,
        formatter=lambda dicts:
        (', '.join(['%s=%.6f' % (k, v) for k, v in dicts.items()])))

    #Horovod: broadcast initial variable states from rank 0 to all other processes
    #This ensures consistent initialization when training is started from random weights or a checkpoint
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    #hook = tf.train.ProfilerHook(save_steps=50, output_dir='.', show_memory=True)
    print('Starting a training cycle.')
    ssd_detector.train(input_fn=input_pipeline(dataset_pattern='train-*',
                                               is_training=True,
                                               batch_size=FLAGS.batch_size),
                       hooks=[logging_hook, bcast_hook],
                       max_steps=FLAGS.max_number_of_steps)

示例#9

显示文件

文件： test.py 项目： y12uc231/BERT-1

def train_eval_fn(FLAGS,
				worker_count, 
				task_index, 
				is_chief, 
				target,
				init_checkpoint,
				train_file,
				dev_file,
				checkpoint_dir,
				is_debug):

	graph = tf.Graph()
	with graph.as_default():
		import json
				
		config = json.load(open(FLAGS.config_file, "r"))

		config = Bunch(config)
		config.use_one_hot_embeddings = True
		config.scope = "bert"
		config.dropout_prob = 0.1
		config.label_type = "single_label"
		
		if FLAGS.if_shard == "0":
			train_size = FLAGS.train_size
			epoch = int(FLAGS.epoch / worker_count)
		elif FLAGS.if_shard == "1":
			train_size = int(FLAGS.train_size/worker_count)
			epoch = FLAGS.epoch

		init_lr = 2e-5

		label_dict = json.load(open(FLAGS.label_id))

		num_train_steps = int(
			train_size / FLAGS.batch_size * epoch)
		num_warmup_steps = int(num_train_steps * 0.1)

		num_storage_steps = int(train_size / FLAGS.batch_size)

		num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

		if is_debug == "0":
			num_storage_steps = 2
			num_eval_steps = 10
			num_train_steps = 10
		print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".format(num_train_steps, num_eval_steps, num_storage_steps))

		print(" model type {}".format(FLAGS.model_type))

		print(num_train_steps, num_warmup_steps, "=============")
		
		opt_config = Bunch({"init_lr":init_lr/worker_count, 
							"num_train_steps":num_train_steps,
							"num_warmup_steps":num_warmup_steps,
							"worker_count":worker_count,
							"opt_type":FLAGS.opt_type})

		model_io_config = Bunch({"fix_lm":False})
		
		model_io_fn = model_io.ModelIO(model_io_config)

		optimizer_fn = optimizer.Optimizer(opt_config)
		
		num_classes = FLAGS.num_classes

		model_train_fn = model_fn_builder(config, num_classes, init_checkpoint, 
												model_reuse=None, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config,
												exclude_scope="",
												not_storage_params=[],
												target="")
		
		model_eval_fn = model_fn_builder(config, num_classes, init_checkpoint, 
												model_reuse=True, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config,
												exclude_scope="",
												not_storage_params=[],
												target="")
		
		def eval_metric_fn(features, eval_op_dict):
			logits = eval_op_dict["logits"]
			print(logits.get_shape(), "===logits shape===")
			pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
			prob = tf.nn.softmax(logits)
			accuracy = correct = tf.equal(
				tf.cast(pred_label, tf.int32),
				tf.cast(features["label_ids"], tf.int32)
			)
			accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

			return {"accuracy":accuracy, "loss":eval_op_dict["loss"], 
					"pred_label":pred_label, "label_ids":features["label_ids"]}

		def train_metric_fn(features, train_op_dict):
			logits = train_op_dict["logits"]
			print(logits.get_shape(), "===logits shape===")
			pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
			prob = tf.nn.softmax(logits)
			accuracy = correct = tf.equal(
				tf.cast(pred_label, tf.int32),
				tf.cast(features["label_ids"], tf.int32)
			)
			accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
			return {"accuracy":accuracy, "loss":train_op_dict["loss"], 
					"train_op":train_op_dict["train_op"]}
		
		name_to_features = {
				"input_ids":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"input_mask":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"segment_ids":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"label_ids":
						tf.FixedLenFeature([], tf.int64),
		}

		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t

			return example 

		params = Bunch({})
		params.epoch = FLAGS.epoch
		params.batch_size = FLAGS.batch_size

		train_features = tf_data_utils.train_input_fn(train_file,
									_decode_record, name_to_features, params, if_shard=FLAGS.if_shard,
									worker_count=worker_count,
									task_index=task_index)

		eval_features = tf_data_utils.eval_input_fn(dev_file,
									_decode_record, name_to_features, params, if_shard=FLAGS.if_shard,
									worker_count=worker_count,
									task_index=task_index)
		
		train_op_dict = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)
		eval_op_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
		eval_dict = eval_metric_fn(eval_features, eval_op_dict["eval"])
		train_dict = train_metric_fn(train_features, train_op_dict["train"])
		
		def eval_fn(eval_dict, sess):
			i = 0
			total_accuracy = 0
			eval_total_dict = {}
			while True:
				try:
					eval_result = sess.run(eval_dict)
					for key in eval_result:
						if key not in eval_total_dict:
							if key in ["pred_label", "label_ids"]:
								eval_total_dict[key] = []
								eval_total_dict[key].extend(eval_result[key])
							if key in ["accuracy", "loss"]:
								eval_total_dict[key] = 0.0
								eval_total_dict[key] += eval_result[key]
						else:
							if key in ["pred_label", "label_ids"]:
								eval_total_dict[key].extend(eval_result[key])
							if key in ["accuracy", "loss"]:
								eval_total_dict[key] += eval_result[key]

					i += 1
					if np.mod(i, num_eval_steps) == 0:
						break
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break

			label_id = eval_total_dict["label_ids"]
			pred_label = eval_total_dict["pred_label"]

			result = classification_report(label_id, pred_label, 
				target_names=list(label_dict["label2id"].keys()))

			print(result, task_index)
			eval_total_dict["classification_report"] = result
			return eval_total_dict

		def train_fn(train_op_dict, sess):
			i = 0
			cnt = 0
			loss_dict = {}
			monitoring_train = []
			monitoring_eval = []
			while True:
				try:
					[train_result, step] = sess.run([train_op_dict, tf.train.get_global_step()])
					for key in train_result:
						if key == "train_op":
							continue
						else:
							if np.isnan(train_result[key]):
								print(train_loss, "get nan loss")
								break
							else:
								if key in loss_dict:
									loss_dict[key] += train_result[key]
								else:
									loss_dict[key] = train_result[key]
					
					i += 1
					cnt += 1
					
					if np.mod(i, num_storage_steps) == 0:
						string = ""
						for key in loss_dict:
							tmp = key + " " + str(loss_dict[key]/cnt) + "\t"
							string += tmp
						print(string, step)
						monitoring_train.append(loss_dict)

						eval_finial_dict = eval_fn(eval_dict, sess)
						monitoring_eval.append(eval_finial_dict)

						for key in loss_dict:
							loss_dict[key] = 0.0
						cnt = 0

					if is_debug == "0":
						if i == num_train_steps:
							break

				except tf.errors.OutOfRangeError:
					print("==Succeeded in training model==")

		print("===========begin to train============")
		sess_config = tf.ConfigProto(allow_soft_placement=False,
									log_device_placement=False)

		checkpoint_dir = checkpoint_dir if task_index == 0 else None
		print("==checkpoint_dir==", checkpoint_dir)

		print("start training")

		hooks = []
		if FLAGS.opt_type == "ps":
			sync_replicas_hook = optimizer_fn.opt.make_session_run_hook(is_chief, num_tokens=0)
			hooks.append(sync_replicas_hook)
			sess = tf.train.MonitoredTrainingSession(master=target,
												 is_chief=is_chief,
												 config=sess_config,
												 hooks=hooks,
												 checkpoint_dir=checkpoint_dir,
												 save_checkpoint_steps=num_storage_steps)
		elif FLAGS.opt_type == "pai_soar" and pai:
			sess = tf.train.MonitoredTrainingSession(master=target,
												 is_chief=is_chief,
												 config=sess_config,
												 hooks=hooks,
												 checkpoint_dir=checkpoint_dir,
												 save_checkpoint_steps=num_storage_steps)
		elif FLAGS.opt_type == "hvd" and hvd:
			bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
			hooks.append(bcast_hook)
			sess_config.gpu_options.allow_growth = True
			sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
			sess = tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
												   hooks=hooks,
												   config=sess_config,
												   save_checkpoint_steps=num_storage_steps)
		else:
			print("==single sess==")
			sess = tf.train.MonitoredTrainingSession(config=sess_config,
												   hooks=hooks,
												   checkpoint_dir=checkpoint_dir,
												   save_checkpoint_steps=num_storage_steps)
						
		step = sess.run(optimizer_fn.global_step)
		print(step)
		train_fn(train_dict, sess)

		if task_index == 0:
			print("===========begin to eval============")
			eval_finial_dict = eval_fn(eval_dict, sess)

示例#10

显示文件

文件： model.py 项目： wz26/graph_gym_core

    def __init__(self, model_info):
        
        # Horovod: initialize Horovod.
        print(hvd.size())
        
        # first need to parse the application info and define nn
        self.game_name = model_info[0]

        # set remaining variables
        self.batch_size = 128
        self.learning_rate = model_info[5]

        if(self.game_name == 'min_cover_s2v'):            
            self.num_nodes = model_info[1]
            self.embed_dim = model_info[2]

            self.batch_size = 2
            
            self.epochs = 50
            ### define placeholder as input
            # input of the model (X decided by selected or not)
            self.x  = tf.placeholder(shape=[None,self.num_nodes], dtype=tf.float32)
            self.ad_matrix = tf.placeholder(shape=[None,self.num_nodes,self.num_nodes], \
                    dtype=tf.float32)
            
            self.x  = tf.placeholder(shape=[None,None], dtype=tf.float32)
            self.ad_matrix = tf.placeholder(shape=[None,None,None], \
                    dtype=tf.float32)

            # label value
            self.y  = tf.placeholder(shape=[1, None],dtype=tf.float32)

#             # used to obtain mu_v
#             self.select_vec = tf.placeholder(shape=[None, self.num_nodes, 1], \
#                     dtype=tf.float32)
#             # used to obtain sigma mu_u
#             self.all_vec    = tf.placeholder(shape=[None, self.num_nodes, 1], \
#                     dtype=tf.float32)
            
            # used to obtain mu_v
            self.select_vec = tf.placeholder(shape=[None, None, 1], \
                    dtype=tf.float32)
            # used to obtain sigma mu_u
            self.all_vec    = tf.placeholder(shape=[None, None, 1], \
                    dtype=tf.float32)

            self.init_limit = 0.5
            self.embed_layer = 5
            self.hid_num_units = 16

            ### define parameter for model of embedding 
            self.embed_param = {
                'theta1':tf.Variable(tf.random_uniform([self.embed_dim, 1], \
                                                       minval=-self.init_limit, \
                                                       maxval=self.init_limit, \
                                                       dtype=tf.float32, seed=1)),
                'theta2':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \
                                                       minval=-self.init_limit, \
                                                       maxval=self.init_limit, \
                                                       dtype=tf.float32, seed=1)),
                'theta3':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \
                                                       minval=-self.init_limit, \
                                                       maxval=self.init_limit, \
                                                       dtype=tf.float32, seed=1)),
                'theta4':tf.Variable(tf.random_uniform([self.embed_dim, 1], \
                                                       minval=-self.init_limit, \
                                                       maxval=self.init_limit, \
                                                       dtype=tf.float32, seed=1))
            }

            ### define parameter for model of Q
            self.value_param = {
                'hid':tf.Variable(tf.random_uniform([2*self.embed_dim, self.hid_num_units], \
                                                    minval=-self.init_limit, \
                                                    maxval=self.init_limit, \
                                                    dtype=tf.float32, seed=1)),
                'theta5':tf.Variable(tf.random_uniform([self.hid_num_units, 1], \
                                                       minval=-self.init_limit, \
                                                       maxval=self.init_limit, \
                                                       dtype=tf.float32, seed=1)),
                'theta6':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \
                                                       minval=-self.init_limit, \
                                                       maxval=self.init_limit, \
                                                       dtype=tf.float32, seed=1)),
                'theta7':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \
                                                       minval=-self.init_limit, \
                                                       maxval=self.init_limit, \
                                                       dtype=tf.float32, seed=1))
            }

            self.size =  tf.shape(self.ad_matrix)[0]
            self.size_nodes =  tf.shape(self.ad_matrix)[1]
            ### first level of embedding:
            self.theta4_relu = tf.nn.relu(tf.tile(self.embed_param['theta4'], \
                    tf.stack([1, self.size_nodes])))
            ## tf.broadcast_to only supported after 1.13.0
            #self.mu_0_sigma  = tf.matmul(self.embed_param['theta3'], \
            #        tf.matmul(self.theta4_relu, self.ad_matrix))
            #self.theta4_relu = tf.reshape(self.theta4_relu, [-1, self.embed_dim, self.num_nodes])
            self.theta4_relu = tf.expand_dims(self.theta4_relu, 0)
            self.theta4_relu = tf.tile(self.theta4_relu, [self.size, 1, 1], name=None)
            #self.theta4_relu = tf.broadcast_to(self.theta4_relu, [self.size, self.embed_dim, self.num_nodes])
            #self.theta4_relu = tf.expand_dims(self.theta4_relu, 0)
            #self.theta4_relu = tf.stack([x, y, z], axis=0)   self.size
            self.theta3 = tf.expand_dims(self.embed_param['theta3'], 0)
            self.theta3 = tf.tile(self.theta3, [self.size, 1, 1], name=None)
            #self.theta3 = tf.broadcast_to(self.theta3, [self.size, self.embed_dim, self.embed_dim])
           
            self.mu_0_sigma  = tf.matmul(self.theta3, tf.matmul(self.theta4_relu, self.ad_matrix))
            
#             # now our output is Bn-by-p
#             #self.x_reshape = tf.transpose(self.x, perm=[0, 2, 1])
#             self.x_reshape = tf.reshape(self.x, [-1, 1])
            self.theta1 = tf.expand_dims(self.embed_param['theta1'], 0)
            self.theta1 = tf.tile(self.theta1, [self.size, 1, 1], name=None)
            #self.theta1 = tf.broadcast_to(self.theta1, [self.size, self.embed_dim, 1])
#             self.x_reshape = tf.reshape(self.x, [self.size, self.num_nodes])
            self.x_reshape = tf.expand_dims(self.x, 0)
#             self.x_reshape = tf.broadcast_to(self.x_reshape, [self.size, 1, self.num_nodes])
            self.x_reshape = tf.transpose(self.x_reshape, perm=[1, 0, 2])
            self.mu_0 = tf.nn.relu(tf.add(tf.matmul(self.theta1, self.x_reshape), self.mu_0_sigma))
            #self.mu_0 = tf.nn.relu(tf.add(tf.matmul(self.x_reshape, self.theta1_reshape),self.mu_0_sigma))               
            
            ### second level of embedding:
            #self.mu_0_reshape = tf.expand_dims(self.mu_0, 0)
#             self.mu_0_reshape = tf.reshape(self.mu_0_reshape, [-1, self.num_nodes, self.embed_dim]) 
#             self.mu_1_sigma = tf.matmul(self.ad_matrix, self.mu_0_reshape)
#             self.mu_1_sigma = tf.matmul(tf.reshape(self.mu_1_sigma, [-1, self.embed_dim]),\
#                                         self.embed_param['theta2'])
            self.theta2 = tf.expand_dims(self.embed_param['theta2'], 0)
            self.theta2 = tf.tile(self.theta2, [self.size, 1, 1], name=None)
            #self.theta2 = tf.broadcast_to(self.theta2, [self.size, self.embed_dim, self.embed_dim])
            self.mu_1_sigma  = tf.matmul(self.theta2, tf.matmul(self.mu_0, self.ad_matrix))
            self.mu_1        = tf.add(tf.matmul(self.theta1, self.x_reshape), self.mu_1_sigma)
            self.mu_1        = tf.nn.relu(tf.add(self.mu_1, self.mu_1_sigma))
            # now bn-by-p
#             self.mu_1 = tf.nn.relu(tf.add(tf.matmul(self.x_reshape, self.theta1_reshape), self.mu_1_sigma))
            
#             #self.mu_1_sigma  = tf.matmul(self.embed_param['theta2'], \
#             #                             tf.matmul(self.mu_0, self.ad_matrix))
#             #self.mu_1        = tf.add(tf.matmul(self.embed_param['theta1'], \
#             #                                    self.x), self.mu_1_sigma)
#             #self.mu_1        = tf.nn.relu(tf.add(self.mu_1, self.mu_0_sigma))

            for i in range(2, self.embed_layer):
                ### second level of embedding:
#                 self.mu_1_reshape = tf.expand_dims(self.mu_1, 0)
#                 self.mu_1_reshape = tf.reshape(self.mu_1_reshape, [-1, self.num_nodes, self.embed_dim]) 
#                 self.mu_1_sigma = tf.matmul(self.ad_matrix, self.mu_1_reshape)
#                 self.mu_1_sigma = tf.matmul(tf.reshape(self.mu_1_sigma, [-1, self.embed_dim]),\
#                                             self.embed_param['theta2'])
#                 # now bn-by-p
#                 self.mu_1 = tf.nn.relu(tf.add(tf.matmul(self.x_reshape, self.theta1_reshape), self.mu_1_sigma))
                self.mu_1_sigma  = tf.matmul(self.theta2, tf.matmul(self.mu_1, self.ad_matrix))
                self.mu_1        = tf.add(tf.matmul(self.theta1, self.x_reshape), self.mu_1_sigma)
                self.mu_1        = tf.nn.relu(tf.add(self.mu_1, self.mu_1_sigma))

#             self.Q_sigma     = tf.matmul(self.value_param['theta6'], \
#                     tf.matmul(self.mu_0_sigma, self.all_vec))
#             self.mu_u        = tf.matmul(self.value_param['theta7'], \
#                     tf.matmul(self.mu_0_sigma, self.select_vec))    
            ### Q value function
            self.Q_sigma = tf.matmul(self.mu_1, self.all_vec)
#             self.Q_sigma = tf.matmul(tf.reshape(self.mu_0_sigma, [-1,self.embed_dim, self.num_nodes]), \
#                       self.all_vec)
            self.Q_sigma = tf.matmul(self.value_param['theta6'], tf.reshape(self.Q_sigma, \
                      [self.embed_dim,-1]))                                
            
            self.mu_u = tf.matmul(self.mu_1, self.select_vec)
#             self.mu_u = tf.matmul(tf.reshape(self.mu_0_sigma, [-1,self.embed_dim, self.num_nodes]), \
#                       self.select_vec)
            self.mu_u = tf.matmul(self.value_param['theta7'], tf.reshape(self.mu_u, \
                      [self.embed_dim,-1]))   

            self.Q_vec        = tf.nn.relu(tf.concat([self.Q_sigma, self.mu_u], 0))

            self.output_hidden = tf.matmul(tf.transpose(self.value_param['hid']), \
                                          self.Q_vec)

            self.output_layer = tf.matmul(tf.transpose(self.value_param['theta5']), \
                    self.output_hidden)

        else:
            self.epochs = 3
            self.in_num_units = model_info[1]
            self.hid_num_units = model_info[2]
            self.out_num_units = model_info[3]   

            # define placeholders
            self.x = tf.placeholder(shape=[None,self.in_num_units], dtype=tf.float32)
            self.y = tf.placeholder(shape=[None,self.out_num_units],dtype=tf.float32)

            ### define weights and biases of the neural network 
            self.weights = {
                'hidden1':tf.Variable(tf.random_normal([self.in_num_units, \
                                                        self.hid_num_units \
                                                        ], seed=1)),
                'hidden2':tf.Variable(tf.random_normal([self.hid_num_units, \
                                                        int(self.hid_num_units/2) \
                                                        ], seed=1)),
                'output':tf.Variable(tf.random_normal([int(self.hid_num_units/2), \
                                                        self.out_num_units \
                                                        ], seed=1))
            }

            self.biases = {
                'hidden1':tf.Variable(tf.random_normal([self.hid_num_units], seed=1)),
                'hidden2':tf.Variable(tf.random_normal([int(self.hid_num_units/2)],\
                        seed=1)),
                'output':tf.Variable(tf.random_normal([self.out_num_units], seed=1))
            }

            self.hidden_layer1 = tf.add(tf.matmul(self.x, self.weights['hidden1']),\
                                       self.biases['hidden1'])
            self.hidden_layer1 = tf.nn.relu(self.hidden_layer1)

            self.hidden_layer2 = tf.add(tf.matmul(self.hidden_layer1, \
                                                  self.weights['hidden2']), \
                                                  self.biases['hidden2'])
            self.hidden_layer2 = tf.nn.relu(self.hidden_layer2)

            self.output_layer = tf.matmul(self.hidden_layer2, \
                                          self.weights['output']) \
                                          + self.biases['output']
            # self.output_layer = tf.nn.relu(self.output_layer)

        self.cost = tf.reduce_sum(tf.square(self.output_layer - self.y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
#                minimize(self.cost)
        #self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
    
        global_step = tf.train.get_or_create_global_step()
    
        self.hvd_opt = hvd.DistributedOptimizer(self.optimizer)
        self.train_op = self.hvd_opt.minimize(self.cost, global_step=global_step)
        
        #global_step = tf.Variable(0, trainable=False)
        
        #self.train_op = self.hvd_opt.minimize(self.cost, global_step=global_step)
        
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        #config.gpu_options.per_process_gpu_memory_fraction = 1
        gpus = config.gpu_options.visible_device_list= str(hvd.local_rank())
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        #gpus = tf.config.experimental.list_physical_devices('GPU')
        print(gpus)
        
        #config.gpu_options.visible_device_list = str(hvd.local_rank())
        print(config.gpu_options.visible_device_list)
        
        self.sess = tf.Session(config=config)
        
        self.sess.run(tf.global_variables_initializer())
        
        bcast = hvd.broadcast_global_variables(0)
        
        print("it has been initialized")

        self.hooks = [
            # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
            # from rank 0 to all other processes. This is necessary to ensure consistent
            # initialization of all workers when training is started with random weights
            # or restored from a checkpoint.
            hvd.BroadcastGlobalVariablesHook(0)

            # Horovod: adjust number of steps based on number of GPUs.
            #tf.train.StopAtStepHook(last_step=self.epochs // hvd.size()),

#             tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': self.cost},
#                                        every_n_iter=10),
        ]

        # Horovod: pin GPU to be used to process local rank (one GPU per process)
       
        # Horovod: save checkpoints only on worker 0 to prevent other workers from
        # corrupting them.
        self.checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
#         training_batch_generator = train_input_generator(x_train,
#                                                          y_train, batch_size=100)
        # The MonitoredTrainingSession takes care of session initialization,
        # restoring from a checkpoint, saving to a checkpoint, and closing when done
        # or an error occurs.
#         with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
#                                                hooks=self.hooks,
#                                                config=config) as mon_sess:
#             while not mon_sess.should_stop():
#                 # Run a training step synchronously.
#                 image_, label_ = next(training_batch_generator)
#                 mon_sess.run(train_op, feed_dict={image: image_, label: label_})
        self.mon_sess = tf.train.MonitoredTrainingSession(hooks=self.hooks, config=config)

示例#11

显示文件

def main(_):
  hvd.init()

  # Only a see a single unique GPU based on process rank
  config = tf.ConfigProto()

  # We don't need allow_growth=True as we will use a whole GPU for each process.
  # config.gpu_options.allow_growth = True
  config.gpu_options.visible_device_list = str(hvd.local_rank())

  # Only one of the workers save checkpoints and summaries in the
  # model directory
  if hvd.rank() == 0:
    config = tf.estimator.RunConfig(
      model_dir=FLAGS.model_dir,
      keep_checkpoint_every_n_hours=5,
      save_summary_steps=100,
      save_checkpoints_secs=60 * 5,
      session_config=config
    )
  else:
    config = tf.estimator.RunConfig(
      session_config=config,
      keep_checkpoint_max=1
    )

  if FLAGS.mobilenet_checkpoint_path is not None:
    # ^((?!badword).)*$ matches all strings which do not contain the badword
    ws = tf.estimator.WarmStartSettings(
      ckpt_to_initialize_from=FLAGS.mobilenet_checkpoint_path,
      vars_to_warm_start='.*' if FLAGS.restore_last_layer else "^((?!Logits).)*$",
    )
  else:
    ws = None

  estimator = tf.estimator.Estimator(
    model_fn=model.model_fn,
    config=config,
    warm_start_from=ws
  )

  bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
  image_counter_hook = ImageRateHook()
  # Caching file writers, which will then be retrieved during evaluation.
  writer = tf.summary.FileWriter(logdir=FLAGS.model_dir, flush_secs=30)
  eval_writer = tf.summary.FileWriter(
    logdir=os.path.join(FLAGS.model_dir, "eval"), flush_secs=30)

  try:
    steps = estimator.get_variable_value('global_step')
  except ValueError:
    steps = 0
  evaluate_every_n = 1000
  evaluate(estimator, True)
  evaluate(estimator, False)
  # if hvd.rank() == 0 and FLAGS.evaluate:
  #   evaluate(estimator, False)
  sys.exit()
  print("Steps", steps, "Max steps", FLAGS.max_steps)
  while steps < FLAGS.max_steps:
    evaluate_every_n = min(evaluate_every_n, FLAGS.max_steps - steps)
    estimator.train(input_fn=lambda: model.imagenet_iterator(
                      is_training=True, num_epochs=10000),
                    steps=evaluate_every_n,
                    hooks=[bcast_hook, image_counter_hook])
    if hvd.rank() == 0 and FLAGS.evaluate:
      # Evaluate on training set only for metric_learning
      if FLAGS.model in ['metric_learning', 'cifar100']:
        evaluate(estimator, True)
        evaluate(estimator, False)
      else:
        evaluate(estimator, False)
    
    steps += evaluate_every_n

示例#12

显示文件

文件： hvd_estimator_inceptionv3_imagenet.py 项目： zhucer2003/tensorflow-training

    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)


hvd.init()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())

classifier = tf.estimator.Estimator(
    model_fn=my_model,
    model_dir='./checkpoints_inceptionv3_%s' % hvd.rank(),
    # params={
    #     'feature_columns': my_feature_columns,
    #     'hidden_units': [10, 10],  # Two hidden layers of 10 nodes each.
    #     'n_classes': 3,  # The model must choose between 3 classes.
    # }
)

bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

train = classifier.train(input_fn=train_input_fn,
                         steps=100,
                         hooks=[bcast_hook])

eval_result = classifier.evaluate(input_fn=test_input_fn, steps=10)

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

示例#13

显示文件

文件： transformer_main.py 项目： IntelAI/models

def train_schedule(
    estimator, train_eval_iterations, single_iteration_train_steps=None,
    single_iteration_train_epochs=None, bleu_source=None, bleu_ref=None,
    bleu_threshold=None):
  """Train and evaluate model, and optionally compute model's BLEU score.

  **Step vs. Epoch vs. Iteration**

  Steps and epochs are canonical terms used in TensorFlow and general machine
  learning. They are used to describe running a single process (train/eval):
    - Step refers to running the process through a single or batch of examples.
    - Epoch refers to running the process through an entire dataset.

  E.g. training a dataset with 100 examples. The dataset is
  divided into 20 batches with 5 examples per batch. A single training step
  trains the model on one batch. After 20 training steps, the model will have
  trained on every batch in the dataset, or, in other words, one epoch.

  Meanwhile, iteration is used in this implementation to describe running
  multiple processes (training and eval).
    - A single iteration:
      1. trains the model for a specific number of steps or epochs.
      2. evaluates the model.
      3. (if source and ref files are provided) compute BLEU score.

  This function runs through multiple train+eval+bleu iterations.

  Args:
    estimator: tf.Estimator containing model to train.
    train_eval_iterations: Number of times to repeat the train+eval iteration.
    single_iteration_train_steps: Number of steps to train in one iteration.
    single_iteration_train_epochs: Number of epochs to train in one iteration.
    bleu_source: File containing text to be translated for BLEU calculation.
    bleu_ref: File containing reference translations for BLEU calculation.
    bleu_threshold: minimum BLEU score before training is stopped.

  Raises:
    ValueError: if both or none of single_iteration_train_steps and
      single_iteration_train_epochs were defined.
  """
  # Ensure that exactly one of single_iteration_train_steps and
  # single_iteration_train_epochs is defined.
  if single_iteration_train_steps is None:
    if single_iteration_train_epochs is None:
      raise ValueError(
          "Exactly one of single_iteration_train_steps or "
          "single_iteration_train_epochs must be defined. Both were none.")
  else:
    if single_iteration_train_epochs is not None:
      raise ValueError(
          "Exactly one of single_iteration_train_steps or "
          "single_iteration_train_epochs must be defined. Both were defined.")

  evaluate_bleu = bleu_source is not None and bleu_ref is not None

  # Print out training schedule
  print("Training schedule:")
  if single_iteration_train_epochs is not None:
    print("\t1. Train for %d epochs." % single_iteration_train_epochs)
  else:
    print("\t1. Train for %d steps." % single_iteration_train_steps)
  print("\t2. Evaluate model.")
  if evaluate_bleu:
    print("\t3. Compute BLEU score.")
    if bleu_threshold is not None:
      print("Repeat above steps until the BLEU score reaches", bleu_threshold)
  if not evaluate_bleu or bleu_threshold is None:
    print("Repeat above steps %d times." % train_eval_iterations)

  if evaluate_bleu:
    # Set summary writer to log bleu score.
    bleu_writer = tf.compat.v1.summary.FileWriter(
        os.path.join(estimator.model_dir, BLEU_DIR))
    if bleu_threshold is not None:
      # Change loop stopping condition if bleu_threshold is defined.
      train_eval_iterations = INF

  # Loop training/evaluation/bleu cycles
  mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP)
  # Profiling with timeline
  if FLAGS.save_profile == "Yes":
    profile_hooks = [tf.compat.v1.train.ProfilerHook(save_steps=1, output_dir=FLAGS.profile_dir)] # the json file 
  #profile file will be saved in in profile_dir
  #Creating hooks for printing Examples per Second, used with estimator.train
  training_batch_size = estimator.params.batch_size
  if FLAGS.batch_size != -1:
    training_batch_size = FLAGS.batch_size
  train_hooks = hooks_helper.get_train_hooks(
      ["ExamplesPerSecondHook"],
      model_dir=FLAGS.model_dir,
      batch_size=training_batch_size,
      every_n_steps=FLAGS.print_iter,
      warm_steps=50
  )
  if FLAGS.save_profile == "Yes":
    hooks = profile_hooks
  else:
    hooks = train_hooks
  
  for i in xrange(train_eval_iterations):
    print("Starting iteration", i + 1)

    if single_iteration_train_epochs is not None:
      mlperf_log.transformer_print(key=mlperf_log.TRAIN_EPOCH,
                                 value=i * single_iteration_train_epochs + 1)

    #Can we move the following out of the loop
    if is_mpi:
      train_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    # Train the model for single_iteration_train_steps or until the input fn
    # runs out of examples (if single_iteration_train_steps is None).
    estimator.train(dataset.train_input_fn, steps=single_iteration_train_steps, hooks=hooks)

    mlperf_log.transformer_print(key=mlperf_log.EVAL_START)
    # To save training time, we can turn off evaluation
    # Otherwise it will be turned on
    if FLAGS.do_eval == "Yes":
        eval_results = estimator.evaluate(dataset.eval_input_fn)
        print("Evaluation results (iter %d/%d):" % (i + 1, train_eval_iterations),
            eval_results)

    if evaluate_bleu:
      uncased_score, _ = evaluate_and_log_bleu(
          estimator, bleu_writer, bleu_source, bleu_ref)
      if bleu_threshold is not None and uncased_score > bleu_threshold:
        bleu_writer.close()
        break
      mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET, value=bleu_threshold)
      mlperf_log.transformer_print(key=mlperf_log.EVAL_ACCURACY, value=uncased_score)
    mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)

示例#14

显示文件

def train(infer_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    batch_size = params['batch_size']
    distort_color = params['distort_color']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    log_dir = params['log_dir']
    precision = params['precision']
    momentum = params['momentum']
    learning_rate_init = params['learning_rate_init']
    learning_rate_power = params['learning_rate_power']
    weight_decay = params['weight_decay']
    loss_scale = params['loss_scale']
    larc_eta = params['larc_eta']
    larc_mode = params['larc_mode']
    num_iter = params['num_iter']
    checkpoint_secs = params['checkpoint_secs']
    display_every = params['display_every']
    iter_unit = params['iter_unit']
    use_dali = params['use_dali']

    # Determinism is not fully supported by all TF ops.
    # Disabling until remaining wrinkles can be ironed out.
    deterministic = False
    if deterministic:
        tf.set_random_seed(2 * (1 + hvd.rank()))
        random.seed(3 * (1 + hvd.rank()))
        np.random.seed(2)

    log_dir  = None if log_dir  == "" else log_dir
    data_dir = None if data_dir == "" else data_dir
    data_idx_dir = None if data_idx_dir == "" else data_idx_dir

    global_batch_size = batch_size * hvd.size()
    if data_dir is not None:
        filename_pattern = os.path.join(data_dir, '%s-*')
        train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
        num_training_samples = _get_num_records(train_filenames)
    else:
        num_training_samples = global_batch_size
    train_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        train_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))

    if iter_unit.lower() == 'epoch':
        nstep = num_training_samples * num_iter // global_batch_size
        decay_steps = nstep
    else:
        nstep = num_iter
        num_epochs = max(nstep * global_batch_size // num_training_samples, 1)
        decay_steps = 90 * num_training_samples // global_batch_size

    nstep_per_epoch = num_training_samples // global_batch_size

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
    config = tf.ConfigProto(gpu_options=gpu_options)
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = max(2, 40//hvd.size()-2)

    classifier = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : decay_steps,
            'weight_decay' : weight_decay,
            'loss_scale' : loss_scale,
            'larc_eta' : larc_eta,
            'larc_mode' : larc_mode,
            'deterministic' : deterministic,
            'n_classes':     1000,
            'use_dali': use_dali,
        },
        config=tf.estimator.RunConfig(
            tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None,
            session_config=config,
            save_checkpoints_secs=checkpoint_secs if hvd.rank() == 0 else None,
            save_checkpoints_steps=nstep if hvd.rank() == 0 else None,
            keep_checkpoint_every_n_hours=3))

    print("Training")
    if not deterministic and not use_dali:
        num_preproc_threads = 10
    elif not deterministic and use_dali:
        num_preproc_threads = 2
    elif deterministic:
        num_preproc_threads = 1

    training_hooks = [hvd.BroadcastGlobalVariablesHook(0),
                      _PrefillStagingAreasHook()]
    if hvd.rank() == 0:
        training_hooks.append(
            _LogSessionRunHook(global_batch_size,
                               num_training_samples,
                               display_every))

    if data_dir is not None:
        input_func = lambda: nvutils.image_set(
            train_filenames, batch_size, image_height, image_width,
            training=True, distort_color=distort_color,
            deterministic=deterministic, num_threads=num_preproc_threads,
            use_dali=use_dali, idx_filenames=train_idx_filenames)
    else:
        input_func = lambda: nvutils.fake_image_set(
            batch_size, image_height, image_width)

    try:
        classifier.train(
            input_fn=input_func,
            max_steps=nstep,
            hooks=training_hooks)
    except KeyboardInterrupt:
        print("Keyboard interrupt")

示例#15

显示文件

def main_train(args, files, tf_config):
    assert args.logdir != '', 'logdir cannot be empty'
    logdir = os.path.join(args.logdir, 'tf_output')

    if os.path.isdir(logdir):
        do_not_delete = True
        if args.ngpus > 1:
            if hvd.rank() == 0:
                if args.force_continue:
                    do_not_delete = True
                else:
                    do_not_delete = False
            else:
                do_not_delete = True

        elif HEADLESS:
            if args.force_continue:
                do_not_delete = True
            else:
                raise ValueError('{} exists'.format(logdir))
        else:
            while True:
                try:
                    key = input(
                        '{} \n do you want to continue?'.format(logdir))
                except NameError:
                    key = 'y'
                if key == 'y':
                    break
                elif key == 'n':
                    do_not_delete = False
                    break
                else:
                    print('invalid key')

        if not do_not_delete:
            print('******* Deleting {} *******'.format(logdir))
            os.system('rm -r {}'.format(logdir))
        else:
            print('continuing')
    elif args.ngpus == 1 or hvd.rank() == 0:
        os.makedirs(logdir)

    print('logdir is {}'.format(logdir))

    tf_output, pc_reader = build_tf_ops(
        args=args,
        data_dict=None,
        files=files,
    )

    train_op, summary_op, tf_data_dict, logger_dict, tf_step = tf_output

    summary_hook = tf.train.SummarySaverHook(
        summary_op=summary_op,
        output_dir=logdir,
        save_steps=args.save_steps,
    )

    logging_hook = tf.train.LoggingTensorHook(
        tensors=logger_dict,
        every_n_iter=args.log_steps,
    )

    hooks = []
    if args.ngpus > 1:
        hooks.append(hvd.BroadcastGlobalVariablesHook(0))
        if hvd.rank() == 0:
            checkpoint_dir = logdir
            save_checkpoint_secs = 300
            hooks += [logging_hook, summary_hook]
        else:
            checkpoint_dir = None
            save_checkpoint_secs = 0
            hooks += [logging_hook]
    else:
        hooks = [logging_hook, summary_hook]
        checkpoint_dir = logdir
        save_checkpoint_secs = 300

    if args.init_checkpoint_folder != '':
        checkpoint = tf.train.latest_checkpoint(args.init_checkpoint_folder)
        tf.train.init_from_checkpoint(checkpoint, {'/': '/'})

    yaml_path = os.path.join(args.logdir, 'args.yaml')

    with open(yaml_path, 'w') as yaml_file:
        yaml.dump(args, yaml_file)

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=checkpoint_dir,
            hooks=hooks,
            save_summaries_secs=0,
            save_checkpoint_secs=save_checkpoint_secs,
            config=tf_config,
    ) as mon_sess:
        start_time = time.time()
        print(time.time() - start_time)
        writer = SummaryWriterCache.get(logdir)

        while not mon_sess.should_stop():
            # print('hvd rank = {}, current_index = {}, nfiles = {}'.format(current_index, hvd.rank(), len(my_files)))
            tensor_list = [tf_step, tf_data_dict]
            if args.training_splits == 'train':
                tensor_list += [train_op] + tensor_list
            # _, step, data_dict = mon_sess.run(tensor_list)
            mon_sess.run(tensor_list)

示例#16

显示文件

def main(_):
    os.environ[
        "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"  #causes memory fragmentation for bert leading to OOM

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

    if FLAGS.horovod:
        hvd.init()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:

        tf.compat.v1.logging.info("Multi-GPU training with TF Horovod")
        tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d",
                                  hvd.size(), hvd.rank())
        global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size(
        )
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.compat.v1.logging.info("***** Configuaration *****")
        for key in FLAGS.__flags.keys():
            tf.compat.v1.logging.info('  {}: {}'.format(
                key, getattr(FLAGS, key)))
        tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.size())
            ]
            num_examples_per_rank = len(train_examples) // hvd.size()
            remainder = len(train_examples) % hvd.size()
            if hvd.rank() < remainder:
                start_index = hvd.rank() * (num_examples_per_rank + 1)
                end_index = start_index + num_examples_per_rank + 1
            else:
                start_index = hvd.rank() * num_examples_per_rank + remainder
                end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(task_name=task_name,
                                bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:

        file_based_convert_examples_to_features(
            train_examples[start_index:end_index], label_list,
            FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])

        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.compat.v1.logging.info("-----------------------------")
            tf.compat.v1.logging.info(
                "Total Training Time = %0.2f for Sentences = %d",
                train_time_elapsed, num_train_steps * global_batch_size)
            tf.compat.v1.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) = %0.2f",
                ss_sentences_per_second)
            tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(eval_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
        eval_start_time = time.time()
        result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks)

        eval_time_elapsed = time.time() - eval_start_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        # Removing outliers (init/warmup) in throughput computation.
        eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
        num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead, num_sentences)
        tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(
            step=(),
            data={"throughput_train": ss_sentences_per_second},
            verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                dllogging.logger.log(step=(),
                                     data={key: float(result[key])},
                                     verbosity=Verbosity.DEFAULT)
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))
        tf.compat.v1.logging.info("  Batch size = %d",
                                  FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        predict_start_time = time.time()

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.io.gfile.GFile(output_predict_file, "w") as writer:
            tf.compat.v1.logging.info("***** Predict results *****")
            for prediction in estimator.predict(input_fn=predict_input_fn,
                                                hooks=predict_hooks,
                                                yield_single_examples=False):
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in prediction) + "\n"
                writer.write(output_line)

        predict_time_elapsed = time.time() - predict_start_time
        predict_time_wo_overhead = predict_hooks[-1].total_time

        time_list = predict_hooks[-1].time_list
        time_list.sort()
        num_sentences = (predict_hooks[-1].count -
                         predict_hooks[-1].skipped) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            predict_time_elapsed,
            predict_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            predict_time_wo_overhead,
            (predict_hooks[-1].count - predict_hooks[-1].skipped) *
            FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

示例#17

显示文件

文件： train.py 项目： congxu-ml/nmt

def train(hparams, scope=None, target_session=""):
    # Horovod
    hvd.init()
    """Train a translation model."""
    log_device_placement = hparams.log_device_placement
    out_dir = hparams.out_dir
    num_train_steps = hparams.num_train_steps
    steps_per_stats = hparams.steps_per_stats
    steps_per_external_eval = hparams.steps_per_external_eval
    steps_per_eval = 10 * steps_per_stats
    avg_ckpts = hparams.avg_ckpts

    if not steps_per_external_eval:
        steps_per_external_eval = 5 * steps_per_eval

    # Create model
    model_creator = get_model_creator(hparams)
    #train_model = model_helper.create_train_model(model_creator, hparams, scope)
    # Horovod
    train_model = model_helper.create_train_model(model_creator,
                                                  hparams,
                                                  scope,
                                                  num_workers=hvd.size(),
                                                  jobid=hvd.rank())
    eval_model = model_helper.create_eval_model(model_creator, hparams, scope)
    infer_model = model_helper.create_infer_model(model_creator, hparams,
                                                  scope)

    # Preload data for sample decoding.
    dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
    dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
    sample_src_data = inference.load_data(dev_src_file)
    sample_tgt_data = inference.load_data(dev_tgt_file)

    summary_name = "train_log"
    model_dir = hparams.out_dir

    # Log and output files
    log_file = os.path.join(out_dir, "log_%d" % time.time())
    log_f = tf.gfile.GFile(log_file, mode="a")
    utils.print_out("# log_file=%s" % log_file, log_f)

    # TensorFlow model
    config_proto = utils.get_config_proto(
        log_device_placement=log_device_placement,
        num_intra_threads=hparams.num_intra_threads,
        num_inter_threads=hparams.num_inter_threads)
    #train_sess = tf.Session(
    #    target=target_session, config=config_proto, graph=train_model.graph)

    # Horovod
    with train_model.graph.as_default():
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        with tf.train.MonitoredTrainingSession(
                #checkpoint_dir=flags.out_dir,
                hooks=hooks,
                config=config_proto) as train_sess:

            eval_sess = tf.Session(target=target_session,
                                   config=config_proto,
                                   graph=eval_model.graph)
            infer_sess = tf.Session(target=target_session,
                                    config=config_proto,
                                    graph=infer_model.graph)

            loaded_train_model, global_step = model_helper.horovod_create_or_load_model(
                train_model.model, model_dir, train_sess, "train")

            # Summary writer
            summary_writer = tf.summary.FileWriter(
                os.path.join(out_dir, summary_name), train_model.graph)

            # First evaluation
            run_full_eval(model_dir, infer_model, infer_sess, eval_model,
                          eval_sess, hparams, summary_writer, sample_src_data,
                          sample_tgt_data, avg_ckpts)

            last_stats_step = global_step
            last_eval_step = global_step
            last_external_eval_step = global_step

            # This is the training loop.
            stats, info, start_train_time = before_train(
                loaded_train_model, train_model, train_sess, global_step,
                hparams, log_f)
            while global_step < num_train_steps:
                ### Run a step ###
                start_time = time.time()
                try:
                    step_result = loaded_train_model.train(train_sess)
                    hparams.epoch_step += 1
                except tf.errors.OutOfRangeError:
                    # Finished going through the training dataset.  Go to next epoch.
                    hparams.epoch_step = 0
                    utils.print_out(
                        "# Finished an epoch, step %d. Perform external evaluation"
                        % global_step)
                    run_sample_decode(infer_model, infer_sess, model_dir,
                                      hparams, summary_writer, sample_src_data,
                                      sample_tgt_data)
                    run_external_eval(infer_model, infer_sess, model_dir,
                                      hparams, summary_writer)

                    if avg_ckpts:
                        run_avg_external_eval(infer_model, infer_sess,
                                              model_dir, hparams,
                                              summary_writer, global_step)

                    train_sess.run(
                        train_model.iterator.initializer,
                        feed_dict={train_model.skip_count_placeholder: 0})
                    continue

                # Process step_result, accumulate stats, and write summary
                global_step, info[
                    "learning_rate"], step_summary = update_stats(
                        stats, start_time, step_result)
                summary_writer.add_summary(step_summary, global_step)

                # Once in a while, we print statistics.
                if global_step - last_stats_step >= steps_per_stats:
                    last_stats_step = global_step
                    is_overflow = process_stats(stats, info, global_step,
                                                steps_per_stats, log_f)
                    print_step_info("  ", global_step, info,
                                    get_best_results(hparams), log_f)
                    if is_overflow:
                        break

                    # Reset statistics
                    stats = init_stats()

                if global_step - last_eval_step >= steps_per_eval:
                    last_eval_step = global_step
                    utils.print_out("# Save eval, global step %d" %
                                    global_step)
                    add_info_summaries(summary_writer, global_step, info)

                    # Save checkpoint
                    loaded_train_model.saver.save(
                        train_sess._sess._sess._sess._sess,
                        os.path.join(out_dir, "translate.ckpt"),
                        global_step=global_step)

                    # Evaluate on dev/test
                    run_sample_decode(infer_model, infer_sess, model_dir,
                                      hparams, summary_writer, sample_src_data,
                                      sample_tgt_data)
                    run_internal_eval(eval_model, eval_sess, model_dir,
                                      hparams, summary_writer)

                if global_step - last_external_eval_step >= steps_per_external_eval:
                    last_external_eval_step = global_step

                    # Save checkpoint
                    loaded_train_model.saver.save(
                        train_sess._sess._sess._sess._sess,
                        os.path.join(out_dir, "translate.ckpt"),
                        global_step=global_step)
                    run_sample_decode(infer_model, infer_sess, model_dir,
                                      hparams, summary_writer, sample_src_data,
                                      sample_tgt_data)
                    run_external_eval(infer_model, infer_sess, model_dir,
                                      hparams, summary_writer)

                    if avg_ckpts:
                        run_avg_external_eval(infer_model, infer_sess,
                                              model_dir, hparams,
                                              summary_writer, global_step)

            # Done training
            # Now out of the training loop. Doing the rest of full and best evaluation
            # by only rank 0
            if hvd.rank() == 0:
                loaded_train_model.saver.save(
                    train_sess._sess._sess._sess._sess,
                    os.path.join(out_dir, "translate.ckpt"),
                    global_step=global_step)

                (result_summary, _, final_eval_metrics) = (run_full_eval(
                    model_dir, infer_model, infer_sess, eval_model, eval_sess,
                    hparams, summary_writer, sample_src_data, sample_tgt_data,
                    avg_ckpts))
                print_step_info("# Final, ", global_step, info, result_summary,
                                log_f)
                utils.print_time("# Done training!", start_train_time)

                summary_writer.close()

                utils.print_out("# Start evaluating saved best models.")
                for metric in hparams.metrics:
                    best_model_dir = getattr(hparams,
                                             "best_" + metric + "_dir")
                    summary_writer = tf.summary.FileWriter(
                        os.path.join(best_model_dir, summary_name),
                        infer_model.graph)
                    result_summary, best_global_step, _ = run_full_eval(
                        best_model_dir, infer_model, infer_sess, eval_model,
                        eval_sess, hparams, summary_writer, sample_src_data,
                        sample_tgt_data)
                    print_step_info("# Best %s, " % metric, best_global_step,
                                    info, result_summary, log_f)
                    summary_writer.close()

                    if avg_ckpts:
                        best_model_dir = getattr(hparams,
                                                 "avg_best_" + metric + "_dir")
                        summary_writer = tf.summary.FileWriter(
                            os.path.join(best_model_dir, summary_name),
                            infer_model.graph)
                        result_summary, best_global_step, _ = run_full_eval(
                            best_model_dir, infer_model, infer_sess,
                            eval_model, eval_sess, hparams, summary_writer,
                            sample_src_data, sample_tgt_data)
                        print_step_info("# Averaged Best %s, " % metric,
                                        best_global_step, info, result_summary,
                                        log_f)
                        summary_writer.close()

    return final_eval_metrics, global_step

示例#18

显示文件

文件： run_pretraining.py 项目： zhuochenKIDD/DeepLearningExamples

def main(_):
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    if FLAGS.horovod:
        import horovod.tensorflow as hvd
        hvd.init()

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tf.io.gfile.makedirs(FLAGS.output_dir)

    input_files = []
    for input_file_dir in FLAGS.input_files_dir.split(","):
        input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*")))

    if FLAGS.horovod and len(input_files) < hvd.size():
        raise ValueError("Input Files must be sharded")
    if FLAGS.use_fp16 and FLAGS.manual_fp16:
        raise ValueError(
            "AMP and Manual Mixed Precision Training are both activated! Error"
        )

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.rank() == 0:
            tf.compat.v1.logging.info("***** Configuaration *****")
            for key in FLAGS.__flags.keys():
                tf.compat.v1.logging.info('  {}: {}'.format(
                    key, getattr(FLAGS, key)))
            tf.compat.v1.logging.info("**************************")


#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
        config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if not FLAGS.horovod or hvd.rank() == 0 else None,
        # This variable controls how often estimator reports examples/sec.
        # Default value is every 100 steps.
        # When --report_loss is True, we set to very large value to prevent
        # default info reporting from estimator.
        # Ideally we should set it to None, but that does not work.
        log_step_count_steps=10000 if FLAGS.report_loss else 100)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd)

    training_hooks = []
    if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
        global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size(
        )
        training_hooks.append(
            _LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps,
                               dllogging, FLAGS.display_loss_steps))
    if FLAGS.horovod and hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            batch_size=FLAGS.train_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True,
            hvd=None if not FLAGS.horovod else hvd)

        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_files = []
        for eval_file_dir in FLAGS.eval_files_dir.split(","):
            eval_files.extend(
                tf.io.gfile.glob(os.path.join(eval_file_dir, "*")))

        eval_input_fn = input_fn_builder(
            input_files=eval_files,
            batch_size=FLAGS.eval_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False,
            hvd=None if not FLAGS.horovod else hvd)

        eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
        eval_start_time = time.time()
        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps,
                                    hooks=eval_hooks)

        eval_time_elapsed = time.time() - eval_start_time
        eval_time_wo_overhead = eval_hooks[-1].total_time

        num_sentences = (eval_hooks[-1].count -
                         eval_hooks[-1].skipped) * FLAGS.eval_batch_size

        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead,
            (eval_hooks[-1].count - eval_hooks[-1].skipped) *
            FLAGS.eval_batch_size)
        tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

示例#19

显示文件

def main(_):

    tf.logging.set_verbosity(tf.logging.INFO)

    hvd.init()

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    hvd_config = tf.ConfigProto()
    hvd_config.gpu_options.allow_growth = True
    hvd_config.gpu_options.visible_device_list = str(hvd.local_rank())

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2

    run_config = contrib_tpu.RunConfig(
        session_config=hvd_config,
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(albert_config=albert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=int(FLAGS.num_train_steps /
                                                    hvd.size()),
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu,
                                optimizer=FLAGS.optimizer,
                                poly_power=FLAGS.poly_power,
                                start_warmup_step=FLAGS.start_warmup_step)

    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True)

        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps,
                        hooks=[bcast_hook])

    if FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        global_step = -1
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        writer = tf.gfile.GFile(output_eval_file, "w")
        tf.gfile.MakeDirs(FLAGS.export_dir)
        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)
        while global_step < FLAGS.num_train_steps:
            if estimator.latest_checkpoint() is None:
                tf.logging.info("No checkpoint found yet. Sleeping.")
                time.sleep(1)
            else:
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=FLAGS.max_eval_steps)
                global_step = result["global_step"]
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

示例#20

显示文件

def main(_):
    #liangaws:测试sagemaker传入python程序的参数。
    import sys
    print(sys.argv)

    #liangaws: initialize Horovod.
    hvd.init()

    #------check Arguments------
    if FLAGS.dt_dir == "":
        FLAGS.dt_dir = (date.today() + timedelta(-1)).strftime('%Y%m%d')
    #FLAGS.model_dir = FLAGS.model_dir + FLAGS.dt_dir
    #FLAGS.data_dir  = FLAGS.data_dir + FLAGS.dt_dir

    print('task_type ', FLAGS.task_type)
    print('model_dir ', FLAGS.model_dir)
    print('data_dir ', FLAGS.data_dir)
    print('dt_dir ', FLAGS.dt_dir)
    print('num_epochs ', FLAGS.num_epochs)
    print('feature_size ', FLAGS.feature_size)
    print('field_size ', FLAGS.field_size)
    print('embedding_size ', FLAGS.embedding_size)
    print('batch_size ', FLAGS.batch_size)
    print('deep_layers ', FLAGS.deep_layers)
    print('dropout ', FLAGS.dropout)
    print('loss_type ', FLAGS.loss_type)
    print('optimizer ', FLAGS.optimizer)
    print('learning_rate ', FLAGS.learning_rate)
    print('batch_norm_decay ', FLAGS.batch_norm_decay)
    print('batch_norm ', FLAGS.batch_norm)
    print('l2_reg ', FLAGS.l2_reg)

    #------init Envs------
    #liangaws: 这里利用glob.glob函数可以把data_dir目录下的所有训练文件名抽取出来组成一个list，之后可以直接把这个文件名list传给TextLineDataset。
    tr_files = glob.glob("%s/tr*libsvm" % FLAGS.data_dir)
    random.shuffle(tr_files)
    print("tr_files:", tr_files)
    va_files = glob.glob("%s/va*libsvm" % FLAGS.data_dir)
    print("va_files:", va_files)
    te_files = glob.glob("%s/te*libsvm" % FLAGS.data_dir)
    print("te_files:", te_files)

    if FLAGS.clear_existing_model:
        try:
            shutil.rmtree(FLAGS.model_dir)
        except Exception as e:
            print(e, "at clear_existing_model")
        else:
            print("existing model cleaned at %s" % FLAGS.model_dir)

    #liangaws:这里注释掉调用设置parameter server方式进行分布式训练的环境参数，因为这个训练环境要用Sagemaker来控制。
    #set_dist_env()

    #------bulid Tasks------
    model_params = {
        "field_size": FLAGS.field_size,
        "feature_size": FLAGS.feature_size,
        "embedding_size": FLAGS.embedding_size,
        "learning_rate": FLAGS.learning_rate,
        "batch_norm_decay": FLAGS.batch_norm_decay,
        "l2_reg": FLAGS.l2_reg,
        "deep_layers": FLAGS.deep_layers,
        "dropout": FLAGS.dropout
    }

    #liangaws:这里注释掉config设置，暂时不使用这个。
    """ 
    config = tf.estimator.RunConfig().replace(session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}),
            log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps)
    """

    #liangaws:设置checkpoint的周期和最大数量
    #config = tf.estimator.RunConfig().replace(save_checkpoints_secs = 5,
    #                                          keep_checkpoint_max = 5, #log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps)

    #liangaws: 使用Horovod， pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # liangaws: 使用Horovod的时候， save checkpoints only on worker 0 to prevent other workers from corrupting them.
    print('current horovod rank is ', hvd.rank())
    print('input model dir is ', FLAGS.model_dir)

    print("host is ", FLAGS.hosts)
    print('current host is ', FLAGS.current_host)

    if hvd.rank() == 0:
        DeepFM = tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir=FLAGS.model_dir,
            params=model_params,
            config=tf.estimator.RunConfig().replace(session_config=config))
    else:
        DeepFM = tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir=None,
            params=model_params,
            config=tf.estimator.RunConfig().replace(session_config=config))

    # liangaws: 使用Horovod的时候， BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0 to all other processes. This is necessary to ensure consistent initialization of all workers when training is started with random weights or restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    #liangaws: 为了在Sagemaker pipe mode下使用horovod的单机多个worker进程，需要在调用Sagemaker的estimator fit的时候用多个channel，至少单机的每个worker需要一个channel。从SM设置的环境变量SM_CHANNELS可以获得当前的所有channel名字，之后每个worker用单独的channel来进行数据读取。
    #这里channel名字的顺序与调用Sagemaker estimator fit时候写入的顺序是不同的。比如对于{'training':train_s3, 'training-2':train2_s3, 'evaluation': validate_s3}这样的三个channel，环境变量被SM设置为['evaluation', 'training', 'training-2']，也就是说最后一个channel 'evaluation'出现在环境变量SM_CHANNELS中的第一个，其他channel则是按照原来顺序排列。
    channel_names = json.loads(os.environ['SM_CHANNELS'])
    print("channel name", channel_names)
    print("first channel", channel_names[0])
    print("last channel name", channel_names[-1])
    eval_channel = channel_names[0]

    if FLAGS.task_type == 'train':
        #liangaws:增加hook到TrainSpec中
        """
        train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(tr_files, channel='training', num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook])
        eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(va_files, channel='evaluation', num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200)
        tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec)
        
        """
        if FLAGS.pipe_mode == 0:  #file mode
            for _ in range(FLAGS.num_epochs):
                DeepFM.train(input_fn=lambda: input_fn(
                    tr_files, num_epochs=1, batch_size=FLAGS.batch_size),
                             hooks=[bcast_hook])
                if hvd.rank() == 0:  #只需要在horovod的master做模型评估
                    DeepFM.evaluate(input_fn=lambda: input_fn(
                        va_files, num_epochs=1, batch_size=FLAGS.batch_size))
        else:  #pipe mode
            #liangaws: horovod + pipe mode方式下，训练中worker第二次进入input_fn中的时候，继续使用PipeModeDataset对同一个FIFO读取数据会出问题。
            """
            train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(channel=channel_names[1 + hvd.local_rank()], num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook])
            eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(channel=eval_channel, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200)
            tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec)
        
            """
            DeepFM.train(input_fn=lambda: input_fn(
                channel=channel_names[1 + hvd.local_rank()],
                num_epochs=FLAGS.num_epochs,
                batch_size=FLAGS.batch_size),
                         hooks=[bcast_hook])
            if hvd.rank() == 0:  #只需要在horovod的master做模型评估
                DeepFM.evaluate(
                    input_fn=lambda: input_fn(channel=eval_channel,
                                              num_epochs=1,
                                              batch_size=FLAGS.batch_size))

    elif FLAGS.task_type == 'eval':
        DeepFM.evaluate(input_fn=lambda: input_fn(
            va_files, num_epochs=1, batch_size=FLAGS.batch_size))
    elif FLAGS.task_type == 'infer':
        preds = DeepFM.predict(input_fn=lambda: input_fn(
            te_files, num_epochs=1, batch_size=FLAGS.batch_size),
                               predict_keys="prob")
        with open(FLAGS.data_dir + "/pred.txt", "w") as fo:
            for prob in preds:
                fo.write("%f\n" % (prob['prob']))
    #liangaws:这里修改当任务类型是train或者export的时候都保存模型
    if FLAGS.task_type == 'export' or FLAGS.task_type == 'train':
        #feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
        #feature_spec = {
        #    'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]),
        #    'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size])
        #}
        #serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
        feature_spec = {
            'feat_ids':
            tf.placeholder(dtype=tf.int64,
                           shape=[None, FLAGS.field_size],
                           name='feat_ids'),
            'feat_vals':
            tf.placeholder(dtype=tf.float32,
                           shape=[None, FLAGS.field_size],
                           name='feat_vals')
        }
        serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
            feature_spec)

        #liangaws: 使用Horovod的时候: Save model and history only on worker 0 (i.e. master)
        if hvd.rank() == 0:
            DeepFM.export_savedmodel(FLAGS.servable_model_dir,
                                     serving_input_receiver_fn)

示例#21

显示文件

文件： horovod_estimator_mnist.py 项目： vandanavk/sagemaker-debugger

def main(args):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets")
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (train_data, train_labels), (eval_data, eval_labels) = keras.datasets.mnist.load_data(
        "MNIST-data-%d" % hvd.rank()
    )

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    if not args.use_only_cpu:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        estimator_config = tf.estimator.RunConfig(session_config=config)
    else:
        estimator_config = None

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = args.model_dir if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir, config=estimator_config
    )

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True
    )

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn, steps=args.num_steps // hvd.size(), hooks=[bcast_hook]
    )

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False
    )
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)

示例#22

显示文件

def main(_):
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    if FLAGS.horovod:
        hvd.init()
    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    processors = {'consensus': ConsensusProcessor}

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:
        global_batch_size = FLAGS.train_batch_size * hvd.size()
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.compat.v1.logging.info("***** Configuration *****")
        for key in FLAGS.__flags.keys():
            tf.compat.v1.logging.info('  {}: {}'.format(
                key, getattr(FLAGS, key)))
        tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.size())
            ]
            num_examples_per_rank = len(train_examples) // hvd.size()
            remainder = len(train_examples) % hvd.size()
            if hvd.rank() < remainder:
                start_index = hvd.rank() * (num_examples_per_rank + 1)
                end_index = start_index + num_examples_per_rank + 1
            else:
                start_index = hvd.rank() * num_examples_per_rank + remainder
                end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd,
                                use_fp16=FLAGS.use_fp16)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        filed_based_convert_examples_to_features(
            train_examples[start_index:end_index], label_list,
            FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        tf.compat.v1.logging.info("  Num of labels = %d", len(label_list))
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.compat.v1.logging.info("-----------------------------")
            tf.compat.v1.logging.info(
                "Total Training Time = %0.2f for Sentences = %d",
                train_time_elapsed, num_train_steps * global_batch_size)
            tf.compat.v1.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) = %0.2f",
                ss_sentences_per_second)
            tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(eval_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, eval_file)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info(
            "  Num examples = %d (%d actual, %d padding)", len(eval_examples),
            num_actual_eval_examples,
            len(eval_examples) - num_actual_eval_examples)
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        # This tells the estimator to run through the entire set.
        eval_steps = None
        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, predict_file)
        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info(
            "  Num examples = %d (%d actual, %d padding)",
            len(predict_examples), num_actual_predict_examples,
            len(predict_examples) - num_actual_predict_examples)
        tf.compat.v1.logging.info("  Batch size = %d",
                                  FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        eval_start_time = time.time()

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.io.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.compat.v1.logging.info("***** Predict results *****")
            for prediction in estimator.predict(input_fn=predict_input_fn,
                                                hooks=eval_hooks,
                                                yield_single_examples=True):
                probabilities = prediction["probabilities"]
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

        eval_time_elapsed = time.time() - eval_start_time
        eval_time_wo_overhead = eval_hooks[-1].total_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        num_sentences = (eval_hooks[-1].count -
                         eval_hooks[-1].skipped) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead,
            (eval_hooks[-1].count - eval_hooks[-1].skipped) *
            FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Summary Inference Statistics")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        tf.compat.v1.logging.info("-----------------------------")

示例#23

显示文件

def train(*tf_records: "Records to train on"):
    """Train on examples."""
    tf.logging.set_verbosity(tf.logging.INFO)
    estimator = dual_net.get_estimator()

    effective_batch_size = FLAGS.train_batch_size
    if FLAGS.dist_train:
        effective_batch_size = int(FLAGS.train_batch_size / hvd.size())
    if FLAGS.use_tpu:
        effective_batch_size *= FLAGS.num_tpu_cores

    if FLAGS.use_tpu:
        if FLAGS.use_bt:

            def _input_fn(params):
                games = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                 FLAGS.cbt_instance,
                                                 FLAGS.cbt_table)
                games_nr = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                    FLAGS.cbt_instance,
                                                    FLAGS.cbt_table + '-nr')
                return preprocessing.get_tpu_bt_input_tensors(
                    games,
                    games_nr,
                    params['batch_size'],
                    number_of_games=FLAGS.window_size,
                    random_rotation=True)
        else:

            def _input_fn(params):
                return preprocessing.get_tpu_input_tensors(
                    params['batch_size'], tf_records, random_rotation=True)

        # Hooks are broken with TPUestimator at the moment.
        hooks = []
    else:

        def _input_fn():
            return preprocessing.get_input_tensors(
                effective_batch_size,
                tf_records,
                filter_amount=FLAGS.filter_amount,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                random_rotation=True,
                seed=FLAGS.training_seed,
                dist_train=FLAGS.dist_train)

        hooks = [
            UpdateRatioSessionHook(FLAGS.work_dir),
            EchoStepCounterHook(output_dir=FLAGS.work_dir)
        ]
        if FLAGS.dist_train:
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    steps = FLAGS.steps_to_train
    logging.info("Training, steps = %s, batch = %s -> %s examples", steps
                 or '?', effective_batch_size,
                 (steps * effective_batch_size) if steps else '?')

    if FLAGS.use_bt:
        games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance,
                                         FLAGS.cbt_table)
        if not games.read_wait_cell():
            games.require_fresh_games(20000)
        latest_game = games.latest_game_number
        index_from = max(latest_game, games.read_wait_cell())
        print("== Last game before training:", latest_game, flush=True)
        print("== Wait cell:", games.read_wait_cell(), flush=True)

    try:
        estimator.train(_input_fn, steps=steps, hooks=hooks)
        if FLAGS.use_bt:
            bigtable_input.set_fresh_watermark(games, index_from,
                                               FLAGS.window_size)
    except:
        if FLAGS.use_bt:
            games.require_fresh_games(0)
        raise

示例#24

显示文件

文件： mnist_horovod.py 项目： zhcf/nauta

def main(_):
    hvd.init()

    # Read/download local dataset. Different copy for each process.
    mnist = tf.contrib.learn.datasets.mnist.read_data_sets(
        "mnist_data_{}".format(hvd.rank()))

    # Name images placeholder to be able to retrieve it from saved meta graph.
    images_placeholder = tf.placeholder(tf.float32, [None, 784],
                                        name=INPUT_NAME)

    dense_dropout_placeholder = tf.placeholder_with_default(1.0, [])
    labels_placeholder = tf.placeholder(tf.int64, [None])
    logits, scores, predictions = build_net(images_placeholder,
                                            dense_dropout_placeholder)

    # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now
    # also does not contain any training specific ops, so it is optimized for serving too.
    tf.train.export_meta_graph("graph.meta", as_text=True)

    loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10),
                                           logits)
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(predictions, labels_placeholder), tf.float32))

    # Define summary ops to save summaries for later use in tensorboard.
    tf.summary.scalar("accuracy", accuracy)
    tf.summary.scalar("loss", loss)
    summary_op = tf.summary.merge_all()

    # Horovod: adjust learning rate based on number of workers.
    optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    global_step = tf.contrib.framework.get_or_create_global_step()

    # Wrap standard optimizer in Horovod distributed one.
    train = hvd.DistributedOptimizer(optimizer).minimize(
        loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of workers.
        tf.train.StopAtStepHook(last_step=2000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Only master saves summaries.
    if hvd.rank() == 0:
        hooks += [
            # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by
            # tensorboard.
            tf.train.SummarySaverHook(save_steps=1,
                                      output_dir=os.path.join(
                                          EXPERIMENT_OUTPUT_PATH,
                                          "tensorboard"),
                                      summary_op=summary_op)
        ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned
    # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user.
    checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH,
                                  "checkpoints") if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks) as mon_sess:
        while not mon_sess.should_stop():
            images, labels = mnist.train.next_batch(64)
            _, loss_val, accuracy_val, global_step_val = mon_sess.run(
                [train, loss, accuracy, global_step],
                feed_dict={
                    images_placeholder: images,
                    labels_placeholder: labels,
                    dense_dropout_placeholder: 0.5
                })

            # Only master publishes metrics.
            if hvd.rank() == 0:
                # Publish metrics just like in the single node example.
                publish({
                    "loss": str(loss_val),
                    "accuracy": str(accuracy_val),
                    "global_step": str(global_step_val)
                })

    # Save servable model only from Horovod master.
    if hvd.rank() == 0:
        # Create a new graph to import the previously exported one.
        with tf.Graph().as_default():
            # Import previously saved meta graph.
            restorer = tf.train.import_meta_graph("graph.meta")
            with tf.Session() as session:
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                restorer.restore(session, checkpoint_file)

                # Get handlers for images placeholder and scores op with names defined before.
                images_placeholder = tf.get_default_graph().get_tensor_by_name(
                    INPUT_NAME + ":0")
                scores = tf.get_default_graph().get_tensor_by_name(
                    SCORES_NAME + ":0")

                # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user.
                builder = tf.saved_model.builder.SavedModelBuilder(
                    os.path.join(EXPERIMENT_OUTPUT_PATH, "models", "00001"))

                prediction_signature = (
                    tf.saved_model.signature_def_utils.build_signature_def(
                        inputs={
                            MODEL_INPUT_NAME:
                            tf.saved_model.utils.build_tensor_info(
                                images_placeholder)
                        },
                        outputs={
                            MODEL_OUTPUT_NAME:
                            tf.saved_model.utils.build_tensor_info(scores)
                        },
                        method_name=tf.saved_model.signature_constants.
                        PREDICT_METHOD_NAME))

                builder.add_meta_graph_and_variables(
                    session, [tf.saved_model.tag_constants.SERVING],
                    signature_def_map={
                        MODEL_SIGNATURE_NAME: prediction_signature
                    },
                    main_op=tf.tables_initializer(),
                    strip_default_attrs=True)

                builder.save()

示例#25

显示文件

文件： tf_mnist_pctx.py 项目： yuxihu/dl-scripts

def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=2000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=100)
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    builder = option_builder.ProfileOptionBuilder
    opts1 = builder(builder.time_and_memory()).\
            order_by('micros').\
            with_max_depth(10).\
            with_file_output("./pctx/opts1-rank-%d" % hvd.rank()).\
            build()
    opts2 = builder.trainable_variables_parameter()
    # with profile_context.ProfileContext("./pctx",
    #                                     trace_steps=range(100, 110),
    #                                     dump_steps=[110]) as pctx:
    with profile_context.ProfileContext("./pctx") as pctx:
        pctx.add_auto_profiling('op', opts1, [800, 900, 1000])
        pctx.add_auto_profiling('scope', opts2, [1000])
        with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                               hooks=hooks,
                                               config=config) as mon_sess:
            while not mon_sess.should_stop():
                # Run a training step synchronously.
                image_, label_ = next(training_batch_generator)
                mon_sess.run(train_op,
                             feed_dict={
                                 image: image_,
                                 label: label_
                             })
        pctx.profiler.advise(options=model_analyzer.ALL_ADVICE)

示例#26

显示文件

def main():
    gpu_thread_count = 2
    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
    hvd.init()

    # random.seed(5 * (1 + hvd.rank()))
    # np.random.seed(7 * (1 + hvd.rank()))
    # tf.set_random_seed(31 * (1 + hvd.rank()))

    cmdline = add_cli_args()
    FLAGS, unknown_args = cmdline.parse_known_args()
    if len(unknown_args) > 0:
        for bad_arg in unknown_args:
            print("ERROR: Unknown command line arg: %s" % bad_arg)
        raise ValueError("Invalid command line arg(s)")

    FLAGS.data_dir = None if FLAGS.data_dir == "" else FLAGS.data_dir
    FLAGS.log_dir = None if FLAGS.log_dir == "" else FLAGS.log_dir

    if FLAGS.eval:
        FLAGS.log_name = 'eval' + FLAGS.log_name
    if FLAGS.local_ckpt:
        do_checkpoint = hvd.local_rank() == 0
    else:
        do_checkpoint = hvd.rank() == 0
    if do_checkpoint and not os.path.isdir(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)

    logger = logging.getLogger(FLAGS.log_name)
    logger.setLevel(logging.INFO)  # INFO, ERROR
    # file handler which logs debug messages
    # console handler
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    # add formatter to the handlers
    # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    if not hvd.local_rank():
        fh = logging.FileHandler(os.path.join(FLAGS.log_dir, FLAGS.log_name))
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(formatter)
        # add handlers to logger
        logger.addHandler(fh)

    height, width = 224, 224
    global_batch_size = FLAGS.batch_size * hvd.size()

    if FLAGS.data_dir:
        filename_pattern = os.path.join(FLAGS.data_dir, '%s-*')
        train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
        eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
        num_training_samples = get_num_records(train_filenames)
    else:
        train_filenames = eval_filenames = []
        num_training_samples = 1281167
    training_samples_per_rank = num_training_samples // hvd.size()

    if FLAGS.num_epochs:
        nstep = num_training_samples * FLAGS.num_epochs // global_batch_size
    elif FLAGS.num_batches:
        nstep = FLAGS.num_batches
        FLAGS.num_epochs = max(
            nstep * global_batch_size // num_training_samples, 1)
    else:
        raise ValueError("Either num_epochs or num_batches has to be passed")
    nstep_per_epoch = num_training_samples // global_batch_size
    decay_steps = nstep

    if FLAGS.lr_decay_mode == 'steps':
        steps = [
            int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(',')
        ]
        lr_steps = [FLAGS.lr]
        for i in range(len(FLAGS.lr_decay_steps.split(','))):
            lr_steps.append(FLAGS.lr * pow(FLAGS.lr_decay_factor, i + 1))
    else:
        steps = []
        lr_steps = []

    if not FLAGS.save_checkpoints_steps:
        # default to save one checkpoint per epoch
        FLAGS.save_checkpoints_steps = nstep_per_epoch
    if not FLAGS.save_summary_steps:
        # default to save one checkpoint per epoch
        FLAGS.save_summary_steps = nstep_per_epoch

    warmup_it = nstep_per_epoch * FLAGS.warmup_epochs

    rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True  # Force pinned memory
    config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 5
    rank0log(logger, "Horovod size: ", hvd.size())

    classifier = tf.estimator.Estimator(
        model_fn=cnn_model_function,
        model_dir=FLAGS.log_dir,
        params={
            'model':
            FLAGS.model,
            'decay_steps':
            decay_steps,
            'n_classes':
            1000,
            'dtype':
            tf.float16 if FLAGS.fp16 else tf.float32,
            'format':
            'channels_first',
            'device':
            '/gpu:0',
            'lr':
            FLAGS.lr,
            'mom':
            FLAGS.mom,
            'wdecay':
            FLAGS.wdecay,
            'steps':
            steps,
            'lr_steps':
            lr_steps,
            'lr_decay_mode':
            FLAGS.lr_decay_mode,
            'warmup_it':
            warmup_it,
            'warmup_lr':
            FLAGS.warmup_lr,
            'loss_scale':
            FLAGS.loss_scale,
            'adv_bn_init':
            FLAGS.adv_bn_init,
            'conv_init':
            tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None
        },
        config=tf.estimator.RunConfig(
            # tf_random_seed=31 * (1 + hvd.rank()),
            session_config=config,
            save_summary_steps=FLAGS.save_summary_steps
            if do_checkpoint else None,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps
            if do_checkpoint else None,
            keep_checkpoint_max=None))

    if not FLAGS.eval:
        num_preproc_threads = 5
        rank0log(logger, "Preproc threads", num_preproc_threads)
        training_hooks = [
            hvd.BroadcastGlobalVariablesHook(0),
            PrefillStagingAreasHook()
        ]
        if hvd.rank() == 0:
            training_hooks.append(
                LogSessionRunHook(global_batch_size, num_training_samples,
                                  FLAGS.display_every, logger))
        try:
            start_time = time.time()
            classifier.train(
                input_fn=lambda: make_dataset(train_filenames,
                                              training_samples_per_rank,
                                              FLAGS.batch_size,
                                              height,
                                              width,
                                              training=True,
                                              num_threads=num_preproc_threads,
                                              shard=True,
                                              synthetic=FLAGS.synthetic),
                max_steps=nstep,
                hooks=training_hooks)
            rank0log(logger, "Finished in ", time.time() - start_time)
        except KeyboardInterrupt:
            print("Keyboard interrupt")
    elif FLAGS.eval and not FLAGS.synthetic:
        rank0log(logger, "Evaluating")
        rank0log(
            logger, "Validation dataset size: {}".format(
                get_num_records(eval_filenames)))
        barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
        tf.Session(config=config).run(barrier)
        time.sleep(5)  # a little extra margin...
        if FLAGS.num_gpus == 1:
            rank0log(
                logger,
                """If you are evaluating checkpoints of a multi-GPU run on a single GPU,
             ensure you set --num_gpus to the number of GPUs it was trained on.
             This will ensure that the epoch number is accurately displayed in the below logs."""
            )
        try:
            ckpts = sort_and_load_ckpts(FLAGS.log_dir)
            for i, c in enumerate(ckpts):
                if i < len(ckpts) - 1:
                    if (not FLAGS.eval_interval) or \
                            (i % FLAGS.eval_interval != 0):
                        continue
                eval_result = classifier.evaluate(
                    input_fn=lambda: make_dataset(eval_filenames,
                                                  get_num_records(
                                                      eval_filenames),
                                                  FLAGS.batch_size,
                                                  height,
                                                  width,
                                                  training=False,
                                                  shard=True,
                                                  synthetic=FLAGS.synthetic),
                    checkpoint_path=c['path'])
                c['epoch'] = (c['step'] * FLAGS.num_gpus) / (nstep_per_epoch *
                                                             hvd.size())
                c['top1'] = eval_result['val-top1acc']
                c['top5'] = eval_result['val-top5acc']
                c['loss'] = eval_result['loss']
            rank0log(
                logger,
                ' step  epoch  top1    top5     loss   checkpoint_time(UTC)')
            barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
            for i, c in enumerate(ckpts):
                tf.Session(config=config).run(barrier)
                if 'top1' not in c:
                    continue
                rank0log(
                    logger,
                    '{:5d}  {:5.1f}  {:5.3f}  {:6.2f}  {:6.2f}  {time}'.format(
                        c['step'],
                        c['epoch'],
                        c['top1'] * 100,
                        c['top5'] * 100,
                        c['loss'],
                        time=time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(c['mtime']))))
            rank0log(logger, "Finished evaluation")
        except KeyboardInterrupt:
            logger.error("Keyboard interrupt")

示例#27

显示文件

文件： tensorflow_mnist.py 项目： twittidai/multinode_containers

def main(_):
    '''Main routine for Horovod Tensorflow Mnist example.'''
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=str(hvd.local_rank()))
    config = tf.ConfigProto(gpu_options=gpu_options)

    batch_size = 100

    # Download and load MNIST dataset.
    if hvd.rank() == 0:
        # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR)
        image, label = get_data_mnist(batch_size)

    # hvd.allreduce(tf.constant([0]), average=False)  # Barrier (not working)
    with tf.Session(config=config):
        # download/unzip in rank 0 only.
        hvd_keras.allreduce([0], name="Barrier")

    if hvd.rank() != 0:
        # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR)
        image, label = get_data_mnist(batch_size)

    # Build model...
    # with tf.name_scope('input'):
    #     image = tf.placeholder(tf.float32, [None, 784], name='image')
    #     label = tf.placeholder(tf.float32, [None], name='label')

    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    # global_step = tf.contrib.framework.get_or_create_global_step()
    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable
        # states from rank 0 to all other processes. This is necessary to
        # ensure consistent initialization of all workers when training is
        # started with random weights or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when
    # done or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            # image_, label_ = mnist.train.next_batch(100)
            # mon_sess.run(train_op, feed_dict={image: image_, label: label_})
            mon_sess.run(train_op)

示例#28

显示文件

    def train(self,
              iter_unit,
              num_iter,
              batch_size,
              weight_decay,
              learning_rate,
              learning_rate_decay_factor,
              learning_rate_decay_steps,
              rmsprop_decay,
              rmsprop_momentum,
              use_auto_loss_scaling,
              augment_data,
              warmup_steps=50,
              is_benchmark=False):

        if iter_unit not in ["epoch", "batch"]:
            raise ValueError(
                '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])'
                % iter_unit)

        if self.run_hparams.data_dir is None and not is_benchmark:
            raise ValueError('`data_dir` must be specified for training!')

        if self.run_hparams.use_tf_amp:
            if use_auto_loss_scaling:

                if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
                    LOGGER.log(
                        "TF Loss Auto Scaling is activated - Experimental Feature"
                    )

                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1"
                apply_manual_loss_scaling = False

            else:
                os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0"
                apply_manual_loss_scaling = True
        else:
            apply_manual_loss_scaling = False

        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
            LOGGER.log('Defining Model Estimator ...\n')

        global_batch_size = batch_size * self.num_gpus

        if self.run_hparams.data_dir is not None:
            filenames, num_samples, num_steps, num_epochs = self.dataset.get_dataset_runtime_specs(
                training=True,
                iter_unit=iter_unit,
                num_iter=num_iter,
                global_batch_size=global_batch_size)

            steps_per_epoch = int(num_steps / num_epochs)

        else:
            num_epochs = 1
            num_steps = num_iter
            steps_per_epoch = 625

        training_hooks = []

        if hvd_utils.is_using_hvd():
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
            training_hooks.append(
                ProfilerHook(global_batch_size=global_batch_size,
                             log_every=self.run_hparams.log_every_n_steps,
                             warmup_steps=warmup_steps,
                             is_training=True,
                             sample_dir=self.run_hparams.sample_dir))

            LOGGER.log('Starting Model Training ...\n')

            LOGGER.log("=> Epochs: %d" % num_epochs)
            LOGGER.log("=> Total Steps: %d" % num_steps)
            LOGGER.log("=> Steps per Epoch: %d" % steps_per_epoch)
            LOGGER.log("=> Weight Decay Factor: %.1e" % weight_decay)
            LOGGER.log("=> Learning Rate: %.1e" % learning_rate)
            LOGGER.log("=> Learning Rate Decay Factor: %.2f" %
                       learning_rate_decay_factor)
            LOGGER.log("=> Learning Rate Decay Steps: %d" %
                       learning_rate_decay_steps)
            LOGGER.log("=> RMSProp - Decay: %.1f" % rmsprop_decay)
            LOGGER.log("=> RMSProp - Momentum: %.1f" % rmsprop_momentum)
            LOGGER.log("=> Loss Function Name: %s" %
                       self.run_hparams.loss_fn_name)

            if self.run_hparams.use_tf_amp:
                LOGGER.log("=> Use Auto Loss Scaling: %s" %
                           use_auto_loss_scaling)

            LOGGER.log("=> # GPUs: %d" % self.num_gpus)
            LOGGER.log("=> GPU Batch Size: %d" % batch_size)
            LOGGER.log("=> Global Batch Size: %d" % global_batch_size)
            LOGGER.log("=> Total Files to Processed: %d\n" %
                       (num_steps * global_batch_size))

        estimator_params = {
            'batch_size': batch_size,
            'steps_per_epoch': steps_per_epoch,
            'learning_rate': learning_rate,
            'learning_rate_decay_steps': learning_rate_decay_steps,
            'learning_rate_decay_factor': learning_rate_decay_factor,
            'rmsprop_decay': rmsprop_decay,
            'rmsprop_momentum': rmsprop_momentum,
            'weight_decay': weight_decay,
            'apply_manual_loss_scaling': apply_manual_loss_scaling,
            'loss_fn_name': self.run_hparams.loss_fn_name,
            'debug_verbosity': self.run_hparams.debug_verbosity,
        }

        def training_data_fn():

            if not is_benchmark or self.run_hparams.data_dir is not None:

                return self.dataset.dataset_fn(
                    batch_size=batch_size,
                    training=True,
                    only_defective_images=True,
                    augment_data=augment_data,
                    input_shape=list(self.run_hparams.input_shape) +
                    [self.run_hparams.n_channels],
                    mask_shape=list(self.run_hparams.mask_shape) +
                    [self.run_hparams.n_channels],
                    num_threads=64,
                    use_gpu_prefetch=True,
                    normalize_data_method="zero_centered",
                    seed=self.run_hparams.seed)

            else:
                if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
                    LOGGER.log("Using Synthetic Data ...")

                return self.dataset.synth_dataset_fn(
                    batch_size=batch_size,
                    training=True,
                    input_shape=list(self.run_hparams.input_shape) +
                    [self.run_hparams.n_channels],
                    mask_shape=list(self.run_hparams.mask_shape) +
                    [self.run_hparams.n_channels],
                    num_threads=64,
                    use_gpu_prefetch=True,
                    normalize_data_method="zero_centered",
                    only_defective_images=True,
                    augment_data=augment_data,
                    seed=self.run_hparams.seed)

        model = self._get_estimator(mode='train',
                                    run_params=estimator_params,
                                    use_xla=self.use_xla)

        try:
            model.train(
                input_fn=training_data_fn,
                steps=num_steps,
                hooks=training_hooks,
            )
        except KeyboardInterrupt:
            print("Keyboard interrupt")

        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
            LOGGER.log('Ending Model Training ...')

示例#29

显示文件

def main(_):
    # causes memory fragmentation for bert leading to OOM
    if os.environ.get("TF_XLA_FLAGS", None) is not None:
        os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false"
    else:
        os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

    if FLAGS.horovod:
      hvd.init()

    processors = {
        "bc5cdr": BC5CDRProcessor,
        "clefe": CLEFEProcessor,
        'i2b2': I2b22012Processor
    }
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
       raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:
      global_batch_size = FLAGS.train_batch_size * hvd.size()
      master_process = (hvd.rank() == 0)
      hvd_rank = hvd.rank()
      config.gpu_options.visible_device_list = str(hvd.local_rank())
      if hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
        tf.enable_resource_variables()
    run_config = tf.estimator.RunConfig(
      model_dir=FLAGS.output_dir if master_process else None,
      session_config=config,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
      keep_checkpoint_max=1)

    if master_process:
      tf.compat.v1.logging.info("***** Configuaration *****")
      for key in FLAGS.__flags.keys():
          tf.compat.v1.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
      tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
          tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
          num_examples_per_rank = len(train_examples) // hvd.size()
          remainder = len(train_examples) % hvd.size()
          if hvd.rank() < remainder:
            start_index = hvd.rank() * (num_examples_per_rank+1)
            end_index = start_index + num_examples_per_rank + 1
          else:
            start_index = hvd.rank() * num_examples_per_rank + remainder
            end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_one_hot_embeddings=False,
        hvd=None if not FLAGS.horovod else hvd,
        amp=FLAGS.amp)

    estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

    if FLAGS.do_train:
        #train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        #filed_based_convert_examples_to_features(
        #    train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        filed_based_convert_examples_to_features(
          train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames, #train_file,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)
        
        #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
          tf.compat.v1.logging.info("-----------------------------")
          tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
                        num_train_steps * global_batch_size)
          tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
                        (num_train_steps - training_hooks[-1].skipped) * global_batch_size)
          tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
          tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
          dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
          tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(eval_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        eval_steps = None
        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.Open(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                dllogging.logger.log(step=(), data={key: float(strresult[key])}, verbosity=Verbosity.DEFAULT)
                writer.write("%s = %s\n" % (key, str(result[key])))
    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,
                                                 FLAGS.max_seq_length, tokenizer,
                                                 predict_file, mode="test")

        with tf.io.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}
        token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
        if tf.io.gfile.Exists(token_path):
            tf.io.gfile.Remove(token_path)

        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        eval_start_time = time.time()

        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
        test_labels_file = os.path.join(FLAGS.output_dir, "test_labels.txt")
        test_labels_err_file = os.path.join(FLAGS.output_dir, "test_labels_errs.txt")
        with tf.io.gfile.Open(output_predict_file, 'w') as writer, \
                tf.io.gfile.Open(test_labels_file, 'w') as tl, \
                tf.io.gfile.Open(test_labels_err_file, 'w') as tle:
            print(id2label)
            i=0
            for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks,
                                                yield_single_examples=True):
                output_line = "\n".join(id2label[id] for id in prediction if id != 0) + "\n"
                writer.write(output_line)
                result_to_pair(predict_examples[i], prediction, id2label, tl, tle)
                i = i + 1

        eval_time_elapsed = time.time() - eval_start_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        # Removing outliers (init/warmup) in throughput computation.
        eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
        num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
                        eval_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
                        num_sentences)
        tf.compat.v1.logging.info("Summary Inference Statistics")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
        dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

        tf.compat.v1.logging.info('Reading: %s', test_labels_file)
        with tf.io.gfile.Open(test_labels_file, "r") as f:
            counts = evaluate(f)
        eval_result = report_notprint(counts)
        print(''.join(eval_result))
        with tf.io.gfile.Open(os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd:
            fd.write(''.join(eval_result))

示例#30

显示文件

文件： model.py 项目： zhuobumeng/TFP

def bnn(args):

    # %% Model

    class Dummy():
        pass

    hvd.init()

    print("Rank is:", hvd.rank())

    tf.reset_default_graph()

    tf.set_random_seed(args.seed + hvd.rank())
    np.random.seed(args.seed + hvd.rank())

    tfd = tf.contrib.distributions

    N = args.X_train.shape[0]
    dim = list(args.X_train.shape[1:])
    K = args.Y_train.shape[1]  # num of class

    X = tf.placeholder(tf.float32, [None] + dim)
    y = tf.placeholder(tf.float32, [None, K])

    neural_net = nnet.convnet(activation=args.activation,
                              numclass=K,
                              inshape=args.inshape,
                              isBay=True,
                              regularizer=args.regularizer,
                              priorstd=args.priorstd,
                              poststd=args.poststd,
                              repeatConv=args.repeatConv)
    logits = neural_net(X)

    labels_distribution = tfd.Categorical(logits=logits)

    # %% Loss

    neg_log_likelihood = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logits))
    kl = sum(neural_net.losses) / N
    elbo_loss = neg_log_likelihood + args.KLscale * kl

    # %% Metrics

    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # %% Posterior

    # names = []
    # qmeans = []
    # qstds = []
    # Wsample = []

    # for i, layer in enumerate(neural_net.layers):
    #     if hasattr(layer, "kernel_posterior"):
    #         q = layer.kernel_posterior
    #         names.append("Layer {}".format(i))
    #         qmeans.append(q.mean())
    #         qstds.append(q.stddev())
    #         Wsample.append(q.sample(args.num_monte_carlo))

    # Horovod training

    opt = tf.train.AdamOptimizer(args.learning_rate * hvd.size())
    opt = hvd.DistributedOptimizer(opt)
    global_step = tf.train.get_or_create_global_step()
    train_opt = opt.minimize(elbo_loss, global_step=global_step)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # checkpoint_dir = './train_logs' if hvd.rank() == 0 else None

    hooks = [
        hvd.BroadcastGlobalVariablesHook(0)
        # tf.train.StopAtStepHook(last_step=max_iter // hvd.size())
    ]

    res_return = Dummy()
    res_return.plot = Dummy()
    res_return.plot.niter = []
    res_return.plot.runtime = []
    res_return.plot.loss = []
    res_return.plot.devAcc = []
    # res_return.plot.devAccMean = []
    # res_return.plot.devAccUp = []
    # res_return.plot.devAccDown = []

    # print("total iteration is", max_iter // hvd.size())

    with tf.train.MonitoredTrainingSession(hooks=hooks,
                                           config=config) as mon_sess:

        if hvd.rank() == 0:
            print("=" * 21 + "Optimization Start" + "=" * 21)

        start_time, algstart = time.time(), time.time()
        niter = 0

        for epoch in range(args.training_epochs // hvd.size()):

            perm = np.random.permutation(N)
            for i in range(0, N, args.batch_size):
                batch_x = args.X_train[perm[i:i + args.batch_size]]
                batch_y = args.Y_train[perm[i:i + args.batch_size]]
                _, loss_val, acc_val = mon_sess.run(
                    [train_opt, elbo_loss, accuracy],
                    feed_dict={
                        X: batch_x,
                        y: batch_y
                    })
                niter += 1
                # print(niter, end=", ")
                if (niter * hvd.size()) % args.viz_steps == 0:
                    end_time = time.time()
                    # eval on dev set
                    acc_val_dev = np.asarray([
                        mon_sess.run(accuracy,
                                     feed_dict={
                                         X: args.X_test,
                                         y: args.Y_test
                                     })
                        for xyz in range(args.num_monte_carlo // hvd.size())
                    ])

                    # save
                    timediff = end_time - start_time
                    AccMean = np.mean(acc_val_dev)
                    AccStd = np.std(acc_val_dev)
                    timediff = end_time - start_time
                    res_return.plot.niter.append(niter)
                    res_return.plot.runtime.append(timediff)
                    res_return.plot.loss.append(loss_val)
                    res_return.plot.devAcc.append(acc_val_dev)
                    # res_return.plot.devAccMean.append(AccMean)
                    # res_return.plot.devAccUp.append(AccMean + AccStd)
                    # res_return.plot.devAccDown.append(AccMean - AccStd)

                    if hvd.rank() == 0:
                        print("Step: {:>3d} RunTime: {:.3f} Loss: {:.3f}"
                              "ACC: {:.3f} AccDevM: {:.3f} AccDevU: {:.3f}".
                              format(niter, timediff, loss_val, acc_val,
                                     AccMean, AccMean + AccStd))
                    start_time = time.time()

        eval_start = time.time()
        if hvd.rank() == 0:
            print("=" * 21 + "Optimization Finish" + "=" * 21)

        tmp = [
            mon_sess.run([accuracy, labels_distribution.probs],
                         feed_dict={
                             X: args.X_test,
                             y: args.Y_test
                         })
            for xyz in range(args.num_monte_carlo_test // hvd.size())
        ]
        [acc_val_test, probs] = list(zip(*tmp))
        acc_val_test = np.asarray(acc_val_test)

        eval_end = time.time()
        tot_time = eval_end - algstart
        eval_time = eval_end - eval_start

        if hvd.rank() == 0:
            print("Step: {:>3d} RunTime: {:.3f} TestAcc:{:.3f}".format(
                niter, end_time - algstart, np.mean(acc_val_test)))

        res_return.tot_time = tot_time
        res_return.eval_time = eval_time

    # Return result

    res_return.probs = np.asarray(probs)
    res_return.acc = np.asarray(acc_val_test)
    # res_return.posterior = Dummy()
    # res_return.posterior.mean = qm_vals
    # res_return.posterior.std = qs_vals
    # res_return.posterior.samples = W_postsam
    # res_return.names = names

    return res_return