def main(unused_argv): hvd.init() model_class = import_symbol(FLAGS.model_name, 'em_mask') model_args = json.loads(FLAGS.model_args) fov_size = tuple([int(i) for i in model_args['fov_size']]) if 'label_size' in model_args: label_size = tuple([int(i) for i in model_args['label_size']]) else: label_size = fov_size model_args['label_size'] = label_size num_classes = int(model_args['num_classes']) if num_classes == 1: # model_fn = model_utils.mask_model_fn_regression model_fn = model_utils.mask_model_fn_binary else: model_fn = model_utils.mask_model_fn_classfication params = { 'model_class': model_class, 'model_args': model_args, 'batch_size': FLAGS.batch_size, 'num_classes': num_classes, 'learning_rate': FLAGS.learning_rate, 'weighted': FLAGS.weighted } gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') sess_config = tf.compat.v1.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) model_dir = FLAGS.train_dir if hvd.rank() == 0 else None save_summary_steps = 90 if hvd.rank() == 0 else None save_checkpoints_secs = 540 if hvd.rank() == 0 else None config = tf.estimator.RunConfig( model_dir=model_dir, save_summary_steps=save_summary_steps, save_checkpoints_secs=save_checkpoints_secs, session_config=sess_config, keep_checkpoint_max=10, ) mask_estimator = tf.estimator.Estimator(model_fn=model_fn, config=config, params=params) bcast_hook = hvd.BroadcastGlobalVariablesHook(0) if FLAGS.weights_volumes: input_fn = io_utils.train_input_fn_with_weight( FLAGS.data_volumes, FLAGS.label_volumes, FLAGS.weights_volumes, FLAGS.tf_coords, num_classes, fov_size, label_size, FLAGS.batch_size, FLAGS.image_mean, FLAGS.image_stddev, FLAGS.rotation) else: # input_fn = io_utils.train_input_fn( # FLAGS.data_volumes, # FLAGS.label_volumes, # FLAGS.tf_coords, # num_classes, # fov_size, # label_size, # FLAGS.batch_size, # FLAGS.image_mean, # FLAGS.image_stddev, # FLAGS.rotation) # input_fn = io_utils.train_input_rebalance_fn( input_fn = io_utils.train_input_mult_fn( FLAGS.data_volumes, FLAGS.label_volumes, FLAGS.tf_coords, num_classes, fov_size, label_size, FLAGS.batch_size, FLAGS.image_mean, FLAGS.image_stddev, FLAGS.rotation, False) mask_estimator.train(input_fn=input_fn, steps=FLAGS.max_steps, hooks=[bcast_hook])
# optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate_).minimize(loss) # horovard inclusion opt = tf.train.AdamOptimizer(learning_rate=learning_rate_ * hvd.size()) # Add Horovod Distributed Optimizer opt = hvd.DistributedOptimizer(opt) prediction = tf.nn.in_top_k(logits, y, 1) accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32)) global_step = tf.train.get_or_create_global_step() # Add hook to broadcast variables from rank 0 to all other processes during # initialization. hooks = [hvd.BroadcastGlobalVariablesHook(0)] # Make training operation train_op = opt.minimize(loss, global_step=tf.train.get_or_create_global_step()) # Save checkpoints only on worker 0 to prevent other workers from corrupting them. checkpoint_dir = '/tmp/train_logs' if hvd.rank() == 0 else None # # initialize the variables init = tf.global_variables_initializer() # Implementing a dynamic graph using tensorflow API stime = time() runtime = 0
def train(self, iter_unit, num_iter, batch_size, warmup_steps=50, weight_decay=1e-4, lr_init=0.1, lr_warmup_epochs=5, momentum=0.9, log_every_n_steps=1, loss_scale=256, label_smoothing=0.0, mixup=0.0, use_cosine_lr=False, use_static_loss_scaling=False, is_benchmark=False): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16: if use_static_loss_scaling: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" else: LOGGER.log("TF Loss Auto Scaling is activated") os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" else: use_static_loss_scaling = False # Make sure it hasn't been set to True on FP32 training num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size() global_batch_size = batch_size * num_gpus if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="train", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=global_batch_size, ) steps_per_epoch = num_steps / num_epochs else: num_epochs = 1 num_steps = num_iter steps_per_epoch = num_steps num_decay_steps = num_steps num_samples = num_steps * batch_size if self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="train") training_hooks = [] if hvd.rank() == 0: LOGGER.log('Starting Model Training...') LOGGER.log("Training Epochs", num_epochs) LOGGER.log("Total Steps", num_steps) LOGGER.log("Steps per Epoch", steps_per_epoch) LOGGER.log("Decay Steps", num_decay_steps) LOGGER.log("Weight Decay Factor", weight_decay) LOGGER.log("Init Learning Rate", lr_init) LOGGER.log("Momentum", momentum) LOGGER.log("Num GPUs", num_gpus) LOGGER.log("Per-GPU Batch Size", batch_size) if is_benchmark: benchmark_logging_hook = hooks.BenchmarkLoggingHook( log_file_path=os.path.join(self.run_hparams.log_dir, "training_benchmark.json"), global_batch_size=global_batch_size, log_every=log_every_n_steps, warmup_steps=warmup_steps) training_hooks.append(benchmark_logging_hook) else: training_logging_hook = hooks.TrainingLoggingHook( log_file_path=os.path.join(self.run_hparams.log_dir, "training.json"), global_batch_size=global_batch_size, num_steps=num_steps, num_samples=num_samples, num_epochs=num_epochs, log_every=log_every_n_steps) training_hooks.append(training_logging_hook) if hvd_utils.is_using_hvd(): bcast_hook = hvd.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) training_hooks.append(hooks.PrefillStagingAreasHook()) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'num_gpus': num_gpus, 'momentum': momentum, 'lr_init': lr_init, 'lr_warmup_epochs': lr_warmup_epochs, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'apply_loss_scaling': use_static_loss_scaling, 'label_smoothing': label_smoothing, 'mixup': mixup, 'num_decay_steps': num_decay_steps, 'use_cosine_lr': use_cosine_lr } image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction, gpu_id=self.run_hparams.gpu_id) def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: LOGGER.log("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: if hvd.rank() == 0: LOGGER.log("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: image_classifier.train( input_fn=training_data_fn, steps=num_steps, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if hvd.rank() == 0: LOGGER.log('Ending Model Training ...')
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ # Convert params (dict) to Config for easier access. training_hooks = None if params['data_format'] == 'channels_first': features = tf.transpose(features, [0, 3, 1, 2]) def _model_outputs(inputs): return model(inputs, config=hparams_config.Config(params)) cls_outputs, box_outputs = utils.build_model_with_precision( params['precision'], _model_outputs, features) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/box_iou_loss', box_iou_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if params['strategy'] == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top learning_rate = learning_rate * hvd.size() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) elif params['strategy'] == 'horovod': optimizer = hvd.DistributedOptimizer(optimizer) training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] if params['strategy'] == 'tpu': batch_size = params['batch_size'] * params['num_shards'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get( 'var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks)
def main(_): tf.get_logger().setLevel(logging.ERROR) hvd.init() FLAGS = PARSER.parse_args() backends = [StdOutBackend(Verbosity.DEFAULT)] if FLAGS.log_dir: backends += [JSONStreamBackend(Verbosity.DEFAULT, FLAGS.log_dir)] DLLogger.init(backends=backends) os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' os.environ['TF_DISABLE_NVTX_RANGES'] = '1' if hvd.rank() == 0: DLLogger.log(step=tuple(), data={"mixed_precision": "ENABLED" if FLAGS.use_amp else "DISABLED"}) dataset = MSDDataset(json_path=os.path.join(FLAGS.data_dir, 'dataset.json'), dst_size=FLAGS.input_shape, seed=FLAGS.seed, interpolator=FLAGS.resize_interpolator, data_normalization=FLAGS.data_normalization, batch_size=FLAGS.batch_size, train_split=FLAGS.train_split, split_seed=FLAGS.split_seed) FLAGS.labels = dataset.labels gpu_options = tf.GPUOptions() config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.estimator.RunConfig( save_summary_steps=None, save_checkpoints_steps=dataset.train_steps * FLAGS.train_epochs, save_checkpoints_secs=None, tf_random_seed=None, session_config=config, keep_checkpoint_max=1) estimator = tf.estimator.Estimator( model_fn=vnet_v2, model_dir=FLAGS.model_dir if hvd.rank() == 0 else None, config=run_config, params=FLAGS) train_hooks = [hvd.BroadcastGlobalVariablesHook(0)] if 'train' in FLAGS.exec_mode: steps = dataset.train_steps * FLAGS.train_epochs if FLAGS.benchmark: steps = FLAGS.warmup_steps * 2 if hvd.rank() == 0: train_hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger)] else: if hvd.rank() == 0: train_hooks += [TrainHook(FLAGS.log_every, DLLogger)] estimator.train( input_fn=lambda: dataset.train_fn(FLAGS.augment), steps=steps, hooks=train_hooks) if 'evaluate' in FLAGS.exec_mode: if hvd.rank() == 0: if FLAGS.train_split >= 1.0: raise ValueError("Missing argument: --train_split < 1.0") result = estimator.evaluate( input_fn=dataset.eval_fn, steps=dataset.eval_steps, hooks=[]) DLLogger.log(step=tuple(), data={'background_dice': result['background dice']}) DLLogger.log(step=tuple(), data={'anterior_dice': result['Anterior dice']}) DLLogger.log(step=tuple(), data={'posterior_dice': result['Posterior dice']}) if 'predict' in FLAGS.exec_mode: count = 1 hooks = [] if hvd.rank() == 0: if FLAGS.benchmark: count = math.ceil((FLAGS.warmup_steps * 2) / dataset.test_steps) hooks += [ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger, training=False)] predictions = estimator.predict(input_fn=lambda: dataset.test_fn(count=count), hooks=hooks) pred = [p['prediction'] for p in predictions] predict_path = os.path.join(FLAGS.model_dir, 'predictions') if os.path.exists(predict_path): shutil.rmtree(predict_path) os.makedirs(predict_path) pickle.dump(pred, open(os.path.join(predict_path, 'predictions.pkl'), 'wb'))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if FLAGS.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" if FLAGS.horovod: import horovod.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tf.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoint_steps if not FLAGS.horovod or hvd.rank() == 0 else None, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, hvd=None if not FLAGS.horovod else hvd) training_hooks = [] if FLAGS.horovod and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.report_loss: global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size( ) training_hooks.append( _LogSessionRunHook(global_batch_size, 1, -1 if not FLAGS.horovod else hvd.rank())) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.horovod else hvd) estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0): tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.horovod else hvd) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): hvd.init() FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join( FLAGS.output_dir, str(hvd.rank())) tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "cla": ClaProcessor, "pair": PairProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), log_step_count_steps=25, session_config=config) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) num_train_steps = num_train_steps // hvd.size() num_warmup_steps = num_warmup_steps // hvd.size() model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) hooks = [hvd.BroadcastGlobalVariablesHook(0)] estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=hooks) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) ####################################################################################################################### # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy steps_and_files = [] filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) global_step = int(cur_filename.split("-")[-1]) tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files.append([global_step, cur_filename]) steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") print("output_eval_file:", output_eval_file) tf.logging.info("output_eval_file:" + output_eval_file) with tf.gfile.GFile(output_eval_file, "w") as writer: for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) tf.logging.info("***** Eval results %s *****" % (filename)) writer.write("***** Eval results %s *****\n" % (filename)) for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ####################################################################################################################### # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) # # output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") # with tf.gfile.GFile(output_eval_file, "w") as writer: # tf.logging.info("***** Eval results *****") # for key in sorted(result.keys()): # tf.logging.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: true_labels = [] with open(os.path.join(FLAGS.data_dir, "test.tsv"), 'r', encoding='utf-8') as f: for line in f.readlines(): line = line.strip() true_labels.append(int(line.split('\t')[0])) predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) predictions = [] output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] a = probabilities.tolist() predictions.append(a.index(max(a))) if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples count = 0 for i in range(len(predictions)): if predictions[i] == true_labels[i]: count += 1 print("Average accuracy: ", count / len(predictions)) with open(os.path.join(FLAGS.data_dir, "id2label.json"), 'r', encoding='utf-8') as f: ld2label = json.load(f) cla_labels = [i for i in range(FLAGS.cla_nums)] report = metrics.classification_report( y_true=true_labels, y_pred=predictions, labels=cla_labels, target_names=[ld2label[str(i)].split()[0] for i in cla_labels], digits=4) confution_matrix = metrics.confusion_matrix(y_true=true_labels, y_pred=predictions, labels=cla_labels) print(report) print(confution_matrix) with open(os.path.join(FLAGS.output_dir, "eval_report.txt"), 'w', encoding='utf-8') as f: f.write(report)
def main(_): #Initialize Horovod hvd.init() # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) # set inter_op to 1 and intra_op to number of physical cores config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=FLAGS.num_cpu_threads, inter_op_parallelism_threads=1, gpu_options=gpu_options) num_gpus = validate_batch_size_for_multi_gpu(FLAGS.batch_size) #Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them model_dir_hvd = FLAGS.model_dir if hvd.rank() == 0 else None # Set up a RunConfig to only save checkpoints once per training cycle. run_config = tf.estimator.RunConfig().replace( save_checkpoints_secs=FLAGS.save_checkpoints_secs ).replace(save_checkpoints_steps=None).replace( save_summary_steps=FLAGS.save_summary_steps).replace( keep_checkpoint_max=5).replace( tf_random_seed=FLAGS.tf_random_seed).replace( log_step_count_steps=FLAGS.log_every_n_steps).replace( session_config=config).replace(model_dir=model_dir_hvd) replicate_ssd_model_fn = tf.contrib.estimator.replicate_model_fn( ssd_model_fn, loss_reduction=tf.losses.Reduction.MEAN) ssd_detector = tf.estimator.Estimator( model_fn=replicate_ssd_model_fn, model_dir=model_dir_hvd, config=run_config, params={ 'num_gpus': num_gpus, 'data_format': FLAGS.data_format, 'batch_size': FLAGS.batch_size, 'model_scope': FLAGS.model_scope, 'num_classes': FLAGS.num_classes, 'negative_ratio': FLAGS.negative_ratio, 'match_threshold': FLAGS.match_threshold, 'neg_threshold': FLAGS.neg_threshold, 'weight_decay': FLAGS.weight_decay, 'momentum': FLAGS.momentum, 'learning_rate': FLAGS.learning_rate, 'end_learning_rate': FLAGS.end_learning_rate, 'decay_boundaries': parse_comma_list(FLAGS.decay_boundaries), 'lr_decay_factors': parse_comma_list(FLAGS.lr_decay_factors), }) tensors_to_log = { 'lr': 'learning_rate_log', 'ce': 'cross_entropy_loss', 'loc': 'location_loss', 'loss': 'total_loss', 'l2': 'l2_loss', 'acc': 'post_forward/cls_accuracy', } logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps, formatter=lambda dicts: (', '.join(['%s=%.6f' % (k, v) for k, v in dicts.items()]))) #Horovod: broadcast initial variable states from rank 0 to all other processes #This ensures consistent initialization when training is started from random weights or a checkpoint bcast_hook = hvd.BroadcastGlobalVariablesHook(0) #hook = tf.train.ProfilerHook(save_steps=50, output_dir='.', show_memory=True) print('Starting a training cycle.') ssd_detector.train(input_fn=input_pipeline(dataset_pattern='train-*', is_training=True, batch_size=FLAGS.batch_size), hooks=[logging_hook, bcast_hook], max_steps=FLAGS.max_number_of_steps)
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target, init_checkpoint, train_file, dev_file, checkpoint_dir, is_debug): graph = tf.Graph() with graph.as_default(): import json config = json.load(open(FLAGS.config_file, "r")) config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / worker_count) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size/worker_count) epoch = FLAGS.epoch init_lr = 2e-5 label_dict = json.load(open(FLAGS.label_id)) num_train_steps = int( train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size) if is_debug == "0": num_storage_steps = 2 num_eval_steps = 10 num_train_steps = 10 print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".format(num_train_steps, num_eval_steps, num_storage_steps)) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({"init_lr":init_lr/worker_count, "num_train_steps":num_train_steps, "num_warmup_steps":num_warmup_steps, "worker_count":worker_count, "opt_type":FLAGS.opt_type}) model_io_config = Bunch({"fix_lm":False}) model_io_fn = model_io.ModelIO(model_io_config) optimizer_fn = optimizer.Optimizer(opt_config) num_classes = FLAGS.num_classes model_train_fn = model_fn_builder(config, num_classes, init_checkpoint, model_reuse=None, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config, exclude_scope="", not_storage_params=[], target="") model_eval_fn = model_fn_builder(config, num_classes, init_checkpoint, model_reuse=True, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config, exclude_scope="", not_storage_params=[], target="") def eval_metric_fn(features, eval_op_dict): logits = eval_op_dict["logits"] print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32) ) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return {"accuracy":accuracy, "loss":eval_op_dict["loss"], "pred_label":pred_label, "label_ids":features["label_ids"]} def train_metric_fn(features, train_op_dict): logits = train_op_dict["logits"] print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32) ) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return {"accuracy":accuracy, "loss":train_op_dict["loss"], "train_op":train_op_dict["train_op"]} name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size train_features = tf_data_utils.train_input_fn(train_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = tf_data_utils.eval_input_fn(dev_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) train_op_dict = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) eval_op_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) eval_dict = eval_metric_fn(eval_features, eval_op_dict["eval"]) train_dict = train_metric_fn(train_features, train_op_dict["train"]) def eval_fn(eval_dict, sess): i = 0 total_accuracy = 0 eval_total_dict = {} while True: try: eval_result = sess.run(eval_dict) for key in eval_result: if key not in eval_total_dict: if key in ["pred_label", "label_ids"]: eval_total_dict[key] = [] eval_total_dict[key].extend(eval_result[key]) if key in ["accuracy", "loss"]: eval_total_dict[key] = 0.0 eval_total_dict[key] += eval_result[key] else: if key in ["pred_label", "label_ids"]: eval_total_dict[key].extend(eval_result[key]) if key in ["accuracy", "loss"]: eval_total_dict[key] += eval_result[key] i += 1 if np.mod(i, num_eval_steps) == 0: break except tf.errors.OutOfRangeError: print("End of dataset") break label_id = eval_total_dict["label_ids"] pred_label = eval_total_dict["pred_label"] result = classification_report(label_id, pred_label, target_names=list(label_dict["label2id"].keys())) print(result, task_index) eval_total_dict["classification_report"] = result return eval_total_dict def train_fn(train_op_dict, sess): i = 0 cnt = 0 loss_dict = {} monitoring_train = [] monitoring_eval = [] while True: try: [train_result, step] = sess.run([train_op_dict, tf.train.get_global_step()]) for key in train_result: if key == "train_op": continue else: if np.isnan(train_result[key]): print(train_loss, "get nan loss") break else: if key in loss_dict: loss_dict[key] += train_result[key] else: loss_dict[key] = train_result[key] i += 1 cnt += 1 if np.mod(i, num_storage_steps) == 0: string = "" for key in loss_dict: tmp = key + " " + str(loss_dict[key]/cnt) + "\t" string += tmp print(string, step) monitoring_train.append(loss_dict) eval_finial_dict = eval_fn(eval_dict, sess) monitoring_eval.append(eval_finial_dict) for key in loss_dict: loss_dict[key] = 0.0 cnt = 0 if is_debug == "0": if i == num_train_steps: break except tf.errors.OutOfRangeError: print("==Succeeded in training model==") print("===========begin to train============") sess_config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False) checkpoint_dir = checkpoint_dir if task_index == 0 else None print("==checkpoint_dir==", checkpoint_dir) print("start training") hooks = [] if FLAGS.opt_type == "ps": sync_replicas_hook = optimizer_fn.opt.make_session_run_hook(is_chief, num_tokens=0) hooks.append(sync_replicas_hook) sess = tf.train.MonitoredTrainingSession(master=target, is_chief=is_chief, config=sess_config, hooks=hooks, checkpoint_dir=checkpoint_dir, save_checkpoint_steps=num_storage_steps) elif FLAGS.opt_type == "pai_soar" and pai: sess = tf.train.MonitoredTrainingSession(master=target, is_chief=is_chief, config=sess_config, hooks=hooks, checkpoint_dir=checkpoint_dir, save_checkpoint_steps=num_storage_steps) elif FLAGS.opt_type == "hvd" and hvd: bcast_hook = hvd.BroadcastGlobalVariablesHook(0) hooks.append(bcast_hook) sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=sess_config, save_checkpoint_steps=num_storage_steps) else: print("==single sess==") sess = tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks, checkpoint_dir=checkpoint_dir, save_checkpoint_steps=num_storage_steps) step = sess.run(optimizer_fn.global_step) print(step) train_fn(train_dict, sess) if task_index == 0: print("===========begin to eval============") eval_finial_dict = eval_fn(eval_dict, sess)
def __init__(self, model_info): # Horovod: initialize Horovod. print(hvd.size()) # first need to parse the application info and define nn self.game_name = model_info[0] # set remaining variables self.batch_size = 128 self.learning_rate = model_info[5] if(self.game_name == 'min_cover_s2v'): self.num_nodes = model_info[1] self.embed_dim = model_info[2] self.batch_size = 2 self.epochs = 50 ### define placeholder as input # input of the model (X decided by selected or not) self.x = tf.placeholder(shape=[None,self.num_nodes], dtype=tf.float32) self.ad_matrix = tf.placeholder(shape=[None,self.num_nodes,self.num_nodes], \ dtype=tf.float32) self.x = tf.placeholder(shape=[None,None], dtype=tf.float32) self.ad_matrix = tf.placeholder(shape=[None,None,None], \ dtype=tf.float32) # label value self.y = tf.placeholder(shape=[1, None],dtype=tf.float32) # # used to obtain mu_v # self.select_vec = tf.placeholder(shape=[None, self.num_nodes, 1], \ # dtype=tf.float32) # # used to obtain sigma mu_u # self.all_vec = tf.placeholder(shape=[None, self.num_nodes, 1], \ # dtype=tf.float32) # used to obtain mu_v self.select_vec = tf.placeholder(shape=[None, None, 1], \ dtype=tf.float32) # used to obtain sigma mu_u self.all_vec = tf.placeholder(shape=[None, None, 1], \ dtype=tf.float32) self.init_limit = 0.5 self.embed_layer = 5 self.hid_num_units = 16 ### define parameter for model of embedding self.embed_param = { 'theta1':tf.Variable(tf.random_uniform([self.embed_dim, 1], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)), 'theta2':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)), 'theta3':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)), 'theta4':tf.Variable(tf.random_uniform([self.embed_dim, 1], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)) } ### define parameter for model of Q self.value_param = { 'hid':tf.Variable(tf.random_uniform([2*self.embed_dim, self.hid_num_units], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)), 'theta5':tf.Variable(tf.random_uniform([self.hid_num_units, 1], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)), 'theta6':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)), 'theta7':tf.Variable(tf.random_uniform([self.embed_dim, self.embed_dim], \ minval=-self.init_limit, \ maxval=self.init_limit, \ dtype=tf.float32, seed=1)) } self.size = tf.shape(self.ad_matrix)[0] self.size_nodes = tf.shape(self.ad_matrix)[1] ### first level of embedding: self.theta4_relu = tf.nn.relu(tf.tile(self.embed_param['theta4'], \ tf.stack([1, self.size_nodes]))) ## tf.broadcast_to only supported after 1.13.0 #self.mu_0_sigma = tf.matmul(self.embed_param['theta3'], \ # tf.matmul(self.theta4_relu, self.ad_matrix)) #self.theta4_relu = tf.reshape(self.theta4_relu, [-1, self.embed_dim, self.num_nodes]) self.theta4_relu = tf.expand_dims(self.theta4_relu, 0) self.theta4_relu = tf.tile(self.theta4_relu, [self.size, 1, 1], name=None) #self.theta4_relu = tf.broadcast_to(self.theta4_relu, [self.size, self.embed_dim, self.num_nodes]) #self.theta4_relu = tf.expand_dims(self.theta4_relu, 0) #self.theta4_relu = tf.stack([x, y, z], axis=0) self.size self.theta3 = tf.expand_dims(self.embed_param['theta3'], 0) self.theta3 = tf.tile(self.theta3, [self.size, 1, 1], name=None) #self.theta3 = tf.broadcast_to(self.theta3, [self.size, self.embed_dim, self.embed_dim]) self.mu_0_sigma = tf.matmul(self.theta3, tf.matmul(self.theta4_relu, self.ad_matrix)) # # now our output is Bn-by-p # #self.x_reshape = tf.transpose(self.x, perm=[0, 2, 1]) # self.x_reshape = tf.reshape(self.x, [-1, 1]) self.theta1 = tf.expand_dims(self.embed_param['theta1'], 0) self.theta1 = tf.tile(self.theta1, [self.size, 1, 1], name=None) #self.theta1 = tf.broadcast_to(self.theta1, [self.size, self.embed_dim, 1]) # self.x_reshape = tf.reshape(self.x, [self.size, self.num_nodes]) self.x_reshape = tf.expand_dims(self.x, 0) # self.x_reshape = tf.broadcast_to(self.x_reshape, [self.size, 1, self.num_nodes]) self.x_reshape = tf.transpose(self.x_reshape, perm=[1, 0, 2]) self.mu_0 = tf.nn.relu(tf.add(tf.matmul(self.theta1, self.x_reshape), self.mu_0_sigma)) #self.mu_0 = tf.nn.relu(tf.add(tf.matmul(self.x_reshape, self.theta1_reshape),self.mu_0_sigma)) ### second level of embedding: #self.mu_0_reshape = tf.expand_dims(self.mu_0, 0) # self.mu_0_reshape = tf.reshape(self.mu_0_reshape, [-1, self.num_nodes, self.embed_dim]) # self.mu_1_sigma = tf.matmul(self.ad_matrix, self.mu_0_reshape) # self.mu_1_sigma = tf.matmul(tf.reshape(self.mu_1_sigma, [-1, self.embed_dim]),\ # self.embed_param['theta2']) self.theta2 = tf.expand_dims(self.embed_param['theta2'], 0) self.theta2 = tf.tile(self.theta2, [self.size, 1, 1], name=None) #self.theta2 = tf.broadcast_to(self.theta2, [self.size, self.embed_dim, self.embed_dim]) self.mu_1_sigma = tf.matmul(self.theta2, tf.matmul(self.mu_0, self.ad_matrix)) self.mu_1 = tf.add(tf.matmul(self.theta1, self.x_reshape), self.mu_1_sigma) self.mu_1 = tf.nn.relu(tf.add(self.mu_1, self.mu_1_sigma)) # now bn-by-p # self.mu_1 = tf.nn.relu(tf.add(tf.matmul(self.x_reshape, self.theta1_reshape), self.mu_1_sigma)) # #self.mu_1_sigma = tf.matmul(self.embed_param['theta2'], \ # # tf.matmul(self.mu_0, self.ad_matrix)) # #self.mu_1 = tf.add(tf.matmul(self.embed_param['theta1'], \ # # self.x), self.mu_1_sigma) # #self.mu_1 = tf.nn.relu(tf.add(self.mu_1, self.mu_0_sigma)) for i in range(2, self.embed_layer): ### second level of embedding: # self.mu_1_reshape = tf.expand_dims(self.mu_1, 0) # self.mu_1_reshape = tf.reshape(self.mu_1_reshape, [-1, self.num_nodes, self.embed_dim]) # self.mu_1_sigma = tf.matmul(self.ad_matrix, self.mu_1_reshape) # self.mu_1_sigma = tf.matmul(tf.reshape(self.mu_1_sigma, [-1, self.embed_dim]),\ # self.embed_param['theta2']) # # now bn-by-p # self.mu_1 = tf.nn.relu(tf.add(tf.matmul(self.x_reshape, self.theta1_reshape), self.mu_1_sigma)) self.mu_1_sigma = tf.matmul(self.theta2, tf.matmul(self.mu_1, self.ad_matrix)) self.mu_1 = tf.add(tf.matmul(self.theta1, self.x_reshape), self.mu_1_sigma) self.mu_1 = tf.nn.relu(tf.add(self.mu_1, self.mu_1_sigma)) # self.Q_sigma = tf.matmul(self.value_param['theta6'], \ # tf.matmul(self.mu_0_sigma, self.all_vec)) # self.mu_u = tf.matmul(self.value_param['theta7'], \ # tf.matmul(self.mu_0_sigma, self.select_vec)) ### Q value function self.Q_sigma = tf.matmul(self.mu_1, self.all_vec) # self.Q_sigma = tf.matmul(tf.reshape(self.mu_0_sigma, [-1,self.embed_dim, self.num_nodes]), \ # self.all_vec) self.Q_sigma = tf.matmul(self.value_param['theta6'], tf.reshape(self.Q_sigma, \ [self.embed_dim,-1])) self.mu_u = tf.matmul(self.mu_1, self.select_vec) # self.mu_u = tf.matmul(tf.reshape(self.mu_0_sigma, [-1,self.embed_dim, self.num_nodes]), \ # self.select_vec) self.mu_u = tf.matmul(self.value_param['theta7'], tf.reshape(self.mu_u, \ [self.embed_dim,-1])) self.Q_vec = tf.nn.relu(tf.concat([self.Q_sigma, self.mu_u], 0)) self.output_hidden = tf.matmul(tf.transpose(self.value_param['hid']), \ self.Q_vec) self.output_layer = tf.matmul(tf.transpose(self.value_param['theta5']), \ self.output_hidden) else: self.epochs = 3 self.in_num_units = model_info[1] self.hid_num_units = model_info[2] self.out_num_units = model_info[3] # define placeholders self.x = tf.placeholder(shape=[None,self.in_num_units], dtype=tf.float32) self.y = tf.placeholder(shape=[None,self.out_num_units],dtype=tf.float32) ### define weights and biases of the neural network self.weights = { 'hidden1':tf.Variable(tf.random_normal([self.in_num_units, \ self.hid_num_units \ ], seed=1)), 'hidden2':tf.Variable(tf.random_normal([self.hid_num_units, \ int(self.hid_num_units/2) \ ], seed=1)), 'output':tf.Variable(tf.random_normal([int(self.hid_num_units/2), \ self.out_num_units \ ], seed=1)) } self.biases = { 'hidden1':tf.Variable(tf.random_normal([self.hid_num_units], seed=1)), 'hidden2':tf.Variable(tf.random_normal([int(self.hid_num_units/2)],\ seed=1)), 'output':tf.Variable(tf.random_normal([self.out_num_units], seed=1)) } self.hidden_layer1 = tf.add(tf.matmul(self.x, self.weights['hidden1']),\ self.biases['hidden1']) self.hidden_layer1 = tf.nn.relu(self.hidden_layer1) self.hidden_layer2 = tf.add(tf.matmul(self.hidden_layer1, \ self.weights['hidden2']), \ self.biases['hidden2']) self.hidden_layer2 = tf.nn.relu(self.hidden_layer2) self.output_layer = tf.matmul(self.hidden_layer2, \ self.weights['output']) \ + self.biases['output'] # self.output_layer = tf.nn.relu(self.output_layer) self.cost = tf.reduce_sum(tf.square(self.output_layer - self.y)) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) # minimize(self.cost) #self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) global_step = tf.train.get_or_create_global_step() self.hvd_opt = hvd.DistributedOptimizer(self.optimizer) self.train_op = self.hvd_opt.minimize(self.cost, global_step=global_step) #global_step = tf.Variable(0, trainable=False) #self.train_op = self.hvd_opt.minimize(self.cost, global_step=global_step) config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.gpu_options.per_process_gpu_memory_fraction = 1 gpus = config.gpu_options.visible_device_list= str(hvd.local_rank()) # Horovod: pin GPU to be used to process local rank (one GPU per process) #gpus = tf.config.experimental.list_physical_devices('GPU') print(gpus) #config.gpu_options.visible_device_list = str(hvd.local_rank()) print(config.gpu_options.visible_device_list) self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) bcast = hvd.broadcast_global_variables(0) print("it has been initialized") self.hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0) # Horovod: adjust number of steps based on number of GPUs. #tf.train.StopAtStepHook(last_step=self.epochs // hvd.size()), # tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': self.cost}, # every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. self.checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # training_batch_generator = train_input_generator(x_train, # y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. # with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, # hooks=self.hooks, # config=config) as mon_sess: # while not mon_sess.should_stop(): # # Run a training step synchronously. # image_, label_ = next(training_batch_generator) # mon_sess.run(train_op, feed_dict={image: image_, label: label_}) self.mon_sess = tf.train.MonitoredTrainingSession(hooks=self.hooks, config=config)
def main(_): hvd.init() # Only a see a single unique GPU based on process rank config = tf.ConfigProto() # We don't need allow_growth=True as we will use a whole GPU for each process. # config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Only one of the workers save checkpoints and summaries in the # model directory if hvd.rank() == 0: config = tf.estimator.RunConfig( model_dir=FLAGS.model_dir, keep_checkpoint_every_n_hours=5, save_summary_steps=100, save_checkpoints_secs=60 * 5, session_config=config ) else: config = tf.estimator.RunConfig( session_config=config, keep_checkpoint_max=1 ) if FLAGS.mobilenet_checkpoint_path is not None: # ^((?!badword).)*$ matches all strings which do not contain the badword ws = tf.estimator.WarmStartSettings( ckpt_to_initialize_from=FLAGS.mobilenet_checkpoint_path, vars_to_warm_start='.*' if FLAGS.restore_last_layer else "^((?!Logits).)*$", ) else: ws = None estimator = tf.estimator.Estimator( model_fn=model.model_fn, config=config, warm_start_from=ws ) bcast_hook = hvd.BroadcastGlobalVariablesHook(0) image_counter_hook = ImageRateHook() # Caching file writers, which will then be retrieved during evaluation. writer = tf.summary.FileWriter(logdir=FLAGS.model_dir, flush_secs=30) eval_writer = tf.summary.FileWriter( logdir=os.path.join(FLAGS.model_dir, "eval"), flush_secs=30) try: steps = estimator.get_variable_value('global_step') except ValueError: steps = 0 evaluate_every_n = 1000 evaluate(estimator, True) evaluate(estimator, False) # if hvd.rank() == 0 and FLAGS.evaluate: # evaluate(estimator, False) sys.exit() print("Steps", steps, "Max steps", FLAGS.max_steps) while steps < FLAGS.max_steps: evaluate_every_n = min(evaluate_every_n, FLAGS.max_steps - steps) estimator.train(input_fn=lambda: model.imagenet_iterator( is_training=True, num_epochs=10000), steps=evaluate_every_n, hooks=[bcast_hook, image_counter_hook]) if hvd.rank() == 0 and FLAGS.evaluate: # Evaluate on training set only for metric_learning if FLAGS.model in ['metric_learning', 'cifar100']: evaluate(estimator, True) evaluate(estimator, False) else: evaluate(estimator, False) steps += evaluate_every_n
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) classifier = tf.estimator.Estimator( model_fn=my_model, model_dir='./checkpoints_inceptionv3_%s' % hvd.rank(), # params={ # 'feature_columns': my_feature_columns, # 'hidden_units': [10, 10], # Two hidden layers of 10 nodes each. # 'n_classes': 3, # The model must choose between 3 classes. # } ) bcast_hook = hvd.BroadcastGlobalVariablesHook(0) train = classifier.train(input_fn=train_input_fn, steps=100, hooks=[bcast_hook]) eval_result = classifier.evaluate(input_fn=test_input_fn, steps=10) print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
def train_schedule( estimator, train_eval_iterations, single_iteration_train_steps=None, single_iteration_train_epochs=None, bleu_source=None, bleu_ref=None, bleu_threshold=None): """Train and evaluate model, and optionally compute model's BLEU score. **Step vs. Epoch vs. Iteration** Steps and epochs are canonical terms used in TensorFlow and general machine learning. They are used to describe running a single process (train/eval): - Step refers to running the process through a single or batch of examples. - Epoch refers to running the process through an entire dataset. E.g. training a dataset with 100 examples. The dataset is divided into 20 batches with 5 examples per batch. A single training step trains the model on one batch. After 20 training steps, the model will have trained on every batch in the dataset, or, in other words, one epoch. Meanwhile, iteration is used in this implementation to describe running multiple processes (training and eval). - A single iteration: 1. trains the model for a specific number of steps or epochs. 2. evaluates the model. 3. (if source and ref files are provided) compute BLEU score. This function runs through multiple train+eval+bleu iterations. Args: estimator: tf.Estimator containing model to train. train_eval_iterations: Number of times to repeat the train+eval iteration. single_iteration_train_steps: Number of steps to train in one iteration. single_iteration_train_epochs: Number of epochs to train in one iteration. bleu_source: File containing text to be translated for BLEU calculation. bleu_ref: File containing reference translations for BLEU calculation. bleu_threshold: minimum BLEU score before training is stopped. Raises: ValueError: if both or none of single_iteration_train_steps and single_iteration_train_epochs were defined. """ # Ensure that exactly one of single_iteration_train_steps and # single_iteration_train_epochs is defined. if single_iteration_train_steps is None: if single_iteration_train_epochs is None: raise ValueError( "Exactly one of single_iteration_train_steps or " "single_iteration_train_epochs must be defined. Both were none.") else: if single_iteration_train_epochs is not None: raise ValueError( "Exactly one of single_iteration_train_steps or " "single_iteration_train_epochs must be defined. Both were defined.") evaluate_bleu = bleu_source is not None and bleu_ref is not None # Print out training schedule print("Training schedule:") if single_iteration_train_epochs is not None: print("\t1. Train for %d epochs." % single_iteration_train_epochs) else: print("\t1. Train for %d steps." % single_iteration_train_steps) print("\t2. Evaluate model.") if evaluate_bleu: print("\t3. Compute BLEU score.") if bleu_threshold is not None: print("Repeat above steps until the BLEU score reaches", bleu_threshold) if not evaluate_bleu or bleu_threshold is None: print("Repeat above steps %d times." % train_eval_iterations) if evaluate_bleu: # Set summary writer to log bleu score. bleu_writer = tf.compat.v1.summary.FileWriter( os.path.join(estimator.model_dir, BLEU_DIR)) if bleu_threshold is not None: # Change loop stopping condition if bleu_threshold is defined. train_eval_iterations = INF # Loop training/evaluation/bleu cycles mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP) # Profiling with timeline if FLAGS.save_profile == "Yes": profile_hooks = [tf.compat.v1.train.ProfilerHook(save_steps=1, output_dir=FLAGS.profile_dir)] # the json file #profile file will be saved in in profile_dir #Creating hooks for printing Examples per Second, used with estimator.train training_batch_size = estimator.params.batch_size if FLAGS.batch_size != -1: training_batch_size = FLAGS.batch_size train_hooks = hooks_helper.get_train_hooks( ["ExamplesPerSecondHook"], model_dir=FLAGS.model_dir, batch_size=training_batch_size, every_n_steps=FLAGS.print_iter, warm_steps=50 ) if FLAGS.save_profile == "Yes": hooks = profile_hooks else: hooks = train_hooks for i in xrange(train_eval_iterations): print("Starting iteration", i + 1) if single_iteration_train_epochs is not None: mlperf_log.transformer_print(key=mlperf_log.TRAIN_EPOCH, value=i * single_iteration_train_epochs + 1) #Can we move the following out of the loop if is_mpi: train_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) # Train the model for single_iteration_train_steps or until the input fn # runs out of examples (if single_iteration_train_steps is None). estimator.train(dataset.train_input_fn, steps=single_iteration_train_steps, hooks=hooks) mlperf_log.transformer_print(key=mlperf_log.EVAL_START) # To save training time, we can turn off evaluation # Otherwise it will be turned on if FLAGS.do_eval == "Yes": eval_results = estimator.evaluate(dataset.eval_input_fn) print("Evaluation results (iter %d/%d):" % (i + 1, train_eval_iterations), eval_results) if evaluate_bleu: uncased_score, _ = evaluate_and_log_bleu( estimator, bleu_writer, bleu_source, bleu_ref) if bleu_threshold is not None and uncased_score > bleu_threshold: bleu_writer.close() break mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET, value=bleu_threshold) mlperf_log.transformer_print(key=mlperf_log.EVAL_ACCURACY, value=uncased_score) mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)
def train(infer_func, params): image_width = params['image_width'] image_height = params['image_height'] image_format = params['image_format'] batch_size = params['batch_size'] distort_color = params['distort_color'] data_dir = params['data_dir'] data_idx_dir = params['data_idx_dir'] log_dir = params['log_dir'] precision = params['precision'] momentum = params['momentum'] learning_rate_init = params['learning_rate_init'] learning_rate_power = params['learning_rate_power'] weight_decay = params['weight_decay'] loss_scale = params['loss_scale'] larc_eta = params['larc_eta'] larc_mode = params['larc_mode'] num_iter = params['num_iter'] checkpoint_secs = params['checkpoint_secs'] display_every = params['display_every'] iter_unit = params['iter_unit'] use_dali = params['use_dali'] # Determinism is not fully supported by all TF ops. # Disabling until remaining wrinkles can be ironed out. deterministic = False if deterministic: tf.set_random_seed(2 * (1 + hvd.rank())) random.seed(3 * (1 + hvd.rank())) np.random.seed(2) log_dir = None if log_dir == "" else log_dir data_dir = None if data_dir == "" else data_dir data_idx_dir = None if data_idx_dir == "" else data_idx_dir global_batch_size = batch_size * hvd.size() if data_dir is not None: filename_pattern = os.path.join(data_dir, '%s-*') train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) num_training_samples = _get_num_records(train_filenames) else: num_training_samples = global_batch_size train_idx_filenames = None if data_idx_dir is not None: filename_pattern = os.path.join(data_idx_dir, '%s-*') train_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) if iter_unit.lower() == 'epoch': nstep = num_training_samples * num_iter // global_batch_size decay_steps = nstep else: nstep = num_iter num_epochs = max(nstep * global_batch_size // num_training_samples, 1) decay_steps = 90 * num_training_samples // global_batch_size nstep_per_epoch = num_training_samples // global_batch_size # Horovod: pin GPU to be used to process local rank (one GPU per process) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) config = tf.ConfigProto(gpu_options=gpu_options) #config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = max(2, 40//hvd.size()-2) classifier = tf.estimator.Estimator( model_fn=_cnn_model_function, model_dir=log_dir, params={ 'model': infer_func, 'format': image_format, 'dtype' : tf.float16 if precision == 'fp16' else tf.float32, 'momentum' : momentum, 'learning_rate_init' : learning_rate_init, 'learning_rate_power' : learning_rate_power, 'decay_steps' : decay_steps, 'weight_decay' : weight_decay, 'loss_scale' : loss_scale, 'larc_eta' : larc_eta, 'larc_mode' : larc_mode, 'deterministic' : deterministic, 'n_classes': 1000, 'use_dali': use_dali, }, config=tf.estimator.RunConfig( tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None, session_config=config, save_checkpoints_secs=checkpoint_secs if hvd.rank() == 0 else None, save_checkpoints_steps=nstep if hvd.rank() == 0 else None, keep_checkpoint_every_n_hours=3)) print("Training") if not deterministic and not use_dali: num_preproc_threads = 10 elif not deterministic and use_dali: num_preproc_threads = 2 elif deterministic: num_preproc_threads = 1 training_hooks = [hvd.BroadcastGlobalVariablesHook(0), _PrefillStagingAreasHook()] if hvd.rank() == 0: training_hooks.append( _LogSessionRunHook(global_batch_size, num_training_samples, display_every)) if data_dir is not None: input_func = lambda: nvutils.image_set( train_filenames, batch_size, image_height, image_width, training=True, distort_color=distort_color, deterministic=deterministic, num_threads=num_preproc_threads, use_dali=use_dali, idx_filenames=train_idx_filenames) else: input_func = lambda: nvutils.fake_image_set( batch_size, image_height, image_width) try: classifier.train( input_fn=input_func, max_steps=nstep, hooks=training_hooks) except KeyboardInterrupt: print("Keyboard interrupt")
def main_train(args, files, tf_config): assert args.logdir != '', 'logdir cannot be empty' logdir = os.path.join(args.logdir, 'tf_output') if os.path.isdir(logdir): do_not_delete = True if args.ngpus > 1: if hvd.rank() == 0: if args.force_continue: do_not_delete = True else: do_not_delete = False else: do_not_delete = True elif HEADLESS: if args.force_continue: do_not_delete = True else: raise ValueError('{} exists'.format(logdir)) else: while True: try: key = input( '{} \n do you want to continue?'.format(logdir)) except NameError: key = 'y' if key == 'y': break elif key == 'n': do_not_delete = False break else: print('invalid key') if not do_not_delete: print('******* Deleting {} *******'.format(logdir)) os.system('rm -r {}'.format(logdir)) else: print('continuing') elif args.ngpus == 1 or hvd.rank() == 0: os.makedirs(logdir) print('logdir is {}'.format(logdir)) tf_output, pc_reader = build_tf_ops( args=args, data_dict=None, files=files, ) train_op, summary_op, tf_data_dict, logger_dict, tf_step = tf_output summary_hook = tf.train.SummarySaverHook( summary_op=summary_op, output_dir=logdir, save_steps=args.save_steps, ) logging_hook = tf.train.LoggingTensorHook( tensors=logger_dict, every_n_iter=args.log_steps, ) hooks = [] if args.ngpus > 1: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if hvd.rank() == 0: checkpoint_dir = logdir save_checkpoint_secs = 300 hooks += [logging_hook, summary_hook] else: checkpoint_dir = None save_checkpoint_secs = 0 hooks += [logging_hook] else: hooks = [logging_hook, summary_hook] checkpoint_dir = logdir save_checkpoint_secs = 300 if args.init_checkpoint_folder != '': checkpoint = tf.train.latest_checkpoint(args.init_checkpoint_folder) tf.train.init_from_checkpoint(checkpoint, {'/': '/'}) yaml_path = os.path.join(args.logdir, 'args.yaml') with open(yaml_path, 'w') as yaml_file: yaml.dump(args, yaml_file) with tf.train.MonitoredTrainingSession( checkpoint_dir=checkpoint_dir, hooks=hooks, save_summaries_secs=0, save_checkpoint_secs=save_checkpoint_secs, config=tf_config, ) as mon_sess: start_time = time.time() print(time.time() - start_time) writer = SummaryWriterCache.get(logdir) while not mon_sess.should_stop(): # print('hvd rank = {}, current_index = {}, nfiles = {}'.format(current_index, hvd.rank(), len(my_files))) tensor_list = [tf_step, tf_data_dict] if args.training_splits == 'train': tensor_list += [train_op] + tensor_list # _, step, data_dict = mon_sess.run(tensor_list) mon_sess.run(tensor_list)
def main(_): os.environ[ "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if FLAGS.horovod: hvd.init() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.io.gfile.makedirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: tf.compat.v1.logging.info("Multi-GPU training with TF Horovod") tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size( ) master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size()) ] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank + 1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder(task_name=task_name, bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: file_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Num examples = %d", len(eval_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)]) num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log( step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): dllogging.logger.log(step=(), data={key: float(result[key])}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info(" Num examples = %d", len(predict_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] predict_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as writer: tf.compat.v1.logging.info("***** Predict results *****") for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks, yield_single_examples=False): output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line) predict_time_elapsed = time.time() - predict_start_time predict_time_wo_overhead = predict_hooks[-1].total_time time_list = predict_hooks[-1].time_list time_list.sort() num_sentences = (predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed, predict_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead, (predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size) tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------")
def train(hparams, scope=None, target_session=""): # Horovod hvd.init() """Train a translation model.""" log_device_placement = hparams.log_device_placement out_dir = hparams.out_dir num_train_steps = hparams.num_train_steps steps_per_stats = hparams.steps_per_stats steps_per_external_eval = hparams.steps_per_external_eval steps_per_eval = 10 * steps_per_stats avg_ckpts = hparams.avg_ckpts if not steps_per_external_eval: steps_per_external_eval = 5 * steps_per_eval # Create model model_creator = get_model_creator(hparams) #train_model = model_helper.create_train_model(model_creator, hparams, scope) # Horovod train_model = model_helper.create_train_model(model_creator, hparams, scope, num_workers=hvd.size(), jobid=hvd.rank()) eval_model = model_helper.create_eval_model(model_creator, hparams, scope) infer_model = model_helper.create_infer_model(model_creator, hparams, scope) # Preload data for sample decoding. dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src) dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt) sample_src_data = inference.load_data(dev_src_file) sample_tgt_data = inference.load_data(dev_tgt_file) summary_name = "train_log" model_dir = hparams.out_dir # Log and output files log_file = os.path.join(out_dir, "log_%d" % time.time()) log_f = tf.gfile.GFile(log_file, mode="a") utils.print_out("# log_file=%s" % log_file, log_f) # TensorFlow model config_proto = utils.get_config_proto( log_device_placement=log_device_placement, num_intra_threads=hparams.num_intra_threads, num_inter_threads=hparams.num_inter_threads) #train_sess = tf.Session( # target=target_session, config=config_proto, graph=train_model.graph) # Horovod with train_model.graph.as_default(): hooks = [hvd.BroadcastGlobalVariablesHook(0)] with tf.train.MonitoredTrainingSession( #checkpoint_dir=flags.out_dir, hooks=hooks, config=config_proto) as train_sess: eval_sess = tf.Session(target=target_session, config=config_proto, graph=eval_model.graph) infer_sess = tf.Session(target=target_session, config=config_proto, graph=infer_model.graph) loaded_train_model, global_step = model_helper.horovod_create_or_load_model( train_model.model, model_dir, train_sess, "train") # Summary writer summary_writer = tf.summary.FileWriter( os.path.join(out_dir, summary_name), train_model.graph) # First evaluation run_full_eval(model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data, avg_ckpts) last_stats_step = global_step last_eval_step = global_step last_external_eval_step = global_step # This is the training loop. stats, info, start_train_time = before_train( loaded_train_model, train_model, train_sess, global_step, hparams, log_f) while global_step < num_train_steps: ### Run a step ### start_time = time.time() try: step_result = loaded_train_model.train(train_sess) hparams.epoch_step += 1 except tf.errors.OutOfRangeError: # Finished going through the training dataset. Go to next epoch. hparams.epoch_step = 0 utils.print_out( "# Finished an epoch, step %d. Perform external evaluation" % global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer) if avg_ckpts: run_avg_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer, global_step) train_sess.run( train_model.iterator.initializer, feed_dict={train_model.skip_count_placeholder: 0}) continue # Process step_result, accumulate stats, and write summary global_step, info[ "learning_rate"], step_summary = update_stats( stats, start_time, step_result) summary_writer.add_summary(step_summary, global_step) # Once in a while, we print statistics. if global_step - last_stats_step >= steps_per_stats: last_stats_step = global_step is_overflow = process_stats(stats, info, global_step, steps_per_stats, log_f) print_step_info(" ", global_step, info, get_best_results(hparams), log_f) if is_overflow: break # Reset statistics stats = init_stats() if global_step - last_eval_step >= steps_per_eval: last_eval_step = global_step utils.print_out("# Save eval, global step %d" % global_step) add_info_summaries(summary_writer, global_step, info) # Save checkpoint loaded_train_model.saver.save( train_sess._sess._sess._sess._sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) # Evaluate on dev/test run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_internal_eval(eval_model, eval_sess, model_dir, hparams, summary_writer) if global_step - last_external_eval_step >= steps_per_external_eval: last_external_eval_step = global_step # Save checkpoint loaded_train_model.saver.save( train_sess._sess._sess._sess._sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) run_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer) if avg_ckpts: run_avg_external_eval(infer_model, infer_sess, model_dir, hparams, summary_writer, global_step) # Done training # Now out of the training loop. Doing the rest of full and best evaluation # by only rank 0 if hvd.rank() == 0: loaded_train_model.saver.save( train_sess._sess._sess._sess._sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) (result_summary, _, final_eval_metrics) = (run_full_eval( model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data, avg_ckpts)) print_step_info("# Final, ", global_step, info, result_summary, log_f) utils.print_time("# Done training!", start_train_time) summary_writer.close() utils.print_out("# Start evaluating saved best models.") for metric in hparams.metrics: best_model_dir = getattr(hparams, "best_" + metric + "_dir") summary_writer = tf.summary.FileWriter( os.path.join(best_model_dir, summary_name), infer_model.graph) result_summary, best_global_step, _ = run_full_eval( best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) print_step_info("# Best %s, " % metric, best_global_step, info, result_summary, log_f) summary_writer.close() if avg_ckpts: best_model_dir = getattr(hparams, "avg_best_" + metric + "_dir") summary_writer = tf.summary.FileWriter( os.path.join(best_model_dir, summary_name), infer_model.graph) result_summary, best_global_step, _ = run_full_eval( best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) print_step_info("# Averaged Best %s, " % metric, best_global_step, info, result_summary, log_f) summary_writer.close() return final_eval_metrics, global_step
def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" if FLAGS.horovod: import horovod.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.io.gfile.makedirs(FLAGS.output_dir) input_files = [] for input_file_dir in FLAGS.input_files_dir.split(","): input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*"))) if FLAGS.horovod and len(input_files) < hvd.size(): raise ValueError("Input Files must be sharded") if FLAGS.use_fp16 and FLAGS.manual_fp16: raise ValueError( "AMP and Manual Mixed Precision Training are both activated! Error" ) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.rank() == 0: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") # config.gpu_options.per_process_gpu_memory_fraction = 0.7 if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None, # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) training_hooks = [] if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0): global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size( ) training_hooks.append( _LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps)) if FLAGS.horovod and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, batch_size=FLAGS.train_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.horovod else hvd) estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0): tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_files = [] for eval_file_dir in FLAGS.eval_files_dir.split(","): eval_files.extend( tf.io.gfile.glob(os.path.join(eval_file_dir, "*"))) eval_input_fn = input_fn_builder( input_files=eval_files, batch_size=FLAGS.eval_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.horovod else hvd) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time eval_time_wo_overhead = eval_hooks[-1].total_time num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) hvd.init() if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Horovod: pin GPU to be used to process local rank (one GPU per process) hvd_config = tf.ConfigProto() hvd_config.gpu_options.allow_growth = True hvd_config.gpu_options.visible_device_list = str(hvd.local_rank()) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( session_config=hvd_config, cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(albert_config=albert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=int(FLAGS.num_train_steps / hvd.size()), num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, optimizer=FLAGS.optimizer, poly_power=FLAGS.poly_power, start_warmup_step=FLAGS.start_warmup_step) estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) bcast_hook = hvd.BroadcastGlobalVariablesHook(0) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps, hooks=[bcast_hook]) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) global_step = -1 output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") writer = tf.gfile.GFile(output_eval_file, "w") tf.gfile.MakeDirs(FLAGS.export_dir) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) while global_step < FLAGS.num_train_steps: if estimator.latest_checkpoint() is None: tf.logging.info("No checkpoint found yet. Sleeping.") time.sleep(1) else: result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): #liangaws:测试sagemaker传入python程序的参数。 import sys print(sys.argv) #liangaws: initialize Horovod. hvd.init() #------check Arguments------ if FLAGS.dt_dir == "": FLAGS.dt_dir = (date.today() + timedelta(-1)).strftime('%Y%m%d') #FLAGS.model_dir = FLAGS.model_dir + FLAGS.dt_dir #FLAGS.data_dir = FLAGS.data_dir + FLAGS.dt_dir print('task_type ', FLAGS.task_type) print('model_dir ', FLAGS.model_dir) print('data_dir ', FLAGS.data_dir) print('dt_dir ', FLAGS.dt_dir) print('num_epochs ', FLAGS.num_epochs) print('feature_size ', FLAGS.feature_size) print('field_size ', FLAGS.field_size) print('embedding_size ', FLAGS.embedding_size) print('batch_size ', FLAGS.batch_size) print('deep_layers ', FLAGS.deep_layers) print('dropout ', FLAGS.dropout) print('loss_type ', FLAGS.loss_type) print('optimizer ', FLAGS.optimizer) print('learning_rate ', FLAGS.learning_rate) print('batch_norm_decay ', FLAGS.batch_norm_decay) print('batch_norm ', FLAGS.batch_norm) print('l2_reg ', FLAGS.l2_reg) #------init Envs------ #liangaws: 这里利用glob.glob函数可以把data_dir目录下的所有训练文件名抽取出来组成一个list,之后可以直接把这个文件名list传给TextLineDataset。 tr_files = glob.glob("%s/tr*libsvm" % FLAGS.data_dir) random.shuffle(tr_files) print("tr_files:", tr_files) va_files = glob.glob("%s/va*libsvm" % FLAGS.data_dir) print("va_files:", va_files) te_files = glob.glob("%s/te*libsvm" % FLAGS.data_dir) print("te_files:", te_files) if FLAGS.clear_existing_model: try: shutil.rmtree(FLAGS.model_dir) except Exception as e: print(e, "at clear_existing_model") else: print("existing model cleaned at %s" % FLAGS.model_dir) #liangaws:这里注释掉调用设置parameter server方式进行分布式训练的环境参数,因为这个训练环境要用Sagemaker来控制。 #set_dist_env() #------bulid Tasks------ model_params = { "field_size": FLAGS.field_size, "feature_size": FLAGS.feature_size, "embedding_size": FLAGS.embedding_size, "learning_rate": FLAGS.learning_rate, "batch_norm_decay": FLAGS.batch_norm_decay, "l2_reg": FLAGS.l2_reg, "deep_layers": FLAGS.deep_layers, "dropout": FLAGS.dropout } #liangaws:这里注释掉config设置,暂时不使用这个。 """ config = tf.estimator.RunConfig().replace(session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}), log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps) """ #liangaws:设置checkpoint的周期和最大数量 #config = tf.estimator.RunConfig().replace(save_checkpoints_secs = 5, # keep_checkpoint_max = 5, #log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps) #liangaws: 使用Horovod, pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # liangaws: 使用Horovod的时候, save checkpoints only on worker 0 to prevent other workers from corrupting them. print('current horovod rank is ', hvd.rank()) print('input model dir is ', FLAGS.model_dir) print("host is ", FLAGS.hosts) print('current host is ', FLAGS.current_host) if hvd.rank() == 0: DeepFM = tf.estimator.Estimator( model_fn=model_fn, model_dir=FLAGS.model_dir, params=model_params, config=tf.estimator.RunConfig().replace(session_config=config)) else: DeepFM = tf.estimator.Estimator( model_fn=model_fn, model_dir=None, params=model_params, config=tf.estimator.RunConfig().replace(session_config=config)) # liangaws: 使用Horovod的时候, BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0 to all other processes. This is necessary to ensure consistent initialization of all workers when training is started with random weights or restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) #liangaws: 为了在Sagemaker pipe mode下使用horovod的单机多个worker进程,需要在调用Sagemaker的estimator fit的时候用多个channel,至少单机的每个worker需要一个channel。从SM设置的环境变量SM_CHANNELS可以获得当前的所有channel名字,之后每个worker用单独的channel来进行数据读取。 #这里channel名字的顺序与调用Sagemaker estimator fit时候写入的顺序是不同的。比如对于{'training':train_s3, 'training-2':train2_s3, 'evaluation': validate_s3}这样的三个channel,环境变量被SM设置为['evaluation', 'training', 'training-2'],也就是说最后一个channel 'evaluation'出现在环境变量SM_CHANNELS中的第一个,其他channel则是按照原来顺序排列。 channel_names = json.loads(os.environ['SM_CHANNELS']) print("channel name", channel_names) print("first channel", channel_names[0]) print("last channel name", channel_names[-1]) eval_channel = channel_names[0] if FLAGS.task_type == 'train': #liangaws:增加hook到TrainSpec中 """ train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(tr_files, channel='training', num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(va_files, channel='evaluation', num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200) tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec) """ if FLAGS.pipe_mode == 0: #file mode for _ in range(FLAGS.num_epochs): DeepFM.train(input_fn=lambda: input_fn( tr_files, num_epochs=1, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) if hvd.rank() == 0: #只需要在horovod的master做模型评估 DeepFM.evaluate(input_fn=lambda: input_fn( va_files, num_epochs=1, batch_size=FLAGS.batch_size)) else: #pipe mode #liangaws: horovod + pipe mode方式下,训练中worker第二次进入input_fn中的时候,继续使用PipeModeDataset对同一个FIFO读取数据会出问题。 """ train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(channel=channel_names[1 + hvd.local_rank()], num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(channel=eval_channel, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200) tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec) """ DeepFM.train(input_fn=lambda: input_fn( channel=channel_names[1 + hvd.local_rank()], num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook]) if hvd.rank() == 0: #只需要在horovod的master做模型评估 DeepFM.evaluate( input_fn=lambda: input_fn(channel=eval_channel, num_epochs=1, batch_size=FLAGS.batch_size)) elif FLAGS.task_type == 'eval': DeepFM.evaluate(input_fn=lambda: input_fn( va_files, num_epochs=1, batch_size=FLAGS.batch_size)) elif FLAGS.task_type == 'infer': preds = DeepFM.predict(input_fn=lambda: input_fn( te_files, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys="prob") with open(FLAGS.data_dir + "/pred.txt", "w") as fo: for prob in preds: fo.write("%f\n" % (prob['prob'])) #liangaws:这里修改当任务类型是train或者export的时候都保存模型 if FLAGS.task_type == 'export' or FLAGS.task_type == 'train': #feature_spec = tf.feature_column.make_parse_example_spec(feature_columns) #feature_spec = { # 'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]), # 'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size]) #} #serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) feature_spec = { 'feat_ids': tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.field_size], name='feat_ids'), 'feat_vals': tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.field_size], name='feat_vals') } serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn( feature_spec) #liangaws: 使用Horovod的时候: Save model and history only on worker 0 (i.e. master) if hvd.rank() == 0: DeepFM.export_savedmodel(FLAGS.servable_model_dir, serving_input_receiver_fn)
def main(args): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets") if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (train_data, train_labels), (eval_data, eval_labels) = keras.datasets.mnist.load_data( "MNIST-data-%d" % hvd.rank() ) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. train_data = np.reshape(train_data, (-1, 784)) / 255.0 eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 # Horovod: pin GPU to be used to process local rank (one GPU per process) if not args.use_only_cpu: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) estimator_config = tf.estimator.RunConfig(session_config=config) else: estimator_config = None # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = args.model_dir if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=estimator_config ) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True ) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=args.num_steps // hvd.size(), hooks=[bcast_hook] ) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False ) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def main(_): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) if FLAGS.horovod: hvd.init() if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" processors = {'consensus': ConsensusProcessor} tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) tf.io.gfile.makedirs(FLAGS.output_dir) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: global_batch_size = FLAGS.train_batch_size * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size()) ] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank + 1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd, use_fp16=FLAGS.use_fp16) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: filed_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) tf.compat.v1.logging.info(" Num of labels = %d", len(label_list)) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info( " Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info( " Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] eval_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.compat.v1.logging.info("***** Predict results *****") for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks, yield_single_examples=True): probabilities = prediction["probabilities"] output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples eval_time_elapsed = time.time() - eval_start_time eval_time_wo_overhead = eval_hooks[-1].total_time time_list = eval_hooks[-1].time_list time_list.sort() num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size) tf.compat.v1.logging.info("Summary Inference Statistics") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------")
def train(*tf_records: "Records to train on"): """Train on examples.""" tf.logging.set_verbosity(tf.logging.INFO) estimator = dual_net.get_estimator() effective_batch_size = FLAGS.train_batch_size if FLAGS.dist_train: effective_batch_size = int(FLAGS.train_batch_size / hvd.size()) if FLAGS.use_tpu: effective_batch_size *= FLAGS.num_tpu_cores if FLAGS.use_tpu: if FLAGS.use_bt: def _input_fn(params): games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], number_of_games=FLAGS.window_size, random_rotation=True) else: def _input_fn(params): return preprocessing.get_tpu_input_tensors( params['batch_size'], tf_records, random_rotation=True) # Hooks are broken with TPUestimator at the moment. hooks = [] else: def _input_fn(): return preprocessing.get_input_tensors( effective_batch_size, tf_records, filter_amount=FLAGS.filter_amount, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True, seed=FLAGS.training_seed, dist_train=FLAGS.dist_train) hooks = [ UpdateRatioSessionHook(FLAGS.work_dir), EchoStepCounterHook(output_dir=FLAGS.work_dir) ] if FLAGS.dist_train: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) steps = FLAGS.steps_to_train logging.info("Training, steps = %s, batch = %s -> %s examples", steps or '?', effective_batch_size, (steps * effective_batch_size) if steps else '?') if FLAGS.use_bt: games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) if not games.read_wait_cell(): games.require_fresh_games(20000) latest_game = games.latest_game_number index_from = max(latest_game, games.read_wait_cell()) print("== Last game before training:", latest_game, flush=True) print("== Wait cell:", games.read_wait_cell(), flush=True) try: estimator.train(_input_fn, steps=steps, hooks=hooks) if FLAGS.use_bt: bigtable_input.set_fresh_watermark(games, index_from, FLAGS.window_size) except: if FLAGS.use_bt: games.require_fresh_games(0) raise
def main(_): hvd.init() # Read/download local dataset. Different copy for each process. mnist = tf.contrib.learn.datasets.mnist.read_data_sets( "mnist_data_{}".format(hvd.rank())) # Name images placeholder to be able to retrieve it from saved meta graph. images_placeholder = tf.placeholder(tf.float32, [None, 784], name=INPUT_NAME) dense_dropout_placeholder = tf.placeholder_with_default(1.0, []) labels_placeholder = tf.placeholder(tf.int64, [None]) logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder) # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now # also does not contain any training specific ops, so it is optimized for serving too. tf.train.export_meta_graph("graph.meta", as_text=True) loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits) accuracy = tf.reduce_mean( tf.cast(tf.equal(predictions, labels_placeholder), tf.float32)) # Define summary ops to save summaries for later use in tensorboard. tf.summary.scalar("accuracy", accuracy) tf.summary.scalar("loss", loss) summary_op = tf.summary.merge_all() # Horovod: adjust learning rate based on number of workers. optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size()) global_step = tf.contrib.framework.get_or_create_global_step() # Wrap standard optimizer in Horovod distributed one. train = hvd.DistributedOptimizer(optimizer).minimize( loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of workers. tf.train.StopAtStepHook(last_step=2000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Only master saves summaries. if hvd.rank() == 0: hooks += [ # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by # tensorboard. tf.train.SummarySaverHook(save_steps=1, output_dir=os.path.join( EXPERIMENT_OUTPUT_PATH, "tensorboard"), summary_op=summary_op) ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user. checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints") if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks) as mon_sess: while not mon_sess.should_stop(): images, labels = mnist.train.next_batch(64) _, loss_val, accuracy_val, global_step_val = mon_sess.run( [train, loss, accuracy, global_step], feed_dict={ images_placeholder: images, labels_placeholder: labels, dense_dropout_placeholder: 0.5 }) # Only master publishes metrics. if hvd.rank() == 0: # Publish metrics just like in the single node example. publish({ "loss": str(loss_val), "accuracy": str(accuracy_val), "global_step": str(global_step_val) }) # Save servable model only from Horovod master. if hvd.rank() == 0: # Create a new graph to import the previously exported one. with tf.Graph().as_default(): # Import previously saved meta graph. restorer = tf.train.import_meta_graph("graph.meta") with tf.Session() as session: checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) restorer.restore(session, checkpoint_file) # Get handlers for images placeholder and scores op with names defined before. images_placeholder = tf.get_default_graph().get_tensor_by_name( INPUT_NAME + ":0") scores = tf.get_default_graph().get_tensor_by_name( SCORES_NAME + ":0") # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user. builder = tf.saved_model.builder.SavedModelBuilder( os.path.join(EXPERIMENT_OUTPUT_PATH, "models", "00001")) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info( images_placeholder) }, outputs={ MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores) }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( session, [tf.saved_model.tag_constants.SERVING], signature_def_map={ MODEL_SIGNATURE_NAME: prediction_signature }, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save()
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=2000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=100) ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. builder = option_builder.ProfileOptionBuilder opts1 = builder(builder.time_and_memory()).\ order_by('micros').\ with_max_depth(10).\ with_file_output("./pctx/opts1-rank-%d" % hvd.rank()).\ build() opts2 = builder.trainable_variables_parameter() # with profile_context.ProfileContext("./pctx", # trace_steps=range(100, 110), # dump_steps=[110]) as pctx: with profile_context.ProfileContext("./pctx") as pctx: pctx.add_auto_profiling('op', opts1, [800, 900, 1000]) pctx.add_auto_profiling('scope', opts2, [1000]) with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={ image: image_, label: label_ }) pctx.profiler.advise(options=model_analyzer.ALL_ADVICE)
def main(): gpu_thread_count = 2 os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' hvd.init() # random.seed(5 * (1 + hvd.rank())) # np.random.seed(7 * (1 + hvd.rank())) # tf.set_random_seed(31 * (1 + hvd.rank())) cmdline = add_cli_args() FLAGS, unknown_args = cmdline.parse_known_args() if len(unknown_args) > 0: for bad_arg in unknown_args: print("ERROR: Unknown command line arg: %s" % bad_arg) raise ValueError("Invalid command line arg(s)") FLAGS.data_dir = None if FLAGS.data_dir == "" else FLAGS.data_dir FLAGS.log_dir = None if FLAGS.log_dir == "" else FLAGS.log_dir if FLAGS.eval: FLAGS.log_name = 'eval' + FLAGS.log_name if FLAGS.local_ckpt: do_checkpoint = hvd.local_rank() == 0 else: do_checkpoint = hvd.rank() == 0 if do_checkpoint and not os.path.isdir(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) logger = logging.getLogger(FLAGS.log_name) logger.setLevel(logging.INFO) # INFO, ERROR # file handler which logs debug messages # console handler ch = logging.StreamHandler() ch.setLevel(logging.INFO) # add formatter to the handlers # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) if not hvd.local_rank(): fh = logging.FileHandler(os.path.join(FLAGS.log_dir, FLAGS.log_name)) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) # add handlers to logger logger.addHandler(fh) height, width = 224, 224 global_batch_size = FLAGS.batch_size * hvd.size() if FLAGS.data_dir: filename_pattern = os.path.join(FLAGS.data_dir, '%s-*') train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation')) num_training_samples = get_num_records(train_filenames) else: train_filenames = eval_filenames = [] num_training_samples = 1281167 training_samples_per_rank = num_training_samples // hvd.size() if FLAGS.num_epochs: nstep = num_training_samples * FLAGS.num_epochs // global_batch_size elif FLAGS.num_batches: nstep = FLAGS.num_batches FLAGS.num_epochs = max( nstep * global_batch_size // num_training_samples, 1) else: raise ValueError("Either num_epochs or num_batches has to be passed") nstep_per_epoch = num_training_samples // global_batch_size decay_steps = nstep if FLAGS.lr_decay_mode == 'steps': steps = [ int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(',') ] lr_steps = [FLAGS.lr] for i in range(len(FLAGS.lr_decay_steps.split(','))): lr_steps.append(FLAGS.lr * pow(FLAGS.lr_decay_factor, i + 1)) else: steps = [] lr_steps = [] if not FLAGS.save_checkpoints_steps: # default to save one checkpoint per epoch FLAGS.save_checkpoints_steps = nstep_per_epoch if not FLAGS.save_summary_steps: # default to save one checkpoint per epoch FLAGS.save_summary_steps = nstep_per_epoch warmup_it = nstep_per_epoch * FLAGS.warmup_epochs rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__)) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = 5 rank0log(logger, "Horovod size: ", hvd.size()) classifier = tf.estimator.Estimator( model_fn=cnn_model_function, model_dir=FLAGS.log_dir, params={ 'model': FLAGS.model, 'decay_steps': decay_steps, 'n_classes': 1000, 'dtype': tf.float16 if FLAGS.fp16 else tf.float32, 'format': 'channels_first', 'device': '/gpu:0', 'lr': FLAGS.lr, 'mom': FLAGS.mom, 'wdecay': FLAGS.wdecay, 'steps': steps, 'lr_steps': lr_steps, 'lr_decay_mode': FLAGS.lr_decay_mode, 'warmup_it': warmup_it, 'warmup_lr': FLAGS.warmup_lr, 'loss_scale': FLAGS.loss_scale, 'adv_bn_init': FLAGS.adv_bn_init, 'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None }, config=tf.estimator.RunConfig( # tf_random_seed=31 * (1 + hvd.rank()), session_config=config, save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None, save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None, keep_checkpoint_max=None)) if not FLAGS.eval: num_preproc_threads = 5 rank0log(logger, "Preproc threads", num_preproc_threads) training_hooks = [ hvd.BroadcastGlobalVariablesHook(0), PrefillStagingAreasHook() ] if hvd.rank() == 0: training_hooks.append( LogSessionRunHook(global_batch_size, num_training_samples, FLAGS.display_every, logger)) try: start_time = time.time() classifier.train( input_fn=lambda: make_dataset(train_filenames, training_samples_per_rank, FLAGS.batch_size, height, width, training=True, num_threads=num_preproc_threads, shard=True, synthetic=FLAGS.synthetic), max_steps=nstep, hooks=training_hooks) rank0log(logger, "Finished in ", time.time() - start_time) except KeyboardInterrupt: print("Keyboard interrupt") elif FLAGS.eval and not FLAGS.synthetic: rank0log(logger, "Evaluating") rank0log( logger, "Validation dataset size: {}".format( get_num_records(eval_filenames))) barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) tf.Session(config=config).run(barrier) time.sleep(5) # a little extra margin... if FLAGS.num_gpus == 1: rank0log( logger, """If you are evaluating checkpoints of a multi-GPU run on a single GPU, ensure you set --num_gpus to the number of GPUs it was trained on. This will ensure that the epoch number is accurately displayed in the below logs.""" ) try: ckpts = sort_and_load_ckpts(FLAGS.log_dir) for i, c in enumerate(ckpts): if i < len(ckpts) - 1: if (not FLAGS.eval_interval) or \ (i % FLAGS.eval_interval != 0): continue eval_result = classifier.evaluate( input_fn=lambda: make_dataset(eval_filenames, get_num_records( eval_filenames), FLAGS.batch_size, height, width, training=False, shard=True, synthetic=FLAGS.synthetic), checkpoint_path=c['path']) c['epoch'] = (c['step'] * FLAGS.num_gpus) / (nstep_per_epoch * hvd.size()) c['top1'] = eval_result['val-top1acc'] c['top5'] = eval_result['val-top5acc'] c['loss'] = eval_result['loss'] rank0log( logger, ' step epoch top1 top5 loss checkpoint_time(UTC)') barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) for i, c in enumerate(ckpts): tf.Session(config=config).run(barrier) if 'top1' not in c: continue rank0log( logger, '{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'.format( c['step'], c['epoch'], c['top1'] * 100, c['top5'] * 100, c['loss'], time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['mtime'])))) rank0log(logger, "Finished evaluation") except KeyboardInterrupt: logger.error("Keyboard interrupt")
def main(_): '''Main routine for Horovod Tensorflow Mnist example.''' # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=str(hvd.local_rank())) config = tf.ConfigProto(gpu_options=gpu_options) batch_size = 100 # Download and load MNIST dataset. if hvd.rank() == 0: # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR) image, label = get_data_mnist(batch_size) # hvd.allreduce(tf.constant([0]), average=False) # Barrier (not working) with tf.Session(config=config): # download/unzip in rank 0 only. hvd_keras.allreduce([0], name="Barrier") if hvd.rank() != 0: # mnist = learn.datasets.mnist.read_data_sets(MNIST_DATADIR) image, label = get_data_mnist(batch_size) # Build model... # with tf.name_scope('input'): # image = tf.placeholder(tf.float32, [None, 784], name='image') # label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) # global_step = tf.contrib.framework.get_or_create_global_step() global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable # states from rank 0 to all other processes. This is necessary to # ensure consistent initialization of all workers when training is # started with random weights or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when # done or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. # image_, label_ = mnist.train.next_batch(100) # mon_sess.run(train_op, feed_dict={image: image_, label: label_}) mon_sess.run(train_op)
def train(self, iter_unit, num_iter, batch_size, weight_decay, learning_rate, learning_rate_decay_factor, learning_rate_decay_steps, rmsprop_decay, rmsprop_momentum, use_auto_loss_scaling, augment_data, warmup_steps=50, is_benchmark=False): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp: if use_auto_loss_scaling: if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log( "TF Loss Auto Scaling is activated - Experimental Feature" ) os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" apply_manual_loss_scaling = False else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" apply_manual_loss_scaling = True else: apply_manual_loss_scaling = False if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log('Defining Model Estimator ...\n') global_batch_size = batch_size * self.num_gpus if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs = self.dataset.get_dataset_runtime_specs( training=True, iter_unit=iter_unit, num_iter=num_iter, global_batch_size=global_batch_size) steps_per_epoch = int(num_steps / num_epochs) else: num_epochs = 1 num_steps = num_iter steps_per_epoch = 625 training_hooks = [] if hvd_utils.is_using_hvd(): training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: training_hooks.append( ProfilerHook(global_batch_size=global_batch_size, log_every=self.run_hparams.log_every_n_steps, warmup_steps=warmup_steps, is_training=True, sample_dir=self.run_hparams.sample_dir)) LOGGER.log('Starting Model Training ...\n') LOGGER.log("=> Epochs: %d" % num_epochs) LOGGER.log("=> Total Steps: %d" % num_steps) LOGGER.log("=> Steps per Epoch: %d" % steps_per_epoch) LOGGER.log("=> Weight Decay Factor: %.1e" % weight_decay) LOGGER.log("=> Learning Rate: %.1e" % learning_rate) LOGGER.log("=> Learning Rate Decay Factor: %.2f" % learning_rate_decay_factor) LOGGER.log("=> Learning Rate Decay Steps: %d" % learning_rate_decay_steps) LOGGER.log("=> RMSProp - Decay: %.1f" % rmsprop_decay) LOGGER.log("=> RMSProp - Momentum: %.1f" % rmsprop_momentum) LOGGER.log("=> Loss Function Name: %s" % self.run_hparams.loss_fn_name) if self.run_hparams.use_tf_amp: LOGGER.log("=> Use Auto Loss Scaling: %s" % use_auto_loss_scaling) LOGGER.log("=> # GPUs: %d" % self.num_gpus) LOGGER.log("=> GPU Batch Size: %d" % batch_size) LOGGER.log("=> Global Batch Size: %d" % global_batch_size) LOGGER.log("=> Total Files to Processed: %d\n" % (num_steps * global_batch_size)) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'learning_rate': learning_rate, 'learning_rate_decay_steps': learning_rate_decay_steps, 'learning_rate_decay_factor': learning_rate_decay_factor, 'rmsprop_decay': rmsprop_decay, 'rmsprop_momentum': rmsprop_momentum, 'weight_decay': weight_decay, 'apply_manual_loss_scaling': apply_manual_loss_scaling, 'loss_fn_name': self.run_hparams.loss_fn_name, 'debug_verbosity': self.run_hparams.debug_verbosity, } def training_data_fn(): if not is_benchmark or self.run_hparams.data_dir is not None: return self.dataset.dataset_fn( batch_size=batch_size, training=True, only_defective_images=True, augment_data=augment_data, input_shape=list(self.run_hparams.input_shape) + [self.run_hparams.n_channels], mask_shape=list(self.run_hparams.mask_shape) + [self.run_hparams.n_channels], num_threads=64, use_gpu_prefetch=True, normalize_data_method="zero_centered", seed=self.run_hparams.seed) else: if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log("Using Synthetic Data ...") return self.dataset.synth_dataset_fn( batch_size=batch_size, training=True, input_shape=list(self.run_hparams.input_shape) + [self.run_hparams.n_channels], mask_shape=list(self.run_hparams.mask_shape) + [self.run_hparams.n_channels], num_threads=64, use_gpu_prefetch=True, normalize_data_method="zero_centered", only_defective_images=True, augment_data=augment_data, seed=self.run_hparams.seed) model = self._get_estimator(mode='train', run_params=estimator_params, use_xla=self.use_xla) try: model.train( input_fn=training_data_fn, steps=num_steps, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log('Ending Model Training ...')
def main(_): # causes memory fragmentation for bert leading to OOM if os.environ.get("TF_XLA_FLAGS", None) is not None: os.environ["TF_XLA_FLAGS"] += "--tf_xla_enable_lazy_compilation=false" else: os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if FLAGS.horovod: hvd.init() processors = { "bc5cdr": BC5CDRProcessor, "clefe": CLEFEProcessor, 'i2b2': I2b22012Processor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) tf.io.gfile.makedirs(FLAGS.output_dir) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: global_batch_size = FLAGS.train_batch_size * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 tf.enable_resource_variables() run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank+1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd, amp=FLAGS.amp) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: #train_file = os.path.join(FLAGS.output_dir, "train.tf_record") #filed_based_convert_examples_to_features( # train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) filed_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, #train_file, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Num examples = %d", len(eval_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.Open(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) dllogging.logger.log(step=(), data={key: float(strresult[key])}, verbosity=Verbosity.DEFAULT) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") with tf.io.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if tf.io.gfile.Exists(token_path): tf.io.gfile.Remove(token_path) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info(" Num examples = %d", len(predict_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] eval_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") test_labels_file = os.path.join(FLAGS.output_dir, "test_labels.txt") test_labels_err_file = os.path.join(FLAGS.output_dir, "test_labels_errs.txt") with tf.io.gfile.Open(output_predict_file, 'w') as writer, \ tf.io.gfile.Open(test_labels_file, 'w') as tl, \ tf.io.gfile.Open(test_labels_err_file, 'w') as tle: print(id2label) i=0 for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks, yield_single_examples=True): output_line = "\n".join(id2label[id] for id in prediction if id != 0) + "\n" writer.write(output_line) result_to_pair(predict_examples[i], prediction, id2label, tl, tle) i = i + 1 eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)]) num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info('Reading: %s', test_labels_file) with tf.io.gfile.Open(test_labels_file, "r") as f: counts = evaluate(f) eval_result = report_notprint(counts) print(''.join(eval_result)) with tf.io.gfile.Open(os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd: fd.write(''.join(eval_result))
def bnn(args): # %% Model class Dummy(): pass hvd.init() print("Rank is:", hvd.rank()) tf.reset_default_graph() tf.set_random_seed(args.seed + hvd.rank()) np.random.seed(args.seed + hvd.rank()) tfd = tf.contrib.distributions N = args.X_train.shape[0] dim = list(args.X_train.shape[1:]) K = args.Y_train.shape[1] # num of class X = tf.placeholder(tf.float32, [None] + dim) y = tf.placeholder(tf.float32, [None, K]) neural_net = nnet.convnet(activation=args.activation, numclass=K, inshape=args.inshape, isBay=True, regularizer=args.regularizer, priorstd=args.priorstd, poststd=args.poststd, repeatConv=args.repeatConv) logits = neural_net(X) labels_distribution = tfd.Categorical(logits=logits) # %% Loss neg_log_likelihood = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logits)) kl = sum(neural_net.losses) / N elbo_loss = neg_log_likelihood + args.KLscale * kl # %% Metrics correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # %% Posterior # names = [] # qmeans = [] # qstds = [] # Wsample = [] # for i, layer in enumerate(neural_net.layers): # if hasattr(layer, "kernel_posterior"): # q = layer.kernel_posterior # names.append("Layer {}".format(i)) # qmeans.append(q.mean()) # qstds.append(q.stddev()) # Wsample.append(q.sample(args.num_monte_carlo)) # Horovod training opt = tf.train.AdamOptimizer(args.learning_rate * hvd.size()) opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_opt = opt.minimize(elbo_loss, global_step=global_step) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # checkpoint_dir = './train_logs' if hvd.rank() == 0 else None hooks = [ hvd.BroadcastGlobalVariablesHook(0) # tf.train.StopAtStepHook(last_step=max_iter // hvd.size()) ] res_return = Dummy() res_return.plot = Dummy() res_return.plot.niter = [] res_return.plot.runtime = [] res_return.plot.loss = [] res_return.plot.devAcc = [] # res_return.plot.devAccMean = [] # res_return.plot.devAccUp = [] # res_return.plot.devAccDown = [] # print("total iteration is", max_iter // hvd.size()) with tf.train.MonitoredTrainingSession(hooks=hooks, config=config) as mon_sess: if hvd.rank() == 0: print("=" * 21 + "Optimization Start" + "=" * 21) start_time, algstart = time.time(), time.time() niter = 0 for epoch in range(args.training_epochs // hvd.size()): perm = np.random.permutation(N) for i in range(0, N, args.batch_size): batch_x = args.X_train[perm[i:i + args.batch_size]] batch_y = args.Y_train[perm[i:i + args.batch_size]] _, loss_val, acc_val = mon_sess.run( [train_opt, elbo_loss, accuracy], feed_dict={ X: batch_x, y: batch_y }) niter += 1 # print(niter, end=", ") if (niter * hvd.size()) % args.viz_steps == 0: end_time = time.time() # eval on dev set acc_val_dev = np.asarray([ mon_sess.run(accuracy, feed_dict={ X: args.X_test, y: args.Y_test }) for xyz in range(args.num_monte_carlo // hvd.size()) ]) # save timediff = end_time - start_time AccMean = np.mean(acc_val_dev) AccStd = np.std(acc_val_dev) timediff = end_time - start_time res_return.plot.niter.append(niter) res_return.plot.runtime.append(timediff) res_return.plot.loss.append(loss_val) res_return.plot.devAcc.append(acc_val_dev) # res_return.plot.devAccMean.append(AccMean) # res_return.plot.devAccUp.append(AccMean + AccStd) # res_return.plot.devAccDown.append(AccMean - AccStd) if hvd.rank() == 0: print("Step: {:>3d} RunTime: {:.3f} Loss: {:.3f}" "ACC: {:.3f} AccDevM: {:.3f} AccDevU: {:.3f}". format(niter, timediff, loss_val, acc_val, AccMean, AccMean + AccStd)) start_time = time.time() eval_start = time.time() if hvd.rank() == 0: print("=" * 21 + "Optimization Finish" + "=" * 21) tmp = [ mon_sess.run([accuracy, labels_distribution.probs], feed_dict={ X: args.X_test, y: args.Y_test }) for xyz in range(args.num_monte_carlo_test // hvd.size()) ] [acc_val_test, probs] = list(zip(*tmp)) acc_val_test = np.asarray(acc_val_test) eval_end = time.time() tot_time = eval_end - algstart eval_time = eval_end - eval_start if hvd.rank() == 0: print("Step: {:>3d} RunTime: {:.3f} TestAcc:{:.3f}".format( niter, end_time - algstart, np.mean(acc_val_test))) res_return.tot_time = tot_time res_return.eval_time = eval_time # Return result res_return.probs = np.asarray(probs) res_return.acc = np.asarray(acc_val_test) # res_return.posterior = Dummy() # res_return.posterior.mean = qm_vals # res_return.posterior.std = qs_vals # res_return.posterior.samples = W_postsam # res_return.names = names return res_return