def wait_for_training_examples(state, num_games): """Wait for training examples to be generated by the latest model. Args: state: the RL loop State instance. num_games: number of games to wait for. """ model_dir = os.path.join(FLAGS.selfplay_dir, state.selfplay_model_name) pattern = os.path.join(model_dir, '*', '*', '*.tfrecord.zz') for i in itertools.count(): try: paths = sorted(tf.io.gfile.glob(pattern)) except tf.errors.OpError: paths = [] if len(paths) >= num_games: mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllogger.event(key='actual_selfplay_games_per_generation', value=len(paths)) break if i % 30 == 0: logging.info('Waiting for %d games in %s (found %d)', num_games, model_dir, len(paths)) time.sleep(1)
def __init__(self, filename, benchmark, organization): self.mllogger = mllog.get_mllogger() self.comm_rank = comm.get_rank() self.comm_size = comm.get_size() self.constants = constants # create logging dir if it does not exist logdir = os.path.dirname(filename) if self.comm_rank == 0: if not os.path.isdir(logdir): os.makedirs(logdir) if torch.distributed.is_available( ) and torch.distributed.is_initialized(): torch.distributed.barrier() # create config mllog.config(filename=filename) self.mllogger.logger.propagate = False self.log_event(key=constants.SUBMISSION_BENCHMARK, value=benchmark) self.log_event(key=constants.SUBMISSION_ORG, value=organization) self.log_event(key=constants.SUBMISSION_DIVISION, value='closed') self.log_event(key=constants.SUBMISSION_STATUS, value='onprem') self.log_event( key=constants.SUBMISSION_PLATFORM, value=f'{self.comm_size}xSUBMISSION_PLATFORM_PLACEHOLDER')
def export_model(model_path): """Take the latest checkpoint and copy it to model_path. Assumes that all relevant model files are prefixed by the same name. (For example, foo.index, foo.meta and foo.data-00000-of-00001). Args: model_path: The path (can be a gs:// path) to export model """ FLAGS.use_bfloat16 = False estimator = tf.estimator.Estimator(model_fn, model_dir=FLAGS.work_dir, params=FLAGS.flag_values_dict()) latest_checkpoint = estimator.latest_checkpoint() all_checkpoint_files = tf.io.gfile.glob(latest_checkpoint + '*') mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) for filename in all_checkpoint_files: suffix = filename.partition(latest_checkpoint)[2] destination_path = model_path + suffix logging.info('Copying {} to {}'.format(filename, destination_path)) tf.io.gfile.copy(filename, destination_path)
def main(unused_argv): models = load_train_times() # Skip all models earlier than start and apply step. models = [x for x in models if int(x[1]) >= FLAGS.start][::FLAGS.step] mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) for i, (timestamp, name, path) in enumerate(models): epoch_num = FLAGS.start + i mllogger.start(key=mllog.constants.EVAL_START, value=epoch_num) winrate = evaluate_model(path, epoch_num) mllogger.end(key=mllog.constants.EVAL_STOP, value=epoch_num) if winrate >= FLAGS.winrate: print('Model {} beat target after {}s'.format(name, timestamp)) break mllogger.event(key='eval_games', value=len(models)) mllogger.event(key='gating_win_rate', value=FLAGS.winrate) mllogger.end(key=mllog.constants.RUN_STOP, value="succuss")
def main(argv): """Train on examples and export the updated model weights.""" tf_records = argv[1:] logging.info("Training on %s records: %s to %s", len(tf_records), tf_records[0], tf_records[-1]) if FLAGS.dist_train: hvd.init() mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) with utils.logged_timer("Training"): train(*tf_records) if (not FLAGS.dist_train) or hvd.rank() == 0: if FLAGS.export_path: dual_net.export_model(FLAGS.export_path) epoch = int(os.path.basename(FLAGS.export_path)) mllogger.event(key="save_model", value={"Iteration": epoch}) if FLAGS.freeze: dual_net.freeze_graph(FLAGS.export_path, FLAGS.use_trt, FLAGS.trt_max_batch_size, FLAGS.trt_precision, FLAGS.selfplay_precision)
def mlperf_submission_log(benchmark): required_dist_init = ['RANK', 'WORLD_SIZE', 'MASTER_ADDR', 'MASTER_PORT'] if all(var in os.environ for var in required_dist_init): torch.distributed.init_process_group(backend='nccl', init_method='env://') num_nodes = os.environ.get('SLURM_NNODES', 1) mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'transformer.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False log_event( key=constants.SUBMISSION_BENCHMARK, value=benchmark, ) log_event(key=constants.SUBMISSION_ORG, value='Fujitsu') log_event(key=constants.SUBMISSION_DIVISION, value='closed') log_event(key=constants.SUBMISSION_STATUS, value='onprem') log_event(key=constants.SUBMISSION_PLATFORM, value=f'1xGX2570M5')
def mlperf_submission_log(benchmark): num_nodes = os.environ.get('SLURM_NNODES', 1) mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False log_event( key=constants.SUBMISSION_BENCHMARK, value=benchmark, ) log_event( key=constants.SUBMISSION_ORG, value='NVIDIA') log_event( key=constants.SUBMISSION_DIVISION, value='closed') log_event( key=constants.SUBMISSION_STATUS, value='onprem') log_event( key=constants.SUBMISSION_PLATFORM, value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
def main(_): if not tf.gfile.Exists(FLAGS.mlperf_log_dir): print("Creating directory %s" % FLAGS.mlperf_log_dir) tf.gfile.MakeDirs(FLAGS.mlperf_log_dir) mllog.config(filename=os.path.join(FLAGS.mlperf_log_dir, "mlperf_compliance.log"), root_dir=os.path.normpath( os.path.dirname(os.path.realpath(__file__)))) mllogger.start(key=mllog_const.INIT_START) # Set logging level to INFO to display training progress (logged by the # estimator) tf.logging.set_verbosity(tf.logging.INFO) # Set random seed. if FLAGS.random_seed is None: raise Exception('No Random seed given') print('Setting random seed = ', FLAGS.random_seed) seed = FLAGS.random_seed random.seed(seed) tf.set_random_seed(seed) numpy.random.seed(seed) # Determine training schedule based on flags. if FLAGS.train_steps is not None and FLAGS.train_epochs is not None: raise ValueError( "Both --train_steps and --train_epochs were set. Only one " "may be defined.") if FLAGS.train_steps is None and FLAGS.train_epochs is None: FLAGS.train_epochs = mlbox_const.DEFAULT_TRAIN_EPOCHS params = mlbox_model_params.MLBoxTransformerParams(FLAGS) # Make sure that the BLEU source and ref files if set if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None: if not tf.gfile.Exists(FLAGS.bleu_source): raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_source) if not tf.gfile.Exists(FLAGS.bleu_ref): raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_ref) mllogger.end(key=mllog_const.INIT_STOP) mllogger.start(key=mllog_const.RUN_START) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir, params=params) train_schedule(estimator, params.train_eval_iterations, params.single_iteration_train_steps, params.single_iteration_train_epochs, FLAGS.bleu_source, FLAGS.bleu_ref, FLAGS.bleu_threshold) mllogger.end(key=mllog_const.RUN_STOP)
def get_mllog_mlloger(): from mlperf_logging import mllog from mlperf_compliance import tf_mlperf_log str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0" mllogger = mllog.get_mllogger() filenames = "resnet50v1.5.log-" + str_hvd_rank mllog.config(filename=filenames) workername = "worker" + str_hvd_rank mllog.config( default_namespace = workername, default_stack_offset = 1, default_clear_line = False, root_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) return mllogger, mllog, tf_mlperf_log
def main(argv): mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) mllogger.event(key=mllog.constants.OPT_BASE_LR, value=FLAGS.lr_rates) mllogger.event(key='lr_rates', value=FLAGS.lr_rates) mllogger.event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=FLAGS.lr_boundaries[1]) mllogger.event(key='lr_boundaries', value=FLAGS.lr_boundaries[1]) mllogger.event(key=mllog.constants.OPT_WEIGHT_DECAY, value=FLAGS.l2_strength) mllogger.event(key='opt_learning_rate_decay_boundary_steps', value=FLAGS.lr_boundaries) mllogger.event(key='train_batch_size', value=FLAGS.train_batch_size)
def get_mllog_mlloger(output_dir=None): from mlperf_logging import mllog str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0" mllogger = mllog.get_mllogger() mllogger.propagate = False mllog.propagate=False if output_dir is None: output_dir='./log' filenames = os.path.normpath(output_dir) + "/result_rank_" + str_hvd_rank + ".txt" mllog.config(filename=filenames) workername = "worker" + str_hvd_rank mllog.config( default_namespace = workername, default_stack_offset = 1, default_clear_line = False, root_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) return mllogger, mllog
def main(argv): mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False, root_dir=os.path.normpath("/tmp/")) mllogger.event(key='num_readouts', value=FLAGS.num_readouts) mllogger.event(key='value_init_penalty', value=FLAGS.value_init_penalty) mllogger.event(key='holdout_pct', value=FLAGS.holdout_pct) mllogger.event(key='disable_resign_pct', value=FLAGS.disable_resign_pct) mllogger.event(key='min_resign_threshold', value=FLAGS.min_resign_threshold) mllogger.event(key='max_resign_threshold', value=FLAGS.max_resign_threshold) mllogger.event(key='selfplay_threads', value=FLAGS.selfplay_threads) mllogger.event(key='parallel_games', value=FLAGS.parallel_inference) mllogger.event(key='virtual_losses', value=FLAGS.virtual_losses)
def main(argv): """Entry point for running one selfplay game.""" del argv # Unused flags.mark_flag_as_required('load_file') mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config( default_namespace = "worker1", default_stack_offset = 1, default_clear_line = False) mllogger.event(key='parallel_games', value=FLAGS.parallel_inference) run_game( load_file=FLAGS.load_file, selfplay_dir=FLAGS.selfplay_dir, holdout_dir=FLAGS.holdout_dir, holdout_pct=FLAGS.holdout_pct, sgf_dir=FLAGS.sgf_dir)
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch import numpy as np import os from mlperf_logging import mllog from mlperf_logging.mllog import constants as mllog_const mllogger = mllog.get_mllogger() mllog.config(filename=(os.getenv("COMPLIANCE_FILE") or "mlperf_compliance.log"), root_dir=os.path.normpath( os.path.dirname(os.path.realpath(__file__)))) def ssd_print(*args, sync=True, **kwargs): use_cuda = os.getenv('USE_CUDA') if sync and use_cuda == 'True': barrier() if get_rank() == 0: kwargs['stack_offset'] = 2 mllogger.event(*args, **kwargs) def barrier(): """ Works as a temporary distributed barrier, currently pytorch
def main(unused_argv): """Run the reinforcement learning loop.""" logger = logging.getLogger() logger.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S') for handler in logger.handlers: handler.setFormatter(formatter) mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="Intel") mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value="8 nodes x 4s CPX") mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="closed") mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="onprem") mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="minigo") mllogger.event(key='cache_clear', value=True) mllogger.event(key="filter_amount", value=FLAGS.train_filter) # The training loop must be bootstrapped; either by running bootstrap.sh # to generate training data from random games, or by running # copy_checkpoint.sh to copy an already generated checkpoint. model_dirs = list_selfplay_dirs(FLAGS.selfplay_dir) if not model_dirs: raise RuntimeError( 'Couldn\'t find any selfplay games under %s. Either bootstrap.sh ' 'or init_from_checkpoint.sh must be run before the train loop is ' 'started') model_num = int(os.path.basename(model_dirs[0])) mllogger.end(key=mllog.constants.INIT_STOP) mllogger.start(key=mllog.constants.RUN_START) with logged_timer('Total time'): try: state = State(model_num) wait( checked_run([ 'python3', 'parse_flags_train.py', '--flagfile={}'.format( os.path.join(FLAGS.flags_dir, 'train.flags')) ])) wait( checked_run([ 'python3', 'parse_flags_selfplay.py', '--flagfile={}'.format( os.path.join(FLAGS.flags_dir, 'selfplay.flags')) ])) mllogger.event(key="window_size", value=FLAGS.window_size) while state.iter_num <= FLAGS.iterations: mllogger.event(key=mllog.constants.EPOCH_START, value=None, metadata={"epoch_num": state.iter_num}) state.iter_num += 1 train(state) mllogger.event(key=mllog.constants.EPOCH_STOP, value=None, metadata={"epoch_num": state.iter_num}) if (FLAGS.precision == 'int8'): post_train(state) finally: asyncio.get_event_loop().close()
def configure_mllogger(log_dir): """Setup the MLPerf logger""" if not have_mlperf_logging: raise RuntimeError('mlperf_logging package unavailable') mllog.config(filename=os.path.join(log_dir, 'mlperf.log')) return mllog.get_mllogger()
def get_mlperf_logger(path, filename='mlperf.log'): mllog.config(filename=os.path.join(path, filename)) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False return mllogger
def main(async_executor=None): # Setup MLPerf logger mllog.config() mllogger = mllog.get_mllogger() mllogger.logger.propagate = False # Start MLPerf benchmark log_start(key=mlperf_constants.INIT_START, uniq=False) # Parse args args = parse_args() ############################################################################ # Initialize various libraries (horovod, logger, amp ...) ############################################################################ # Initialize async executor if args.async_val: assert async_executor is not None, 'Please use ssd_main_async.py to launch with async support' else: # (Force) disable async validation async_executor = None # Initialize horovod hvd.init() # Initialize AMP if args.precision == 'amp': amp.init(layout_optimization=True) # Set MXNET_SAFE_ACCUMULATION=1 if necessary if args.precision == 'fp16': os.environ["MXNET_SAFE_ACCUMULATION"] = "1" # Results folder network_name = f'ssd_{args.backbone}_{args.data_layout}_{args.dataset}_{args.data_shape}' save_prefix = None if args.results: save_prefix = os.path.join(args.results, network_name) else: logging.info( "No results folder was provided. The script will not write logs or save weight to disk" ) # Initialize logger log_file = None if args.results: log_file = f'{save_prefix}_{args.mode}_{hvd.rank()}.log' setup_logger(level=args.log_level if hvd.local_rank() in args.log_local_ranks else 'CRITICAL', log_file=log_file) # Set seed args.seed = set_seed_distributed(args.seed) ############################################################################ ############################################################################ # Validate arguments and print some useful information ############################################################################ logging.info(args) assert not (args.resume_from and args.pretrained_backbone), ( "--resume-from and --pretrained_backbone are " "mutually exclusive.") assert args.data_shape == 300, "only data_shape=300 is supported at the moment." assert args.input_batch_multiplier >= 1, "input_batch_multiplier must be >= 1" assert not (hvd.size() == 1 and args.gradient_predivide_factor > 1), ( "Gradient predivide factor is not supported " "with a single GPU") if args.data_layout == 'NCHW' or args.precision == 'fp32': assert args.bn_group == 1, "Group batch norm doesn't support FP32 data format or NCHW data layout." if not args.no_fuse_bn_relu: logging.warning(( "WARNING: fused batch norm relu is only supported with NHWC layout. " "A non fused version will be forced.")) args.no_fuse_bn_relu = True if not args.no_fuse_bn_add_relu: logging.warning(( "WARNING: fused batch norm add relu is only supported with NHWC layout. " "A non fused version will be forced.")) args.no_fuse_bn_add_relu = True if args.profile_no_horovod and hvd.size() > 1: logging.warning( "WARNING: hvd.size() > 1, so must IGNORE requested --profile-no-horovod" ) args.profile_no_horovod = False logging.info(f'Seed: {args.seed}') logging.info(f'precision: {args.precision}') if args.precision == 'fp16': logging.info(f'loss scaling: {args.fp16_loss_scale}') logging.info(f'network name: {network_name}') logging.info(f'fuse bn relu: {not args.no_fuse_bn_relu}') logging.info(f'fuse bn add relu: {not args.no_fuse_bn_add_relu}') logging.info(f'bn group: {args.bn_group}') logging.info(f'bn all reduce fp16: {args.bn_fp16}') logging.info(f'MPI size: {hvd.size()}') logging.info(f'MPI global rank: {hvd.rank()}') logging.info(f'MPI local rank: {hvd.local_rank()}') logging.info(f'async validation: {args.async_val}') ############################################################################ # TODO(ahmadki): load network and anchors based on args.backbone (JoC) # Load network net = ssd_300_resnet34_v1_mlperf_coco( pretrained_base=False, nms_overlap_thresh=args.nms_overlap_thresh, nms_topk=args.nms_topk, nms_valid_thresh=args.nms_valid_thresh, post_nms=args.post_nms, layout=args.data_layout, fuse_bn_add_relu=not args.no_fuse_bn_add_relu, fuse_bn_relu=not args.no_fuse_bn_relu, bn_fp16=args.bn_fp16, norm_kwargs={'bn_group': args.bn_group}) # precomputed anchors anchors_np = mlperf_xywh_anchors(image_size=args.data_shape, clip=True, normalize=True) if args.test_anchors and hvd.rank() == 0: logging.info(f'Normalized anchors: {anchors_np}') # Training mode train_net = None train_pipeline = None trainer_fn = None lr_scheduler = None if args.mode in ['train', 'train_val']: # Training iterator num_cropping_iterations = 1 if args.use_tfrecord: tfrecord_files = glob.glob( os.path.join(args.tfrecord_root, 'train.*.tfrecord')) index_files = glob.glob( os.path.join(args.tfrecord_root, 'train.*.idx')) tfrecords = [(tfrecod, index) for tfrecod, index in zip(tfrecord_files, index_files) ] train_pipeline = get_training_pipeline( coco_root=args.coco_root if not args.use_tfrecord else None, tfrecords=tfrecords if args.use_tfrecord else None, anchors=anchors_np, num_shards=hvd.size(), shard_id=hvd.rank(), device_id=hvd.local_rank(), batch_size=args.batch_size * args.input_batch_multiplier, dataset_size=args.dataset_size, data_layout=args.data_layout, data_shape=args.data_shape, num_cropping_iterations=num_cropping_iterations, num_workers=args.dali_workers, fp16=args.precision == 'fp16', input_jpg_decode=args.input_jpg_decode, hw_decoder_load=args.hw_decoder_load, decoder_cache_size=min( (100 * 1024 + hvd.size() - 1) // hvd.size(), 12 * 1024) if args.input_jpg_decode == 'cache' else 0, seed=args.seed) log_event(key=mlperf_constants.TRAIN_SAMPLES, value=train_pipeline.epoch_size) log_event(key=mlperf_constants.MAX_SAMPLES, value=num_cropping_iterations) # Training network train_net = SSDMultiBoxLoss(net=net, local_batch_size=args.batch_size, bulk_last_wgrad=args.bulk_last_wgrad) # Trainer function. SSDModel expects a function that takes 1 parameter - HybridBlock trainer_fn = functools.partial( sgd_trainer, learning_rate=args.lr, weight_decay=args.weight_decay, momentum=args.momentum, precision=args.precision, fp16_loss_scale=args.fp16_loss_scale, gradient_predivide_factor=args.gradient_predivide_factor, num_groups=args.horovod_num_groups, profile_no_horovod=args.profile_no_horovod) # Learning rate scheduler lr_scheduler = MLPerfLearningRateScheduler( learning_rate=args.lr, decay_factor=args.lr_decay_factor, decay_epochs=args.lr_decay_epochs, warmup_factor=args.lr_warmup_factor, warmup_epochs=args.lr_warmup_epochs, epoch_size=train_pipeline.epoch_size, global_batch_size=args.batch_size * hvd.size()) # Validation mode infer_net = None val_iterator = None if args.mode in ['infer', 'val', 'train_val']: # Validation iterator tfrecord_files = glob.glob( os.path.join(args.tfrecord_root, 'val.*.tfrecord')) index_files = glob.glob(os.path.join(args.tfrecord_root, 'val.*.idx')) tfrecords = [(tfrecod, index) for tfrecod, index in zip(tfrecord_files, index_files)] val_pipeline = get_inference_pipeline( coco_root=args.coco_root if not args.use_tfrecord else None, tfrecords=tfrecords if args.use_tfrecord else None, num_shards=hvd.size(), shard_id=hvd.rank(), device_id=hvd.local_rank(), batch_size=args.eval_batch_size, dataset_size=args.eval_dataset_size, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16') log_event(key=mlperf_constants.EVAL_SAMPLES, value=val_pipeline.epoch_size) # Inference network infer_net = COCOInference(net=net, ltrb=False, scale_bboxes=True, score_threshold=0.0) # annotations file cocoapi_annotation_file = os.path.join( args.coco_root, 'annotations', 'bbox_only_instances_val2017.json') # Prepare model model = SSDModel(net=net, anchors_np=anchors_np, precision=args.precision, fp16_loss_scale=args.fp16_loss_scale, train_net=train_net, trainer_fn=trainer_fn, lr_scheduler=lr_scheduler, metric=mx.metric.Loss(), infer_net=infer_net, async_executor=async_executor, save_prefix=save_prefix, ctx=mx.gpu(hvd.local_rank())) # Do a training and validation runs on fake data. # this will set layers shape (needed before loading pre-trained backbone), # allocate tensors and and cache optimized graph. # Training dry run: logging.info('Running training dry runs') dummy_train_pipeline = get_training_pipeline( coco_root=None, tfrecords=[('dummy.tfrecord', 'dummy.idx')], anchors=anchors_np, num_shards=1, shard_id=0, device_id=hvd.local_rank(), batch_size=args.batch_size * args.input_batch_multiplier, dataset_size=None, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16', seed=args.seed) dummy_train_iterator = get_training_iterator(pipeline=dummy_train_pipeline, batch_size=args.batch_size) for images, box_targets, cls_targets in dummy_train_iterator: model.train_step(images=images, box_targets=box_targets, cls_targets=cls_targets) # Freeing memory is disabled due a bug in CUDA graphs # del dummy_train_pipeline # del dummy_train_iterator mx.ndarray.waitall() logging.info('Done') # Validation dry run: logging.info('Running inference dry runs') dummy_val_pipeline = get_inference_pipeline( coco_root=None, tfrecords=[('dummy.tfrecord', 'dummy.idx')], num_shards=1, shard_id=0, device_id=hvd.local_rank(), batch_size=args.eval_batch_size, dataset_size=None, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16') dummy_val_iterator = get_inference_iterator(pipeline=dummy_val_pipeline) model.infer(data_iterator=dummy_val_iterator, log_interval=None) # Freeing memory is disabled due a bug in CUDA graphs # del dummy_val_pipeline # del dummy_val_iterator mx.ndarray.waitall() logging.info('Done') # re-initialize the model as a precaution in case the dry runs changed the parameters model.init_model(force_reinit=True) model.zero_grads() mx.ndarray.waitall() # load saved model or pretrained backbone if args.resume_from: model.load_parameters(filename=args.resume_from) elif args.pretrained_backbone: model.load_pretrain_backbone(picklefile_name=args.pretrained_backbone) # broadcast parameters model.broadcast_params() mx.ndarray.waitall() if args.test_initialization and hvd.rank() == 0: model.print_params_stats(net) log_end(key=mlperf_constants.INIT_STOP) # Main MLPerf loop (training+validation) mpiwrapper.barrier() log_start(key=mlperf_constants.RUN_START) mpiwrapper.barrier() # Real data iterators train_iterator = None val_iterator = None if train_pipeline: train_iterator = get_training_iterator(pipeline=train_pipeline, batch_size=args.batch_size, synthetic=args.synthetic) if val_pipeline: val_iterator = get_inference_iterator(pipeline=val_pipeline) model_map, epoch = model.train_val(train_iterator=train_iterator, start_epoch=args.start_epoch, end_epoch=args.epochs, val_iterator=val_iterator, val_interval=args.val_interval, val_epochs=args.val_epochs, annotation_file=cocoapi_annotation_file, target_map=args.target_map, train_log_interval=args.log_interval, val_log_interval=args.log_interval, save_interval=args.save_interval, cocoapi_threads=args.cocoapi_threads, profile_start=args.profile_start, profile_stop=args.profile_stop) status = 'success' if (model_map and model_map >= args.target_map) else 'aborted' mx.ndarray.waitall() log_end(key=mlperf_constants.RUN_STOP, metadata={"status": status}) logging.info(f'Rank {hvd.rank()} done. map={model_map} @ epoch={epoch}') mx.nd.waitall() hvd.shutdown()
from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id from npu_bridge.estimator.npu import npu_compile from npu_bridge.helper import helper gen_npu_ops = helper.get_gen_ops(); ###### npu ###### rank_size = int(os.getenv('RANK_SIZE')) rank_id = int(os.getenv('RANK_ID').split("-")[-1]) device_id = int(os.getenv('DEVICE_ID')) + rank_id * 8 ############################### # MLperf log if device_id == 0: mllogger = mllog.get_mllogger() mllog.config(filename='resnet_close.log') mllog.config( default_namespace='worker1', default_stack_offset=1, default_clear_line=False, root_dir=os.path.normpath(os.path.dirname(os.path.realpath(__file__))) ) mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet" ) mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="open" ) mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="SIAT" ) mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value="Ascend 910" ) mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="cloud" ) mllogger.event(key=mllog.constants.CACHE_CLEAR ) params = { # 'data_dir': '/opt/dataset/imagenet_TF',
def main(): mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'unet3d.log')) mllog.config(filename=os.path.join("/results", 'unet3d.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False mllog_start(key=constants.INIT_START) flags = PARSER.parse_args() dllogger = get_dllogger(flags) local_rank = flags.local_rank device = get_device(local_rank) is_distributed = init_distributed() world_size = get_world_size() local_rank = get_rank() worker_seeds, shuffling_seeds = setup_seeds(flags.seed, flags.epochs, device) worker_seed = worker_seeds[local_rank] seed_everything(worker_seed) mllog_event(key=constants.SEED, value=flags.seed if flags.seed != -1 else worker_seed, sync=False) if is_main_process and flags.verbose: mlperf_submission_log() mlperf_run_param_log(flags) callbacks = get_callbacks(flags, dllogger, local_rank, world_size) flags.seed = worker_seed model = Unet3D(1, 3, normalization=flags.normalization, activation=flags.activation) mllog_end(key=constants.INIT_STOP, sync=True) mllog_start(key=constants.RUN_START, sync=True) train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size) mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size, sync=False) loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout, include_background=flags.include_background) score_fn = DiceScore(to_onehot_y=True, use_argmax=True, layout=flags.layout, include_background=flags.include_background) if flags.exec_mode == 'train': train(flags, model, train_dataloader, val_dataloader, loss_fn, score_fn, device=device, callbacks=callbacks, is_distributed=is_distributed) elif flags.exec_mode == 'evaluate': eval_metrics = evaluate(flags, model, val_dataloader, loss_fn, score_fn, device=device, is_distributed=is_distributed) if local_rank == 0: for key in eval_metrics.keys(): print(key, eval_metrics[key]) else: print("Invalid exec_mode.") pass
def dummy_example(): """Example usage of mllog""" # Get the mllogger instance, this needs to be called in every module that # needs logging mllogger = mllog.get_mllogger() # Customize mllogger configuration # These configurations only need to be set Once in your entire program. # Try tweaking the following configurations to see the difference. # logger: Customize the underlying logger to change the logging behavior. # filename: a log file to use. If set, a default file handler will be added # to the logger so it can log to the specified file. For more advanced # customizations, please set the 'logger' parameter instead. # default_namespace: the default namespace to use if one isn't provided. # default_stack_offset: the default depth to go into the stack to find # the call site. # default_clear_line: the default behavior of line clearing (i.e. print # an extra new line to clear any pre-existing text in the log line). # root_dir: directory prefix which will be trimmed when reporting calling # file for logging. # Customize the underlying logger to use a file in addition to stdout. # 1. Simple way # Provide a filename, this adds a log file with default behavior. mllog.config(filename="example_simple.log") # 2. Advanced way # You may pass a logging.Logger instance to mllog.config(). # To use the advanced way, comment out the "Simple way" above and uncomment # the followings: # # # Notice that proper log level needs to be set for both logger and handler. # logger = logging.getLogger("custom_logger") # logger.propagate = False # logger.setLevel(logging.DEBUG) # # add file handler for file logging # _file_handler = logging.FileHandler("example_advanced.log") # _file_handler.setLevel(logging.DEBUG) # logger.addHandler(_file_handler) # # add stream handler for stdout logging # _stream_handler = logging.StreamHandler(stream=sys.stdout) # _stream_handler.setLevel(logging.INFO) # logger.addHandler(_stream_handler) # mllog.config(logger=logger) # Set other logger configurations mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False, root_dir=os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) # Example log messages # The methods to use are "start", "end", and "event". # You may check out the detailed APIs in mllog.mllog. # Try to use the keys from mllog.constants to avoid wrong keys. mllogger.start(key=mllog.constants.INIT_START) mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="Intel") mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value="1 node x 8s CPX") mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="closed") mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="onprem") mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet") mllogger.event(key=mllog.constants.SUBMISSION_POC_NAME, value="Wei Wang, Christine Cheng") mllogger.event(key=mllog.constants.SUBMISSION_POC_EMAIL, value="[email protected], [email protected]") mllogger.event(key=mllog.constants.TRAIN_SAMPLES, value=1281167) mllogger.event(key="lars_opt_momentum", value=0.9) mllogger.end(key=mllog.constants.INIT_STOP)
def config_logger(benchmark): "initiates mlperf logger" mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log')) _MLLOGGER.logger.propagate = False
def main(args): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'transformer.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False log_start(key=constants.INIT_START, log_all_ranks=True) # preinit and warmup streams/groups for allreduce communicators allreduce_communicators = None if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt: allreduce_groups = [ torch.distributed.new_group() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] allreduce_streams = [ torch.cuda.Stream() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] for group, stream in zip(allreduce_groups, allreduce_streams): with torch.cuda.stream(stream): torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=group) allreduce_communicators = (allreduce_groups, allreduce_streams) if args.max_tokens is None: args.max_tokens = 6000 print(args) log_event(key=constants.GLOBAL_BATCH_SIZE, value=args.max_tokens * args.distributed_world_size) log_event(key=constants.OPT_NAME, value=args.optimizer) assert (len(args.lr) == 1) log_event(key=constants.OPT_BASE_LR, value=args.lr[0] if len(args.lr) == 1 else args.lr) log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates) assert (args.max_source_positions == args.max_target_positions) log_event(key=constants.MAX_SEQUENCE_LENGTH, value=args.max_target_positions, metadata={'method': 'discard'}) log_event(key=constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0]) log_event(key=constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1]) log_event(key=constants.OPT_ADAM_EPSILON, value=args.adam_eps) log_event(key=constants.SEED, value=args.seed) # L2 Sector Promotion pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) result = ctypes.CDLL('libcudart.so').cudaDeviceSetLimit( ctypes.c_int(0x05), ctypes.c_int(128)) result = ctypes.CDLL('libcudart.so').cudaDeviceGetLimit( pValue, ctypes.c_int(0x05)) worker_seeds, shuffling_seeds = setup_seeds( args.seed, args.max_epoch + 1, torch.device('cuda'), args.distributed_rank, args.distributed_world_size, ) worker_seed = worker_seeds[args.distributed_rank] print( f'Worker {args.distributed_rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: if args.distributed_weight_update != 0: from fairseq.fp16_trainer import DistributedFP16Trainer trainer = DistributedFP16Trainer( args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: from fairseq.fp16_trainer import FP16Trainer trainer = FP16Trainer( args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion, allreduce_communicators=None) #if (args.online_eval or args.target_bleu) and not args.remove_bpe: # args.remove_bpe='@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() # Send a dummy batch to warm the caching allocator dummy_batch = language_pair_dataset.get_dummy_batch_isolated( args.max_tokens, max_positions, 8) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] # mlperf compliance synchronization if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) torch.cuda.synchronize() log_end(key=constants.INIT_STOP, sync=False) log_start(key=constants.RUN_START, sync=True) # second sync after RUN_START tag is printed. # this ensures no rank touches data until after RUN_START tag is printed. barrier() # Load dataset splits load_dataset_splits(task, ['train', 'test']) log_event(key=constants.TRAIN_SAMPLES, value=len(task.dataset(args.train_subset)), sync=False) log_event(key=constants.EVAL_SAMPLES, value=len(task.dataset(args.gen_subset)), sync=False) ctr = 0 start = time.time() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), dataloader_num_workers=args.dataloader_num_workers, dataloader_pin_memory=args.enable_dataloader_pin_memory, max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seeds=shuffling_seeds, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, epoch=epoch_itr.epoch if ctr is not 0 else 0, bucket_growth_factor=args.bucket_growth_factor, seq_len_multiple=args.seq_len_multiple, batching_scheme=args.batching_scheme, batch_multiple_strategy=args.batch_multiple_strategy, ) print("got epoch iterator", time.time() - start) # Main training loop while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: first_epoch = epoch_itr.epoch + 1 log_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': 1 }, sync=False) log_start(key=constants.EPOCH_START, metadata={'epoch_num': first_epoch}, sync=False) gc.disable() # Load the latest checkpoint if one is available if ctr is 0: load_checkpoint(args, trainer, epoch_itr) # train for one epoch start = time.time() #exit(1) train(args, trainer, task, epoch_itr, shuffling_seeds) print("epoch time ", time.time() - start) start = time.time() log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': first_epoch}, sync=False) # Eval BLEU score if args.online_eval or (not tgt_bleu is math.inf): current_bleu = score(args, trainer, task, epoch_itr, args.gen_subset) log_event(key=constants.EVAL_ACCURACY, value=float(current_bleu) / 100.0, metadata={'epoch_num': first_epoch}) gc.enable() # Only use first validation loss to update the learning rate #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint #if epoch_itr.epoch % args.save_interval == 0: # save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) ctr = ctr + 1 print("validation and scoring ", time.time() - start) log_end(key=constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=False) train_meter.stop() status = 'success' if current_bleu >= tgt_bleu else 'aborted' log_end(key=constants.RUN_STOP, metadata={'status': status}) print('| done training in {:.1f} seconds'.format(train_meter.sum))
damping_each_step = np.array(damping_each_step).astype(np.float32) damping_now = damping_each_step[current_step:] return damping_now if __name__ == '__main__': context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True) auto_parallel_context().set_all_reduce_fusion_split_indices([43], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum2") auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum3") auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum4") auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum5") # add mllog mllog.config(filename=log_filename) mllog.config( default_namespace="mindspore", default_stack_offset=1, default_clear_line=False, root_dir=os.path.normpath( os.path.dirname(os.path.realpath(__file__)))) mllogger = mllog.get_mllogger() # submission mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet") mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="open") mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="SIAT") mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value="Ascend 910") mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="cloud") mllogger.event(key=mllog.constants.CACHE_CLEAR)
def set_defaults(opts): # Logs and checkpoint paths # Must be run last opts['summary_str'] += "Logging\n" name = opts['name'] if opts["name_suffix"]: name = name + "_" + opts["name_suffix"] if opts.get("poplar_version"): name += "_v" + _extract_poplar_version(opts['poplar_version']) # We want this to be random even if random seeds have been set so that we don't overwrite # when re-running with the same seed random_state = random.getstate() random.seed() rnd_str = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(3)) random.setstate(random_state) name += "_{}".format(rnd_str) opts['summary_str'] += " Name: {name}\n" # only instance 0 creates a log dir and logs to disk # a log dir is also created when using validation.py (aka opts['training']==False) # using train.py with --restore-path logs training results into that folder if ((not opts['no_logs']) and (not opts['restore_path'] or not opts.get('training')) and (opts['distributed_worker_index'] == 0 or opts['log_all_instances'])): if "logs_path" not in opts or opts["logs_path"] is None: opts["logs_path"] = os.path.join(opts["log_dir"], '{}'.format(name)) opts["checkpoint_path"] = os.path.join(opts["logs_path"], "ckpt") if not os.path.isdir(opts["logs_path"]): os.makedirs(opts["logs_path"]) opts['summary_str'] += " Saving to {logs_path}\n" fname = os.path.join(opts["logs_path"], 'arguments.json') if os.path.isfile(fname): fname = os.path.join(opts["logs_path"], 'arguments_restore.json') with open(fname, 'w') as fp: json.dump(opts, fp, sort_keys=True, indent=4, separators=(',', ': ')) elif ( opts['restore_path'] and (opts['distributed_worker_index'] == 0 or opts['log_all_instances'])): opts['logs_path'] = opts['restore_path'] opts['checkpoint_path'] = os.path.join(opts['logs_path'], 'ckpt') else: opts["logs_path"] = None opts["log_dir"] = None opts["mlperf_logging"] = False opts["checkpoint_path"] = os.path.join('/tmp/', '{}/ckpt'.format(name)) if not os.path.isdir( os.path.dirname(os.path.abspath(opts["checkpoint_path"]))): os.makedirs( os.path.dirname(os.path.abspath(opts["checkpoint_path"]))) global MLPERF_LOGGING if opts["mlperf_logging"] and MLPERF_LOGGING and opts[ 'distributed_worker_index'] == 0: MLPERF_LOGGING = True seed = opts.get("seed", "None") try: mllog.config(default_namespace=mllog.constants.RESNET, default_stack_offset=2, default_clear_line=False, root_dir=os.path.split(os.path.abspath(__file__))[0], filename=os.path.join(opts["logs_path"], "result_{}.txt".format(seed))) except NameError: pass else: MLPERF_LOGGING = False return opts
from mlperf_utils.logs import hooks_helper from mlperf_utils.logs import logger from mlperf_utils.misc import model_helpers global is_mpi try: import horovod.tensorflow as hvd hvd.init() is_mpi = hvd.size() except ImportError: is_mpi = 0 print("No MPI horovod support, this is running in no-MPI mode!") mllogger = mllog.get_mllogger() filenames = "resnet50v1.5.log-" + str(hvd.rank()) mllog.config(filename=filenames) workername = "worker" + str(hvd.rank()) mllog.config( default_namespace = workername, default_stack_offset = 1, default_clear_line = False, root_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) mllogger.event(key=mllog.constants.CACHE_CLEAR) mllogger.start(key=mllog.constants.RUN_START) _NUM_EXAMPLES_NAME = "num_examples" _NUM_IMAGES = { 'train': 1281167, 'validation': 50000 }
def configure_logger(benchmark): mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False
def main(): """ Launches data-parallel multi-gpu training. """ mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'gnmt.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False gnmt_start(key=constants.INIT_START) args = parse_args() device = utils.set_device(args.cuda, args.local_rank) distributed = utils.init_distributed(args.cuda) args.rank = utils.get_rank() if args.rank == 0: mlperf_submission_log(benchmark=constants.GNMT) if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_rank_{utils.get_rank()}.log' utils.setup_logging(os.path.join(save_path, log_filename)) if args.env: utils.log_env_info() logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') # automatically set train_iter_size based on train_global_batch_size, # world_size and per-worker train_batch_size if args.train_global_batch_size is not None: global_bs = args.train_global_batch_size bs = args.train_batch_size world_size = utils.get_world_size() assert global_bs % (bs * world_size) == 0 args.train_iter_size = global_bs // (bs * world_size) logging.info(f'Global batch size was set in the config, ' f'Setting train_iter_size to {args.train_iter_size}') gnmt_event(key='seed', value=args.seed) worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs, device) worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer pad_vocab = utils.pad_vocabulary(args.math) tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME), pad_vocab) vocab_size = tokenizer.vocab_size # build GNMT model model_config = { 'hidden_size': args.hidden_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding } model = GNMT(vocab_size=vocab_size, **model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = {'optimizer': args.optimizer, 'lr': args.lr} opt_config.update(literal_eval(args.optimizer_extra)) logging.info(f'Training optimizer config: {opt_config}') scheduler_config = { 'warmup_steps': args.warmup_steps, 'remain_steps': args.remain_steps, 'decay_interval': args.decay_interval, 'decay_steps': args.decay_steps, 'decay_factor': args.decay_factor } logging.info(f'Training LR schedule config: {scheduler_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') gnmt_end(key=constants.INIT_STOP, sync=True) gnmt_start(key=constants.RUN_START, sync=True) # build datasets gnmt_event(key=constants.MAX_SEQUENCE_LENGTH, value=args.max_length_train, sync=False, metadata={'method': 'discard'}) train_data = LazyParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) test_data = TextDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=True) batching_opt = { 'shard_size': args.shard_size, 'num_buckets': args.num_buckets } # get data loaders train_loader = train_data.get_loader(batch_size=args.train_batch_size, seeds=shuffling_seeds, batch_first=batch_first, shuffle=True, batching=args.batching, batching_opt=batching_opt, num_workers=args.train_loader_workers) gnmt_event(key=constants.GLOBAL_BATCH_SIZE, value=args.train_batch_size * utils.get_world_size(), sync=False) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.val_loader_workers) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, pad=True, num_workers=args.test_loader_workers) gnmt_event(key='training_samples', value=len(train_loader), sync=False) gnmt_event(key='evaluation_samples', value=len(val_loader), sync=False) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) # create trainer total_train_iters = len(train_loader) // args.train_iter_size * args.epochs save_info = { 'model_config': model_config, 'config': args, 'tokenizer': tokenizer.get_state() } trainer_options = dict(criterion=criterion, grad_clip=args.grad_clip, iter_size=args.train_iter_size, save_path=save_path, save_freq=args.save_freq, save_info=save_info, opt_config=opt_config, scheduler_config=scheduler_config, train_iterations=total_train_iters, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, intra_epoch_eval=args.intra_epoch_eval, translator=translator) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop best_loss = float('inf') break_training = False test_bleu = None for epoch in range(args.start_epoch, args.epochs): gnmt_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': epoch + 1, 'epoch_count': 1 }, sync=True) gnmt_start(key=constants.EPOCH_START, metadata={'epoch_num': epoch + 1}, sync=True) logging.info(f'Starting epoch {epoch}') train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) gnmt_end(key=constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}, sync=True) # evaluate on validation set if args.eval: logging.info(f'Running validation on dev set') val_loss, val_perf = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) if args.eval: gnmt_start(key=constants.EVAL_START, value=epoch, metadata={'epoch_num': epoch + 1}, sync=True) test_bleu, break_training = translator.run(calc_bleu=True, epoch=epoch) gnmt_event(key=constants.EVAL_ACCURACY, value={ "epoch": epoch, "value": round(test_bleu, 2) }, metadata={'epoch_num': epoch + 1}, sync=False) gnmt_end(key=constants.EVAL_STOP, metadata={'epoch_num': epoch + 1}, sync=True) acc_log = [] acc_log += [f'Summary: Epoch: {epoch}'] acc_log += [f'Training Loss: {train_loss:.4f}'] if args.eval: acc_log += [f'Validation Loss: {val_loss:.4f}'] acc_log += [f'Test BLEU: {test_bleu:.2f}'] perf_log = [] perf_log += [f'Performance: Epoch: {epoch}'] perf_log += [f'Training: {train_perf:.0f} Tok/s'] if args.eval: perf_log += [f'Validation: {val_perf:.0f} Tok/s'] if args.rank == 0: logging.info('\t'.join(acc_log)) logging.info('\t'.join(perf_log)) gnmt_end(key=constants.BLOCK_STOP, metadata={ 'first_epoch_num': epoch + 1, 'epoch_count': 1 }, sync=True) logging.info(f'Finished epoch {epoch}') if break_training: break gnmt_end(key=constants.RUN_STOP, metadata={'status': 'success' if break_training else 'aborted'}, sync=True)