def wait_for_training_examples(state, num_games): """Wait for training examples to be generated by the latest model. Args: state: the RL loop State instance. num_games: number of games to wait for. """ model_dir = os.path.join(FLAGS.selfplay_dir, state.selfplay_model_name) pattern = os.path.join(model_dir, '*', '*', '*.tfrecord.zz') for i in itertools.count(): try: paths = sorted(tf.io.gfile.glob(pattern)) except tf.errors.OpError: paths = [] if len(paths) >= num_games: mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllogger.event(key='actual_selfplay_games_per_generation', value=len(paths)) break if i % 30 == 0: logging.info('Waiting for %d games in %s (found %d)', num_games, model_dir, len(paths)) time.sleep(1)
def main(unused_argv): models = load_train_times() # Skip all models earlier than start and apply step. models = [x for x in models if int(x[1]) >= FLAGS.start][::FLAGS.step] mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) for i, (timestamp, name, path) in enumerate(models): epoch_num = FLAGS.start + i mllogger.start(key=mllog.constants.EVAL_START, value=epoch_num) winrate = evaluate_model(path, epoch_num) mllogger.end(key=mllog.constants.EVAL_STOP, value=epoch_num) if winrate >= FLAGS.winrate: print('Model {} beat target after {}s'.format(name, timestamp)) break mllogger.event(key='eval_games', value=len(models)) mllogger.event(key='gating_win_rate', value=FLAGS.winrate) mllogger.end(key=mllog.constants.RUN_STOP, value="succuss")
def mlperf_submission_log(benchmark): num_nodes = os.environ.get('SLURM_NNODES', 1) mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False log_event( key=constants.SUBMISSION_BENCHMARK, value=benchmark, ) log_event( key=constants.SUBMISSION_ORG, value='NVIDIA') log_event( key=constants.SUBMISSION_DIVISION, value='closed') log_event( key=constants.SUBMISSION_STATUS, value='onprem') log_event( key=constants.SUBMISSION_PLATFORM, value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
def export_model(model_path): """Take the latest checkpoint and copy it to model_path. Assumes that all relevant model files are prefixed by the same name. (For example, foo.index, foo.meta and foo.data-00000-of-00001). Args: model_path: The path (can be a gs:// path) to export model """ FLAGS.use_bfloat16 = False estimator = tf.estimator.Estimator(model_fn, model_dir=FLAGS.work_dir, params=FLAGS.flag_values_dict()) latest_checkpoint = estimator.latest_checkpoint() all_checkpoint_files = tf.io.gfile.glob(latest_checkpoint + '*') mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) for filename in all_checkpoint_files: suffix = filename.partition(latest_checkpoint)[2] destination_path = model_path + suffix logging.info('Copying {} to {}'.format(filename, destination_path)) tf.io.gfile.copy(filename, destination_path)
def mlperf_submission_log(benchmark): required_dist_init = ['RANK', 'WORLD_SIZE', 'MASTER_ADDR', 'MASTER_PORT'] if all(var in os.environ for var in required_dist_init): torch.distributed.init_process_group(backend='nccl', init_method='env://') num_nodes = os.environ.get('SLURM_NNODES', 1) mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'transformer.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False log_event( key=constants.SUBMISSION_BENCHMARK, value=benchmark, ) log_event(key=constants.SUBMISSION_ORG, value='Fujitsu') log_event(key=constants.SUBMISSION_DIVISION, value='closed') log_event(key=constants.SUBMISSION_STATUS, value='onprem') log_event(key=constants.SUBMISSION_PLATFORM, value=f'1xGX2570M5')
def main(argv): """Train on examples and export the updated model weights.""" tf_records = argv[1:] logging.info("Training on %s records: %s to %s", len(tf_records), tf_records[0], tf_records[-1]) if FLAGS.dist_train: hvd.init() mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) with utils.logged_timer("Training"): train(*tf_records) if (not FLAGS.dist_train) or hvd.rank() == 0: if FLAGS.export_path: dual_net.export_model(FLAGS.export_path) epoch = int(os.path.basename(FLAGS.export_path)) mllogger.event(key="save_model", value={"Iteration": epoch}) if FLAGS.freeze: dual_net.freeze_graph(FLAGS.export_path, FLAGS.use_trt, FLAGS.trt_max_batch_size, FLAGS.trt_precision, FLAGS.selfplay_precision)
def __init__(self, filename, benchmark, organization): self.mllogger = mllog.get_mllogger() self.comm_rank = comm.get_rank() self.comm_size = comm.get_size() self.constants = constants # create logging dir if it does not exist logdir = os.path.dirname(filename) if self.comm_rank == 0: if not os.path.isdir(logdir): os.makedirs(logdir) if torch.distributed.is_available( ) and torch.distributed.is_initialized(): torch.distributed.barrier() # create config mllog.config(filename=filename) self.mllogger.logger.propagate = False self.log_event(key=constants.SUBMISSION_BENCHMARK, value=benchmark) self.log_event(key=constants.SUBMISSION_ORG, value=organization) self.log_event(key=constants.SUBMISSION_DIVISION, value='closed') self.log_event(key=constants.SUBMISSION_STATUS, value='onprem') self.log_event( key=constants.SUBMISSION_PLATFORM, value=f'{self.comm_size}xSUBMISSION_PLATFORM_PLACEHOLDER')
def maybe_set_seed(): if FLAGS.training_seed != 0: random.seed(FLAGS.training_seed) tf.set_random_seed(FLAGS.training_seed) np.random.seed(FLAGS.training_seed) mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.SEED, value=FLAGS.training_seed)
def log_submission_info(benchmark='cosmoflow', org='UNDEFINED', division='UNDEFINED', status='UNDEFINED', platform='UNDEFINED'): """Log general MLPerf submission details from config""" mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value=benchmark) mllogger.event(key=mllog.constants.SUBMISSION_ORG, value=org) mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value=division) mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value=status) mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value=platform)
def build_model(input_shape, target_size, conv_size=32, kernel_size=3, n_conv_layers=5, fc1_size=128, fc2_size=64, l2=0, hidden_activation='LeakyReLU', pooling_type='MaxPool3D', dropout=0.5): """Construct the CosmoFlow 3D CNN model""" if have_mlperf_logging: mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.OPT_WEIGHT_DECAY, value=l2) mllogger.event(key='dropout', value=dropout) conv_args = dict(kernel_size=kernel_size, padding='same') hidden_activation = getattr(layers, hidden_activation) pooling_type = getattr(layers, pooling_type) model = tf.keras.models.Sequential() # First convolutional layer model.add(layers.Conv3D(conv_size, input_shape=input_shape, **conv_args)) model.add(hidden_activation()) model.add(pooling_type(pool_size=2)) # Additional conv layers for i in range(1, n_conv_layers): # Double conv channels at every layer model.add(layers.Conv3D(conv_size * 2**i, **conv_args)) model.add(hidden_activation()) model.add(pooling_type(pool_size=2)) model.add(layers.Flatten()) # Fully-connected layers model.add(layers.Dense(fc1_size, kernel_regularizer=regularizers.l2(l2))) model.add(hidden_activation()) model.add(layers.Dropout(dropout)) model.add(layers.Dense(fc2_size, kernel_regularizer=regularizers.l2(l2))) model.add(hidden_activation()) model.add(layers.Dropout(dropout)) # Output layers model.add(layers.Dense(target_size, activation='tanh')) model.add(layers.Lambda(scale_1p2)) return model
def get_optimizer(name, distributed=False, **opt_args): """Configure the optimizer""" # MLPerf logging if utils.distributed.rank() == 0 and have_mlperf_logging: mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.OPT_NAME, value=name) # Construct the optimizer OptType = getattr(keras.optimizers, name) opt = OptType(**opt_args) # Distributed optimizer wrapper if distributed: opt = hvd.DistributedOptimizer(opt) return opt
def get_mllog_mlloger(): from mlperf_logging import mllog from mlperf_compliance import tf_mlperf_log str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0" mllogger = mllog.get_mllogger() filenames = "resnet50v1.5.log-" + str_hvd_rank mllog.config(filename=filenames) workername = "worker" + str_hvd_rank mllog.config( default_namespace = workername, default_stack_offset = 1, default_clear_line = False, root_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) return mllogger, mllog, tf_mlperf_log
def test_mllog_end_simple(self): prefix = ":::MLLOG" expected_log_json = json.dumps( json.loads(r''' { "namespace": "", "time_ms": 1234567890123, "event_type": "INTERVAL_END", "key": "run_stop", "value": null, "metadata": {"file": "mybenchmark/file.py", "lineno": 42} }''', object_pairs_hook=collections.OrderedDict)) expected_output = " ".join([prefix, expected_log_json]) with _captured_stdout() as out: mllogger = mllog.get_mllogger() mllogger.end(mllog.constants.RUN_STOP, None) self.assertEqual(out.getvalue().splitlines()[0], expected_output)
def test_mllog_event_simple(self): prefix = ":::MLLOG" expected_log_json = json.dumps( json.loads(r''' { "namespace": "", "time_ms": 1234567890123, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.99, "metadata": {"file": "mybenchmark/file.py", "lineno": 42} }''', object_pairs_hook=collections.OrderedDict)) expected_output = " ".join([prefix, expected_log_json]) with _captured_stdout() as out: mllogger = mllog.get_mllogger() mllogger.event(mllog.constants.EVAL_ACCURACY, 0.99) self.assertEqual(out.getvalue().splitlines()[0], expected_output)
def main(argv): mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False) mllogger.event(key=mllog.constants.OPT_BASE_LR, value=FLAGS.lr_rates) mllogger.event(key='lr_rates', value=FLAGS.lr_rates) mllogger.event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=FLAGS.lr_boundaries[1]) mllogger.event(key='lr_boundaries', value=FLAGS.lr_boundaries[1]) mllogger.event(key=mllog.constants.OPT_WEIGHT_DECAY, value=FLAGS.l2_strength) mllogger.event(key='opt_learning_rate_decay_boundary_steps', value=FLAGS.lr_boundaries) mllogger.event(key='train_batch_size', value=FLAGS.train_batch_size)
def evaluate_model(eval_model_path, epoch): processes = [] for i, device in enumerate(FLAGS.devices): a = i * FLAGS.num_games // len(FLAGS.devices) b = (i + 1) * FLAGS.num_games // len(FLAGS.devices) num_games = b - a env = os.environ.copy() env['CUDA_VISIBLE_DEVICES'] = device processes.append( checked_run([ 'numactl', '--physcpubind={}'.format(i), 'bazel-bin/cc/eval', '--flagfile={}'.format( os.path.join(FLAGS.flags_dir, 'eval.flags')), '--eval_model={}'.format(eval_model_path), '--target_model={}'.format( FLAGS.target), '--sgf_dir={}'.format(FLAGS.sgf_dir), '--parallel_games={}'.format(num_games), '--eval_device=cpu', '--target_device=cpu', '--verbose=false' ], env, False)) all_output = wait(processes) total_wins = 0 total_num_games = 0 for output in all_output: lines = output.split('\n') eval_stats, target_stats = parse_win_stats_table(lines[-7:]) num_games = eval_stats.total_wins + target_stats.total_wins total_wins += eval_stats.total_wins total_num_games += num_games mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.EVAL_SAMPLES, value=total_num_games) win_rate = total_wins / total_num_games logging.info('Win rate %s vs %s: %.3f', eval_stats.model_name, target_stats.model_name, win_rate) mllogger.event(key=mllog.constants.EVAL_ACCURACY, value=win_rate, metadata={"epoch_num": epoch}) return win_rate
def mx_resnet_print(key, val=None, metadata=None, deferred=False, stack_offset=1, sync=False, uniq=True): rank = mpiwrapper.rank() if sync: mpiwrapper.barrier() if (uniq and rank == 0) or (not uniq): mllogger = mllog.get_mllogger() if key == mlperf_constants.RUN_START: mllogger.start(key=key, value=val, metadata=metadata) elif key== mlperf_constants.RUN_STOP: mllogger.end(key=key, value=val, metadata=metadata) else: mllogger.event(key=key, value=val, metadata=metadata) if sync: mpiwrapper.barrier() return
def get_mllog_mlloger(output_dir=None): from mlperf_logging import mllog str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0" mllogger = mllog.get_mllogger() mllogger.propagate = False mllog.propagate=False if output_dir is None: output_dir='./log' filenames = os.path.normpath(output_dir) + "/result_rank_" + str_hvd_rank + ".txt" mllog.config(filename=filenames) workername = "worker" + str_hvd_rank mllog.config( default_namespace = workername, default_stack_offset = 1, default_clear_line = False, root_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) return mllogger, mllog
def main(argv): mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False, root_dir=os.path.normpath("/tmp/")) mllogger.event(key='num_readouts', value=FLAGS.num_readouts) mllogger.event(key='value_init_penalty', value=FLAGS.value_init_penalty) mllogger.event(key='holdout_pct', value=FLAGS.holdout_pct) mllogger.event(key='disable_resign_pct', value=FLAGS.disable_resign_pct) mllogger.event(key='min_resign_threshold', value=FLAGS.min_resign_threshold) mllogger.event(key='max_resign_threshold', value=FLAGS.max_resign_threshold) mllogger.event(key='selfplay_threads', value=FLAGS.selfplay_threads) mllogger.event(key='parallel_games', value=FLAGS.parallel_inference) mllogger.event(key='virtual_losses', value=FLAGS.virtual_losses)
def main(argv): """Entry point for running one selfplay game.""" del argv # Unused flags.mark_flag_as_required('load_file') mllogger = mllog.get_mllogger() mllog.config(filename="train.log") mllog.config( default_namespace = "worker1", default_stack_offset = 1, default_clear_line = False) mllogger.event(key='parallel_games', value=FLAGS.parallel_inference) run_game( load_file=FLAGS.load_file, selfplay_dir=FLAGS.selfplay_dir, holdout_dir=FLAGS.holdout_dir, holdout_pct=FLAGS.holdout_pct, sgf_dir=FLAGS.sgf_dir)
def test_mllog_event_override_param(self): prefix = ":::MLLOG" expected_log_json = json.dumps( json.loads(r''' { "namespace": "worker1", "time_ms": 1231231230123, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.99, "metadata": {"file": "mybenchmark/file.py", "lineno": 42} }''', object_pairs_hook=collections.OrderedDict)) expected_output = "\n" + " ".join([prefix, expected_log_json]) + "\n" with _captured_stdout() as out: mllogger = mllog.get_mllogger() mllogger.event(mllog.constants.EVAL_ACCURACY, 0.99, namespace="worker1", time_ms=1231231230123, clear_line=True) self.assertEqual(out.getvalue(), expected_output)
def get_lr_schedule(base_lr, global_batch_size, base_batch_size=None, scaling=None, n_warmup_epochs=0, decay_schedule={}): """Get the learning rate schedule function""" if scaling == 'linear': scale_factor = global_batch_size / base_batch_size elif scaling == 'sqrt': scale_factor = math.sqrt(global_batch_size / base_batch_size) else: scale_factor = 1. peak_lr = base_lr * scale_factor # MLPerf logging # NOTE: there is currently a confusing mismatch between the parameter # naming convention in this implementation and MLPerf's hyperparameter # conventions. Here we define base LR to be the LR at a baseline batch # size and the "peak" LR to be the value scaled according to current batch # size. We will leave things as-is for now. if utils.distributed.rank() == 0 and have_mlperf_logging: mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.OPT_BASE_LR, value=peak_lr) mllogger.event(key=mllog.constants.OPT_LR_WARMUP_EPOCHS, value=n_warmup_epochs) mllogger.event(key=mllog.constants.OPT_LR_WARMUP_FACTOR, value=scale_factor) mllogger.event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=sorted(decay_schedule.keys())) mllogger.event(key=mllog.constants.OPT_LR_DECAY_FACTOR, value=max(decay_schedule.values()) if len(decay_schedule) > 0 else 1) return partial(_lr_schedule, base_lr=base_lr, peak_lr=peak_lr, n_warmup_epochs=n_warmup_epochs, decay_schedule=decay_schedule)
def configure_logger(benchmark): mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import torch from mlperf_logging import mllog from mlperf_logging.mllog import constants mllogger = mllog.get_mllogger() def configure_logger(benchmark): mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False def log_start(*args, **kwargs): _log(mllogger.start, *args, **kwargs) def log_end(*args, **kwargs): _log(mllogger.end, *args, **kwargs)
def main(args): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'transformer.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False log_start(key=constants.INIT_START, log_all_ranks=True) # preinit and warmup streams/groups for allreduce communicators allreduce_communicators = None if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt: allreduce_groups = [ torch.distributed.new_group() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] allreduce_streams = [ torch.cuda.Stream() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] for group, stream in zip(allreduce_groups, allreduce_streams): with torch.cuda.stream(stream): torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=group) allreduce_communicators = (allreduce_groups, allreduce_streams) if args.max_tokens is None: args.max_tokens = 6000 print(args) log_event(key=constants.GLOBAL_BATCH_SIZE, value=args.max_tokens * args.distributed_world_size) log_event(key=constants.OPT_NAME, value=args.optimizer) assert (len(args.lr) == 1) log_event(key=constants.OPT_BASE_LR, value=args.lr[0] if len(args.lr) == 1 else args.lr) log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates) assert (args.max_source_positions == args.max_target_positions) log_event(key=constants.MAX_SEQUENCE_LENGTH, value=args.max_target_positions, metadata={'method': 'discard'}) log_event(key=constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0]) log_event(key=constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1]) log_event(key=constants.OPT_ADAM_EPSILON, value=args.adam_eps) log_event(key=constants.SEED, value=args.seed) # L2 Sector Promotion pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) result = ctypes.CDLL('libcudart.so').cudaDeviceSetLimit( ctypes.c_int(0x05), ctypes.c_int(128)) result = ctypes.CDLL('libcudart.so').cudaDeviceGetLimit( pValue, ctypes.c_int(0x05)) worker_seeds, shuffling_seeds = setup_seeds( args.seed, args.max_epoch + 1, torch.device('cuda'), args.distributed_rank, args.distributed_world_size, ) worker_seed = worker_seeds[args.distributed_rank] print( f'Worker {args.distributed_rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: if args.distributed_weight_update != 0: from fairseq.fp16_trainer import DistributedFP16Trainer trainer = DistributedFP16Trainer( args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: from fairseq.fp16_trainer import FP16Trainer trainer = FP16Trainer( args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion, allreduce_communicators=None) #if (args.online_eval or args.target_bleu) and not args.remove_bpe: # args.remove_bpe='@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() # Send a dummy batch to warm the caching allocator dummy_batch = language_pair_dataset.get_dummy_batch_isolated( args.max_tokens, max_positions, 8) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] # mlperf compliance synchronization if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) torch.cuda.synchronize() log_end(key=constants.INIT_STOP, sync=False) log_start(key=constants.RUN_START, sync=True) # second sync after RUN_START tag is printed. # this ensures no rank touches data until after RUN_START tag is printed. barrier() # Load dataset splits load_dataset_splits(task, ['train', 'test']) log_event(key=constants.TRAIN_SAMPLES, value=len(task.dataset(args.train_subset)), sync=False) log_event(key=constants.EVAL_SAMPLES, value=len(task.dataset(args.gen_subset)), sync=False) ctr = 0 start = time.time() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), dataloader_num_workers=args.dataloader_num_workers, dataloader_pin_memory=args.enable_dataloader_pin_memory, max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seeds=shuffling_seeds, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, epoch=epoch_itr.epoch if ctr is not 0 else 0, bucket_growth_factor=args.bucket_growth_factor, seq_len_multiple=args.seq_len_multiple, batching_scheme=args.batching_scheme, batch_multiple_strategy=args.batch_multiple_strategy, ) print("got epoch iterator", time.time() - start) # Main training loop while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: first_epoch = epoch_itr.epoch + 1 log_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': 1 }, sync=False) log_start(key=constants.EPOCH_START, metadata={'epoch_num': first_epoch}, sync=False) gc.disable() # Load the latest checkpoint if one is available if ctr is 0: load_checkpoint(args, trainer, epoch_itr) # train for one epoch start = time.time() #exit(1) train(args, trainer, task, epoch_itr, shuffling_seeds) print("epoch time ", time.time() - start) start = time.time() log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': first_epoch}, sync=False) # Eval BLEU score if args.online_eval or (not tgt_bleu is math.inf): current_bleu = score(args, trainer, task, epoch_itr, args.gen_subset) log_event(key=constants.EVAL_ACCURACY, value=float(current_bleu) / 100.0, metadata={'epoch_num': first_epoch}) gc.enable() # Only use first validation loss to update the learning rate #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint #if epoch_itr.epoch % args.save_interval == 0: # save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) ctr = ctr + 1 print("validation and scoring ", time.time() - start) log_end(key=constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=False) train_meter.stop() status = 'success' if current_bleu >= tgt_bleu else 'aborted' log_end(key=constants.RUN_STOP, metadata={'status': status}) print('| done training in {:.1f} seconds'.format(train_meter.sum))
def __init__(self, metric='val_mae', log_key='eval_error'): self.mllogger = mllog.get_mllogger() self.metric = metric self.log_key = log_key
def train(*tf_records: "Records to train on"): """Train on examples.""" tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) estimator = dual_net.get_estimator(FLAGS.num_intra_threads, FLAGS.num_inter_threads) if FLAGS.dist_train: effective_batch_size = int(FLAGS.train_batch_size / hvd.size()) global_batch_size = effective_batch_size * hvd.size() mllogger = mllog.get_mllogger() mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE, value=global_batch_size) else: effective_batch_size = FLAGS.train_batch_size global_batch_size = FLAGS.train_batch_size logging.info("Real global batch size = {}, local batch size = {}.".format( global_batch_size, effective_batch_size)) if FLAGS.use_tpu: effective_batch_size *= FLAGS.num_tpu_cores if FLAGS.use_tpu: if FLAGS.use_bt: def _input_fn(params): games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) games_nr = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr') return preprocessing.get_tpu_bt_input_tensors( games, games_nr, params['batch_size'], params['input_layout'], number_of_games=FLAGS.window_size, random_rotation=True) else: def _input_fn(params): return preprocessing.get_tpu_input_tensors( params['batch_size'], params['input_layout'], tf_records, filter_amount=FLAGS.filter_amount, shuffle_examples=FLAGS.shuffle_examples, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True) # Hooks are broken with TPUestimator at the moment. hooks = [] else: def _input_fn(): return preprocessing.get_input_tensors( effective_batch_size, FLAGS.input_layout, tf_records, filter_amount=FLAGS.filter_amount, shuffle_examples=FLAGS.shuffle_examples, shuffle_buffer_size=FLAGS.shuffle_buffer_size, random_rotation=True, seed=FLAGS.training_seed, dist_train=FLAGS.dist_train, use_bf16=FLAGS.use_bfloat16) hooks = [ UpdateRatioSessionHook(FLAGS.work_dir), EchoStepCounterHook(output_dir=FLAGS.work_dir) ] if FLAGS.dist_train: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) steps = FLAGS.steps_to_train if not steps and FLAGS.num_examples: batch_size = effective_batch_size if FLAGS.use_tpu: batch_size *= FLAGS.num_tpu_cores steps = math.floor(FLAGS.num_examples / batch_size) logging.info("Training, steps = %s, batch = %s -> %s examples", steps or '?', effective_batch_size, (steps * effective_batch_size) if steps else '?') if FLAGS.use_bt: games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table) if not games.read_wait_cell(): games.require_fresh_games(20000) latest_game = games.latest_game_number index_from = max(latest_game, games.read_wait_cell()) print("== Last game before training:", latest_game, flush=True) print("== Wait cell:", games.read_wait_cell(), flush=True) try: estimator.train(_input_fn, steps=steps, hooks=hooks) if FLAGS.use_bt: bigtable_input.set_fresh_watermark(games, index_from, FLAGS.window_size) except: if FLAGS.use_bt: games.require_fresh_games(0) raise
""" Utilities for MLPerf logging """ import collections import os import subprocess from mlperf_logging import mllog from mlperf_logging.mllog import constants import torch _MLLOGGER = mllog.get_mllogger() def log_start(*args, **kwargs): "log with start tag" _log_print(_MLLOGGER.start, *args, **kwargs) def log_end(*args, **kwargs): "log with end tag" _log_print(_MLLOGGER.end, *args, **kwargs) def log_event(*args, **kwargs): "log with event tag" _log_print(_MLLOGGER.event, *args, **kwargs) def _log_print(logger, *args, **kwargs):
def main(): mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'unet3d.log')) mllog.config(filename=os.path.join("/results", 'unet3d.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False mllog_start(key=constants.INIT_START) flags = PARSER.parse_args() dllogger = get_dllogger(flags) local_rank = flags.local_rank device = get_device(local_rank) is_distributed = init_distributed() world_size = get_world_size() local_rank = get_rank() worker_seeds, shuffling_seeds = setup_seeds(flags.seed, flags.epochs, device) worker_seed = worker_seeds[local_rank] seed_everything(worker_seed) mllog_event(key=constants.SEED, value=flags.seed if flags.seed != -1 else worker_seed, sync=False) if is_main_process and flags.verbose: mlperf_submission_log() mlperf_run_param_log(flags) callbacks = get_callbacks(flags, dllogger, local_rank, world_size) flags.seed = worker_seed model = Unet3D(1, 3, normalization=flags.normalization, activation=flags.activation) mllog_end(key=constants.INIT_STOP, sync=True) mllog_start(key=constants.RUN_START, sync=True) train_dataloader, val_dataloader = get_data_loaders(flags, num_shards=world_size) mllog_event(key=constants.GLOBAL_BATCH_SIZE, value=flags.batch_size * world_size, sync=False) loss_fn = DiceCELoss(to_onehot_y=True, use_softmax=True, layout=flags.layout, include_background=flags.include_background) score_fn = DiceScore(to_onehot_y=True, use_argmax=True, layout=flags.layout, include_background=flags.include_background) if flags.exec_mode == 'train': train(flags, model, train_dataloader, val_dataloader, loss_fn, score_fn, device=device, callbacks=callbacks, is_distributed=is_distributed) elif flags.exec_mode == 'evaluate': eval_metrics = evaluate(flags, model, val_dataloader, loss_fn, score_fn, device=device, is_distributed=is_distributed) if local_rank == 0: for key in eval_metrics.keys(): print(key, eval_metrics[key]) else: print("Invalid exec_mode.") pass
def dummy_example(): """Example usage of mllog""" # Get the mllogger instance, this needs to be called in every module that # needs logging mllogger = mllog.get_mllogger() # Customize mllogger configuration # These configurations only need to be set Once in your entire program. # Try tweaking the following configurations to see the difference. # logger: Customize the underlying logger to change the logging behavior. # filename: a log file to use. If set, a default file handler will be added # to the logger so it can log to the specified file. For more advanced # customizations, please set the 'logger' parameter instead. # default_namespace: the default namespace to use if one isn't provided. # default_stack_offset: the default depth to go into the stack to find # the call site. # default_clear_line: the default behavior of line clearing (i.e. print # an extra new line to clear any pre-existing text in the log line). # root_dir: directory prefix which will be trimmed when reporting calling # file for logging. # Customize the underlying logger to use a file in addition to stdout. # 1. Simple way # Provide a filename, this adds a log file with default behavior. mllog.config(filename="example_simple.log") # 2. Advanced way # You may pass a logging.Logger instance to mllog.config(). # To use the advanced way, comment out the "Simple way" above and uncomment # the followings: # # # Notice that proper log level needs to be set for both logger and handler. # logger = logging.getLogger("custom_logger") # logger.propagate = False # logger.setLevel(logging.DEBUG) # # add file handler for file logging # _file_handler = logging.FileHandler("example_advanced.log") # _file_handler.setLevel(logging.DEBUG) # logger.addHandler(_file_handler) # # add stream handler for stdout logging # _stream_handler = logging.StreamHandler(stream=sys.stdout) # _stream_handler.setLevel(logging.INFO) # logger.addHandler(_stream_handler) # mllog.config(logger=logger) # Set other logger configurations mllog.config(default_namespace="worker1", default_stack_offset=1, default_clear_line=False, root_dir=os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) # Example log messages # The methods to use are "start", "end", and "event". # You may check out the detailed APIs in mllog.mllog. # Try to use the keys from mllog.constants to avoid wrong keys. mllogger.start(key=mllog.constants.INIT_START) mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="Intel") mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value="1 node x 8s CPX") mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="closed") mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="onprem") mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet") mllogger.event(key=mllog.constants.SUBMISSION_POC_NAME, value="Wei Wang, Christine Cheng") mllogger.event(key=mllog.constants.SUBMISSION_POC_EMAIL, value="[email protected], [email protected]") mllogger.event(key=mllog.constants.TRAIN_SAMPLES, value=1281167) mllogger.event(key="lars_opt_momentum", value=0.9) mllogger.end(key=mllog.constants.INIT_STOP)