def run_epoch(self, state: TrainingState, data, metric_reporter: MetricReporter): """Our run_epoch is a bit different, because we're wrapping the model forward call with model.train_batch, which arranges tensors and gets loss, etc.""" report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics model = state.model for batch_id, batch in enumerate(data): self.zero_grads(state) with timing.time("model.train_batch"): loss, metric_data = model.train_batch(model, batch) self.backprop(state, loss) if report_metric: with timing.time("add metrics"): metric_reporter.add_batch_stats( batch_id, *metric_data, **metric_reporter.batch_context(batch) ) metrics = None if report_metric: with timing.time("report metrics"): metrics = metric_reporter.report_metric( model, state.stage, state.epoch, print_to_channels=(state.rank == 0) ) else: metric_reporter._reset() return metrics
def run_epoch( self, state: TrainingState, data: BatchIterator, metric_reporter: MetricReporter ): # This method is due for some refactoring, pushing it off because it interacts # with the metric reporter too much. Much of the logic here either changes in # the NewTaskTrainer or should change with a better metric reporter design. report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics model = state.model samples = [] is_data_empty = True """ Sometimes, a batch of inputs is too large to fit into GPU, which has to be split into several micro-batches. However, to improve efficiency, it would be helpful to only apply params/gradients sync at original batch boundaries instead of micro-batch boundaries. num_accumulated_batches specified the number of accumulating gradients locally before sync gradients, total training_batch_size = train_batch_size x num_accumulated_batches and it will improve the system performance by reduce the total network transfer bytes. """ for sample in enumerate(data): is_data_empty = False samples.append(sample) if ( state.stage != Stage.TRAIN or len(samples) == self.config.num_accumulated_batches ): self.run_step(samples, state, metric_reporter, report_metric) samples = [] if samples: self.run_step(samples, state, metric_reporter, report_metric) samples = [] metrics = None if report_metric: if is_data_empty: error_msg = ( f"Trying to report metric for stage {state.stage}, but no data was " "found. Either disable metric reporting for this stage, pass in " "non-empty data, or see if data fields are misnamed (warnings " "would appear in preceding stdout logs)." ) raise ValueError(error_msg) with timing.time("report metrics"): metrics = metric_reporter.report_metric( model, state.stage, state.epoch, print_to_channels=(state.rank == 0), optimizer=getattr( state, "optimizer", None ), # optimizer is not present during test privacy_engine=getattr(state, "privacy_engine", None), ) else: metric_reporter._reset() return metrics
def run_epoch(self, state: TrainingState, data: BatchIterator, metric_reporter: MetricReporter): # This method is due for some refactoring, pushing it off because it interacts # with the metric reporter too much. Much of the logic here either changes in # the NewTaskTrainer or should change with a better metric reporter design. report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics model = state.model for batch_id, (inputs, targets, context) in enumerate(data): self.zero_grads(state) # pass context to model to use in forward call if needed model.contextualize(context) with timing.time("model.forward"): logits = model(*inputs) with timing.time("compute loss"): loss = model.get_loss(logits, targets, context) if BatchContext.IGNORE_LOSS in context: loss *= 0 self.backprop(state, loss) if report_metric: with timing.time("add metrics"): preds, scores = model.get_pred(logits, targets, context, state.stage, *inputs) metric_reporter.add_batch_stats(batch_id, preds, targets, scores, loss.item(), inputs, **context) if (state.rank == 0 and batch_id % self.config.num_samples_to_log_progress == 0): print( f"Running batch {batch_id} for epoch {state.epoch} in {state.stage} stage", flush=True, ) metrics = None if report_metric: with timing.time("report metrics"): metrics = metric_reporter.report_metric( model, state.stage, state.epoch, print_to_channels=(state.rank == 0)) else: metric_reporter._reset() return metrics
def run_epoch( self, state: TrainingState, data: BatchIterator, metric_reporter: MetricReporter ): # This method is due for some refactoring, pushing it off because it interacts # with the metric reporter too much. Much of the logic here either changes in # the NewTaskTrainer or should change with a better metric reporter design. report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics model = state.model samples = [] """ Sometimes, a batch of inputs is too large to fit into GPU, which has to be split into several micro-batches. However, to improve efficiency, it would be helpful to only apply params/gradients sync at original batch boundaries instead of micro-batch boundaries. num_accumulated_batches specified the number of accumulating gradients locally before sync gradients, total training_batch_size = train_batch_size x num_accumulated_batches and it will improve the system performance by reduce the total network transfer bytes. """ for sample in enumerate(data): samples.append(sample) if ( state.stage != Stage.TRAIN or len(samples) == self.config.num_accumulated_batches ): self.run_step(samples, state, metric_reporter, report_metric) samples = [] if samples: self.run_step(samples, state, metric_reporter, report_metric) samples = [] metrics = None if report_metric: with timing.time("report metrics"): metrics = metric_reporter.report_metric( model, state.stage, state.epoch, print_to_channels=(state.rank == 0), optimizer=getattr( state, "optimizer", None ), # optimizer is not present during test ) else: metric_reporter._reset() return metrics
def _run_epoch( self, stage: Stage, epoch: int, batches, model: Model, metric_reporter: MetricReporter, pre_batch=lambda: None, backprop=lambda loss: None, rank=0, num_samples_to_log_progress: int = None, ): """Our run_epoch is a bit different, because we're wrapping the model forward call with model.train_batch, which arranges tensors and gets loss, etc.""" print(f"Rank {rank} worker: Running epoch #{epoch} for {stage}") report_metric = stage != Stage.TRAIN or self.config.report_train_metrics for batch_id, batch in enumerate(batches): pre_batch() with timing.time("model.train_batch"): loss, metric_data = model.train_batch(batch) with timing.time("backprop"): backprop(loss) if report_metric: with timing.time("add metrics"): metric_reporter.add_batch_stats( batch_id, *metric_data, **metric_reporter.batch_context(batch) ) metrics = None if report_metric: with timing.time("report metrics"): metrics = metric_reporter.report_metric( model, stage, epoch, print_to_channels=(rank == 0) ) else: metric_reporter._reset() return metrics
def _run_epoch( self, stage: Stage, epoch: int, batches, model: Model, metric_reporter: MetricReporter, pre_batch=lambda: None, backprop=lambda loss, timer=None: None, rank=0, ): """Our run_epoch is a bit different, because we're wrapping the model forward call with model.train_batch, which arranges tensors and gets loss, etc.""" print(f"Rank {rank} worker: Running epoch #{epoch} for {stage}") report_metric = stage != Stage.TRAIN or self.config.report_train_metrics for batch_id, (batch, tensors) in enumerate(batches): print(f"Batch {batch_id} has {len(batch)} examples") pre_batch() context = metric_reporter.batch_context(batch) # pass context to model to use in forward call if needed model.contextualize(context) loss, metric_data = model.train_batch(tensors) if BatchContext.IGNORE_LOSS in context: loss *= 0 backprop(loss) if report_metric: metric_reporter.add_batch_stats(batch_id, *metric_data, **context) metrics = None if report_metric: metrics = metric_reporter.report_metric( stage, epoch, print_to_channels=(rank == 0)) else: metric_reporter._reset() return metrics