예제 #1
0
    def run_epoch(self, state: TrainingState, data, metric_reporter: MetricReporter):
        """Our run_epoch is a bit different, because we're wrapping the model forward
        call with model.train_batch, which arranges tensors and gets loss, etc."""
        report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics
        model = state.model

        for batch_id, batch in enumerate(data):
            self.zero_grads(state)
            with timing.time("model.train_batch"):
                loss, metric_data = model.train_batch(model, batch)
            self.backprop(state, loss)
            if report_metric:
                with timing.time("add metrics"):
                    metric_reporter.add_batch_stats(
                        batch_id, *metric_data, **metric_reporter.batch_context(batch)
                    )

        metrics = None
        if report_metric:
            with timing.time("report metrics"):
                metrics = metric_reporter.report_metric(
                    model, state.stage, state.epoch, print_to_channels=(state.rank == 0)
                )
        else:
            metric_reporter._reset()

        return metrics
예제 #2
0
파일: trainer.py 프로젝트: mksifakis/pytext
    def run_epoch(
        self, state: TrainingState, data: BatchIterator, metric_reporter: MetricReporter
    ):
        # This method is due for some refactoring, pushing it off because it interacts
        # with the metric reporter too much. Much of the logic here either changes in
        # the NewTaskTrainer or should change with a better metric reporter design.
        report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics
        model = state.model
        samples = []
        is_data_empty = True

        """
        Sometimes, a batch of inputs is too large to fit into GPU, which has to
        be split into several micro-batches. However, to improve efficiency,
        it would be helpful to only apply params/gradients sync at original batch
        boundaries instead of micro-batch boundaries.
        num_accumulated_batches specified the number of accumulating gradients
        locally before sync gradients, total training_batch_size =
        train_batch_size x num_accumulated_batches and it will improve the system
        performance by reduce the total network transfer bytes.
        """
        for sample in enumerate(data):
            is_data_empty = False
            samples.append(sample)
            if (
                state.stage != Stage.TRAIN
                or len(samples) == self.config.num_accumulated_batches
            ):
                self.run_step(samples, state, metric_reporter, report_metric)
                samples = []
        if samples:
            self.run_step(samples, state, metric_reporter, report_metric)
            samples = []

        metrics = None
        if report_metric:
            if is_data_empty:
                error_msg = (
                    f"Trying to report metric for stage {state.stage}, but no data was "
                    "found. Either disable metric reporting for this stage, pass in "
                    "non-empty data, or see if data fields are misnamed (warnings "
                    "would appear in preceding stdout logs)."
                )
                raise ValueError(error_msg)

            with timing.time("report metrics"):
                metrics = metric_reporter.report_metric(
                    model,
                    state.stage,
                    state.epoch,
                    print_to_channels=(state.rank == 0),
                    optimizer=getattr(
                        state, "optimizer", None
                    ),  # optimizer is not present during test
                    privacy_engine=getattr(state, "privacy_engine", None),
                )
        else:
            metric_reporter._reset()

        return metrics
예제 #3
0
파일: trainer.py 프로젝트: buptkang/pytext
    def run_epoch(self, state: TrainingState, data: BatchIterator,
                  metric_reporter: MetricReporter):
        # This method is due for some refactoring, pushing it off because it interacts
        # with the metric reporter too much. Much of the logic here either changes in
        # the NewTaskTrainer or should change with a better metric reporter design.
        report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics
        model = state.model

        for batch_id, (inputs, targets, context) in enumerate(data):
            self.zero_grads(state)
            # pass context to model to use in forward call if needed
            model.contextualize(context)
            with timing.time("model.forward"):
                logits = model(*inputs)

            with timing.time("compute loss"):
                loss = model.get_loss(logits, targets, context)
                if BatchContext.IGNORE_LOSS in context:
                    loss *= 0

            self.backprop(state, loss)

            if report_metric:
                with timing.time("add metrics"):
                    preds, scores = model.get_pred(logits, targets, context,
                                                   state.stage, *inputs)
                    metric_reporter.add_batch_stats(batch_id,
                                                    preds, targets, scores,
                                                    loss.item(), inputs,
                                                    **context)

            if (state.rank == 0 and
                    batch_id % self.config.num_samples_to_log_progress == 0):
                print(
                    f"Running batch {batch_id} for epoch {state.epoch} in {state.stage} stage",
                    flush=True,
                )

        metrics = None
        if report_metric:
            with timing.time("report metrics"):
                metrics = metric_reporter.report_metric(
                    model,
                    state.stage,
                    state.epoch,
                    print_to_channels=(state.rank == 0))
        else:
            metric_reporter._reset()

        return metrics
예제 #4
0
    def run_epoch(
        self, state: TrainingState, data: BatchIterator, metric_reporter: MetricReporter
    ):
        # This method is due for some refactoring, pushing it off because it interacts
        # with the metric reporter too much. Much of the logic here either changes in
        # the NewTaskTrainer or should change with a better metric reporter design.
        report_metric = state.stage != Stage.TRAIN or self.config.report_train_metrics
        model = state.model
        samples = []

        """
        Sometimes, a batch of inputs is too large to fit into GPU, which has to
        be split into several micro-batches. However, to improve efficiency,
        it would be helpful to only apply params/gradients sync at original batch
        boundaries instead of micro-batch boundaries.
        num_accumulated_batches specified the number of accumulating gradients
        locally before sync gradients, total training_batch_size =
        train_batch_size x num_accumulated_batches and it will improve the system
        performance by reduce the total network transfer bytes.
        """
        for sample in enumerate(data):
            samples.append(sample)
            if (
                state.stage != Stage.TRAIN
                or len(samples) == self.config.num_accumulated_batches
            ):
                self.run_step(samples, state, metric_reporter, report_metric)
                samples = []
        if samples:
            self.run_step(samples, state, metric_reporter, report_metric)
            samples = []

        metrics = None
        if report_metric:
            with timing.time("report metrics"):
                metrics = metric_reporter.report_metric(
                    model,
                    state.stage,
                    state.epoch,
                    print_to_channels=(state.rank == 0),
                    optimizer=getattr(
                        state, "optimizer", None
                    ),  # optimizer is not present during test
                )
        else:
            metric_reporter._reset()

        return metrics
예제 #5
0
    def _run_epoch(
        self,
        stage: Stage,
        epoch: int,
        batches,
        model: Model,
        metric_reporter: MetricReporter,
        pre_batch=lambda: None,
        backprop=lambda loss: None,
        rank=0,
        num_samples_to_log_progress: int = None,
    ):
        """Our run_epoch is a bit different, because we're wrapping the model forward
        call with model.train_batch, which arranges tensors and gets loss, etc."""
        print(f"Rank {rank} worker: Running epoch #{epoch} for {stage}")
        report_metric = stage != Stage.TRAIN or self.config.report_train_metrics

        for batch_id, batch in enumerate(batches):
            pre_batch()
            with timing.time("model.train_batch"):
                loss, metric_data = model.train_batch(batch)
            with timing.time("backprop"):
                backprop(loss)
            if report_metric:
                with timing.time("add metrics"):
                    metric_reporter.add_batch_stats(
                        batch_id, *metric_data, **metric_reporter.batch_context(batch)
                    )

        metrics = None
        if report_metric:
            with timing.time("report metrics"):
                metrics = metric_reporter.report_metric(
                    model, stage, epoch, print_to_channels=(rank == 0)
                )
        else:
            metric_reporter._reset()

        return metrics
예제 #6
0
    def _run_epoch(
        self,
        stage: Stage,
        epoch: int,
        batches,
        model: Model,
        metric_reporter: MetricReporter,
        pre_batch=lambda: None,
        backprop=lambda loss, timer=None: None,
        rank=0,
    ):
        """Our run_epoch is a bit different, because we're wrapping the model forward
        call with model.train_batch, which arranges tensors and gets loss, etc."""
        print(f"Rank {rank} worker: Running epoch #{epoch} for {stage}")
        report_metric = stage != Stage.TRAIN or self.config.report_train_metrics
        for batch_id, (batch, tensors) in enumerate(batches):
            print(f"Batch {batch_id} has {len(batch)} examples")
            pre_batch()
            context = metric_reporter.batch_context(batch)
            # pass context to model to use in forward call if needed
            model.contextualize(context)
            loss, metric_data = model.train_batch(tensors)
            if BatchContext.IGNORE_LOSS in context:
                loss *= 0
            backprop(loss)
            if report_metric:
                metric_reporter.add_batch_stats(batch_id, *metric_data,
                                                **context)

        metrics = None
        if report_metric:
            metrics = metric_reporter.report_metric(
                stage, epoch, print_to_channels=(rank == 0))
        else:
            metric_reporter._reset()
        return metrics