Python recursive_copy_to_gpu示例，classy_vision.generic.util.recursive_copy_to_gpu Python示例

示例#1

0

显示文件

    def test_recursive_copy_to_gpu(self):
        tensor_a = get_mock_tensor()
        tensor_b = get_mock_tensor()

        valid_gpu_copy_value = tensor_a
        gpu_value = util.recursive_copy_to_gpu(valid_gpu_copy_value)
        self.assertTrue(gpu_value.is_cuda)

        valid_recursive_copy_value = [[tensor_a]]
        gpu_value = util.recursive_copy_to_gpu(valid_recursive_copy_value)
        self.assertTrue(gpu_value[0][0].is_cuda)

        valid_gpu_copy_collections = [
            (tensor_a, tensor_b),
            [tensor_a, tensor_b],
            {
                "tensor_a": tensor_a,
                "tensor_b": tensor_b
            },
        ]
        for value in valid_gpu_copy_collections:
            gpu_value = util.recursive_copy_to_gpu(value)
            if isinstance(value, dict):
                self.assertTrue(gpu_value["tensor_a"].is_cuda)
                self.assertTrue(gpu_value["tensor_b"].is_cuda)
            else:
                self.assertEqual(len(gpu_value), 2)
                self.assertTrue(gpu_value[0].is_cuda)
                self.assertTrue(gpu_value[1].is_cuda)

        value = {"a": "b"}
        self.assertEqual(value, util.recursive_copy_to_gpu(value))

示例#2

0

显示文件

文件： generic_util_test.py 项目： crisp-snakey/ClassyVision

    def test_recursive_copy_to_gpu(self):
        tensor_a = get_mock_tensor()
        tensor_b = get_mock_tensor()

        valid_gpu_copy_value = tensor_a
        gpu_value = util.recursive_copy_to_gpu(valid_gpu_copy_value)
        self.assertTrue(gpu_value.is_cuda)

        valid_recursive_copy_value = [[tensor_a]]
        gpu_value = util.recursive_copy_to_gpu(valid_recursive_copy_value)
        self.assertTrue(gpu_value[0][0].is_cuda)

        valid_gpu_copy_collections = [
            (tensor_a, tensor_b),
            [tensor_a, tensor_b],
            {
                "tensor_a": tensor_a,
                "tensor_b": tensor_b
            },
        ]
        for value in valid_gpu_copy_collections:
            gpu_value = util.recursive_copy_to_gpu(value)
            if isinstance(value, dict):
                self.assertTrue(gpu_value["tensor_a"].is_cuda)
                self.assertTrue(gpu_value["tensor_b"].is_cuda)
            else:
                self.assertEqual(len(gpu_value), 2)
                self.assertTrue(gpu_value[0].is_cuda)
                self.assertTrue(gpu_value[1].is_cuda)

        invalid_gpu_copy_values = [1234, True, 1.0]
        for value in invalid_gpu_copy_values:
            with self.assertRaises(AttributeError):
                gpu_value = util.recursive_copy_to_gpu(value)

        invalid_gpu_copy_depth = [
            ((((tensor_a, tensor_b), tensor_b), tensor_b), tensor_b),
            {
                "tensor_map_a": {
                    "tensor_map_b": {
                        "tensor_map_c": {
                            "tensor": tensor_a
                        }
                    }
                }
            },
            [[[[tensor_a, tensor_b], tensor_b], tensor_b], tensor_b],
            "abcd",  # Strings are sequences, includeing single char strings
        ]
        for value in invalid_gpu_copy_depth:
            with self.assertRaises(ValueError):
                gpu_value = util.recursive_copy_to_gpu(value, max_depth=3)

示例#3

0

显示文件

文件： classification_task.py 项目： cwb96/ClassyVision

    def eval_step(self, use_gpu):
        self.last_batch = None

        # Process next sample
        sample = next(self.get_data_iterator())

        assert isinstance(
            sample, dict) and "input" in sample and "target" in sample, (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        target = sample["target"]
        if use_gpu:
            for key, value in sample.items():
                sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

        with torch.no_grad():
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()
            loss = all_reduce_mean(loss)

            self.losses.append(loss.data.cpu().item() * target.size(0))

            self.update_meters(output, sample)

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(loss=loss,
                                        output=output,
                                        target=target,
                                        sample=sample)

示例#4

0

显示文件

    def train_step(self):
        """Train step to be executed in train loop."""

        self.last_batch = None

        # Process next sample
        with Timer() as timer:
            sample = next(self.data_iterator)

        assert isinstance(
            sample, dict) and "input" in sample and "target" in sample, (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        target = sample["target"]
        if self.use_gpu:
            sample = recursive_copy_to_gpu(sample, non_blocking=True)

        if self.mixup_transform is not None:
            sample = self.mixup_transform(sample)

        # Optional Pytorch AMP context
        torch_amp_context = (torch.cuda.amp.autocast() if self.amp_type
                             == AmpType.PYTORCH else contextlib.suppress())

        # only sync with DDP when we need to perform an optimizer step
        # an optimizer step can be skipped if gradient accumulation is enabled
        do_step = self._should_do_step()
        ctx_mgr_model = (self.distributed_model.no_sync()
                         if self.distributed_model is not None and not do_step
                         else contextlib.suppress())
        ctx_mgr_loss = (self.distributed_loss.no_sync()
                        if self.distributed_loss is not None and not do_step
                        else contextlib.suppress())

        with ctx_mgr_model, ctx_mgr_loss:
            # Forward pass
            with torch.enable_grad(), torch_amp_context:
                output = self.compute_model(sample)

                local_loss = self.compute_loss(output, sample)
                loss = local_loss.detach().clone()
                self.losses.append(loss.data.cpu().item())

                self.update_meters(output, sample)

            # Backwards pass + optimizer step
            self.run_optimizer(local_loss)

        self.num_updates += self.get_global_batchsize()

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(
            loss=loss,
            output=output,
            target=target,
            sample=sample,
            step_data={"sample_fetch_time": timer.elapsed_time},
        )

示例#5

0

显示文件

文件： classification_task.py 项目： rutujak24/ClassyVision

    def train_step(self):
        """Train step to be executed in train loop."""

        self.last_batch = None

        # Process next sample
        with Timer() as timer:
            sample = next(self.get_data_iterator())

        assert isinstance(sample, dict) and "input" in sample and "target" in sample, (
            f"Returned sample [{sample}] is not a map with 'input' and"
            + "'target' keys"
        )

        # Copy sample to GPU
        target = sample["target"]
        if self.use_gpu:
            sample = recursive_copy_to_gpu(sample, non_blocking=True)

        if self.mixup_transform is not None:
            sample = self.mixup_transform(sample)

        with torch.enable_grad():
            # Forward pass
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()

            self.losses.append(loss.data.cpu().item() * target.size(0))

            self.update_meters(output, sample)

        # Run backwards pass / update optimizer
        if self.amp_args is not None:
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(
                local_loss, self.optimizer.optimizer
            ) as scaled_loss:
                scaled_loss.backward()
        else:
            self.optimizer.backward(local_loss)

        self.check_inf_nan(loss)

        self.optimizer.update_schedule_on_step(self.where)
        self.optimizer.step()

        self.num_updates += self.get_global_batchsize()

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(
            loss=loss,
            output=output,
            target=target,
            sample=sample,
            step_data={"sample_fetch_time": timer.elapsed_time},
        )

示例#6

0

显示文件

    def __next__(self) -> Any:
        result = None

        with torch.cuda.stream(self.stream):
            if self.cache is not None:
                # Make sure that an ongoing transfer is done
                torch.cuda.current_stream().wait_stream(self.stream)
                result = self.cache
            else:
                result = recursive_copy_to_gpu(next(self._iter))

            # Lookahead and start upload
            try:
                self.cache = recursive_copy_to_gpu(next(self._iter))
            except StopIteration:
                self.cache = None
        assert result is not None

        return result

示例#7

0

显示文件

文件： classification_task.py 项目： cwb96/ClassyVision

    def train_step(self, use_gpu):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
        """

        self.last_batch = None

        # Process next sample
        sample = next(self.get_data_iterator())

        assert isinstance(
            sample, dict) and "input" in sample and "target" in sample, (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        target = sample["target"]
        if use_gpu:
            for key, value in sample.items():
                sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

        with torch.enable_grad():
            # Forward pass
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()
            loss = all_reduce_mean(loss)

            self.losses.append(loss.data.cpu().item() * target.size(0))

            self.update_meters(output, sample)

        # Run backwards pass / update optimizer
        if self.amp_opt_level is not None:
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(local_loss,
                                     self.optimizer.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.optimizer.backward(local_loss)

        self.optimizer.update_schedule_on_step(self.where)
        self.optimizer.step()

        self.num_updates += self.get_global_batchsize()

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(loss=loss,
                                        output=output,
                                        target=target,
                                        sample=sample)

示例#8

0

显示文件

    def preload(self):
        # Get data from the iterator
        try:
            self.cache_next = next(self._iter)

            # Copy to the device, in a parallel CUDA stream
            with torch.cuda.stream(self.stream):
                self.cache = recursive_copy_to_gpu(self.cache_next,
                                                   non_blocking=True)

        except StopIteration:
            self.cache = None
            return

示例#9

0

显示文件

文件： generic_util_test.py 项目： bhers4/ClassyVision

    def test_recursive_copy_to_gpu(self):
        tensor_a = get_mock_tensor()
        tensor_b = get_mock_tensor()

        valid_gpu_copy_value = tensor_a
        gpu_value = util.recursive_copy_to_gpu(valid_gpu_copy_value)
        self.assertTrue(gpu_value.is_cuda)

        valid_recursive_copy_value = [[tensor_a]]
        gpu_value = util.recursive_copy_to_gpu(valid_recursive_copy_value)
        self.assertTrue(gpu_value[0][0].is_cuda)

        valid_gpu_copy_collections = [
            (tensor_a, tensor_b),
            [tensor_a, tensor_b],
            {"tensor_a": tensor_a, "tensor_b": tensor_b},
        ]
        for value in valid_gpu_copy_collections:
            gpu_value = util.recursive_copy_to_gpu(value)
            if isinstance(value, dict):
                self.assertTrue(gpu_value["tensor_a"].is_cuda)
                self.assertTrue(gpu_value["tensor_b"].is_cuda)
            else:
                self.assertEqual(len(gpu_value), 2)
                self.assertTrue(gpu_value[0].is_cuda)
                self.assertTrue(gpu_value[1].is_cuda)

        invalid_gpu_copy_depth = [
            ((((tensor_a, tensor_b), tensor_b), tensor_b), tensor_b),
            {"tensor_map_a": {"tensor_map_b": {"tensor_map_c": {"tensor": tensor_a}}}},
            [[[[tensor_a, tensor_b], tensor_b], tensor_b], tensor_b],
        ]
        for value in invalid_gpu_copy_depth:
            with self.assertRaises(ValueError):
                gpu_value = util.recursive_copy_to_gpu(value, max_depth=3)

        value = {"a": "b"}
        self.assertEqual(value, util.recursive_copy_to_gpu(value))

示例#10

0

显示文件

文件： classification_task.py 项目： stephenyan1231/ClassyVision-1

    def eval_step(self, use_gpu, local_variables=None):
        if local_variables is None:
            local_variables = {}

        # Process next sample
        sample = next(self.get_data_iterator())
        local_variables["sample"] = sample

        assert (
            isinstance(local_variables["sample"], dict)
            and "input" in local_variables["sample"]
            and "target" in local_variables["sample"]), (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        with torch.no_grad():
            local_variables["output"] = self.model(
                local_variables["sample"]["input"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_forward.name)

            local_variables["local_loss"] = self.compute_loss(
                local_variables["output"], local_variables["sample"])

            local_variables["loss"] = local_variables["local_loss"].detach(
            ).clone()
            local_variables["loss"] = all_reduce_mean(local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            self.update_meters(local_variables["output"],
                               local_variables["sample"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_loss_and_meter.name)

示例#11

0

显示文件

    def eval_step(self):
        self.last_batch = None

        # Process next sample
        with Timer() as timer:
            sample = next(self.data_iterator)

        assert isinstance(sample, dict) and "input" in sample and "target" in sample, (
            f"Returned sample [{sample}] is not a map with 'input' and"
            + "'target' keys"
        )

        target = sample["target"]
        if self.use_gpu:
            sample = recursive_copy_to_gpu(sample, non_blocking=True)

        # Optional Pytorch AMP context
        torch_amp_context = (
            torch.cuda.amp.autocast()
            if self.amp_type == AmpType.PYTORCH
            else contextlib.suppress()
        )

        with torch.no_grad(), torch_amp_context:
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()

            self.check_inf_nan(loss)

            self.losses.append(loss.data.cpu().item())

            self.update_meters(output, sample)

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(
            loss=loss,
            output=output,
            target=target,
            sample=sample,
            step_data={"sample_fetch_time": timer.elapsed_time},
        )

示例#12

0

显示文件

    def eval_step(self):
        self.last_batch = None

        # Process next sample
        with Timer() as timer:
            sample = next(self.get_data_iterator())

        assert isinstance(
            sample, dict) and "input" in sample and "target" in sample, (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        target = sample["target"]
        if self.use_gpu:
            sample = recursive_copy_to_gpu(sample, non_blocking=True)

        with torch.no_grad():
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()

            self.check_inf_nan(loss)

            self.losses.append(loss.data.cpu().item() * target.size(0))

            self.update_meters(output, sample)

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(
            loss=loss,
            output=output,
            target=target,
            sample=sample,
            step_data={"sample_fetch_time": timer.elapsed_time},
        )

示例#13

0

显示文件

def _get_iterator(data_iter, use_gpu):
    for elem in data_iter:
        if use_gpu:
            elem = recursive_copy_to_gpu(elem, non_blocking=True)

        yield elem["input"]

示例#14

0

显示文件

 def __next__(self) -> Any:
     # Get data from the iterator and move to GPU
     # This can raise `StopIteration`
     return recursive_copy_to_gpu(next(self._iter), non_blocking=True)

示例#15

0

显示文件

文件： precise_batch_norm_hook.py 项目： sooheang/ClassyVision

def _get_iterator(cache, use_gpu):
    for elem in cache:
        if use_gpu:
            elem = recursive_copy_to_gpu(elem, non_blocking=True)
        yield elem

示例#16

0

显示文件

    def train_step(self, use_gpu, local_variables=None):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
            local_variables: Dict containing intermediate values
                in train_step for access by hooks
        """
        from classy_vision.hooks import ClassyHookFunctions

        if local_variables is None:
            local_variables = {}

        # Process next sample
        sample = next(self.get_data_iterator())
        local_variables["sample"] = sample

        assert (
            isinstance(local_variables["sample"], dict)
            and "input" in local_variables["sample"]
            and "target" in local_variables["sample"]), (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        # Only need gradients during training
        context = torch.enable_grad() if self.train else torch.no_grad()
        with context:
            # Forward pass
            local_variables["output"] = self.model(
                local_variables["sample"]["input"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_forward.name)

            local_variables["local_loss"] = self.compute_loss(
                local_variables["output"], local_variables["sample"])

            # NOTE: This performs an all_reduce_mean() on the losses across the
            # replicas.  The reduce should ideally be weighted by the length of
            # the targets on each replica. This will only be an issue when
            # there are dummy samples present (once an epoch) and will only
            # impact the loss reporting (slightly).
            local_variables["loss"] = local_variables["local_loss"].detach(
            ).clone()
            local_variables["loss"] = all_reduce_mean(local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            self.update_meters(local_variables["output"],
                               local_variables["sample"])

            # After both loss and meters are updated, we run hooks. Among hooks,
            # `LossLrMeterLoggingHook` will log both loss and meter status
            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_loss_and_meter.name)

        num_samples_in_step = self.get_global_batchsize()
        self.num_samples_this_phase += num_samples_in_step

        # For training phases, run backwards pass / update optimizer
        if self.train:
            if self.amp_opt_level is not None:
                self.optimizer.zero_grad()
                with apex.amp.scale_loss(
                        local_variables["local_loss"],
                        self.optimizer.optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                self.optimizer.backward(local_variables["local_loss"])

            self.optimizer.update_schedule_on_step(self.where)
            self.optimizer.step()

            self.run_hooks(local_variables, ClassyHookFunctions.on_update.name)

            self.num_updates += num_samples_in_step

示例#17

0

显示文件

    def train_step(self, use_gpu, local_variables=None):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
            local_variables: Dict containing intermediate values
                in train_step for access by hooks
        """

        if local_variables is None:
            local_variables = {}

        # Process next sample
        sample = next(self.get_data_iterator())
        local_variables["sample"] = sample

        assert (
            isinstance(local_variables["sample"], dict)
            and "input" in local_variables["sample"]
            and "target" in local_variables["sample"]), (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        with torch.enable_grad():
            # Forward pass
            local_variables["output"] = self.model(
                local_variables["sample"]["input"])

            local_variables["local_loss"] = self.compute_loss(
                local_variables["output"], local_variables["sample"])

            local_variables["loss"] = local_variables["local_loss"].detach(
            ).clone()
            local_variables["loss"] = all_reduce_mean(local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            self.update_meters(local_variables["output"],
                               local_variables["sample"])

        # Run backwards pass / update optimizer
        if self.amp_opt_level is not None:
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(local_variables["local_loss"],
                                     self.optimizer.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.optimizer.backward(local_variables["local_loss"])

        self.optimizer.update_schedule_on_step(self.where)
        self.optimizer.step()

        self.num_updates += self.get_global_batchsize()

示例#18

0

显示文件

    def train_step(self, use_gpu, local_variables=None):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
            local_variables: Dict containing intermediate values
                in train_step for access by hooks
        """
        from classy_vision.hooks import ClassyHookFunctions

        if local_variables is None:
            local_variables = {}

        # We'll time train_step and some of its sections, and accumulate values
        # into perf_stats if it were defined in local_variables:
        perf_stats = local_variables.get("perf_stats", None)
        timer_train_step = PerfTimer("train_step_total", perf_stats)
        timer_train_step.start()

        # Process next sample
        with PerfTimer("read_sample", perf_stats):
            sample = next(self.get_data_iterator())
            local_variables["sample"] = sample

            assert (
                isinstance(local_variables["sample"], dict)
                and "input" in local_variables["sample"]
                and "target" in local_variables["sample"]
            ), (f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        self.run_hooks(local_variables, ClassyHookFunctions.on_sample.name)

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        # Only need gradients during training
        context = torch.enable_grad() if self.train else torch.no_grad()
        with context:
            # Forward pass
            with PerfTimer("forward", perf_stats):
                local_variables["output"] = self.model(
                    local_variables["sample"]["input"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_forward.name)

            model_output = local_variables["output"]
            target = local_variables["sample"]["target"]
            local_variables["local_loss"] = self.loss(model_output, target)

            # NOTE: This performs an all_reduce_mean() on the losses across the
            # replicas.  The reduce should ideally be weighted by the length of
            # the targets on each replica. This will only be an issue when
            # there are dummy samples present (once an epoch) and will only
            # impact the loss reporting (slightly).
            with PerfTimer("loss_allreduce", perf_stats):
                local_variables["loss"] = local_variables["local_loss"].detach(
                ).clone()
                local_variables["loss"] = all_reduce_mean(
                    local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            model_output_cpu = model_output.cpu() if use_gpu else model_output

            # Update meters
            with PerfTimer("meters_update", perf_stats):
                for meter in self.meters:
                    meter.update(model_output_cpu,
                                 target.detach().cpu(),
                                 is_train=self.train)
            # After both loss and meters are updated, we run hooks. Among hooks,
            # `LossLrMeterLoggingHook` will log both loss and meter status
            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_loss_and_meter.name)

        num_samples_in_step = self.get_global_batchsize()
        self.num_samples_this_phase += num_samples_in_step

        # For training phases, run backwards pass / update optimizer
        if self.train:
            with PerfTimer("backward", perf_stats):
                self.optimizer.backward(local_variables["local_loss"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_backward.name)

            self.optimizer.update_schedule_on_step(self.where)
            with PerfTimer("optimizer_step", perf_stats):
                self.optimizer.step()

            self.run_hooks(local_variables, ClassyHookFunctions.on_update.name)

            self.num_updates += num_samples_in_step

        timer_train_step.stop()
        timer_train_step.record()