class EpisodeFPSHandler:
    def __init__(self):
        self._timer = Timer(average=True)

    def attach(self, engine: Engine):
        self._timer.attach(engine, step=EngineEvents.ITERATION_COMPLETED)
        engine.add_event_handler(EndOfEpisodeHandler.Events.EPISODE_COMPLETED, self)

    def __call__(self, engine: Engine):
        t_val = self._timer.value()
        if engine.state.iteration == 1:
            self._timer.reset()
        else:
            engine.state.metrics['fps'] = 1./t_val
        engine.state.metrics['time_passed'] = t_val * self._timer.step_count
class Frequency(Metric):
    """Provides metrics for the number of examples processed per second.

    Examples:

        .. code-block:: python

            # Compute number of tokens processed
            wps_metric = Frequency(output_transformer=lambda x: x['ntokens'])
            wps_metric.attach(trainer, name='wps')
            # Logging with TQDM
            ProgressBar(persist=True).attach(trainer, metric_names=['wps'])
            # Progress bar will looks like
            # Epoch [2/10]: [12/24]  50%|█████      , wps=400 [00:17<1:23]
    """
    def __init__(self, output_transform=lambda x: x, device=None):
        self._timer = None
        self._acc = None
        self._n = None
        self._elapsed = None
        super(Frequency, self).__init__(output_transform=output_transform,
                                        device=device)

    @reinit__is_reduced
    def reset(self):
        self._timer = Timer()
        self._acc = 0
        self._n = 0
        self._elapsed = 0.0
        super(Frequency, self).reset()

    @reinit__is_reduced
    def update(self, output):
        self._acc += output
        self._n = self._acc
        self._elapsed = torch.tensor(self._timer.value(), device=self._device)

    @sync_all_reduce("_n", "_elapsed")
    def compute(self):
        time_divisor = 1.0

        if dist.is_available() and dist.is_initialized():
            time_divisor *= dist.get_world_size()

        # Returns the average processed objects per second across all workers
        return self._n / self._elapsed.item() * time_divisor

    def completed(self, engine, name):
        engine.state.metrics[name] = int(self.compute())

    def attach(self, engine, name, event_name=Events.ITERATION_COMPLETED):
        engine.add_event_handler(Events.EPOCH_STARTED, self.started)
        engine.add_event_handler(event_name, self.iteration_completed)
        engine.add_event_handler(event_name, self.completed, name)
示例#3
0
    def __init__(self) -> None:
        self._dataflow_timer = Timer()
        self._processing_timer = Timer()
        self._event_handlers_timer = Timer()

        self.dataflow_times = []  # type: List[float]
        self.processing_times = []  # type: List[float]
        self.event_handlers_times = {
        }  # type: Dict[EventEnum, Dict[str, List[float]]]
示例#4
0
class EpisodeFPSHandler:
    FPS_METRIC = 'fps'
    AVG_FPS_METRIC = 'avg_fps'
    TIME_PASSED_METRIC = 'time_passed'

    def __init__(self, fps_mul: float = 1.0, fps_smooth_alpha: float = 0.98):
        self._timer = Timer(average=True)
        self._fps_mul = fps_mul
        self._started_ts = time.time()
        self._fps_smooth_alpha = fps_smooth_alpha

    def attach(self, engine: Engine, manual_step: bool = False):
        self._timer.attach(
            engine, step=None if manual_step else Events.ITERATION_COMPLETED)
        engine.add_event_handler(EpisodeEvents.EPISODE_COMPLETED, self)

    def step(self):
        """
        If manual_step=True on attach(), this method should be used every time we've communicated with environment
        to get proper FPS
        :return:
        """
        self._timer.step()

    def __call__(self, engine: Engine):
        t_val = self._timer.value()
        if engine.state.iteration > 1:
            fps = self._fps_mul / t_val
            avg_fps = engine.state.metrics.get(self.AVG_FPS_METRIC)
            if avg_fps is None:
                avg_fps = fps
            else:
                avg_fps *= self._fps_smooth_alpha
                avg_fps += (1 - self._fps_smooth_alpha) * fps
            engine.state.metrics[self.AVG_FPS_METRIC] = avg_fps
            engine.state.metrics[self.FPS_METRIC] = fps
        engine.state.metrics[
            self.TIME_PASSED_METRIC] = time.time() - self._started_ts
        self._timer.reset()
示例#5
0
    def __init__(self) -> None:
        self._dataflow_timer = Timer()
        self._processing_timer = Timer()
        self._event_handlers_timer = Timer()

        self.dataflow_times = torch.zeros(1)
        self.processing_times = torch.zeros(1)
        self.event_handlers_times = {}  # type: Dict[EventEnum, torch.Tensor]

        self._events = [
            Events.EPOCH_STARTED,
            Events.EPOCH_COMPLETED,
            Events.ITERATION_STARTED,
            Events.ITERATION_COMPLETED,
            Events.GET_BATCH_STARTED,
            Events.GET_BATCH_COMPLETED,
            Events.COMPLETED,
        ]
        self._fmethods = [
            self._as_first_epoch_started,
            self._as_first_epoch_completed,
            self._as_first_iter_started,
            self._as_first_iter_completed,
            self._as_first_get_batch_started,
            self._as_first_get_batch_completed,
            self._as_first_completed,
        ]
        self._lmethods = [
            self._as_last_epoch_started,
            self._as_last_epoch_completed,
            self._as_last_iter_started,
            self._as_last_iter_completed,
            self._as_last_get_batch_started,
            self._as_last_get_batch_completed,
            self._as_last_completed,
        ]
示例#6
0
 def reset(self) -> None:
     self._timer = Timer()
     self._acc = 0
     self._n = 0
     self._elapsed = 0.0
     super(Frequency, self).reset()
示例#7
0
class Frequency(Metric):
    """Provides metrics for the number of examples processed per second.

    Examples:
        .. code-block:: python

            # Compute number of tokens processed
            wps_metric = Frequency(output_transform=lambda x: x['ntokens'])
            wps_metric.attach(trainer, name='wps')
            # Logging with TQDM
            ProgressBar(persist=True).attach(trainer, metric_names=['wps'])
            # Progress bar will look like
            # Epoch [2/10]: [12/24]  50%|█████      , wps=400 [00:17<1:23]


        To compute examples processed per second every 50th iteration:

        .. code-block:: python

            # Compute number of tokens processed
            wps_metric = Frequency(output_transform=lambda x: x['ntokens'])
            wps_metric.attach(trainer, name='wps', event_name=Events.ITERATION_COMPLETED(every=50))
            # Logging with TQDM
            ProgressBar(persist=True).attach(trainer, metric_names=['wps'])
            # Progress bar will look like
            # Epoch [2/10]: [50/100]  50%|█████      , wps=400 [00:17<00:35]
    """

    def __init__(
        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
    ) -> None:
        super(Frequency, self).__init__(output_transform=output_transform, device=device)

    @reinit__is_reduced
    def reset(self) -> None:
        self._timer = Timer()
        self._acc = 0
        self._n = 0
        self._elapsed = 0.0
        super(Frequency, self).reset()

    @reinit__is_reduced
    def update(self, output: int) -> None:
        self._acc += output
        self._n = self._acc
        self._elapsed = self._timer.value()

    @sync_all_reduce("_n", "_elapsed")
    def compute(self) -> float:
        time_divisor = 1.0

        if idist.get_world_size() > 1:
            time_divisor *= idist.get_world_size()

        # Returns the average processed objects per second across all workers
        return self._n / self._elapsed * time_divisor

    def completed(self, engine: Engine, name: str) -> None:
        engine.state.metrics[name] = int(self.compute())

    # TODO: see issue https://github.com/pytorch/ignite/issues/1405
    def attach(  # type: ignore
        self, engine: Engine, name: str, event_name: Events = Events.ITERATION_COMPLETED
    ) -> None:
        engine.add_event_handler(Events.EPOCH_STARTED, self.started)
        engine.add_event_handler(Events.ITERATION_COMPLETED, self.iteration_completed)
        engine.add_event_handler(event_name, self.completed, name)
示例#8
0
class HandlersTimeProfiler:
    """
    HandlersTimeProfiler can be used to profile the handlers,
    data loading and data processing times. Custom events are also
    profiled by this profiler

    Examples:

    .. code-block:: python

        from ignite.handlers import HandlersTimeProfiler

        trainer = Engine(train_updater)

        # Create an object of the profiler and attach an engine to it
        profiler = HandlersTimeProfiler()
        profiler.attach(trainer)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_intermediate_results():
            profiler.print_results(profiler.get_results())

        trainer.run(dataloader, max_epochs=3)

        profiler.write_results('path_to_dir/time_profiling.csv')

    .. versionadded:: 0.4.6
    """

    EVENT_FILTER_THESHOLD_TIME = 0.0001

    def __init__(self) -> None:
        self._dataflow_timer = Timer()
        self._processing_timer = Timer()
        self._event_handlers_timer = Timer()

        self.dataflow_times = []  # type: List[float]
        self.processing_times = []  # type: List[float]
        self.event_handlers_times = {
        }  # type: Dict[EventEnum, Dict[str, List[float]]]

    @staticmethod
    def _get_callable_name(handler: Callable) -> str:
        # get name of the callable handler
        return getattr(handler, "__qualname__", handler.__class__.__name__)

    def _create_wrapped_handler(self, handler: Callable,
                                event: EventEnum) -> Callable:
        @functools.wraps(handler)
        def _timeit_handler(*args: Any, **kwargs: Any) -> None:
            self._event_handlers_timer.reset()
            handler(*args, **kwargs)
            t = self._event_handlers_timer.value()
            hname = self._get_callable_name(handler)
            # filter profiled time if the handler was attached to event with event filter
            if not hasattr(handler,
                           "_parent") or t >= self.EVENT_FILTER_THESHOLD_TIME:
                self.event_handlers_times[event][hname].append(t)

        # required to revert back to original handler after profiling
        setattr(_timeit_handler, "_profiler_original", handler)
        return _timeit_handler

    def _timeit_processing(self) -> None:
        # handler used for profiling processing times
        t = self._processing_timer.value()
        self.processing_times.append(t)

    def _timeit_dataflow(self) -> None:
        # handler used for profiling dataflow times
        t = self._dataflow_timer.value()
        self.dataflow_times.append(t)

    def _reset(self, event_handlers_names: Mapping[EventEnum,
                                                   List[str]]) -> None:
        # reset the variables used for profiling
        self.dataflow_times = []
        self.processing_times = []

        self.event_handlers_times = {
            e: {h: []
                for h in event_handlers_names[e]}
            for e in event_handlers_names
        }

    @staticmethod
    def _is_internal_handler(handler: Callable) -> bool:
        # checks whether the handler is internal
        return any(n in repr(handler)
                   for n in ["HandlersTimeProfiler.", "Timer."])

    def _detach_profiler_handlers(self, engine: Engine) -> None:
        # reverts handlers to original handlers
        for e in engine._event_handlers:
            for i, (func, args,
                    kwargs) in enumerate(engine._event_handlers[e]):
                if hasattr(func, "_profiler_original"):
                    engine._event_handlers[e][i] = (func._profiler_original,
                                                    args, kwargs)

    def _as_first_started(self, engine: Engine) -> None:
        # wraps original handlers for profiling

        self.event_handlers_names = {
            e: [
                self._get_callable_name(h)
                for (h, _, _) in engine._event_handlers[e]
                if not self._is_internal_handler(h)
            ]
            for e in engine._allowed_events
        }

        self._reset(self.event_handlers_names)

        for e in engine._allowed_events:
            for i, (func, args,
                    kwargs) in enumerate(engine._event_handlers[e]):
                if not self._is_internal_handler(func):
                    engine._event_handlers[e][i] = (
                        self._create_wrapped_handler(func, e), args, kwargs)

        # processing timer
        engine.add_event_handler(Events.ITERATION_STARTED,
                                 self._processing_timer.reset)
        engine._event_handlers[Events.ITERATION_COMPLETED].insert(
            0, (self._timeit_processing, (), {}))

        # dataflow timer
        engine.add_event_handler(Events.GET_BATCH_STARTED,
                                 self._dataflow_timer.reset)
        engine._event_handlers[Events.GET_BATCH_COMPLETED].insert(
            0, (self._timeit_dataflow, (), {}))

        # revert back the wrapped handlers with original handlers at the end
        engine.add_event_handler(Events.COMPLETED,
                                 self._detach_profiler_handlers)

    def attach(self, engine: Engine) -> None:
        """Attach HandlersTimeProfiler to the given engine.

        Args:
            engine: the instance of Engine to attach
        """
        if not isinstance(engine, Engine):
            raise TypeError(
                f"Argument engine should be ignite.engine.Engine, but given {type(engine)}"
            )

        if not engine.has_event_handler(self._as_first_started):
            engine._event_handlers[Events.STARTED].insert(
                0, (self._as_first_started, (engine, ), {}))

    def get_results(self) -> List[List[Union[str, float]]]:
        """
        Method to fetch the aggregated profiler results after the engine is run

        .. code-block:: python

            results = profiler.get_results()

        """
        total_eh_time = sum([
            sum(self.event_handlers_times[e][h])
            for e in self.event_handlers_times
            for h in self.event_handlers_times[e]
        ])
        total_eh_time = round(
            float(total_eh_time),
            5,
        )

        def compute_basic_stats(
            times: Union[Sequence, torch.Tensor]
        ) -> List[Union[str, float, Tuple[Union[str, float], Union[str,
                                                                   float]]]]:
            data = torch.as_tensor(times, dtype=torch.float32)
            # compute on non-zero data:
            data = data[data > 0]
            total = round(torch.sum(data).item(), 5) if len(
                data) > 0 else "not triggered"  # type: Union[str, float]
            min_index = ("None", "None"
                         )  # type: Tuple[Union[str, float], Union[str, float]]
            max_index = ("None", "None"
                         )  # type: Tuple[Union[str, float], Union[str, float]]
            mean = "None"  # type: Union[str, float]
            std = "None"  # type: Union[str, float]
            if len(data) > 0:
                min_index = (round(torch.min(data).item(),
                                   5), torch.argmin(data).item())
                max_index = (round(torch.max(data).item(),
                                   5), torch.argmax(data).item())
                mean = round(torch.mean(data).item(), 5)
                if len(data) > 1:
                    std = round(torch.std(data).item(), 5)
            return [total, min_index, max_index, mean, std]

        event_handler_stats = [[
            h,
            getattr(e, "name", str(e)),
            *compute_basic_stats(
                torch.tensor(self.event_handlers_times[e][h],
                             dtype=torch.float32)),
        ] for e in self.event_handlers_times
                               for h in self.event_handlers_times[e]]
        event_handler_stats.append(
            ["Total", "", total_eh_time, "", "", "", ""])
        event_handler_stats.append([
            "Processing", "None", *compute_basic_stats(self.processing_times)
        ])
        event_handler_stats.append(
            ["Dataflow", "None", *compute_basic_stats(self.dataflow_times)])

        return event_handler_stats

    def write_results(self, output_path: str) -> None:
        """
        Method to store the unaggregated profiling results to a csv file

        Args:
            output_path: file output path containing a filename

        .. code-block:: python

            profiler.write_results('path_to_dir/awesome_filename.csv')

        Example output:

        .. code-block:: text

            -----------------------------------------------------------------
            # processing_stats dataflow_stats training.<locals>.log_elapsed_time (EPOCH_COMPLETED) ...
            1     0.00003         0.252387                          0.125676
            2     0.00029         0.252342                          0.125123

        """
        try:
            import pandas as pd
        except ImportError:
            raise RuntimeError("Need pandas to write results as files")

        processing_stats = torch.tensor(self.processing_times,
                                        dtype=torch.float32)
        dataflow_stats = torch.tensor(self.dataflow_times, dtype=torch.float32)

        cols = [processing_stats, dataflow_stats]
        headers = ["processing_stats", "dataflow_stats"]
        for e in self.event_handlers_times:
            for h in self.event_handlers_times[e]:
                headers.append(f"{h} ({getattr(e, 'name', str(e))})")
                cols.append(
                    torch.tensor(self.event_handlers_times[e][h],
                                 dtype=torch.float32))
        # Determine maximum length
        max_len = max([x.numel() for x in cols])

        count_col = torch.arange(max_len, dtype=torch.float32) + 1
        cols.insert(0, count_col)
        headers.insert(0, "#")

        # pad all tensors to have same length
        cols = [
            torch.nn.functional.pad(x,
                                    pad=(0, max_len - x.numel()),
                                    mode="constant",
                                    value=0) for x in cols
        ]

        results_dump = torch.stack(
            cols,
            dim=1,
        ).numpy()

        results_df = pd.DataFrame(
            data=results_dump,
            columns=headers,
        )
        results_df.to_csv(output_path, index=False)

    @staticmethod
    def print_results(results: List[List[Union[str, float]]]) -> None:
        """
        Method to print the aggregated results from the profiler

        Args:
            results: the aggregated results from the profiler

        .. code-block:: python

            profiler.print_results(results)

        Example output:

        .. code-block:: text

            -----------------------------------------  -----------------------  -------------- ...
            Handler                                    Event Name                     Total(s)
            -----------------------------------------  -----------------------  --------------
            run.<locals>.log_training_results          EPOCH_COMPLETED                19.43245
            run.<locals>.log_validation_results        EPOCH_COMPLETED                 2.55271
            run.<locals>.log_time                      EPOCH_COMPLETED                 0.00049
            run.<locals>.log_intermediate_results      EPOCH_COMPLETED                 0.00106
            run.<locals>.log_training_loss             ITERATION_COMPLETED               0.059
            run.<locals>.log_time                      COMPLETED                 not triggered
            -----------------------------------------  -----------------------  --------------
            Total                                                                     22.04571
            -----------------------------------------  -----------------------  --------------
            Processing took total 11.29543s [min/index: 0.00393s/1875, max/index: 0.00784s/0,
             mean: 0.00602s, std: 0.00034s]
            Dataflow took total 16.24365s [min/index: 0.00533s/1874, max/index: 0.01129s/937,
             mean: 0.00866s, std: 0.00113s]

        """
        # adopted implementation of torch.autograd.profiler.build_table
        handler_column_width = max([len(item[0]) for item in results
                                    ]) + 4  # type: ignore[arg-type]
        event_column_width = max([len(item[1]) for item in results
                                  ]) + 4  # type: ignore[arg-type]

        DEFAULT_COLUMN_WIDTH = 14

        headers = [
            "Handler",
            "Event Name",
            "Total(s)",
            "Min(s)/IDX",
            "Max(s)/IDX",
            "Mean(s)",
            "Std(s)",
        ]

        # Have to use a list because nonlocal is Py3 only...
        SPACING_SIZE = 2
        row_format_lst = [""]
        header_sep_lst = [""]
        line_length_lst = [-SPACING_SIZE]

        def add_column(padding: int, text_dir: str = ">") -> None:
            row_format_lst[0] += "{: " + text_dir + str(padding) + "}" + (
                " " * SPACING_SIZE)
            header_sep_lst[0] += "-" * padding + (" " * SPACING_SIZE)
            line_length_lst[0] += padding + SPACING_SIZE

        add_column(handler_column_width, text_dir="<")
        add_column(event_column_width, text_dir="<")
        for _ in headers[2:]:
            add_column(DEFAULT_COLUMN_WIDTH)

        row_format = row_format_lst[0]
        header_sep = header_sep_lst[0]

        result = []

        def append(s: str) -> None:
            result.append(s)
            result.append("\n")

        result.append("\n")
        append(header_sep)
        append(row_format.format(*headers))
        append(header_sep)

        for row in results[:-3]:
            # format min/idx and max/idx
            row[3] = "{}/{}".format(*row[3])  # type: ignore[misc]
            row[4] = "{}/{}".format(*row[4])  # type: ignore[misc]

            append(row_format.format(*row))

        append(header_sep)
        # print total handlers time row
        append(row_format.format(*results[-3]))
        append(header_sep)

        summary_format = "{} took total {}s [min/index: {}, max/index: {}, mean: {}s, std: {}s]"
        for row in results[-2:]:
            row[3] = "{}s/{}".format(*row[3])  # type: ignore[misc]
            row[4] = "{}s/{}".format(*row[4])  # type: ignore[misc]
            del row[1]
            append(summary_format.format(*row))
        print("".join(result))
示例#9
0
class BasicTimeProfiler:
    """
    BasicTimeProfiler can be used to profile the handlers,
    events, data loading and data processing times.

    Examples:

    .. code-block:: python

        from ignite.handlers import BasicTimeProfiler

        trainer = Engine(train_updater)

        # Create an object of the profiler and attach an engine to it
        profiler = BasicTimeProfiler()
        profiler.attach(trainer)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_intermediate_results():
            profiler.print_results(profiler.get_results())

        trainer.run(dataloader, max_epochs=3)

        profiler.write_results('path_to_dir/time_profiling.csv')

    .. versionadded:: 0.4.6
    """

    events_to_ignore = [
        Events.EXCEPTION_RAISED,
        Events.TERMINATE,
        Events.TERMINATE_SINGLE_EPOCH,
        Events.DATALOADER_STOP_ITERATION,
    ]

    def __init__(self) -> None:
        self._dataflow_timer = Timer()
        self._processing_timer = Timer()
        self._event_handlers_timer = Timer()

        self.dataflow_times = torch.zeros(1)
        self.processing_times = torch.zeros(1)
        self.event_handlers_times = {}  # type: Dict[EventEnum, torch.Tensor]

        self._events = [
            Events.EPOCH_STARTED,
            Events.EPOCH_COMPLETED,
            Events.ITERATION_STARTED,
            Events.ITERATION_COMPLETED,
            Events.GET_BATCH_STARTED,
            Events.GET_BATCH_COMPLETED,
            Events.COMPLETED,
        ]
        self._fmethods = [
            self._as_first_epoch_started,
            self._as_first_epoch_completed,
            self._as_first_iter_started,
            self._as_first_iter_completed,
            self._as_first_get_batch_started,
            self._as_first_get_batch_completed,
            self._as_first_completed,
        ]
        self._lmethods = [
            self._as_last_epoch_started,
            self._as_last_epoch_completed,
            self._as_last_iter_started,
            self._as_last_iter_completed,
            self._as_last_get_batch_started,
            self._as_last_get_batch_completed,
            self._as_last_completed,
        ]

    def _reset(self, num_epochs: int, total_num_iters: int) -> None:
        self.dataflow_times = torch.zeros(total_num_iters)
        self.processing_times = torch.zeros(total_num_iters)
        self.event_handlers_times = {
            Events.STARTED: torch.zeros(1),
            Events.COMPLETED: torch.zeros(1),
            Events.EPOCH_STARTED: torch.zeros(num_epochs),
            Events.EPOCH_COMPLETED: torch.zeros(num_epochs),
            Events.ITERATION_STARTED: torch.zeros(total_num_iters),
            Events.ITERATION_COMPLETED: torch.zeros(total_num_iters),
            Events.GET_BATCH_COMPLETED: torch.zeros(total_num_iters),
            Events.GET_BATCH_STARTED: torch.zeros(total_num_iters),
        }

    def _as_first_started(self, engine: Engine) -> None:
        if hasattr(engine.state.dataloader, "__len__"):
            num_iters_per_epoch = len(
                engine.state.dataloader)  # type: ignore[arg-type]
        else:
            if engine.state.epoch_length is None:
                raise ValueError(
                    "As epoch_length is not set, we can not use BasicTimeProfiler in this case."
                    "Please, set trainer.run(..., epoch_length=epoch_length) in order to fix this."
                )
            num_iters_per_epoch = engine.state.epoch_length

        self.max_epochs = cast(int, engine.state.max_epochs)
        self.total_num_iters = self.max_epochs * num_iters_per_epoch
        self._reset(self.max_epochs, self.total_num_iters)

        self.event_handlers_names = {
            e: [
                h.__qualname__
                if hasattr(h, "__qualname__") else h.__class__.__name__
                for (h, _, _) in engine._event_handlers[e]
                if "BasicTimeProfiler." not in repr(
                    h)  # avoid adding internal handlers into output
            ]
            for e in Events if e not in self.events_to_ignore
        }

        # Setup all other handlers:
        engine._event_handlers[Events.STARTED].append(
            (self._as_last_started, (engine, ), {}))

        for e, m in zip(self._events, self._fmethods):
            engine._event_handlers[e].insert(0, (m, (engine, ), {}))

        for e, m in zip(self._events, self._lmethods):
            engine._event_handlers[e].append((m, (engine, ), {}))

        # Let's go
        self._event_handlers_timer.reset()

    def _as_last_started(self, engine: Engine) -> None:
        self.event_handlers_times[
            Events.STARTED][0] = self._event_handlers_timer.value()

    def _as_first_epoch_started(self, engine: Engine) -> None:
        self._event_handlers_timer.reset()

    def _as_last_epoch_started(self, engine: Engine) -> None:
        t = self._event_handlers_timer.value()
        e = engine.state.epoch - 1
        self.event_handlers_times[Events.EPOCH_STARTED][e] = t

    def _as_first_get_batch_started(self, engine: Engine) -> None:
        self._event_handlers_timer.reset()
        self._dataflow_timer.reset()

    def _as_last_get_batch_started(self, engine: Engine) -> None:
        t = self._event_handlers_timer.value()
        i = engine.state.iteration - 1
        self.event_handlers_times[Events.GET_BATCH_STARTED][i] = t

    def _as_first_get_batch_completed(self, engine: Engine) -> None:
        self._event_handlers_timer.reset()

    def _as_last_get_batch_completed(self, engine: Engine) -> None:
        t = self._event_handlers_timer.value()
        i = engine.state.iteration - 1
        self.event_handlers_times[Events.GET_BATCH_COMPLETED][i] = t

        d = self._dataflow_timer.value()
        self.dataflow_times[i] = d

        self._dataflow_timer.reset()

    def _as_first_iter_started(self, engine: Engine) -> None:
        self._event_handlers_timer.reset()

    def _as_last_iter_started(self, engine: Engine) -> None:
        t = self._event_handlers_timer.value()
        i = engine.state.iteration - 1
        self.event_handlers_times[Events.ITERATION_STARTED][i] = t

        self._processing_timer.reset()

    def _as_first_iter_completed(self, engine: Engine) -> None:
        t = self._processing_timer.value()
        i = engine.state.iteration - 1
        self.processing_times[i] = t

        self._event_handlers_timer.reset()

    def _as_last_iter_completed(self, engine: Engine) -> None:
        t = self._event_handlers_timer.value()
        i = engine.state.iteration - 1
        self.event_handlers_times[Events.ITERATION_COMPLETED][i] = t

    def _as_first_epoch_completed(self, engine: Engine) -> None:
        self._event_handlers_timer.reset()

    def _as_last_epoch_completed(self, engine: Engine) -> None:
        t = self._event_handlers_timer.value()
        e = engine.state.epoch - 1
        self.event_handlers_times[Events.EPOCH_COMPLETED][e] = t

    def _as_first_completed(self, engine: Engine) -> None:
        self._event_handlers_timer.reset()

    def _as_last_completed(self, engine: Engine) -> None:
        self.event_handlers_times[
            Events.COMPLETED][0] = self._event_handlers_timer.value()

        # Remove added handlers:
        engine.remove_event_handler(self._as_last_started, Events.STARTED)

        for e, m in zip(self._events, self._fmethods):
            engine.remove_event_handler(m, e)

        for e, m in zip(self._events, self._lmethods):
            engine.remove_event_handler(m, e)

    def attach(self, engine: Engine) -> None:
        """Attach BasicTimeProfiler to the given engine.

        Args:
            engine: the instance of Engine to attach
        """
        if not isinstance(engine, Engine):
            raise TypeError(
                f"Argument engine should be ignite.engine.Engine, but given {type(engine)}"
            )

        if not engine.has_event_handler(self._as_first_started):
            engine._event_handlers[Events.STARTED].insert(
                0, (self._as_first_started, (engine, ), {}))

    @staticmethod
    def _compute_basic_stats(
        data: torch.Tensor
    ) -> Dict[str, Union[str, float, Tuple[Union[float], Union[float]]]]:
        # compute on non-zero data:
        data = data[data > 0]
        out = [
            ("total",
             torch.sum(data).item() if len(data) > 0 else "not yet triggered")
        ]  # type: List[Tuple[str, Union[str, float, Tuple[Union[float], Union[float]]]]]
        if len(data) > 1:
            out += [
                ("min/index", (torch.min(data).item(),
                               torch.argmin(data).item())),
                ("max/index", (torch.max(data).item(),
                               torch.argmax(data).item())),
                ("mean", torch.mean(data).item()),
                ("std", torch.std(data).item()),
            ]
        return OrderedDict(out)

    def get_results(self) -> Dict[str, Dict[str, Any]]:
        """
        Method to fetch the aggregated profiler results after the engine is run

        .. code-block:: python

            results = profiler.get_results()

        """
        total_eh_time = sum([(self.event_handlers_times[e]).sum()
                             for e in Events if e not in self.events_to_ignore
                             ])  # type: Union[int, torch.Tensor]
        event_handlers_stats = dict(
            [(str(e.name).replace(".", "_"),
              self._compute_basic_stats(self.event_handlers_times[e]))
             for e in Events if e not in self.events_to_ignore] +
            [("total_time", total_eh_time)]  # type: ignore[list-item]
        )

        return OrderedDict([
            ("processing_stats",
             self._compute_basic_stats(self.processing_times)),
            ("dataflow_stats", self._compute_basic_stats(self.dataflow_times)),
            (
                "event_handlers_stats",
                event_handlers_stats,
            ),
            (
                "event_handlers_names",
                {
                    str(e.name).replace(".", "_") + "_names": v
                    for e, v in self.event_handlers_names.items()
                },
            ),
        ])

    def write_results(self, output_path: str) -> None:
        """
        Method to store the unaggregated profiling results to a csv file

        Args:
            output_path: file output path containing a filename

        .. code-block:: python

            profiler.write_results('path_to_dir/awesome_filename.csv')

        Example output:

        .. code-block:: text

            -----------------------------------------------------------------
            epoch iteration processing_stats dataflow_stats Event_STARTED ...
            1.0     1.0        0.00003         0.252387        0.125676
            1.0     2.0        0.00029         0.252342        0.125123

        """
        try:
            import pandas as pd
        except ImportError:
            raise RuntimeError("Need pandas to write results as files")

        iters_per_epoch = self.total_num_iters // self.max_epochs

        epochs = torch.arange(
            self.max_epochs,
            dtype=torch.float32).repeat_interleave(iters_per_epoch) + 1
        iterations = torch.arange(self.total_num_iters,
                                  dtype=torch.float32) + 1
        processing_stats = self.processing_times
        dataflow_stats = self.dataflow_times

        event_started = self.event_handlers_times[
            Events.STARTED].repeat_interleave(self.total_num_iters)
        event_completed = self.event_handlers_times[
            Events.COMPLETED].repeat_interleave(self.total_num_iters)
        event_epoch_started = self.event_handlers_times[
            Events.EPOCH_STARTED].repeat_interleave(iters_per_epoch)
        event_epoch_completed = self.event_handlers_times[
            Events.EPOCH_COMPLETED].repeat_interleave(iters_per_epoch)

        event_iter_started = self.event_handlers_times[
            Events.ITERATION_STARTED]
        event_iter_completed = self.event_handlers_times[
            Events.ITERATION_COMPLETED]
        event_batch_started = self.event_handlers_times[
            Events.GET_BATCH_STARTED]
        event_batch_completed = self.event_handlers_times[
            Events.GET_BATCH_COMPLETED]

        results_dump = torch.stack(
            [
                epochs,
                iterations,
                processing_stats,
                dataflow_stats,
                event_started,
                event_completed,
                event_epoch_started,
                event_epoch_completed,
                event_iter_started,
                event_iter_completed,
                event_batch_started,
                event_batch_completed,
            ],
            dim=1,
        ).numpy()

        results_df = pd.DataFrame(
            data=results_dump,
            columns=[
                "epoch",
                "iteration",
                "processing_stats",
                "dataflow_stats",
                "Event_STARTED",
                "Event_COMPLETED",
                "Event_EPOCH_STARTED",
                "Event_EPOCH_COMPLETED",
                "Event_ITERATION_STARTED",
                "Event_ITERATION_COMPLETED",
                "Event_GET_BATCH_STARTED",
                "Event_GET_BATCH_COMPLETED",
            ],
        )
        results_df.to_csv(output_path, index=False)

    @staticmethod
    def print_results(results: Dict) -> str:
        """
        Method to print the aggregated results from the profiler

        Args:
            results: the aggregated results from the profiler

        .. code-block:: python

            profiler.print_results(results)

        Example output:

        .. code-block:: text

             ----------------------------------------------------
            | Time profiling stats (in seconds):                 |
             ----------------------------------------------------
            total  |  min/index  |  max/index  |  mean  |  std

            Processing function:
            157.46292 | 0.01452/1501 | 0.26905/0 | 0.07730 | 0.01258

            Dataflow:
            6.11384 | 0.00008/1935 | 0.28461/1551 | 0.00300 | 0.02693

            Event handlers:
            2.82721

            - Events.STARTED: []
            0.00000

            - Events.EPOCH_STARTED: []
            0.00006 | 0.00000/0 | 0.00000/17 | 0.00000 | 0.00000

            - Events.ITERATION_STARTED: ['PiecewiseLinear']
            0.03482 | 0.00001/188 | 0.00018/679 | 0.00002 | 0.00001

            - Events.ITERATION_COMPLETED: ['TerminateOnNan']
            0.20037 | 0.00006/866 | 0.00089/1943 | 0.00010 | 0.00003

            - Events.EPOCH_COMPLETED: ['empty_cuda_cache', 'training.<locals>.log_elapsed_time', ]
            2.57860 | 0.11529/0 | 0.14977/13 | 0.12893 | 0.00790

            - Events.COMPLETED: []
            not yet triggered

        """
        def to_str(v: Union[str, tuple]) -> str:
            if isinstance(v, str):
                return v
            elif isinstance(v, tuple):
                return f"{v[0]:.5f}/{v[1]}"
            return f"{v:.5f}"

        def odict_to_str(d: Mapping) -> str:
            out = " | ".join([to_str(v) for v in d.values()])
            return out

        others = {
            k: odict_to_str(v) if isinstance(v, OrderedDict) else v
            for k, v in results["event_handlers_stats"].items()
        }

        others.update(results["event_handlers_names"])

        output_message = """
 ----------------------------------------------------
| Time profiling stats (in seconds):                 |
 ----------------------------------------------------
total  |  min/index  |  max/index  |  mean  |  std

Processing function:
{processing_stats}

Dataflow:
{dataflow_stats}

Event handlers:
{total_time:.5f}

- Events.STARTED: {STARTED_names}
{STARTED}

- Events.EPOCH_STARTED: {EPOCH_STARTED_names}
{EPOCH_STARTED}

- Events.ITERATION_STARTED: {ITERATION_STARTED_names}
{ITERATION_STARTED}

- Events.ITERATION_COMPLETED: {ITERATION_COMPLETED_names}
{ITERATION_COMPLETED}

- Events.EPOCH_COMPLETED: {EPOCH_COMPLETED_names}
{EPOCH_COMPLETED}

- Events.COMPLETED: {COMPLETED_names}
{COMPLETED}
""".format(
            processing_stats=odict_to_str(results["processing_stats"]),
            dataflow_stats=odict_to_str(results["dataflow_stats"]),
            **others,
        )
        print(output_message)
        return output_message
示例#10
0
 def __init__(self, fps_mul: float = 1.0, fps_smooth_alpha: float = 0.98):
     self._timer = Timer(average=True)
     self._fps_mul = fps_mul
     self._started_ts = time.time()
     self._fps_smooth_alpha = fps_smooth_alpha
示例#11
0
文件: train.py 项目: bouthilx/dpd
def train(data,
          model,
          optimizer,
          model_seed=1,
          sampler_seed=1,
          max_epochs=120,
          patience=None,
          stopping_rule=None,
          compute_test_error_rates=False,
          loading_file_path=None,
          callback=None):

    # Checkpointing file path is named based on Mahler task ID
    checkpointing_file_path = get_checkpoint_file_path()

    if loading_file_path is None:
        loading_file_path = checkpointing_file_path
    # Else, we are branching from another configuration.

    print("\n\nLoading file path:")
    print(loading_file_path)

    print("\n\nCheckpointing file path:")
    print(checkpointing_file_path)
    print("\n\n")

    dataset, model, optimizer, lr_scheduler, device, seeds = build_experiment(
        data=data,
        model=model,
        optimizer=optimizer,
        model_seed=model_seed,
        sampler_seed=sampler_seed)

    if lr_scheduler is None and patience is None:
        patience = 20
    elif patience is None:
        patience = lr_scheduler.patience * 2

    print("\n\nMax epochs: {}\n\n".format(max_epochs))

    print("\n\nEarly stopping with patience: {}\n\n".format(patience))

    print('Building timers, training and evaluation loops...')
    timer = Timer(average=True)

    print('    Stopping timer')
    stopping_timer = Timer(average=True)

    print('    Training loop')
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        torch.nn.functional.cross_entropy,
                                        device=device)

    print('    Evaluator loop')
    evaluators, early_stopping = build_evaluators(trainer, model, device,
                                                  patience,
                                                  compute_test_error_rates)

    print('    Set timer events')
    timer.attach(trainer, start=Events.STARTED, step=Events.EPOCH_COMPLETED)

    print('    Metric logger')
    metric_logger = Logger()
    print('Done')

    all_stats = []
    best_stats = {}

    @trainer.on(Events.STARTED)
    def trainer_load_checkpoint(engine):
        engine.state.last_checkpoint = datetime.utcnow()
        metadata = load_checkpoint(loading_file_path, model, optimizer,
                                   lr_scheduler)
        if metadata:
            print('Resuming from epoch {}'.format(metadata['epoch']))
            print('Optimizer:')
            print('    lr:', optimizer.param_groups[0]['lr'])
            print('    momentum:', optimizer.param_groups[0]['momentum'])
            print('    weight decay:',
                  optimizer.param_groups[0]['weight_decay'])

            print('LR schedule:')
            print('    best:', lr_scheduler.best)
            print('    num_bad_epochs:', lr_scheduler.num_bad_epochs)
            print('    cooldown:', lr_scheduler.cooldown)

            engine.state.epoch = metadata['epoch']
            engine.state.iteration = metadata['iteration']
            for epoch_stats in metadata['all_stats']:
                tmp = engine.state.metrics
                engine.state.metrics = epoch_stats['valid']
                early_stopping(engine)
                engine.state.metrics = tmp

                all_stats.append(epoch_stats)
                if (not best_stats
                        or (epoch_stats['valid']['error_rate']['mean'] <
                            best_stats['valid']['error_rate']['mean'])):
                    best_stats.update(epoch_stats)

            print('Early stopping:')
            print('    best_score:', early_stopping.best_score)
            print('    counter:', early_stopping.counter)
        else:
            engine.state.epoch = 0
            engine.state.iteration = 0
            engine.state.output = 0.0
            # trainer_save_checkpoint(engine)

    @trainer.on(Events.EPOCH_STARTED)
    def trainer_seeding(engine):
        print(seeds['sampler'] + engine.state.epoch)
        seed(int(seeds['sampler'] + engine.state.epoch))
        model.train()

    @trainer.on(Events.EPOCH_COMPLETED)
    def trainer_save_checkpoint(engine):
        model.eval()

        stats = dict(epoch=engine.state.epoch)

        for name in ['valid', 'train', 'test']:
            evaluator = evaluators.get(name, None)
            if evaluator is None:
                continue

            loader = dataset[name]
            metrics = evaluator.run(loader).metrics
            stats[name] = dict(loss=metrics['nll'],
                               error_rate=metrics['error_rate'])

        print('Early stopping')
        print('{}   {} < {}'.format(early_stopping.best_score,
                                    early_stopping.counter,
                                    early_stopping.patience))

        current_v_error_rate = stats['valid']['error_rate']['mean']
        best_v_error_rate = best_stats.get('valid',
                                           {}).get('error_rate',
                                                   {}).get('mean', 100)

        if lr_scheduler:
            lr_scheduler.step(current_v_error_rate)
            print('Lr schedule')
            print('{}   last_epoch: {} bads: {} cooldown: {}'.format(
                lr_scheduler.best, lr_scheduler.last_epoch,
                lr_scheduler.num_bad_epochs, lr_scheduler.cooldown_counter))

        if not best_stats or current_v_error_rate < best_v_error_rate:
            best_stats.update(stats)

        # TODO: load all tasks with the same tags in mahler, compute the error_rate at that point
        #       (compare median of best error_rates up to that point vs this best_stats
        #       if below median, suspend
        #       maybe, interrupt and increase priority, or not... Because we would need to wait for
        #       it to completed anyway
        #       Grace period? Like 60 epochs? :/
        #       Or reduce quantile as time grows (stop worst 95th quantile at 10 epochs, 50th at
        #       100, 75th at 150 and so on...) Meh to much novelty.
        #       min trials at that point?
        #       or interrupt after each 10/20 epochs, so that number of trials is quickly high
        #       but that means we need a way to log results during execution, not just output.

        print(("Epoch {:>4} Iteration {:>12} Loss {:>8.3f} "
               "Best-Valid-ER {:>8.4f} Time {:>8.3f}").format(
                   engine.state.epoch, engine.state.iteration,
                   engine.state.output, best_v_error_rate, timer.value()))

        metric_logger.add_metric(stats)

        all_stats.append(stats)

        # TODO: Checkpoint lr_scheduler as well
        if (datetime.utcnow() -
                engine.state.last_checkpoint).total_seconds() > TIME_BUFFER:
            print('Checkpointing epoch {}'.format(engine.state.epoch))
            save_checkpoint(checkpointing_file_path,
                            model,
                            optimizer,
                            lr_scheduler,
                            epoch=engine.state.epoch,
                            iteration=engine.state.iteration,
                            all_stats=all_stats)
            engine.state.last_checkpoint = datetime.utcnow()

        if callback:
            callback(step=engine.state.epoch,
                     objective=stats['valid']['error_rate']['mean'],
                     finished=False)

    print("Training")
    trainer.run(dataset['train'], max_epochs=max_epochs)

    metric_logger.close()

    # Remove checkpoint to avoid cluttering the FS.
    clear_checkpoint(checkpointing_file_path)

    if callback:
        callback(step=max_epochs,
                 objective=all_stats[-1]['valid']['error_rate']['mean'],
                 finished=True)

    return {'best': best_stats, 'all': tuple(all_stats)}
 def __init__(self):
     self._timer = Timer(average=True)