示例#1
0
    def profile(self, device_name, options, executor=None):
        """Profile the network with the given device spec.

        Returns:
            A dictionary contains the following keys:
              (layers, flops, executor, executor_std, flops_message,
              executor_msg)
        """
        device_spec = device.DEVICES[device_name]
        logger.info('Profiling for device %s' % device_spec.name)

        results = []
        for layer_spec in self.graph.topology_order:
            layer = layer_spec.layer_op

            # Always run flop-based profiler.
            if executor == 'tensorflow':
                # Here we disable the cudnn heuristics.
                # Tensorflow requires creating a cuda stream and does not allow
                # multiple context under one process.
                # We cannot use cuda stream because of the python wrapper.
                options.use_cudnn_heuristics = False

            flops_profiler = profilers.FlopsProfiler(options, device_spec)
            flop_based_time = flops_profiler.profile(layer)

            logger.info('Layer: %s' % layer_spec.name)
            logger.info(
                '- %s: %s  %s' %
                (flops_profiler.name, flop_based_time, flops_profiler.message))

            if device_spec.is_gpu:
                profiler = None
                if executor == 'cudnn':
                    from profilers.cudnn_profiler import CudnnProfiler
                    profiler = CudnnProfiler(options)
                elif executor == 'tensorflow':
                    from profilers.tensorflow_profiler import (
                        TensorFlowProfiler)
                    profiler = TensorFlowProfiler(options)

                if profiler:
                    executor_time = profiler.profile(layer)
                    logger.info(
                        '- %s: %s  %s' %
                        (profiler.name, executor_time, profiler.message))

                    results.append(
                        (layer_spec.name, flop_based_time.total_time,
                         executor_time.total_time, 0, flops_profiler.message,
                         profiler.message))
        return results
示例#2
0
def _profile_for_batch_size(layer_list,
                            direction,
                            device,
                            batch_size,
                            use_only_gemm,
                            ppp_comp,
                            ppp_comm,
                            cross_device_bandwidth=None):
    """Use flops profiler to estiamte execution with under the given spec."""
    logger.debug('Profile for\n  pass: %s\n  device: %s\n  batch size: %s' %
                 (direction, device.name, batch_size))
    times = []
    params_in_bytes = 0

    # Estimate forward time for each layer.
    for layer_spec in layer_list:
        layer = layer_spec.layer_op
        if batch_size:
            layer.batch_size = batch_size

        options = profilers.ProfilerOptions()
        options.direction = direction
        options.gradient_wrt = None
        if use_only_gemm:
            options.use_cudnn_heuristics = False
        # FIXME: we don't include bias and activation for simplicity.
        options.include_bias_and_activation = False
        options.ppp_comp = ppp_comp
        options.ppp_comm = ppp_comm
        flops_profiler = profilers.FlopsProfiler(
            options, device)  # Why instantiate new profiler for every layer?

        layer_time = flops_profiler.profile(
            layer, layer_spec.device_id,
            [p.device_id for p in layer_spec.parents], cross_device_bandwidth)
        params_in_bytes += layer.weights_in_bytes
        times.append(layer_time)

    return times, params_in_bytes
示例#3
0
def _profile_for_apply_updates(params_in_bytes, device):
    flops_profiler = profilers.FlopsProfiler(profilers.ProfilerOptions(),
                                             device)
    return flops_profiler.profile_apply_updates(params_in_bytes)