示例#1
0
    def __init__(self, kernel, machine, cores=1):
        """Initialize cache simulation based predictor from kernel and machine object."""
        CachePredictor.__init__(self, kernel, machine, cores)
        # Get the machine's cache model and simulator
        csim = self.machine.get_cachesim(self.cores)

        # FIXME handle multiple datatypes
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = self.machine['cacheline size']
        elements_per_cacheline = int(cacheline_size // element_size)

        # Gathering some loop information:
        inner_loop = list(self.kernel.get_loop_stack(subs_consts=True))[-1]
        inner_index = symbol_pos_int(inner_loop['index'])
        inner_increment = inner_loop[
            'increment']  # Calculate the number of iterations for warm-up
        max_cache_size = max(
            map(lambda c: c.size(), csim.levels(with_mem=False)))
        max_array_size = max(
            self.kernel.array_sizes(in_bytes=True, subs_consts=True).values())

        # Regular Initialization
        warmup_indices = {
            symbol_pos_int(l['index']):
            ((l['stop'] - l['start']) // l['increment']) // 3
            for l in self.kernel.get_loop_stack(subs_consts=True)
        }
        warmup_iteration_count = self.kernel.indices_to_global_iterator(
            warmup_indices)
        # Make sure we are not handeling gigabytes of data, but 1.5x the maximum cache size
        while warmup_iteration_count * element_size > max_cache_size * 1.5:
            # Decreasing indices (starting from outer), until total size is small enough
            for l in self.kernel.get_loop_stack():
                index = symbol_pos_int(l['index'])
                if warmup_indices[index] > l['start']:
                    warmup_indices[index] -= 1
                    # get offset in iterations and size that a change on this level provokes
                    diff_iterations = (
                        warmup_iteration_count -
                        self.kernel.indices_to_global_iterator(warmup_indices))
                    diff_size = diff_iterations * element_size
                    warmup_indices[index] = max(
                        l['start'],
                        (warmup_iteration_count -
                         (max_cache_size * 1.5) // element_size) // diff_size)
                    break
            warmup_iteration_count = self.kernel.indices_to_global_iterator(
                warmup_indices)
        offsets = []

        if max_array_size < 2 * max_cache_size:
            # Full caching possible, go through all itreration before actual initialization
            offsets = list(
                self.kernel.compile_global_offsets(
                    iteration=range(0, self.kernel.iteration_length())))

        # Align iteration count with cachelines
        # do this by aligning either writes (preferred) or reads
        # Assumption: writes (and reads) increase linearly
        o = list(
            self.kernel.compile_global_offsets(
                iteration=warmup_iteration_count))[0]
        if o[1]:
            # we have a write to work with:
            first_offset = min(o[1])
        else:
            # we use reads
            first_offset = min(o[0])

        # Distance from cacheline boundary (in bytes)
        diff = first_offset - \
            (int(first_offset) >> csim.first_level.cl_bits << csim.first_level.cl_bits)
        warmup_iteration_count -= (diff // element_size) // inner_increment
        warmup_indices = self.kernel.global_iterator_to_indices(
            warmup_iteration_count)

        offsets += list(
            self.kernel.compile_global_offsets(
                iteration=range(0, warmup_iteration_count)))

        # Do the warm-up
        csim.loadstore(offsets, length=element_size)
        # FIXME compile_global_offsets should already expand to element_size

        # Reset stats to conclude warm-up phase
        csim.reset_stats()

        # Benchmark iterations:
        # Starting point is one past the last warmup element
        bench_iteration_start = warmup_iteration_count
        # End point is the end of the current dimension (cacheline alligned)
        first_dim_factor = int(
            (inner_loop['stop'] - warmup_indices[inner_index] - 1) //
            (elements_per_cacheline // inner_increment))
        # If end point is less than one cacheline away, go beyond for 100 cachelines and
        # warn user of potentially inaccurate results
        if first_dim_factor == 0:
            # TODO a nicer solution would be to do less warmup iterations to select a
            # cacheline within a first dimension, if possible
            print(
                'Warning: (automatic) warmup vs benchmark iteration choice was not perfect '
                'and may lead to inaccurate cache miss predictions. This is most likely the '
                'result of too few inner loop iterations ({} from {} to {}).'.
                format(inner_loop['index'], inner_loop['start'],
                       inner_loop['stop']))
            first_dim_factor = 100
        bench_iteration_end = (
            bench_iteration_start +
            elements_per_cacheline * inner_increment * first_dim_factor)

        # compile access needed for one cache-line
        offsets = list(
            self.kernel.compile_global_offsets(
                iteration=range(bench_iteration_start, bench_iteration_end)))
        # simulate
        csim.loadstore(offsets, length=element_size)
        # FIXME compile_global_offsets should already expand to element_size

        # use stats to build results
        self.stats = list(csim.stats())
        self.first_dim_factor = first_dim_factor
示例#2
0
    def __init__(self, kernel, machine, cores=1):
        """Initialize layer condition based predictor from kernel and machine object."""
        CachePredictor.__init__(self, kernel, machine, cores=cores)

        # check that layer conditions can be applied on this kernel:
        # 1. All iterations may only have a step width of 1
        loop_stack = list(self.kernel.get_loop_stack())
        if any([l['increment'] != 1 for l in loop_stack]):
            raise ValueError(
                "Can not apply layer condition, since not all loops are of step "
                "length 1.")

        # 2. The order of iterations must be reflected in the order of indices in all array
        #    references containing the inner loop index. If the inner loop index is not part of the
        #    reference, the reference is simply ignored
        index_order = [symbol_pos_int(l['index']) for l in loop_stack]
        for var_name, arefs in chain(self.kernel.sources.items(),
                                     self.kernel.destinations.items()):
            if next(iter(arefs)) is None:
                # Anything that is a scalar may be ignored
                continue
            for a in [self.kernel.access_to_sympy(var_name, a) for a in arefs]:
                for t in a.expand().as_ordered_terms():
                    # Check each and every term if they are valid according to loop order and array
                    # initialization
                    idx = t.free_symbols.intersection(index_order)

                    # Terms without any indices can be treat as constant offsets and are acceptable
                    if not idx:
                        continue

                    if len(idx) != 1:
                        raise ValueError(
                            "Only one loop counter may appear per term. "
                            "Problematic term: {}.".format(t))
                    else:  # len(idx) == 1
                        idx = idx.pop()
                        # Check that number of multiplication match access order of iterator
                        pow_dict = {
                            k: v
                            for k, v in t.as_powers_dict().items() if k != idx
                        }
                        stride_dim = sum(pow_dict.values())
                        error = False
                        try:
                            if loop_stack[-stride_dim -
                                          1]['index'] != idx.name:
                                error = True
                        except IndexError:
                            error = True
                        if error:
                            raise ValueError(
                                "Number of multiplications in index term does not "
                                "match loop counter order. "
                                "Problematic term: {}.".format(t))

        # 3. Indices may only increase with one
        inner_index = symbol_pos_int(loop_stack[-1]['index'])
        inner_increment = loop_stack[-1]['increment']
        # TODO use a public interface, not self.kernel._*
        for arefs in chain(chain(*self.kernel.sources.values()),
                           chain(*self.kernel.destinations.values())):
            if arefs is None:
                continue
            for expr in arefs:
                diff = expr.subs(inner_index, 1 + inner_increment) - expr.subs(
                    inner_index, 1)
                if diff != 0 and diff != 1:
                    # TODO support -1 aswell
                    raise ValueError(
                        "Can not apply layer condition, array references may not "
                        "increment more then one per iteration.")
        # FIXME handle multiple datatypes
        element_size = self.kernel.datatypes_size[self.kernel.datatype]

        accesses = {}
        destinations = set()
        distances = []
        for var_name in self.kernel.variables:
            # Gather all access to current variable/array
            accesses[var_name] = self.kernel.sources.get(
                var_name,
                set()).union(self.kernel.destinations.get(var_name, set()))
            # Skip non-variable offsets, where acs is [None, None, None] (or similar) or only made
            # up from constant offsets
            if not any(accesses[var_name]) or not any([
                    a == inner_index or a.coeff(inner_index) != 0
                    for a in chain.from_iterable(accesses[var_name])
            ]):
                continue
            destinations.update([
                (var_name, tuple(r))
                for r in self.kernel.destinations.get(var_name, [])
            ])
            acs = accesses[var_name]
            # Transform them into sympy expressions
            acs = [self.kernel.access_to_sympy(var_name, r) for r in acs]
            # Replace constants with their integer counter parts, to make the entries sortable
            acs = [self.kernel.subs_consts(e) for e in acs]
            # Sort accesses by decreasing order
            acs.sort(reverse=True)

            # Create reuse distances by substracting accesses pairwise in decreasing order
            distances += [(acs[i - 1] - acs[i]).simplify()
                          for i in range(1, len(acs))]
            # Add infinity for each array
            distances.append(sympy.oo)

        # Sort distances by decreasing order
        distances.sort(reverse=True)
        # Create copy of distances in bytes:
        distances_bytes = [d * element_size for d in distances]
        # CAREFUL! From here on we are working in byte offsets and not in indices anymore.

        # converting access sets to lists, otherwise pprint will fail during obligatory sorting step
        results = {
            'accesses': {k: list(v)
                         for k, v in accesses.items()},
            'distances': distances,
            'destinations': destinations,
            'distances_bytes': distances_bytes,
            'cache': []
        }

        sum_array_sizes = sum(
            self.kernel.array_sizes(in_bytes=True, subs_consts=True).values())

        for c in self.machine.get_cachesim(self.cores).levels(with_mem=False):
            # Assuming increasing order of cache sizes
            hits = 0
            misses = len(distances_bytes)
            cache_requirement = 0

            tail = 0
            # Test for full caching
            if c.size() > sum_array_sizes:
                hits = misses
                misses = 0
                cache_requirement = sum_array_sizes
            else:
                for tail in sorted(set(distances_bytes), reverse=True):
                    # Assuming decreasing order of tails
                    # Ignoring infinity tail:
                    if tail is sympy.oo:
                        continue
                    cache_requirement = (
                        # Sum of inter-access caches
                        sum([d for d in distances_bytes if d <= tail]) +
                        # Tails
                        tail * len([d for d in distances_bytes if d > tail]))

                    if cache_requirement <= c.size():
                        # If we found a tail that fits into our available cache size
                        # note hits and misses and break
                        hits = len([d for d in distances_bytes if d <= tail])
                        misses = len([d for d in distances_bytes if d > tail])
                        break

            # Resulting analysis for current cache level
            # TODO include loads and stores
            results['cache'].append({
                'name':
                c.name,
                'hits':
                hits,
                'misses':
                misses,
                'evicts':
                len(destinations) if c.size() < sum_array_sizes else 0,
                'requirement':
                cache_requirement,
                'tail':
                tail
            })

        self.results = results
示例#3
0
    def __init__(self, kernel, machine, cores=1):
        """Initialize layer condition based predictor from kernel and machine object."""
        CachePredictor.__init__(self, kernel, machine, cores=cores)

        # check that layer conditions can be applied on this kernel:
        # 1. All iterations may only have a step width of 1
        loop_stack = list(self.kernel.get_loop_stack())
        if any([l['increment'] != 1 for l in loop_stack]):
            raise ValueError("Can not apply layer condition, since not all loops are of step "
                             "length 1.")

        # 2. The order of iterations must be reflected in the order of indices in all array
        #    references containing the inner loop index. If the inner loop index is not part of the
        #    reference, the reference is simply ignored
        index_order = [symbol_pos_int(l['index']) for l in loop_stack]
        for var_name, arefs in chain(self.kernel.sources.items(), self.kernel.destinations.items()):
            if next(iter(arefs)) is None:
                # Anything that is a scalar may be ignored
                continue
            for a in [self.kernel.access_to_sympy(var_name, a) for a in arefs]:
                for t in a.expand().as_ordered_terms():
                    # Check each and every term if they are valid according to loop order and array
                    # initialization
                    idx = t.free_symbols.intersection(index_order)

                    # Terms without any indices can be treat as constant offsets and are acceptable
                    if not idx:
                        continue

                    if len(idx) != 1:
                        raise ValueError("Only one loop counter may appear per term. "
                                         "Problematic term: {}.".format(t))
                    else:  # len(idx) == 1
                        idx = idx.pop()
                        # Check that number of multiplication match access order of iterator
                        pow_dict = {k: v for k, v in t.as_powers_dict().items()
                                    if k != idx}
                        stride_dim = sum(pow_dict.values())
                        error = False
                        try:
                            if loop_stack[-stride_dim-1]['index'] != idx.name:
                                error = True
                        except IndexError:
                            error = True
                        if error:
                            raise ValueError("Number of multiplications in index term does not "
                                             "match loop counter order. "
                                             "Problematic term: {}.".format(t))

        # 3. Indices may only increase with one
        inner_index = symbol_pos_int(loop_stack[-1]['index'])
        inner_increment = loop_stack[-1]['increment']
        # TODO use a public interface, not self.kernel._*
        for arefs in chain(chain(*self.kernel.sources.values()),
                           chain(*self.kernel.destinations.values())):
            if arefs is None:
                continue
            for expr in arefs:
                diff = expr.subs(inner_index, 1+inner_increment) - expr.subs(inner_index, 1)
                if diff != 0 and diff != 1:
                    # TODO support -1 aswell
                    raise ValueError("Can not apply layer condition, array references may not "
                                     "increment more then one per iteration.")
        # FIXME handle multiple datatypes
        element_size = self.kernel.datatypes_size[self.kernel.datatype]

        accesses = {}
        destinations = set()
        distances = []
        for var_name in self.kernel.variables:
            # Gather all access to current variable/array
            accesses[var_name] = self.kernel.sources.get(var_name, set()).union(
                                    self.kernel.destinations.get(var_name, set()))
            # Skip non-variable offsets, where acs is [None, None, None] (or similar) or only made
            # up from constant offsets
            if not any(accesses[var_name]) or not any(
                    [a == inner_index or a.coeff(inner_index) != 0
                     for a in chain.from_iterable(accesses[var_name])]):
                continue
            destinations.update(
                [(var_name, tuple(r)) for r in self.kernel.destinations.get(var_name, [])])
            acs = accesses[var_name]
            # Transform them into sympy expressions
            acs = [self.kernel.access_to_sympy(var_name, r) for r in acs]
            # Replace constants with their integer counter parts, to make the entries sortable
            acs = [self.kernel.subs_consts(e) for e in acs]
            # Sort accesses by decreasing order
            acs.sort(reverse=True)

            # Create reuse distances by substracting accesses pairwise in decreasing order
            distances += [(acs[i-1]-acs[i]).simplify() for i in range(1, len(acs))]
            # Add infinity for each array
            distances.append(sympy.oo)

        # Sort distances by decreasing order
        distances.sort(reverse=True)
        # Create copy of distances in bytes:
        distances_bytes = [d*element_size for d in distances]
        # CAREFUL! From here on we are working in byte offsets and not in indices anymore.

        # converting access sets to lists, otherwise pprint will fail during obligatory sorting step
        results = {'accesses': {k: list(v) for k,v in accesses.items()},
                   'distances': distances,
                   'destinations': destinations,
                   'distances_bytes': distances_bytes,
                   'cache': []}

        sum_array_sizes = sum(self.kernel.array_sizes(in_bytes=True, subs_consts=True).values())

        for c in self.machine.get_cachesim(self.cores).levels(with_mem=False):
            # Assuming increasing order of cache sizes
            hits = 0
            misses = len(distances_bytes)
            cache_requirement = 0

            tail = 0
            # Test for full caching
            if c.size() > sum_array_sizes:
                hits = misses
                misses = 0
                cache_requirement = sum_array_sizes
            else:
                for tail in sorted(set(distances_bytes), reverse=True):
                    # Assuming decreasing order of tails
                    # Ignoring infinity tail:
                    if tail is sympy.oo:
                        continue
                    cache_requirement = (
                        # Sum of inter-access caches
                        sum([d for d in distances_bytes if d <= tail]) +
                        # Tails
                        tail*len([d for d in distances_bytes if d > tail]))

                    if cache_requirement <= c.size():
                        # If we found a tail that fits into our available cache size
                        # note hits and misses and break
                        hits = len([d for d in distances_bytes if d <= tail])
                        misses = len([d for d in distances_bytes if d > tail])
                        break

            # Resulting analysis for current cache level
            # TODO include loads and stores
            results['cache'].append({
                'name': c.name,
                'hits': hits,
                'misses': misses,
                'evicts': len(destinations) if c.size() < sum_array_sizes else 0,
                'requirement': cache_requirement,
                'tail': tail})

        self.results = results
示例#4
0
    def __init__old(self, kernel, machine, cores=1):
        """Initialize cache simulation based predictor from kernel and machine object."""
        CachePredictor.__init__(self, kernel, machine, cores)
        # Get the machine's cache model and simulator
        csim = self.machine.get_cachesim(self.cores)

        # FIXME handle multiple datatypes
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = self.machine['cacheline size']
        elements_per_cacheline = int(cacheline_size // element_size)

        # Gathering some loop information:
        inner_loop = list(self.kernel.get_loop_stack(subs_consts=True))[-1]
        inner_index = symbol_pos_int(inner_loop['index'])
        inner_increment = inner_loop['increment']  # Calculate the number of iterations for warm-up
        max_cache_size = max(map(lambda c: c.size(), csim.levels(with_mem=False)))
        max_array_size = max(self.kernel.array_sizes(in_bytes=True, subs_consts=True).values())

        # Regular Initialization
        warmup_indices = {
            symbol_pos_int(l['index']): ((l['stop']-l['start'])//l['increment'])//3
            for l in self.kernel.get_loop_stack(subs_consts=True)}
        warmup_iteration_count = self.kernel.indices_to_global_iterator(warmup_indices)
        # Make sure we are not handeling gigabytes of data, but 1.5x the maximum cache size
        while warmup_iteration_count * element_size > max_cache_size*1.5:
            # Decreasing indices (starting from outer), until total size is small enough
            for l in self.kernel.get_loop_stack():
                index = symbol_pos_int(l['index'])
                if warmup_indices[index] > l['start']:
                    warmup_indices[index] -= 1
                    # get offset in iterations and size that a change on this level provokes
                    diff_iterations = (warmup_iteration_count -
                                       self.kernel.indices_to_global_iterator(warmup_indices))
                    diff_size = diff_iterations*element_size
                    warmup_indices[index] = max(
                        l['start'],
                        (warmup_iteration_count - (max_cache_size*1.5) // element_size) // diff_size
                    )
                    break
            warmup_iteration_count = self.kernel.indices_to_global_iterator(warmup_indices)

        # Align iteration count with cachelines
        # do this by aligning either writes (preferred) or reads
        # Assumption: writes (and reads) increase linearly
        o = self.kernel.compile_global_offsets(iteration=warmup_iteration_count)[0]
        if len(o[1]):
            # we have a write to work with:
            first_offset = min(o[1])
        else:
            # we use reads
            first_offset = min(o[0])

        # Distance from cacheline boundary (in bytes)
        diff = first_offset - \
            (int(first_offset) >> csim.first_level.cl_bits << csim.first_level.cl_bits)
        warmup_iteration_count -= (diff//element_size)//inner_increment
        warmup_indices = self.kernel.global_iterator_to_indices(warmup_iteration_count)
        offsets = self.kernel.compile_global_offsets(
            iteration=range(0, warmup_iteration_count))

        if max_array_size < 2*max_cache_size:
            # Full caching possible, go through all itreration before actual initialization
            offsets = np.concatenate((self.kernel.compile_global_offsets(
                                         iteration=range(0, self.kernel.iteration_length())),
                                      offsets))

        # Do the warm-up
        csim.loadstore(offsets, length=element_size)
        # FIXME compile_global_offsets should already expand to element_size

        # Reset stats to conclude warm-up phase
        csim.reset_stats()

        # Benchmark iterations:
        # Starting point is one past the last warmup element
        bench_iteration_start = warmup_iteration_count
        # End point is the end of the current dimension (cacheline alligned)
        first_dim_factor = int((inner_loop['stop'] - warmup_indices[inner_index] - 1)
                               // (elements_per_cacheline // inner_increment))
        # If end point is less than one cacheline away, go beyond for 100 cachelines and
        # warn user of potentially inaccurate results
        if first_dim_factor == 0:
            # TODO a nicer solution would be to do less warmup iterations to select a
            # cacheline within a first dimension, if possible
            print('Warning: (automatic) warmup vs benchmark iteration choice was not perfect '
                  'and may lead to inaccurate cache miss predictions. This is most likely the '
                  'result of too few inner loop iterations ({} from {} to {}).'.format(
                      inner_loop['index'], inner_loop['start'], inner_loop['stop']
                  ))
            first_dim_factor = 100
        bench_iteration_end = (bench_iteration_start +
                               elements_per_cacheline * inner_increment * first_dim_factor)

        # compile access needed for one cache-line
        offsets = self.kernel.compile_global_offsets(
            iteration=range(bench_iteration_start, bench_iteration_end))
        # simulate
        csim.loadstore(offsets, length=element_size)
        # FIXME compile_global_offsets should already expand to element_size

        # use stats to build results
        self.stats = list(csim.stats())
        self.first_dim_factor = first_dim_factor
示例#5
0
    def __init__(self, kernel, machine, cores=1):
        """Initialize cache simulation based predictor from kernel and machine object."""
        CachePredictor.__init__(self, kernel, machine, cores)
        # Get the machine's cache model and simulator
        self.csim = self.machine.get_cachesim(self.cores)

        # FIXME handle multiple datatypes
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = self.machine['cacheline size']
        elements_per_cacheline = int(cacheline_size // element_size)
        iterations_per_cacheline = (sympy.Integer(self.machine['cacheline size']) /
                                    sympy.Integer(self.kernel.bytes_per_iteration))

        # Gathering some loop information:
        inner_loop = list(self.kernel.get_loop_stack(subs_consts=True))[-1]
        inner_index = symbol_pos_int(inner_loop['index'])
        inner_increment = inner_loop['increment']  # Calculate the number of iterations for warm-up
        total_length = self.kernel.iteration_length()
        max_iterations = self.kernel.subs_consts(total_length)
        max_cache_size = sum([c.size() for c in self.csim.levels(with_mem=False)])


        # Warmup
        # Phase 1:
        # define warmup interval boundaries
        max_steps = 100
        warmup_increment = ceildiv(max_cache_size // element_size, max_steps // 2)
        invalid_entries = self.csim.count_invalid_entries()
        step = 0
        warmup_iteration = 0
        complete_sweep = False
        while invalid_entries > 0 and step < max_steps and not complete_sweep:
            prev_warmup_iteration = warmup_iteration
            warmup_iteration = warmup_iteration + warmup_increment
            if warmup_iteration > max_iterations:
                warmup_iteration = max_iterations
                complete_sweep = True

            # print("warmup_iteration1", warmup_iteration)
            offsets = self.kernel.compile_global_offsets(
                iteration=range(prev_warmup_iteration, warmup_iteration))
            self.csim.loadstore(offsets, length=element_size)
            invalid_entries = self.csim.count_invalid_entries()
            # TODO more intelligent break criteria based on change of invalid entries might be
            #      useful for early termination.
            # print("invalid_entries", invalid_entries)

            step += 1

        # Phase 2:
        # Check that there is enough space left for benchmarking
        if complete_sweep:
            warmup_iteration = 0
        else:
            # Are we far away from max_iterations?
            # print("max_iterations", max_iterations)
            if warmup_iteration > max_iterations - 100000:
                # To close to end, need to complete sweep
                complete_sweep = True
                prev_warmup_iteration = warmup_iteration
                warmup_iteration = max_iterations
                # print("warmup_iteration2", warmup_iteration, end="; ")
                offsets = self.kernel.compile_global_offsets(
                    iteration=range(prev_warmup_iteration, warmup_iteration))
                self.csim.loadstore(offsets, length=element_size)
                warmup_iteration = 0
            if not complete_sweep and invalid_entries > 0:
                print("Warning: Unable to perform complete sweep nor initialize cache completely. "
                      "This might introduce inaccuracies (additional cache misses) in the cache "
                      "prediction.")

        # Phase 3:
        # Iterate to safe handover point
        prev_warmup_iteration = warmup_iteration
        warmup_iteration = self._align_iteration_with_cl_boundary(warmup_iteration, subtract=False)
        if warmup_iteration != prev_warmup_iteration:
            # print("warmup_iteration3", warmup_iteration)
            offsets = self.kernel.compile_global_offsets(
                iteration=range(prev_warmup_iteration, warmup_iteration))
            self.csim.loadstore(offsets, length=element_size)

        # Reset stats to conclude warm-up phase
        self.csim.reset_stats()

        # Benchmark
        bench_iteration = self._align_iteration_with_cl_boundary(min(
            warmup_iteration + 100000, max_iterations - 1))
        # print("bench_iteration", bench_iteration)
        first_dim_factor = float((bench_iteration - warmup_iteration) / iterations_per_cacheline)
        # If end point is less than 100 cacheline away, warn user of inaccuracy
        if first_dim_factor < 1000:
            print("Warning: benchmark iterations are very low ({} CL). This may lead to inaccurate "
                  "cache predictions.".format(first_dim_factor))

        # Compile access needed for one cache-line
        offsets = self.kernel.compile_global_offsets(
            iteration=range(warmup_iteration, bench_iteration))
        # Run cache simulation
        self.csim.loadstore(offsets, length=element_size)
        # FIXME compile_global_offsets should already expand to element_size

        # use stats to build results
        self.stats = list(self.csim.stats())
        self.first_dim_factor = first_dim_factor
示例#6
0
    def __init__(self, kernel, machine, cores=1):
        """Initialize cache simulation based predictor from kernel and machine object."""
        CachePredictor.__init__(self, kernel, machine, cores)
        # Get the machine's cache model and simulator
        self.csim = self.machine.get_cachesim(self.cores)

        # FIXME handle multiple datatypes
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = self.machine['cacheline size']
        elements_per_cacheline = int(cacheline_size // element_size)

        # Gathering some loop information:
        inner_loop = list(self.kernel.get_loop_stack(subs_consts=True))[-1]
        inner_index = symbol_pos_int(inner_loop['index'])
        inner_increment = inner_loop[
            'increment']  # Calculate the number of iterations for warm-up
        total_length = self.kernel.iteration_length()
        max_iterations = self.kernel.subs_consts(total_length)
        max_cache_size = sum(
            [c.size() for c in self.csim.levels(with_mem=False)])

        # Warmup
        # Phase 1:
        # define warmup interval boundaries
        max_steps = 100
        warmup_increment = ceildiv(max_cache_size // element_size, max_steps)
        invalid_entries = self.csim.count_invalid_entries()
        step = 0
        warmup_iteration = 0
        complete_sweep = False
        while invalid_entries > 0 and step < max_steps and not complete_sweep:
            prev_warmup_iteration = warmup_iteration
            warmup_iteration = warmup_iteration + warmup_increment
            if warmup_iteration > max_iterations:
                warmup_iteration = max_iterations
                complete_sweep = True

            # print("warmup_iteration1", warmup_iteration)
            offsets = self.kernel.compile_global_offsets(
                iteration=range(prev_warmup_iteration, warmup_iteration))
            self.csim.loadstore(offsets, length=element_size)
            invalid_entries = self.csim.count_invalid_entries()
            # TODO more intelligent break criteria based on change of invalid entries might be
            #      useful for early termination.
            # print("invalid_entries", invalid_entries)

            step += 1

        # Phase 2:
        # Check that there is enough space left for benchmarking
        if complete_sweep:
            warmup_iteration = 0
        else:
            # Are we far away from max_iterations?
            # print("max_iterations", max_iterations)
            if warmup_iteration > max_iterations - 100000:
                # To close to end, need to complete sweep
                complete_sweep = True
                prev_warmup_iteration = warmup_iteration
                warmup_iteration = max_iterations
                # print("warmup_iteration2", warmup_iteration, end="; ")
                offsets = self.kernel.compile_global_offsets(
                    iteration=range(prev_warmup_iteration, warmup_iteration))
                self.csim.loadstore(offsets, length=element_size)
                warmup_iteration = 0
            if not complete_sweep and invalid_entries > 0:
                print(
                    "Warning: Unable to perform complete sweep nor initialize cache completely. "
                    "This might introduce inaccuracies (additional cache misses) in the cache "
                    "prediction.")

        # Phase 3:
        # Iterate to safe handover point
        prev_warmup_iteration = warmup_iteration
        warmup_iteration = self._align_iteration_with_cl_boundary(
            warmup_iteration, subtract=False)
        if warmup_iteration != prev_warmup_iteration:
            # print("warmup_iteration3", warmup_iteration)
            offsets = self.kernel.compile_global_offsets(
                iteration=range(prev_warmup_iteration, warmup_iteration))
            self.csim.loadstore(offsets, length=element_size)

        # Reset stats to conclude warm-up phase
        self.csim.reset_stats()

        # Benchmark
        bench_iteration = self._align_iteration_with_cl_boundary(
            min(warmup_iteration + 100000, max_iterations - 1))
        # print("bench_iteration", bench_iteration)
        first_dim_factor = float(
            (bench_iteration - warmup_iteration) / elements_per_cacheline)
        # If end point is less than 100 cacheline away, warn user of inaccuracy
        if first_dim_factor < 1000:
            print(
                "Warning: benchmark iterations are very low ({} CL). This may lead to inaccurate "
                "cache predictions.".format(first_dim_factor))

        # Compile access needed for one cache-line
        offsets = self.kernel.compile_global_offsets(
            iteration=range(warmup_iteration, bench_iteration))
        # Run cache simulation
        self.csim.loadstore(offsets, length=element_size)
        # FIXME compile_global_offsets should already expand to element_size

        # use stats to build results
        self.stats = list(self.csim.stats())
        self.first_dim_factor = first_dim_factor
示例#7
0
    def __init__(self, kernel, machine, cores=1, symbolic=False):
        """Initialize layer condition based predictor from kernel and machine object."""
        CachePredictor.__init__(self, kernel, machine, cores=cores)

        # check that layer conditions can be applied on this kernel:
        # 1. All iterations may only have a step width of 1
        loop_stack = list(self.kernel.get_loop_stack())
        if any([l['increment'] != 1 for l in loop_stack]):
            raise ValueError("Can not apply layer condition, since not all loops are of step "
                             "length 1.")

        # 2. The order of iterations must be reflected in the order of indices in all array
        #    references containing the inner loop index. If the inner loop index is not part of the
        #    reference, the reference is simply ignored
        index_order = [symbol_pos_int(l['index']) for l in loop_stack]
        for var_name, arefs in chain(self.kernel.sources.items(), self.kernel.destinations.items()):
            if next(iter(arefs)) is None:
                # Anything that is a scalar may be ignored
                continue

            for a in [self.kernel.access_to_sympy(var_name, a) for a in arefs]:
                for t in a.expand().as_ordered_terms():
                    # Check each and every term if they are valid according to loop order and array
                    # initialization
                    idx = t.free_symbols.intersection(index_order)

                    # Terms without any indices can be treat as constant offsets and are acceptable
                    if not idx:
                        continue

                    if len(idx) != 1:
                        raise ValueError("Only one loop counter may appear per term. "
                                         "Problematic term: {}.".format(t))
                    else:  # len(idx) == 1
                        idx = idx.pop()
                        # Check that number of multiplication match access order of iterator
                        pow_dict = {k: v for k, v in t.as_powers_dict().items()
                                    if k != idx}
                        stride_dim = sum(pow_dict.values())
                        error = False
                        try:
                            if loop_stack[-stride_dim-1]['index'] != idx.name:
                                error = True
                        except IndexError:
                            error = True
                        if error:
                            raise ValueError("Number of multiplications in index term does not "
                                             "match loop counter order. "
                                             "Problematic term: {}.".format(t))

        # 3. Indices may only increase with one
        inner_index = symbol_pos_int(loop_stack[-1]['index'])
        inner_increment = loop_stack[-1]['increment']
        # TODO use a public interface, not self.kernel._*
        for aref in chain(chain(*self.kernel.sources.values()),
                          chain(*self.kernel.destinations.values())):
            if aref is None:
                continue

            for expr in aref:
                diff = expr.subs(inner_index, 1+inner_increment) - expr.subs(inner_index, 1)
                if diff != 0 and diff != 1:
                    # TODO support -1 aswell
                    raise ValueError("Can not apply layer condition, array references may not "
                                     "increment more then one per iteration.")
        # FIXME handle multiple datatypes
        element_size = self.kernel.datatypes_size[self.kernel.datatype]

        indices = list([symbol_pos_int(l[0]) for l in self.kernel._loop_stack])
        sympy_accesses = self.kernel.compile_sympy_accesses()
        accesses = {}
        destinations = set()
        distances = []
        for var_name in sorted(self.kernel.variables):
            # Gather all access to current variable/array and delinearize accesses
            accesses[var_name] = []
            dimension_factors = None
            for a in sympy_accesses[var_name]:
                o, df = split_sympy_access_in_dim_offset_and_factor(a, indices)
                accesses[var_name].append(o)
                if dimension_factors is None:
                    dimension_factors = df
                elif dimension_factors[-len(indices):] != df[-len(indices):]:
                    raise ValueError("Extracted dimension factors are different within one "
                                     "variable. Must be a bug. {!r} != {!r}".format(
                        df, dimension_factors))
                elif len(dimension_factors) < len(df):
                    dimension_factors = df
            # Skip non-variable offsets, where acs is [None, None, None] (or similar) or only made
            # up from constant offsets
            if not any(accesses[var_name]) or not any(
                    [a == inner_index or a.coeff(inner_index) != 0
                     for a in chain.from_iterable(accesses[var_name])]):
                continue
            destinations.update(
                [(var_name, tuple(r)) for r in self.kernel.destinations.get(var_name, [])])
            acs = list(accesses[var_name])
            # If accesses are of unequal length, pad with leading zero elements
            max_dims = max(map(len, acs))
            for i in range(len(acs)):
                if len(acs[i]) < max_dims:
                    acs[i] = (sympy.Integer(0),)*(max_dims-len(acs[i])) + acs[i]
            # Sort accesses by decreasing order
            acs.sort(reverse=True)
            # Transform back into sympy expressions
            for i in range(len(acs)):
                acs[i] = reduce(sympy.Add, [f*df for f, df in zip(acs[i], dimension_factors)])
            # Create reuse distances by substracting accesses pairwise in decreasing order
            distances += [(acs[i-1]-acs[i]).simplify() for i in range(1, len(acs))]
            # Add infinity for each array
            distances.append(sympy.oo)

        # Sort distances by decreasing order
        distances.sort(reverse=True, key=sympy_expr_abs_distance_key)
        # Create copy of distances in bytes:
        distances_bytes = [d*element_size for d in distances]
        # CAREFUL! From here on we are working in byte offsets and not in indices anymore.

        # converting access sets to lists, otherwise pprint will fail during obligatory sorting step
        results = {'accesses': {k: sorted(list(v), key=cmp_to_key(uneven_tuple_cmp))
                                for k,v in accesses.items()},
                   'distances': distances,
                   'destinations': destinations,
                   'distances_bytes': distances_bytes,
                   'cache': []}

        sum_array_sizes = sum(self.kernel.array_sizes(in_bytes=True, subs_consts=False).values())

        for c in self.machine.get_cachesim(self.cores).levels(with_mem=False):
            # Assuming increasing order of cache sizes
            options = []
            # Full caching
            options.append({
                'condition': (c.size() > sum_array_sizes).simplify().expand(),
                'hits': len(distances),
                'misses': 0,
                'evicts': 0,
                'tail': sympy.oo,
            })

            for tail in sorted(set([d.simplify().expand() for d in distances_bytes]), reverse=True,
                               key=sympy_expr_abs_distance_key):
                # Assuming decreasing order of tails
                # Ignoring infinity tail:
                if tail is sympy.oo:
                    continue
                cache_requirement = (
                    # Sum of inter-access caches
                    sum([d for d in distances_bytes
                         if sympy_expr_abs_distance_key(d) <= sympy_expr_abs_distance_key(tail)]
                        ) +
                    # Tails
                    tail*len([d for d in distances_bytes
                              if sympy_expr_abs_distance_key(d) >
                                 sympy_expr_abs_distance_key(tail)]))
                condition = (cache_requirement <= c.size()).simplify().expand()

                hits = len(
                    [d for d in distances_bytes
                     if sympy_expr_abs_distance_key(d) <= sympy_expr_abs_distance_key(tail)])
                misses = len(
                    [d for d in distances_bytes
                     if sympy_expr_abs_distance_key(d) > sympy_expr_abs_distance_key(tail)])

                # Resulting analysis
                options.append({
                    'condition': condition,
                    'hits': hits,
                    'misses': misses,
                    'evicts': len(destinations),
                    'tail': tail})
                # If we encountered a True condition, break to not include multiple such.
                if isinstance(condition, BooleanTrue):
                    break
            if not isinstance(options[-1]['condition'], BooleanTrue):
                # Fallback: no condition matched
                options.append({
                    'condition': True,
                    'hits': 0,
                    'misses': len(distances),
                    'evicts': len(destinations),
                    'tail': 0
                })

            if symbolic:
                results['cache'].append(options)
            else:
                for o in options:
                    if self.kernel.subs_consts(o['condition']):
                        results['cache'].append(o)
                        break
        self.results = results