def _factorize(self, cluster, **kwargs): """ Collect terms in each expr in exprs based on the following heuristic: * Collect all literals; * Collect all temporaries produced by CSE; * If the expression has an operation count higher than self.threshold, then this is applied recursively until no more factorization opportunities are available. """ processed = [] for expr in cluster.exprs: handle = collect_nested(expr) cost_handle = estimate_cost(handle) if cost_handle >= self.thresholds['min-cost-factorize']: handle_prev = handle cost_prev = estimate_cost(expr) while cost_handle < cost_prev: handle_prev, handle = handle, collect_nested(handle) cost_prev, cost_handle = cost_handle, estimate_cost(handle) cost_handle, handle = cost_prev, handle_prev processed.append(handle) return cluster.rebuild(processed)
def _extract_time_invariants(self, cluster, template, with_cse=True, costmodel=None, **kwargs): """ Extract time-invariant subexpressions, and assign them to temporaries. """ # Extract time invariants make = lambda i: ScalarFunction(name=template(i)).indexify() rule = iq_timeinvariant(cluster.trace) costmodel = costmodel or (lambda e: estimate_cost(e) > 0) processed, found = xreplace_constrained(cluster.exprs, make, rule, costmodel) if with_cse: leaves = [i for i in processed if i not in found] # Search for common sub-expressions amongst them (and only them) make = lambda i: ScalarFunction(name=template(i + len(found)) ).indexify() found = common_subexprs_elimination(found, make) # Some temporaries may be droppable at this point processed = compact_temporaries(found + leaves) return cluster.reschedule(processed)
def wrapper(self, state, **kwargs): if self.mode.intersection(set(self.triggers[func.__name__])): tic = time() state.update(flatten([func(self, c) for c in state.clusters])) toc = time() key = '%s%d' % (func.__name__, len(self.timings)) self.timings[key] = toc - tic if self.profile: candidates = [c.exprs for c in state.clusters if c.is_dense] self.ops[key] = estimate_cost(flatten(candidates))
def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common subexpressions elimination. Note: the output is not guranteed to be topologically sorted. :param exprs: The target SymPy expression, or a collection of SymPy expressions. :param make: A function to construct symbols used for replacement. The function takes as input an integer ID; ID is computed internally and used as a unique identifier for the constructed symbols. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some sort of post-processing assert mode == 'default' # Only supported mode ATM processed = list(exprs) mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_op).items() targets = OrderedDict([(k, estimate_cost(k)) for k, v in counted if v > 1]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make(len(mapped) + i)) for i, e in enumerate(picked)]) # Apply repleacements processed = [e.xreplace(mapper) for e in processed] mapped = [e.xreplace(mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # Simply renumber the temporaries in ascending order mapper = {i.lhs: j.lhs for i, j in zip(mapped, reversed(mapped))} processed = [e.xreplace(mapper) for e in processed] # Some temporaries may be droppable at this point processed = compact_temporaries(processed) return processed
def _extract_time_varying(self, cluster, template, **kwargs): """ Extract time-varying subexpressions, and assign them to temporaries. Time varying subexpressions arise for example when approximating derivatives through finite differences. """ make = lambda i: ScalarFunction(name=template(i)).indexify() rule = iq_timevarying(cluster.trace) costmodel = lambda i: estimate_cost(i) > 0 processed, _ = xreplace_constrained(cluster.exprs, make, rule, costmodel) return cluster.reschedule(processed)
def _extract_time_varying(self, cluster, **kwargs): """ Extract time-varying subexpressions, and assign them to temporaries. Time varying subexpressions arise for example when approximating derivatives through finite differences. """ template = self.conventions['time-dependent'] + "%d" make = lambda i: ScalarFunction(name=template % i).indexify() rule = iq_timevarying(cluster.trace) cm = lambda i: estimate_cost(i) > 0 processed, _ = xreplace_constrained(cluster.exprs, make, rule, cm) return cluster.rebuild(processed)
def wrapper(self, state, **kwargs): # A template to construct temporaries tempname = self.conventions.get(func.__name__) if tempname: start = kwargs.get('start') tempname += '%d' if start is None else (('_%d_' % start) + '%d') template = lambda i: tempname % i else: template = None # Invoke the DSE pass tic = time() state.update(flatten([func(self, c, template, **kwargs) for c in state.clusters])) toc = time() # Profiling key = '%s%d' % (func.__name__, len(self.timings)) self.timings[key] = toc - tic if self.profile: candidates = [c.exprs for c in state.clusters if c.is_dense] self.ops[key] = estimate_cost(flatten(candidates))
def _extract_time_invariants(self, cluster, **kwargs): """ Extract time-invariant subexpressions, and assign them to temporaries. """ # Extract time invariants template = self.conventions['time-invariant'] + "%d" make = lambda i: ScalarFunction(name=template % i).indexify() rule = iq_timeinvariant(cluster.trace) cm = lambda e: estimate_cost(e) > 0 processed, found = xreplace_constrained(cluster.exprs, make, rule, cm) leaves = [i for i in processed if i not in found] # Search for common sub-expressions amongst them (and only them) template = "%s%s%s" % (self.conventions['redundancy'], self.conventions['time-invariant'], '%d') make = lambda i: ScalarFunction(name=template % i).indexify() found = common_subexprs_elimination(found, make) return cluster.rebuild(found + leaves)
def _eliminate_inter_stencil_redundancies(self, cluster, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect_aliases.__doc__ mapper, aliases = collect_aliases(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices shape = g.space_shape # Template for captured redundancies name = self.conventions['redundancy'] + "%d" template = lambda i: TensorFunction( name=name % i, shape=shape, dimensions=indices).indexed # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds[ 'min-cost-time-hoist'] and g.time_invariant(v): candidates[v.rhs] = k elif cost >= self.thresholds[ 'min-cost-space-hoist'] and naliases > 1: candidates[v.rhs] = k else: processed.append(Eq(k, v.rhs)) # Create temporaries capturing redundant computation found = [] rules = OrderedDict() stencils = [] for c, (origin, alias) in enumerate(aliases.items()): temporary = Indexed(template(c), *indices) found.append(Eq(temporary, origin)) # Track the stencil of each TensorFunction introduced stencils.append(alias.anti_stencil.anti(cluster.stencil)) for aliased, distance in alias.with_distance: coordinates = [ sum([i, j]) for i, j in distance.items() if i in indices ] rules[candidates[aliased]] = Indexed(template(c), *tuple(coordinates)) # Create the alias clusters alias_clusters = clusterize(found, stencils) alias_clusters = sorted(alias_clusters, key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Template for captured redundancies shape = tuple(i.symbolic_size for i in indices) make = lambda i: TensorFunction( name=template(i), shape=shape, dimensions=indices).indexed # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(Eq(k, v.rhs)) # Create temporaries capturing redundant computation expressions = [] stencils = [] rules = OrderedDict() for c, (origin, alias) in enumerate(aliases.items()): if all(i not in candidates for i in alias.aliased): continue # Build alias expression function = make(c) expressions.append(Eq(Indexed(function, *indices), origin)) # Build substitution rules for aliased, distance in alias.with_distance: coordinates = [ sum([i, j]) for i, j in distance.items() if i in indices ] temporary = Indexed(function, *tuple(coordinates)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Build cluster stencil stencil = alias.anti_stencil.anti(cluster.stencil) if all(time_invariants[i] for i in alias.aliased): # Optimization: drop time dimension if time-invariant and the # alias involves a complex calculation stencil = stencil.section(g.time_indices) stencils.append(stencil) # Create the alias clusters alias_clusters = clusterize(expressions, stencils, indices) alias_clusters = sorted(alias_clusters, key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]