def mapping_point_generator_function(resource, layer, schedule=None, verbose=False): ''' Mapping point generator. Generates a new mapping point each iteration. ''' num_levels = resource.buffer_levels() blocking_partitioning_generator = \ blocking_partitioning_generator_function(resource, layer, schedule) for blocking_partitioning in blocking_partitioning_generator: ''' dummy_mapping_point is used to validate the current blocking_partitioning, and abandon the ones that exceed the buffer size at any level. Since this validation does not depend on loop_orders, we perform the validation at this early stage, so that we can avoid generating all the loop orders for an invalid blocking_partitioning ''' [blocking, partitioning] = blocking_partitioning dummy_mapping_point = MappingPoint(None, blocking, partitioning) if cost_model.valid_mapping_point(resource, dummy_mapping_point, layer, verbose): order_generator = \ opt_order_generator_function(dummy_mapping_point, le.NUM, num_levels) for loop_order in order_generator: mapping_point = MappingPoint(loop_order, \ blocking, \ partitioning) yield mapping_point
def dataflow_exploration(resource, layer, file_name, verbose=False): ''' Dataflow exploration. Generates a table, with unrolled loops being keys, the best energy (and utilization) being the values. ''' dataflow_tb = {} num_levels = resource.buffer_levels() parallel_levels = resource.para_index blocking_partitioning_generator = \ blocking_partitioning_generator_function(resource, layer, None) # dummy_partitioning = [(1,) * num_levels] * le.NUM smallest_cost = float("inf") # best_mapping_point = None for blocking_partitioning in blocking_partitioning_generator: ''' dummy_mapping_point is used to validate the current blocking_partitioning, and abandon the ones that exceed the buffer size at any level. Since this validation does not depend on loop_orders, we perform the validation at this early stage, so that we can avoid generating all the loop orders for an invalid blocking_partitioning ''' if verbose >= 2: print "Find best order for schedule: ", blocking_partitioning [blocking, partitioning, para_dim] = blocking_partitioning dummy_mapping_point = MappingPoint(None, blocking, partitioning, para_dim) # print "partitioning: ", partitioning unrolled_loops, utilized = partitioned_loop_string( partitioning, parallel_levels, para_dim) utilization = get_utilization(utilized, resource) if resource.replication and utilization < resource.utilization_threshold: continue cost, loop_order = opt_get_best_loop_order(resource, layer, dummy_mapping_point, verbose) if unrolled_loops not in dataflow_tb or dataflow_tb[unrolled_loops][ 0] > cost: best_mapping_point = MappingPoint(loop_order, blocking, partitioning, para_dim) dataflow_tb[unrolled_loops] = (cost, utilization, best_mapping_point ) # TODO utilization if verbose: print "unrolled loops: ", unrolled_loops, " with utilization ", utilization # print "best loop order: ", best_mapping_point.loop_orders print "blocking: ", blocking print "partitioning: ", partitioning print "Update smallest cost: ", dataflow_tb[unrolled_loops][0] # print "Update best shedule: ", utils.print_loop_nest(best_mapping_point) # assert best_mapping_point, "No valid mapping point found." pickle_file_name = file_name + ".pickle" pickle.dump(dataflow_tb, open(pickle_file_name, "wb")) return dataflow_tb
def blocking_partitioning_generator_function(resource, layer, schedule, verbose=False): ''' Generate all blocking and partitioning choices, only explore the size that is power of 2, due to spead issue ''' #loop_blocking_list and loop_partitioning_list generator. num_level = resource.buffer_levels() blocking_generator = blocking_generator_function(resource, layer, schedule, verbose) for loop_blocking in blocking_generator: loop_blocking_reshape = zip(*loop_blocking) pb_generator = parallel_blocking_generator_function(loop_blocking_reshape, resource, layer, schedule) for pi in pb_generator: partition, para_dim = pi partitioned_loop_blocking_reshape = [] for level in xrange(num_level): partitioned_loop_blocking_reshape.append([ (x+y-1) // y for x, y in zip(loop_blocking_reshape[level], partition[level])]) #TODO check if using two maps with floordiv is faster blocking_list = zip(*partitioned_loop_blocking_reshape) partitioning_list = zip(*partition) dummy_mapping_point = MappingPoint(None, blocking_list, partitioning_list, para_dim) if cost_model.valid_partitioning(resource, dummy_mapping_point, layer, verbose): yield [blocking_list, partitioning_list, para_dim]
def opt_valid_blocking(blocking_cache, resource, layer, blocking): num_levels = resource.buffer_levels() blocking_tuple = zip(*blocking) dummy_partitioning = [(1, ) * num_levels] * le.NUM dummy_mapping_point = MappingPoint(None, list(blocking), dummy_partitioning) ''' Use cache to compute valid of first level ''' level = 0 value_in_cache = blocking_cache.read_cache(level, blocking_tuple[level]) if value_in_cache == None: valid = cost_model.valid_blocking_size_current_level( resource, dummy_mapping_point, layer, level) blocking_cache.write_cache(level, blocking_tuple[level], valid) else: valid = value_in_cache if not valid: return False for level in xrange(1, num_levels): if not cost_model.valid_blocking_size_current_level( resource, dummy_mapping_point, layer, level): return False return True
def opt_mapping_point_generator_function(resource, layer, schedule=None, verbose=False): ''' Mapping point generator. Generates a new mapping point each iteration. ''' num_levels = resource.buffer_levels() blocking_partitioning_generator = \ blocking_partitioning_generator_function(resource, layer, schedule) # dummy_partitioning = [(1,) * num_levels] * le.NUM smallest_cost = float("inf") best_mapping_point = None for blocking_partitioning in blocking_partitioning_generator: ''' dummy_mapping_point is used to validate the current blocking_partitioning, and abandon the ones that exceed the buffer size at any level. Since this validation does not depend on loop_orders, we perform the validation at this early stage, so that we can avoid generating all the loop orders for an invalid blocking_partitioning ''' if verbose >= 2: print "Find best order for schedule: ", blocking_partitioning [blocking, partitioning, para_dim] = blocking_partitioning dummy_mapping_point = MappingPoint(None, blocking, partitioning, para_dim) # print "blocking_partitioning: ", blocking_partitioning cost, loop_order = opt_get_best_loop_order(resource, layer, dummy_mapping_point, verbose) if cost < smallest_cost: smallest_cost = cost best_mapping_point = MappingPoint(loop_order, blocking, partitioning, para_dim) if verbose >= 2: print "best loop order: ", best_mapping_point.loop_orders print "Update smallest cost: ", smallest_cost print "Update best schedule: ", utils.print_loop_nest( best_mapping_point) assert best_mapping_point, "No valid mapping point found." return smallest_cost, best_mapping_point
def opt_mapping_point_generator_function(resource, layer, schedule=None, verbose=False): ''' Mapping point generator. Generates a new mapping point each iteration. ''' num_levels = resource.buffer_levels() parallel_levels = resource.para_index ideal_perf = cost_model.get_ideal_performance(layer, resource) blocking_partitioning_generator = \ blocking_partitioning_generator_function(resource, layer, schedule) smallest_cost = float("inf") best_mapping_point = None for blocking_partitioning in blocking_partitioning_generator: ''' dummy_mapping_point is used to validate the current blocking_partitioning, and abandon the ones that exceed the buffer size at any level. Since this validation does not depend on loop_orders, we perform the validation at this early stage, so that we can avoid generating all the loop orders for an invalid blocking_partitioning ''' if verbose >= 3: print "Find best order for schedule: ", blocking_partitioning [blocking, partitioning, para_dim] = blocking_partitioning dummy_mapping_point = MappingPoint(None, blocking, partitioning, para_dim) cost, loop_order = opt_get_best_loop_order(resource, layer, dummy_mapping_point, verbose) if cost < smallest_cost: smallest_cost = cost best_mapping_point = MappingPoint(loop_order, blocking, partitioning, para_dim) unrolled_loops, utilized = partitioned_loop_string(partitioning, parallel_levels, para_dim) utilization = get_utilization(utilized, resource) perf = ideal_perf / utilization if verbose >= 3: print "best loop order: ", best_mapping_point.loop_orders if verbose >= 2: print "runtime (cycles): ", perf, "utilization: ", utilization print "Update smallest cost (pJ): ", smallest_cost print "Update best shedule: ", utils.print_loop_nest(best_mapping_point) assert best_mapping_point, "No valid mapping point found." return smallest_cost, perf, best_mapping_point
def opt_get_best_loop_order(resource, layer, point, verbose=False): ''' When no paritioning, the cost of the current level only depends on the current level loop orders, given the blocking factors. Thus we can leverage this to find the best loop order for each level individually. ''' num_levels = resource.buffer_levels() best_loop_order = [] blocking = point.loop_blockings partitioning = point.loop_partitionings para_dim = point.para_loop_dim non_empty_loops = get_non_empty_loops(point, num_levels) best_cost = 0 para_level = 0 for level in xrange(num_levels): smallest_cost = float("inf") for curr_level_order in level_order_generator_function(point, le.NUM, non_empty_loops, level): dummy_loop_order = [[0] * le.NUM] * num_levels dummy_loop_order[level] = curr_level_order mapping_point = MappingPoint(zip(*dummy_loop_order), blocking, partitioning, para_dim) if level <= 0 or resource.paras[level-1].count <= 1 \ or resource.paras[level-1].access_mode < 1: curr_cost = cost_model.get_level_cost(resource, mapping_point, layer, level, verbose) else: curr_cost = cost_model.get_array_and_curr_level_cost(resource, mapping_point, layer, level, verbose) if curr_cost < smallest_cost: best_curr_level_order = curr_level_order smallest_cost = curr_cost if resource.mac_capacity == 0 and level == 0: break best_loop_order.append(best_curr_level_order) best_cost += smallest_cost return best_cost, zip(*best_loop_order)
def opt_get_best_loop_order(resource, layer, point, verbose=False): ''' [HW template right now: systolic array] [SRAM only talks to the PEs on the edge, most PE will get data from its neighbour PE] When there is no partitioning (parallelism), the cost of the current level only depends on the current level loop orders, given the blocking factors. Thus we can leverage this to find the best loop order for each level individually. When there is partitioning (parallelism), the # of times that the paralleled level of memory passing data to its neighbour PE (corresponding to the energy spent on interconnection, array_cost) depends on the current level parallelism size & memory access from the above level memory The lowest level memory access (talk to MAC) only depends on the NN layer size level access: [input, weight, output] # of element level order: [fx, fy, ox, oy, oc, ic, on], '0' for innermost loop, '6' for outermost loop / non-existed loop ''' num_levels = resource.buffer_levels() best_loop_order = [] blocking = point.loop_blockings partitioning = point.loop_partitionings para_dim = point.para_loop_dim non_empty_loops = get_non_empty_loops(point, num_levels) # print blocking, partitioning best_cost = 0 para_level = 0 for level in xrange(num_levels): smallest_cost = float("inf") # LMEI later, might can speed up the exhaustive order search by identifying symmetrical terms, # e.g. ox and oy, fx and fx, to remove some similar orders for curr_level_order in level_order_generator_function( point, le.NUM, non_empty_loops, level): dummy_loop_order = [[0] * le.NUM] * num_levels dummy_loop_order[level] = curr_level_order mapping_point = MappingPoint(zip(*dummy_loop_order), blocking, partitioning, para_dim) if level <= 0 or resource.paras[level - 1].count <= 1 \ or resource.paras[level - 1].access_mode < 1: # don't get it curr_cost = cost_model.get_level_cost(resource, mapping_point, layer, level, verbose) else: curr_cost = cost_model.get_array_and_curr_level_cost( resource, mapping_point, layer, level, verbose) if curr_cost < smallest_cost: best_curr_level_order = curr_level_order smallest_cost = curr_cost if verbose >= 3: print "Level", level, "Current order:", curr_level_order, " Best order:", best_curr_level_order print "Level", level, "Current energy:", '%20d' % curr_cost, " Best energy:", '%20d' % smallest_cost print "" # LMEI later, instead of using mac_capacity, we could use 4-level memory model, treat mac_capacity # as the innermost memory level for output. if resource.mac_capacity == 0 and level == 0: break # Here the author thinks the loop order in innermost level doesn't matter, thus break best_loop_order.append(best_curr_level_order) best_cost += smallest_cost return best_cost, zip(*best_loop_order)
def blocking_partitioning_generator_function(resource, layer, schedule, verbose=False): ''' loop_blocking_list and loop_partitioning_list generator. loop_blocking: [[Total size (temporal+spatial) of Fx @ mem level 0,1,2],[Fy],[OX],[OY],[OC],[IC],[ON]] loop_blocking_reshape: [(All loops' total size (temporal+spatial) @ mem level 0),(@ level 1),(@ level 2)] partition: [[All loops' spatial unrolled size @ mem level 0],[@ level 1],[@ level 2]] para_dim: [[Spatial unrolled loop dimensions @ mem level 0],[@ level 1],[@ level 2]] partitioned_loop_blocking_reshape: [[All loops' temporal unrolled size @ mem level 0],[@ level 1],[@ level 2]] blocking_list: [[Temporal unrolled size of Fx @ mem level 0,1,2],[Fy],[OX],[OY],[OC],[IC],[ON]] partitioning_list: [[Spatial unrolled size of Fx @ mem level 0,1,2],[Fy],[OX],[OY],[OC],[IC],[ON]] ''' num_level = resource.buffer_levels() blocking_generator = blocking_generator_function(resource, layer, schedule, verbose) for loop_blocking in blocking_generator: if verbose == 3: print "loop_tilling: ", loop_blocking loop_blocking_reshape = zip(*loop_blocking) pb_generator = parallel_blocking_generator_function( loop_blocking_reshape, resource, layer, schedule) for pi in pb_generator: partition, para_dim = pi partitioned_loop_blocking_reshape = [] for level in xrange(num_level): partitioned_loop_blocking_reshape.append([ (x + y - 1) // y for x, y in zip( loop_blocking_reshape[level], partition[level]) ]) # TODO check if using two maps with floordiv is faster blocking_list = zip(*partitioned_loop_blocking_reshape) partitioning_list = zip(*partition) if verbose == 3: print "loop_blocking: ", blocking_list print "loop_partition: ", partitioning_list print "para_dimension: ", para_dim dummy_mapping_point = MappingPoint(None, blocking_list, partitioning_list, para_dim) if cost_model.valid_partitioning(resource, dummy_mapping_point, layer, verbose): # if cost_model.valid_mapping_point(resource, dummy_mapping_point, layer, verbose): if verbose == 3: print "Valid" print "" yield [blocking_list, partitioning_list, para_dim] # else: # print "invalid" # print "" else: if verbose == 3: print "invalid" print ""