def mapping_point_generator_function(resource, layer, schedule=None, verbose=False):
    '''
    Mapping point generator.

    Generates a new mapping point each iteration.
    '''

    num_levels = resource.buffer_levels()

    blocking_partitioning_generator = \
        blocking_partitioning_generator_function(resource, layer, schedule)

    for blocking_partitioning in blocking_partitioning_generator:
        ''' 
           dummy_mapping_point is used to validate the current blocking_partitioning,
           and abandon the ones that exceed the buffer size at any level.
           Since this validation does not depend on loop_orders, we perform the validation
           at this early stage, so that we can avoid generating all the loop orders for 
           an invalid blocking_partitioning 
        '''
        [blocking, partitioning] = blocking_partitioning
        dummy_mapping_point = MappingPoint(None, blocking, partitioning)
        if cost_model.valid_mapping_point(resource, dummy_mapping_point, layer, verbose):
            order_generator = \
                opt_order_generator_function(dummy_mapping_point, le.NUM, num_levels)
            for loop_order in order_generator:
                mapping_point = MappingPoint(loop_order, \
                                blocking, \
                                partitioning)
                yield mapping_point
示例#2
0
def dataflow_exploration(resource, layer, file_name, verbose=False):
    '''
    Dataflow exploration.

    Generates a table, with unrolled loops being keys, the best energy (and utilization)
    being the values.
    '''

    dataflow_tb = {}
    num_levels = resource.buffer_levels()
    parallel_levels = resource.para_index

    blocking_partitioning_generator = \
        blocking_partitioning_generator_function(resource, layer, None)

    # dummy_partitioning = [(1,) * num_levels] * le.NUM

    smallest_cost = float("inf")
    # best_mapping_point = None
    for blocking_partitioning in blocking_partitioning_generator:
        ''' 
           dummy_mapping_point is used to validate the current blocking_partitioning,
           and abandon the ones that exceed the buffer size at any level.
           Since this validation does not depend on loop_orders, we perform the validation
           at this early stage, so that we can avoid generating all the loop orders for 
           an invalid blocking_partitioning 
        '''
        if verbose >= 2:
            print "Find best order for schedule: ", blocking_partitioning
        [blocking, partitioning, para_dim] = blocking_partitioning
        dummy_mapping_point = MappingPoint(None, blocking, partitioning,
                                           para_dim)
        # print "partitioning: ", partitioning
        unrolled_loops, utilized = partitioned_loop_string(
            partitioning, parallel_levels, para_dim)
        utilization = get_utilization(utilized, resource)
        if resource.replication and utilization < resource.utilization_threshold:
            continue
        cost, loop_order = opt_get_best_loop_order(resource, layer,
                                                   dummy_mapping_point,
                                                   verbose)
        if unrolled_loops not in dataflow_tb or dataflow_tb[unrolled_loops][
                0] > cost:
            best_mapping_point = MappingPoint(loop_order, blocking,
                                              partitioning, para_dim)
            dataflow_tb[unrolled_loops] = (cost, utilization,
                                           best_mapping_point
                                           )  # TODO utilization
            if verbose:
                print "unrolled loops: ", unrolled_loops, " with utilization ", utilization
                # print "best loop order: ", best_mapping_point.loop_orders
                print "blocking: ", blocking
                print "partitioning: ", partitioning
                print "Update smallest cost: ", dataflow_tb[unrolled_loops][0]
                # print "Update best shedule: ", utils.print_loop_nest(best_mapping_point)
    # assert best_mapping_point, "No valid mapping point found."
    pickle_file_name = file_name + ".pickle"
    pickle.dump(dataflow_tb, open(pickle_file_name, "wb"))
    return dataflow_tb
def blocking_partitioning_generator_function(resource, layer, schedule, verbose=False):
    '''
    Generate all blocking and partitioning choices, only explore the size that is 
    power of 2, due to spead issue
    ''' 
    #loop_blocking_list and loop_partitioning_list generator.
    
    num_level = resource.buffer_levels()
    blocking_generator = blocking_generator_function(resource, layer, schedule, verbose)

    for loop_blocking in blocking_generator:
        
        loop_blocking_reshape = zip(*loop_blocking)
        pb_generator = parallel_blocking_generator_function(loop_blocking_reshape, resource, layer, schedule)
        
        for pi in pb_generator:
            partition, para_dim = pi
            partitioned_loop_blocking_reshape = []
            for level in xrange(num_level):
                partitioned_loop_blocking_reshape.append([ (x+y-1) // y 
                    for x, y in zip(loop_blocking_reshape[level], partition[level])])   #TODO check if using two maps with floordiv is faster 
            blocking_list = zip(*partitioned_loop_blocking_reshape)
            partitioning_list = zip(*partition)
            
            dummy_mapping_point = MappingPoint(None, blocking_list, partitioning_list, para_dim)
            if cost_model.valid_partitioning(resource, dummy_mapping_point, layer, verbose):
                yield [blocking_list, partitioning_list, para_dim]
示例#4
0
def opt_valid_blocking(blocking_cache, resource, layer, blocking):
    num_levels = resource.buffer_levels()
    blocking_tuple = zip(*blocking)
    dummy_partitioning = [(1, ) * num_levels] * le.NUM
    dummy_mapping_point = MappingPoint(None, list(blocking),
                                       dummy_partitioning)
    '''
   Use cache to compute valid of first level
   '''
    level = 0
    value_in_cache = blocking_cache.read_cache(level, blocking_tuple[level])
    if value_in_cache == None:
        valid = cost_model.valid_blocking_size_current_level(
            resource, dummy_mapping_point, layer, level)
        blocking_cache.write_cache(level, blocking_tuple[level], valid)
    else:
        valid = value_in_cache
    if not valid:
        return False

    for level in xrange(1, num_levels):
        if not cost_model.valid_blocking_size_current_level(
                resource, dummy_mapping_point, layer, level):
            return False
    return True
示例#5
0
def opt_mapping_point_generator_function(resource,
                                         layer,
                                         schedule=None,
                                         verbose=False):
    '''
    Mapping point generator.

    Generates a new mapping point each iteration.
    '''
    num_levels = resource.buffer_levels()
    blocking_partitioning_generator = \
        blocking_partitioning_generator_function(resource, layer, schedule)

    # dummy_partitioning = [(1,) * num_levels] * le.NUM

    smallest_cost = float("inf")
    best_mapping_point = None
    for blocking_partitioning in blocking_partitioning_generator:
        ''' 
           dummy_mapping_point is used to validate the current blocking_partitioning,
           and abandon the ones that exceed the buffer size at any level.
           Since this validation does not depend on loop_orders, we perform the validation
           at this early stage, so that we can avoid generating all the loop orders for 
           an invalid blocking_partitioning 
        '''
        if verbose >= 2:
            print "Find best order for schedule: ", blocking_partitioning
        [blocking, partitioning, para_dim] = blocking_partitioning
        dummy_mapping_point = MappingPoint(None, blocking, partitioning,
                                           para_dim)
        # print "blocking_partitioning: ", blocking_partitioning
        cost, loop_order = opt_get_best_loop_order(resource, layer,
                                                   dummy_mapping_point,
                                                   verbose)
        if cost < smallest_cost:
            smallest_cost = cost
            best_mapping_point = MappingPoint(loop_order, blocking,
                                              partitioning, para_dim)
            if verbose >= 2:
                print "best loop order: ", best_mapping_point.loop_orders
                print "Update smallest cost: ", smallest_cost
                print "Update best schedule: ", utils.print_loop_nest(
                    best_mapping_point)
    assert best_mapping_point, "No valid mapping point found."
    return smallest_cost, best_mapping_point
def opt_mapping_point_generator_function(resource, layer, schedule=None, verbose=False):
    '''
    Mapping point generator.

    Generates a new mapping point each iteration.
    '''
    num_levels = resource.buffer_levels()
    parallel_levels = resource.para_index 
    ideal_perf = cost_model.get_ideal_performance(layer, resource)

    blocking_partitioning_generator = \
        blocking_partitioning_generator_function(resource, layer, schedule)

    smallest_cost = float("inf")
    best_mapping_point = None 
    for blocking_partitioning in blocking_partitioning_generator:
        ''' 
           dummy_mapping_point is used to validate the current blocking_partitioning,
           and abandon the ones that exceed the buffer size at any level.
           Since this validation does not depend on loop_orders, we perform the validation
           at this early stage, so that we can avoid generating all the loop orders for 
           an invalid blocking_partitioning 
        '''
        if verbose >= 3:
            print "Find best order for schedule: ", blocking_partitioning
        [blocking, partitioning, para_dim] = blocking_partitioning
        dummy_mapping_point = MappingPoint(None, blocking, partitioning, para_dim)
        cost, loop_order = opt_get_best_loop_order(resource, layer, dummy_mapping_point, verbose)
        if cost < smallest_cost:
            smallest_cost = cost
            best_mapping_point = MappingPoint(loop_order, blocking, partitioning, para_dim)
            unrolled_loops, utilized = partitioned_loop_string(partitioning, parallel_levels, para_dim)
            utilization = get_utilization(utilized, resource)
            perf = ideal_perf / utilization
            if verbose >= 3:
                print "best loop order: ", best_mapping_point.loop_orders
            if verbose >= 2:
                print "runtime (cycles): ", perf, "utilization: ", utilization
                print "Update smallest cost (pJ): ", smallest_cost
                print "Update best shedule: ", utils.print_loop_nest(best_mapping_point)
    assert best_mapping_point, "No valid mapping point found."
    return smallest_cost, perf, best_mapping_point
def opt_get_best_loop_order(resource, layer, point, verbose=False):
    '''
    When no paritioning, the cost of the current level only depends on the current 
    level loop orders, given the blocking factors. Thus we can leverage this to 
    find the best loop order for each level individually. 
    '''
    num_levels = resource.buffer_levels()
    best_loop_order = []
    blocking = point.loop_blockings
    partitioning = point.loop_partitionings
    para_dim = point.para_loop_dim

    non_empty_loops = get_non_empty_loops(point, num_levels)

    best_cost = 0
    para_level = 0
    for level in xrange(num_levels):
        smallest_cost = float("inf") 
        for curr_level_order in level_order_generator_function(point, le.NUM, non_empty_loops, level):
            dummy_loop_order = [[0] * le.NUM] * num_levels 
            dummy_loop_order[level] = curr_level_order
            mapping_point = MappingPoint(zip(*dummy_loop_order), blocking, partitioning, para_dim)        
            if level <= 0 or resource.paras[level-1].count <= 1 \
                or resource.paras[level-1].access_mode < 1:
                curr_cost = cost_model.get_level_cost(resource, mapping_point, layer, level, verbose)
            else:
                curr_cost = cost_model.get_array_and_curr_level_cost(resource, mapping_point, layer, level, verbose) 
            if curr_cost < smallest_cost:
                best_curr_level_order = curr_level_order 
                smallest_cost = curr_cost
            if resource.mac_capacity == 0 and level == 0:
                break
        best_loop_order.append(best_curr_level_order)
        best_cost += smallest_cost

    return best_cost, zip(*best_loop_order)
示例#8
0
def opt_get_best_loop_order(resource, layer, point, verbose=False):
    '''

    [HW template right now: systolic array]
    [SRAM only talks to the PEs on the edge, most PE will get data from its neighbour PE]

    When there is no partitioning (parallelism), the cost of the current level only depends on the current
    level loop orders, given the blocking factors. Thus we can leverage this to
    find the best loop order for each level individually.

    When there is partitioning (parallelism),
    the # of times that the paralleled level of memory passing data to its neighbour PE
    (corresponding to the energy spent on interconnection, array_cost)
    depends on the current level parallelism size & memory access from the above level memory

    The lowest level memory access (talk to MAC) only depends on the NN layer size

    level access: [input, weight, output] # of element
    level order: [fx, fy, ox, oy, oc, ic, on], '0' for innermost loop, '6' for outermost loop / non-existed loop

    '''
    num_levels = resource.buffer_levels()
    best_loop_order = []
    blocking = point.loop_blockings
    partitioning = point.loop_partitionings
    para_dim = point.para_loop_dim

    non_empty_loops = get_non_empty_loops(point, num_levels)
    # print blocking, partitioning

    best_cost = 0
    para_level = 0
    for level in xrange(num_levels):
        smallest_cost = float("inf")
        # LMEI later, might can speed up the exhaustive order search by identifying symmetrical terms,
        #  e.g. ox and oy, fx and fx, to remove some similar orders

        for curr_level_order in level_order_generator_function(
                point, le.NUM, non_empty_loops, level):
            dummy_loop_order = [[0] * le.NUM] * num_levels
            dummy_loop_order[level] = curr_level_order
            mapping_point = MappingPoint(zip(*dummy_loop_order), blocking,
                                         partitioning, para_dim)
            if level <= 0 or resource.paras[level - 1].count <= 1 \
                    or resource.paras[level - 1].access_mode < 1:  # don't get it
                curr_cost = cost_model.get_level_cost(resource, mapping_point,
                                                      layer, level, verbose)
            else:
                curr_cost = cost_model.get_array_and_curr_level_cost(
                    resource, mapping_point, layer, level, verbose)
            if curr_cost < smallest_cost:
                best_curr_level_order = curr_level_order
                smallest_cost = curr_cost

            if verbose >= 3:
                print "Level", level, "Current order:", curr_level_order, "     Best order:", best_curr_level_order
                print "Level", level, "Current energy:", '%20d' % curr_cost, "     Best energy:", '%20d' % smallest_cost
                print ""

            # LMEI later, instead of using mac_capacity, we could use 4-level memory model, treat mac_capacity
            #  as the innermost memory level for output.
            if resource.mac_capacity == 0 and level == 0:
                break  # Here the author thinks the loop order in innermost level doesn't matter, thus break

        best_loop_order.append(best_curr_level_order)
        best_cost += smallest_cost

    return best_cost, zip(*best_loop_order)
示例#9
0
def blocking_partitioning_generator_function(resource,
                                             layer,
                                             schedule,
                                             verbose=False):
    '''
    loop_blocking_list and loop_partitioning_list generator.

    loop_blocking: [[Total size (temporal+spatial) of Fx @ mem level 0,1,2],[Fy],[OX],[OY],[OC],[IC],[ON]]
    loop_blocking_reshape: [(All loops' total size (temporal+spatial) @ mem level 0),(@ level 1),(@ level 2)]

    partition: [[All loops' spatial unrolled size @ mem level 0],[@ level 1],[@ level 2]]
    para_dim: [[Spatial unrolled loop dimensions @ mem level 0],[@ level 1],[@ level 2]]

    partitioned_loop_blocking_reshape: [[All loops' temporal unrolled size @ mem level 0],[@ level 1],[@ level 2]]
    blocking_list: [[Temporal unrolled size of Fx @ mem level 0,1,2],[Fy],[OX],[OY],[OC],[IC],[ON]]
    partitioning_list: [[Spatial unrolled size of Fx @ mem level 0,1,2],[Fy],[OX],[OY],[OC],[IC],[ON]]
    '''

    num_level = resource.buffer_levels()
    blocking_generator = blocking_generator_function(resource, layer, schedule,
                                                     verbose)

    for loop_blocking in blocking_generator:
        if verbose == 3:
            print "loop_tilling: ", loop_blocking

        loop_blocking_reshape = zip(*loop_blocking)
        pb_generator = parallel_blocking_generator_function(
            loop_blocking_reshape, resource, layer, schedule)

        for pi in pb_generator:
            partition, para_dim = pi
            partitioned_loop_blocking_reshape = []
            for level in xrange(num_level):
                partitioned_loop_blocking_reshape.append([
                    (x + y - 1) // y for x, y in zip(
                        loop_blocking_reshape[level], partition[level])
                ])  # TODO check if using two maps with floordiv is faster
            blocking_list = zip(*partitioned_loop_blocking_reshape)
            partitioning_list = zip(*partition)

            if verbose == 3:
                print "loop_blocking: ", blocking_list
                print "loop_partition: ", partitioning_list
                print "para_dimension: ", para_dim

            dummy_mapping_point = MappingPoint(None, blocking_list,
                                               partitioning_list, para_dim)
            if cost_model.valid_partitioning(resource, dummy_mapping_point,
                                             layer, verbose):
                #                if cost_model.valid_mapping_point(resource, dummy_mapping_point, layer, verbose):
                if verbose == 3:
                    print "Valid"
                    print ""
                yield [blocking_list, partitioning_list, para_dim]


#                else:
#                   print "invalid"
#                    print ""
            else:
                if verbose == 3:
                    print "invalid"
                    print ""