def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option()
def test_access_cost_same_lbs(self): ''' get_access_cost same lbs. ''' lbs = self._lbs(self._make_bl_ts((0, 1, 1), (1, 0, 1), (1, 1, 0)), rsrckey='LG') self.assertTrue(lbs.is_valid()) c1 = lbs.get_access_cost(Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, idl_unit=50)) c2 = lbs.get_access_cost(Cost(mac_op=-1, mem_hier=(-200, -6, -2, -1), noc_hop=-50, idl_unit=-50)) self.assertAlmostEqual(c1, -c2)
def test_mem_hier_at_error(self): ''' Accessor mem_hier error. ''' cost = Cost( mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=10, unit_static=0, ) self.assertIsNone(cost.mem_hier_at(me.NUM)) self.assertIsNone(cost.mem_hier_at(None))
def test_mem_hier_at(self): ''' Accessor mem_hier. ''' cost = Cost( mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=10, unit_static=0, ) self.assertEqual(cost.mem_hier_at(me.DRAM), 200, 'mem_hier: DRAM') self.assertEqual(cost.mem_hier_at(me.GBUF), 6, 'mem_hier: GBUF') self.assertEqual(cost.mem_hier_at(me.ITCN), 2, 'mem_hier: ITCN') self.assertEqual(cost.mem_hier_at(me.REGF), 1, 'mem_hier: REGF')
def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') net = Network('simple') net.set_input_layer(InputLayer(4, 2)) net.add('1', ConvLayer(4, 4, 2, 1)) net.add('2', ConvLayer(4, 4, 2, 1)) # Two more layers to avoid single-segment case. net.add('a1', ConvLayer(4, 1, 1, 1, strd=2)) net.add('a2', ConvLayer(1, 1, 1, 1)) self.simple_net = net net = Network('complex') net.set_input_layer(InputLayer(8, 8)) net.add('1', ConvLayer(8, 8, 8, 1)) net.add('2a', ConvLayer(8, 8, 8, 1), prevs=('1', )) net.add('3a', ConvLayer(8, 8, 8, 1)) net.add('2b', ConvLayer(8, 8, 8, 1), prevs=('1', )) net.add('3b', ConvLayer(8, 8, 8, 1)) net.add('4', ConvLayer(16, 8, 8, 1), prevs=('3a', '3b')) self.complex_net = net self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, idl_unit=0) self.options = Option()
def test_invalid_mem_hier_type(self): ''' Invalid mem_hier type. ''' with self.assertRaisesRegexp(TypeError, 'Cost: .*mem_hier.*'): _ = Cost( mac_op=1, mem_hier=200, noc_hop=10, unit_static=0, ) with self.assertRaisesRegexp(TypeError, 'Cost: .*mem_hier.*'): _ = Cost( mac_op=1, mem_hier=[200, 6, 2, 1], noc_hop=10, unit_static=0, )
def setUp(self): self.layers = {} self.layers['BASE'] = ConvLayer(8, 16, 28, 3) self.layers['POOL'] = PoolingLayer(16, 28, 2) self.layers['LR'] = LocalRegionLayer(16, 28, nreg=3, sreg=1) self.batch_size = 4 self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, unit_static=50) self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DATA),), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64) self.options = Option(partition_hybrid=True, partition_batch=True, partition_ifmaps=True, ntops=10) self.ifmap_layouts = {} part = PartitionScheme(order=(pe.INPP, pe.BATP, pe.OUTP, pe.OFMP), pdims=((1, 2), (2, 1), (1, 2), (2, 1))) for wlkey in self.layers: self.ifmap_layouts[wlkey] = partition.get_ofmap_layout( self.layers[wlkey].input_layer(), self.batch_size, part, self.resource.src_data_region())
def setUp(self): self.layers = {} self.layers['BASE'] = ConvLayer(8, 16, 28, 3) self.layers['POOL'] = PoolingLayer(16, 28, 2) self.layers['LR'] = LocalRegionLayer(16, 28, nreg=3, sreg=1) self.batch_size = 4 self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, idl_unit=50) self.none_cstr = SchedulingConstraint() self.cstr = SchedulingConstraint(topofm=1, topbat=self.batch_size) self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.options = Option(partition_hybrid=True, partition_batch=True, partition_ifmaps=True, ntops=10) self.ifmap_layouts = {} part = PartitionScheme(order=(pe.INPP, pe.BATP, pe.OUTP, pe.OFMP), pdims=((1, 2), (2, 1), (1, 2), (2, 1))) for wlkey in self.layers: input_layer = self.layers[wlkey].input_layer() self.ifmap_layouts[wlkey] = DataLayout( frngs=(FmapRange((0, 0, 0, 0), FmapPosition(b=self.batch_size, n=input_layer.nofm, h=input_layer.hofm, w=input_layer.wofm)), ), regions=(self.resource.src_data_region, ), parts=(part.projection(self.resource.src_data_region, appl2frng=True), )) self.sched_seq = (2, 0, 1)
def test_invalid_idl_unit(self): ''' Invalid idl_unit. ''' with self.assertRaisesRegex(TypeError, 'Cost: .*idl_unit.*'): _ = Cost( mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=10, idl_unit=set([1, 2]), )
def test_invalid_mac_op(self): ''' Invalid mac_op. ''' with self.assertRaisesRegex(TypeError, 'Cost: .*mac_op.*'): _ = Cost( mac_op=(1, 2), mem_hier=(200, 6, 2, 1), noc_hop=10, idl_unit=0, )
def test_invalid_noc_hop(self): ''' Invalid noc_hop. ''' with self.assertRaisesRegexp(TypeError, 'Cost: .*noc_hop.*'): _ = Cost( mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=[10, 10], unit_static=0, )
def test_invalid_unit_static(self): ''' Invalid unit_static. ''' with self.assertRaisesRegexp(TypeError, 'Cost: .*unit_static.*'): _ = Cost( mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=10, unit_static=set([1, 2]), )
def test_invalid_mem_hier_len(self): ''' Invalid mem_hier len. ''' with self.assertRaisesRegexp(ValueError, 'Cost: .*mem_hier.*'): _ = Cost( mac_op=1, mem_hier=(200, 6), noc_hop=10, unit_static=0, )
class TestMLP_network(): def __init__(self, mlp_network): self.net = mlp_network #MLP_network(18,32,64,32,2) self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option() def test_eyeriss_isca16(self): network = self.net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, cache_stats = nnd.schedule_search(self.options) if not tops: sys.stderr.write("No valid dataflow found!") return None dfsch = tops[0] ## Write results. res_map = OrderedDict() res_map['net'] = "MLP_L" res_map['batch'] = batch_size res_map['resource'] = self.resource._asdict() res_map['cost'] = self.cost._asdict() res_map['options'] = self.options._asdict() res_map['cache_stats'] = cache_stats stats = stats_dict(dfsch, self.cost) for key, val in stats.items(): res_map[key] = val return res_map
def __init__(self, mlp_network): self.net = mlp_network #MLP_network(18,32,64,32,2) self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option()
def test_valid_args(self): ''' Valid arguments. ''' cost = Cost( mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=10, idl_unit=0, ) self.assertEqual(cost.mac_op, 1, 'mac_op') self.assertEqual(cost.mem_hier, (200, 6, 2, 1), 'mem_hier') self.assertEqual(cost.noc_hop, 10, 'noc_hop') self.assertEqual(cost.idl_unit, 0, 'idl_unit')
def test_valid_args(self): ''' Valid arguments. ''' cost = Cost( mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=10, unit_static=0, ) self.assertEqual(cost.mac_op, 1, 'mac_op') self.assertEqual(cost.mem_hier, (200, 6, 2, 1), 'mem_hier') self.assertEqual(cost.noc_hop, 10, 'noc_hop') self.assertEqual(cost.unit_static, 0, 'unit_static')
def __init__(self): self.alex_net = import_network('alex_net') self.mock_net = import_network('mock_net') self.map_strategy = MapStrategyEyeriss value_mult = {} value_control = 1 my_weights = {} self.cost = Cost(value_control=value_control, value_mult=value_mult, adder_cost=1, mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, idl_unit=0, my_weights=my_weights, mem_cycles=(200, 6, 2, 1)) self.options = Option()
def __init__(self): self.alex_net = import_network('alex_net') self.map_strategy = MapStrategyEyeriss value_mult = {} value_control = {} my_weights = {} self.cost = Cost(value_control=value_control, value_mult=value_mult, mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, idl_unit=0, my_weights=my_weights) self.options = Option() print('mapping is : {}'.format(self.map_strategy)) print('cost is: {}'.format(self.cost)) print('options are: {}'.format(self.options))
class TestNNDataflow(unittest.TestCase): ''' Tests for NNDataflow module. ''' def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option() def test_invalid_network(self): ''' Invalid network argument. ''' with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*network.*'): _ = NNDataflow(self.alex_net.input_layer(), 4, self.resource, self.cost, self.map_strategy) def test_invalid_resource(self): ''' Invalid network argument. ''' with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*resource.*'): _ = NNDataflow(self.alex_net, 4, self.resource.proc_region, self.cost, self.map_strategy) def test_invalid_cost(self): ''' Invalid network argument. ''' with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*cost.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost._asdict(), self.map_strategy) def test_invalid_map_strategy(self): ''' Invalid map_strategy argument. ''' class _DummyClass(object): # pylint: disable=too-few-public-methods pass with self.assertRaisesRegexp(TypeError, 'NNDataflow: .*map_strategy.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost, _DummyClass) def test_verbose(self): ''' Verbose mode. ''' network = self.alex_net batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, verbose=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO.StringIO() sys.stderr = stderr = StringIO.StringIO() tops, _ = nnd.schedule_search(options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertTrue(tops) self.assertFalse(stdout_value) for layer in network: self.assertIn(layer, stderr_value) def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops) def test_scheduling_failure(self): ''' Layer scheduling failure. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, MapStrategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO.StringIO() sys.stderr = stderr = StringIO.StringIO() with self.assertRaises(NotImplementedError): _ = nnd.schedule_search(self.options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertFalse(stdout_value) self.assertIn('Failed', stderr_value) def test_eyeriss_isca16(self): ''' Reproduce Eyeriss ISCA'16 paper Fig. 10. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in ['conv{}'.format(i) for i in range(1, 6)] \ + ['fc{}'.format(i) for i in range(1, 4)]: op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] op_cost += sr.total_ops * self.cost.mac_op access_cost = [ ac + a * c for ac, a, c in zip( access_cost, sr.total_accesses, self.cost.mem_hier) ] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] / 1e9) # Check the major parts: ALU, DRAM, RF. major_cost_bkdn_ref = { 'conv1': [1.69, 2.46, 6.75], 'conv2': [3.58, 2.27, 14.33], 'conv3': [2.39, 2.02, 9.57], 'conv4': [1.79, 1.57, 7.18], 'conv5': [1.20, 1.05, 4.78], 'fc1': [0.60, 7.78, 2.42], 'fc2': [0.27, 3.39, 1.07], 'fc3': [0.07, 0.84, 0.26], } for layer in cost_bkdn: success = all( abs(a - b) < 0.1 for a, b in zip(cost_bkdn[layer][:2] + cost_bkdn[layer][-1:], major_cost_bkdn_ref[layer])) self.assertTrue( success, 'test_eyeriss_isca16: ' 'ALU, DRAM, RF cost diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}'.format(layer, header, cost_bkdn[layer], major_cost_bkdn_ref[layer])) def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 ) cost = Cost( mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, unit_static=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = { 'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue( success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}'.format(layer, header, stats[layer], stats_ref[layer])) def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB ) cost = Cost( mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, unit_static=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DATA), ), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B ) cost = Cost( mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, unit_static=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7)
def setUp(self): # Workload. self.layer = {} self.layer['BASE'] = ConvLayer(12, 10, 28, 3) self.layer['LGFIL'] = ConvLayer(2, 4, 28, 20) self.layer['POOL'] = PoolingLayer(32, 28, 2) self.batch_size = 4 # Resource. self.resource = {} dim_array = PhyDim2(16, 16) proc_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC) data_regions = (NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ) # Typical resource. self.resource['BASE'] = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=65536, size_regf=64) # Larger resource with sufficient capacity, to make all schemes valid. self.resource['LG'] = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3) # Small resource. self.resource['SM'] = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=4096, size_regf=16) # Nested loop description after mapping. self.nld = {} self.nld['BASE'] = next( MapStrategyEyeriss(self.layer['BASE'], self.batch_size, dim_array).gen_nested_loop_desc()) self.nld['LGFIL'] = next( MapStrategyEyeriss(self.layer['LGFIL'], self.batch_size, dim_array).gen_nested_loop_desc()) self.nld['POOL'] = next( MapStrategyEyeriss(self.layer['POOL'], self.batch_size, dim_array).gen_nested_loop_desc()) # Fake nested loop, with zero filter size. self.nld['ZERO_FIL'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(0, 1000, 800), usize_regf=(0, 3, 1), unit_access=((0, 1000, 800), (0, 1000, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Fake nested loop, with zero ifmap size. self.nld['ZERO_IFM'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(9, 0, 800), usize_regf=(3, 0, 1), unit_access=((9, 0, 800), (9, 0, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Options. self.options = {} # Basic. self.options['BASE'] = Option(ntops=2**30) # Multiprocessing. self.options['MP'] = Option(ntops=2**30, nprocesses=8) # Limited top schemes. self.options['NTOPS'] = Option(ntops=10) # Bypass. self.options['BYP'] = Option(sw_gbuf_bypass=(True, ) * 3, ntops=2**30) # Bypass solver. self.options['BYPSOL'] = Option(sw_gbuf_bypass=(True, ) * 3, sw_solve_loopblocking=True, ntops=2**30) # Cost. self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, unit_static=50) # Partition occupation. self.part_occ = 0.91
def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72)
def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = {'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue(success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, stats[layer], stats_ref[layer]))
class TestNNDataflow(unittest.TestCase): ''' Tests for NNDataflow module. ''' def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') net = Network('simple') net.set_input_layer(InputLayer(4, 2)) net.add('1', ConvLayer(4, 4, 2, 1)) net.add('2', ConvLayer(4, 4, 2, 1)) # Two more layers to avoid single-segment case. net.add('a1', ConvLayer(4, 1, 1, 1, strd=2)) net.add('a2', ConvLayer(1, 1, 1, 1)) self.simple_net = net net = Network('complex') net.set_input_layer(InputLayer(8, 8)) net.add('1', ConvLayer(8, 8, 8, 1)) net.add('2a', ConvLayer(8, 8, 8, 1), prevs=('1',)) net.add('3a', ConvLayer(8, 8, 8, 1)) net.add('2b', ConvLayer(8, 8, 8, 1), prevs=('1',)) net.add('3b', ConvLayer(8, 8, 8, 1)) net.add('4', ConvLayer(16, 8, 8, 1), prevs=('3a', '3b')) self.complex_net = net self.map_strategy = MapStrategyEyeriss self.resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, idl_unit=0) self.options = Option() def test_invalid_network(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*network.*'): _ = NNDataflow(self.alex_net.input_layer(), 4, self.resource, self.cost, self.map_strategy) def test_invalid_resource(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*resource.*'): _ = NNDataflow(self.alex_net, 4, self.resource.proc_region, self.cost, self.map_strategy) def test_invalid_cost(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*cost.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost._asdict(), self.map_strategy) def test_invalid_map_strategy(self): ''' Invalid map_strategy argument. ''' class _DummyClass(): # pylint: disable=too-few-public-methods pass with self.assertRaisesRegex(TypeError, 'NNDataflow: .*map_strategy.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost, _DummyClass) def test_verbose(self): ''' Verbose mode. ''' network = self.alex_net batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, verbose=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() tops, _ = nnd.schedule_search(options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertTrue(tops) self.assertFalse(stdout_value) for layer in network: self.assertIn(layer, stderr_value) def test_pipelining(self): ''' Pipelining. ''' network = self.alex_net batch_size = 1 options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_infeasible(self): ''' Enter fast forward due to infeasible constraint. ''' network = self.simple_net batch_size = 1 # Very small gbuf size. Small fmap tpart is infeasible. resource = self.resource._replace( dim_array=PhyDim2(2, 2), size_gbuf=16) options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) # No pipelining is feasible. for dtfl in tops: self.assertTupleEqual(dtfl['1'].sched_seq, (0, 0, 0)) self.assertTupleEqual(dtfl['2'].sched_seq, (1, 0, 0)) def test_fast_forward_found(self): ''' Enter fast forward due to early found. ''' network = self.simple_net batch_size = 1 # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_crit_time(self): ''' Enter fast forward due to long critical time. ''' network = self.simple_net batch_size = 1 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(1, 1), ) # Very strict time overhead limit. # At large fmap tpart, utilization decreases and critical time would # increase. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=1e-3) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_frontier(self): ''' Enter fast forward due to off-frontier. ''' network = self.simple_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(2, 2), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fmap_fwd(self): ''' Fmap forward with shared mem sources or both on/off-chip destinations. ''' network = self.complex_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_sched_instance_sharing(self): ''' Scheduling instance sharing between layers. ''' network = self.alex_net batch_size = 1 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) self.assertIs(nnd.layer_sched_dict['conv1_a'], nnd.layer_sched_dict['conv1_b']) self.assertIs(nnd.layer_sched_dict['conv2_a'], nnd.layer_sched_dict['conv2_b']) self.assertIs(nnd.layer_sched_dict['pool1_a'], nnd.layer_sched_dict['pool1_b']) def test_opt_goal(self): ''' Optimization goal. ''' network = self.alex_net batch_size = 8 resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC) ) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) options_e = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='e', ntops=16) tops_e, _ = nnd.schedule_search(options_e) self.assertTrue(tops_e) options_d = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='d', ntops=16) tops_d, _ = nnd.schedule_search(options_d) self.assertTrue(tops_d) options_ed = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='ed', ntops=16) tops_ed, _ = nnd.schedule_search(options_ed) self.assertTrue(tops_ed) self.assertLess(tops_e[0].total_cost, tops_d[0].total_cost) self.assertLess(tops_e[0].total_cost, tops_ed[0].total_cost) self.assertLess(tops_d[0].total_time, tops_e[0].total_time) self.assertLess(tops_d[0].total_time, tops_ed[0].total_time) # Sum of the smallest ED may not be the smallest; allow for error. self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_e[0].total_cost * tops_e[0].total_time * 1.05) self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_d[0].total_cost * tops_d[0].total_time * 1.05) def test_ext_layer(self): ''' With external layers. ''' network = self.alex_net network.add_ext('e0', InputLayer(4, 1)) network.add('l1', FCLayer(1000, 4)) network.add('l2', FCLayer(8, 4), prevs=('e0', 'l1')) batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops) # With inter-layer pipelining. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) tops, _ = nnd.schedule_search(options) self.assertFalse(tops) def test_scheduling_failure(self): ''' Layer scheduling failure. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, MapStrategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() with self.assertRaises(NotImplementedError): _ = nnd.schedule_search(self.options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertFalse(stdout_value) self.assertIn('Failed', stderr_value) def test_eyeriss_isca16(self): ''' Reproduce Eyeriss ISCA'16 paper Fig. 10. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in ['conv{}'.format(i) for i in range(1, 6)] \ + ['fc{}'.format(i) for i in range(1, 4)]: op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] op_cost += sr.total_ops * self.cost.mac_op access_cost = [ac + a * c for ac, a, c in zip(access_cost, sr.total_accesses, self.cost.mem_hier)] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] / 1e9) # Check the major parts: ALU, DRAM, RF. major_cost_bkdn_ref = {'conv1': [1.69, 2.46, 6.75], 'conv2': [3.58, 2.27, 14.33], 'conv3': [2.39, 2.02, 9.57], 'conv4': [1.79, 1.57, 7.18], 'conv5': [1.20, 1.05, 4.78], 'fc1': [0.60, 7.78, 2.42], 'fc2': [0.27, 3.39, 1.07], 'fc3': [0.07, 0.84, 0.26], } for layer in cost_bkdn: success = all(abs(a - b) < 0.1 for a, b in zip(cost_bkdn[layer][:2] + cost_bkdn[layer][-1:], major_cost_bkdn_ref[layer])) self.assertTrue(success, 'test_eyeriss_isca16: ' 'ALU, DRAM, RF cost diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, cost_bkdn[layer], major_cost_bkdn_ref[layer])) def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = {'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue(success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, stats[layer], stats_ref[layer])) def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72)
def eyerissAsplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost( mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) pdb.set_trace() nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. #self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) print('t16 ops: {}'.format(dfsch_t16.total_ops)) # Performance of T-16 is proportional to PE resource (20% margin). #self.assertLess(dfsch_t16.total_time, # 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) print('t16_time: {}'.format(dfsch_t16.total_time)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. #self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72) print('t16_energy: {}'.format(dfsch_t16.total_cost)) for i in dfsch_t16: print(str(i) + ',') ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in dfsch_t16: layer = str(layer) op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch_t16[layer_part] op_cost += sr.total_ops * cost.mac_op access_cost = [ ac + a * c for ac, a, c in zip( access_cost, sr.total_accesses, cost.mem_hier) ] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] * 1e12 / 1e9) for layer in cost_bkdn: print(cost_bkdn[layer])
def do_scheduling(args): ''' Get optimal scheduling for given problem. Return a result schedule. ''' ## Network. network = import_network(args.net) batch_size = args.batch ## Resource. dim_nodes = PhyDim2(*args.nodes) dim_array = PhyDim2(*args.array) # Sizes of gbuf and regf are in words. word = (args.word + 7) / 8 size_gbuf = args.gbuf / word size_regf = args.regf / word proc_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.PROC) if args.mem_type == '2D': # Memory nodes are on two sides. data_regions = (NodeRegion(dim=PhyDim2(h=dim_nodes.h, w=1), origin=PhyDim2(h=0, w=0), type=NodeRegion.DATA), NodeRegion(dim=PhyDim2(h=dim_nodes.h, w=1), origin=PhyDim2(h=0, w=dim_nodes.w - 1), type=NodeRegion.DATA)) elif args.mem_type == '3D': # All nodes have memory. data_regions = (NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.DATA), ) resource = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=size_gbuf, size_regf=size_regf) ## Cost. hier_cost = [0] * me.NUM hier_cost[me.DRAM] = args.hier_cost[0] hier_cost[me.GBUF] = args.hier_cost[1] hier_cost[me.ITCN] = args.hier_cost[2] hier_cost[me.REGF] = args.hier_cost[3] cost = Cost(mac_op=args.op_cost, mem_hier=tuple(hier_cost), noc_hop=args.hop_cost, unit_static=args.unit_static_cost) ## Options. bypass = [True] * de.NUM bypass[de.IFM] = 'i' not in args.disable_bypass bypass[de.OFM] = 'o' not in args.disable_bypass bypass[de.FIL] = 'f' not in args.disable_bypass options = Option(sw_gbuf_bypass=tuple(bypass), sw_solve_loopblocking=args.solve_loopblocking, partition_hybrid=args.hybrid_partition, partition_batch=args.batch_partition, partition_ifmaps=args.ifmaps_partition, ntops=args.top, nprocesses=args.processes, verbose=args.verbose) ## Search schedules. nnd = NNDataflow(network, batch_size, resource, cost, MapStrategyEyeriss) tops, cache_stats = nnd.schedule_search(options) if not tops: sys.stderr.write('No valid dataflow found.\n') return None top = tops[0] ## Write results. res_map = OrderedDict() res_map['version'] = get_version(with_local=True) res_map['net'] = args.net res_map['batch'] = args.batch res_map['resource'] = resource._asdict() res_map['cost'] = cost._asdict() res_map['options'] = options._asdict() res_map['cache_stats'] = cache_stats stats = stats_dict(top, cost) for key, val in stats.items(): res_map[key] = val return res_map
def do_scheduling(args): ''' Get optimal scheduling for given problem. Return a result schedule. ''' ## Network. network = import_network(args.net) batch_size = args.batch ## Resource. dim_nodes = PhyDim2(*args.nodes) dim_array = PhyDim2(*args.array) # Sizes of gbuf and regf are in words. word = (args.word + 7) / 8 size_gbuf = args.gbuf / word size_regf = args.regf / word array_bus_width = args.bus_width // args.word if not array_bus_width: array_bus_width = float('inf') dram_bandwidth = args.dram_bw / word proc_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.PROC) if args.mem_type == '2D': # Memory nodes are on two sides. data_region = NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), dist=dim_nodes - PhyDim2(1, 1), type=NodeRegion.DRAM) assert data_region.rel2abs(PhyDim2(1, 1)) + PhyDim2(1, 1) \ == proc_region.dim elif args.mem_type == '3D': # Memory nodes are on the top. data_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.DRAM) resource = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=size_gbuf, size_regf=size_regf, array_bus_width=array_bus_width, dram_bandwidth=dram_bandwidth, no_time_mux=False) ## Cost. hier_cost = [0] * me.NUM hier_cost[me.DRAM] = args.hier_cost[0] hier_cost[me.GBUF] = args.hier_cost[1] hier_cost[me.ITCN] = args.hier_cost[2] hier_cost[me.REGF] = args.hier_cost[3] cost = Cost(mac_op=args.op_cost, mem_hier=tuple(hier_cost), noc_hop=args.hop_cost, idl_unit=args.unit_idle_cost) ## Options. bypass = [True] * de.NUM bypass[de.IFM] = 'i' not in args.disable_bypass bypass[de.OFM] = 'o' not in args.disable_bypass bypass[de.FIL] = 'f' not in args.disable_bypass options = Option( sw_gbuf_bypass=tuple(bypass), sw_solve_loopblocking=args.solve_loopblocking, hw_access_forwarding=args.enable_access_forwarding, hw_gbuf_sharing=args.enable_gbuf_sharing, hw_gbuf_save_writeback=args.enable_save_writeback, partition_hybrid=args.hybrid_partition, partition_batch=args.batch_partition, partition_ifmaps=args.ifmaps_partition, partition_interlayer=args.interlayer_partition, layer_pipeline_time_ovhd=args.layer_pipeline_time_overhead, layer_pipeline_max_degree=args.layer_pipeline_max_degree, layer_pipeline_opt=not args.disable_interlayer_opt, opt_goal=args.goal.lower(), ntops=args.top, nprocesses=args.processes, verbose=args.verbose) ## Search schedules. nnd = NNDataflow(network, batch_size, resource, cost, MapStrategyEyeriss) tbeg = time.time() tops, cache_stats = nnd.schedule_search(options) tend = time.time() telapsed = tend - tbeg if not tops: sys.stderr.write('No valid dataflow found.\n') return None top = tops[0] ## Write results. res_map = OrderedDict() res_map['version'] = get_version(with_local=True) res_map['net'] = args.net res_map['batch'] = args.batch res_map['resource'] = resource._asdict() res_map['cost'] = cost._asdict() res_map['options'] = options._asdict() res_map['cache_stats'] = cache_stats res_map['elapsed'] = telapsed stats = stats_dict(top, cost) for key, val in stats.items(): res_map[key] = val return res_map
def setUp(self): # Workload. self.layer = {} self.layer['BASE'] = ConvLayer(12, 10, 28, 3) self.layer['LGFIL'] = ConvLayer(2, 4, 28, 20) self.layer['POOL'] = PoolingLayer(32, 28, 2) self.layer['PAR'] = ConvLayer(24, 36, 56, 3) self.batch_size = 4 # Resource. self.resource = {} dim_array = PhyDim2(16, 16) proc_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC) data_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM) # Typical resource. self.resource['BASE'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=65536, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Larger resource with sufficient capacity, to make all schemes valid. self.resource['LG'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Small resource. self.resource['SM'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=4096, size_regf=16, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Multi-node parallel resource. self.resource['PAR'] = Resource(proc_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 2), type=NodeRegion.PROC), dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=25000, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Resource with no data regions. proc_data_region = NodeRegion(origin=PhyDim2(1, 1), dim=PhyDim2(1, 1), type=NodeRegion.PROC) self.resource['SRCNOTDATA'] = Resource( proc_region=proc_region, dram_region=data_region, src_data_region=proc_data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.resource['DSTNOTDATA'] = Resource( proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=proc_data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.resource['DATALOCAL'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=proc_region, dst_data_region=proc_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Filter pinning. self.resource['FILPIN'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=True) # Nested loop description after mapping. self.nld = {} self.nld['BASE'] = next( MapStrategyEyeriss(self.layer['BASE'], self.batch_size, 1, dim_array).gen_nested_loop_desc()) self.nld['LGFIL'] = next( MapStrategyEyeriss(self.layer['LGFIL'], self.batch_size, 1, dim_array).gen_nested_loop_desc()) self.nld['POOL'] = next( MapStrategyEyeriss(self.layer['POOL'], self.batch_size, 1, dim_array).gen_nested_loop_desc()) # Fake nested loop, with zero filter size. self.nld['ZERO_FIL'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(0, 1000, 800), usize_regf=(0, 3, 1), unit_access=((0, 1000, 800), (0, 1000, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Fake nested loop, with zero ifmap size. self.nld['ZERO_IFM'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(9, 0, 800), usize_regf=(3, 0, 1), unit_access=((9, 0, 800), (9, 0, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Fake partition scheme. self.part = PartitionScheme(range(pe.NUM), ((1, 1), ) * pe.NUM) # Fake buffer sharing scheme. self.bufshr = BufShrScheme(proc_region, self.part) # Options. self.options = {} # Basic. self.options['BASE'] = Option(ntops=2**30) # Multiprocessing. self.options['MP'] = Option(ntops=2**30, nprocesses=8) # Limited top schemes. self.options['NTOPS'] = Option(ntops=10) # Bypass. self.options['BYP'] = Option(sw_gbuf_bypass=(True, ) * 3, ntops=2**30) # Bypass solver. self.options['BYPSOL'] = Option(sw_gbuf_bypass=(True, ) * 3, sw_solve_loopblocking=True, ntops=2**30) # Access forwarding. self.options['ACCFWD'] = Option(hw_access_forwarding=True, ntops=2**30) # Buffer sharing. self.options['BUFSHR'] = Option(hw_gbuf_sharing=True, ntops=2**30) # Buffer sharing with bypassing. self.options['BUFSHR-BYP'] = Option(sw_gbuf_bypass=(True, ) * 3, hw_gbuf_sharing=True, ntops=2**30) # Constraint. self.none_cstr = SchedulingConstraint() self.cstr = SchedulingConstraint(topifm=1, topbat=1) # Cost. self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, idl_unit=50)
def eyerissAsplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' #network = self.alex_net network = self.mock_net batch_size = 1 resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, num_value_pes=256, ) # model values print('converting weights') q_weight_dict = {} weights_dict = read_weights() for w_layer in [ 'conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6', 'fc7', 'fc8' ]: array = convertToArray(weights_dict, w_layer) array_qint8 = quantizeWeights(array, 'qint8') q_weight_dict[w_layer] = array_qint8 #print('''Hey num weights in conv1 are {} '''.format(len(array_qint8))) # hardware costs mult_cost = readValueMult8Cost() #control_cost = readValueControl8Cost() print('done converting weights') #with open('weights.pickle', 'wb') as f: # pickle.dump(q_weight_dict,f) #counter = 0 #c = 0 #for m in mult_cost.keys(): # c += mult_cost[m] # counter += 1 #ave = c/counter #print('{} '.format(counter)) #print('average = {}'.format(ave)) #print('conv3 weights are') #for w in q_weight_dict['conv1']: # print(w) #exit() cost = Cost( value_control=1.92e-13, value_mult=mult_cost, mac_op=2e-12, adder_cost=(1.178e-5) / 200000000, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pj/16-b noc_hop=40e-12, idl_unit=200e-12, my_weights=q_weight_dict, mem_cycles=(200, 6, 2, 1)) #cost = cost(value_control=control_cost, # value_mult=mult_cost, # mac_op=2e-12, # mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pj/16-b # noc_hop=40e-12, # idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) #pdb.set_trace() nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. #self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) print('t16 ops: {}'.format(dfsch_t16.total_ops)) # Performance of T-16 is proportional to PE resource (20% margin). #self.assertLess(dfsch_t16.total_time, # 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) print('t16_time: {}'.format(dfsch_t16.total_time)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. #self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72) print('t16_energy: {}'.format(dfsch_t16.total_cost)) for i in dfsch_t16: print(str(i) + ',') ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in dfsch_t16: layer = str(layer) op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch_t16[layer_part] op_cost += sr.total_ops * cost.mac_op access_cost = [ ac + a * c for ac, a, c in zip( access_cost, sr.total_accesses, cost.mem_hier) ] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] * 1e12 / 1e9) for layer in cost_bkdn: print(cost_bkdn[layer])