def setUp(self): self.layers = {} self.layers['BASE'] = ConvLayer(8, 16, 28, 3) self.layers['POOL'] = PoolingLayer(16, 28, 2) self.layers['LR'] = LocalRegionLayer(16, 28, nreg=3, sreg=1) self.batch_size = 4 self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, unit_static=50) self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DATA),), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64) self.options = Option(partition_hybrid=True, partition_batch=True, partition_ifmaps=True, ntops=10) self.ifmap_layouts = {} part = PartitionScheme(order=(pe.INPP, pe.BATP, pe.OUTP, pe.OFMP), pdims=((1, 2), (2, 1), (1, 2), (2, 1))) for wlkey in self.layers: self.ifmap_layouts[wlkey] = partition.get_ofmap_layout( self.layers[wlkey].input_layer(), self.batch_size, part, self.resource.src_data_region())
def test_invalid_data_region(self): ''' Invalid src/dst_proc_region. ''' with self.assertRaisesRegex(TypeError, 'Resource: .*src_data_.*'): _ = Resource( proc_region=self.proc_region, dram_region=self.dram_region, src_data_region=PhyDim2(2, 1), dst_data_region=self.dst_data_region, dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, array_bus_width=8, dram_bandwidth=128, no_time_mux=False, ) with self.assertRaisesRegex(TypeError, 'Resource: .*dst_data_.*'): _ = Resource( proc_region=self.proc_region, dram_region=self.dram_region, src_data_region=self.src_data_region, dst_data_region=PhyDim2(2, 1), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, array_bus_width=8, dram_bandwidth=128, no_time_mux=False, )
def setUp(self): self.layers = {} self.layers['BASE'] = ConvLayer(8, 16, 28, 3) self.layers['POOL'] = PoolingLayer(16, 28, 2) self.layers['LR'] = LocalRegionLayer(16, 28, nreg=3, sreg=1) self.batch_size = 4 self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, idl_unit=50) self.none_cstr = SchedulingConstraint() self.cstr = SchedulingConstraint(topofm=1, topbat=self.batch_size) self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.options = Option(partition_hybrid=True, partition_batch=True, partition_ifmaps=True, ntops=10) self.ifmap_layouts = {} part = PartitionScheme(order=(pe.INPP, pe.BATP, pe.OUTP, pe.OFMP), pdims=((1, 2), (2, 1), (1, 2), (2, 1))) for wlkey in self.layers: input_layer = self.layers[wlkey].input_layer() self.ifmap_layouts[wlkey] = DataLayout( frngs=(FmapRange((0, 0, 0, 0), FmapPosition(b=self.batch_size, n=input_layer.nofm, h=input_layer.hofm, w=input_layer.wofm)), ), regions=(self.resource.src_data_region, ), parts=(part.projection(self.resource.src_data_region, appl2frng=True), )) self.sched_seq = (2, 0, 1)
def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') net = Network('simple') net.set_input_layer(InputLayer(4, 2)) net.add('1', ConvLayer(4, 4, 2, 1)) net.add('2', ConvLayer(4, 4, 2, 1)) # Two more layers to avoid single-segment case. net.add('a1', ConvLayer(4, 1, 1, 1, strd=2)) net.add('a2', ConvLayer(1, 1, 1, 1)) self.simple_net = net net = Network('complex') net.set_input_layer(InputLayer(8, 8)) net.add('1', ConvLayer(8, 8, 8, 1)) net.add('2a', ConvLayer(8, 8, 8, 1), prevs=('1', )) net.add('3a', ConvLayer(8, 8, 8, 1)) net.add('2b', ConvLayer(8, 8, 8, 1), prevs=('1', )) net.add('3b', ConvLayer(8, 8, 8, 1)) net.add('4', ConvLayer(16, 8, 8, 1), prevs=('3a', '3b')) self.complex_net = net self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, idl_unit=0) self.options = Option()
def setUp(self): self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.none_cstr = SchedulingConstraint() part = PartitionScheme(order=range(pe.NUM), pdims=[(1, 1)] * pe.NUM) self.ifmap_layout = DataLayout( frngs=(FmapRange((0, 0, 0, 0), (2, 4, 16, 16)), ), regions=(self.resource.src_data_region, ), parts=(part, )) self.sched_seq = (2, 0, 0)
def test_pernode_sched_cache_key(self): ''' Per-node scheduling cache key must be hash-able. ''' layer = self.layers['BASE'] ifmap_layout = self.ifmap_layouts['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) condition = SchedulingCondition(resource=self.resource, ifmap_layout=ifmap_layout) _ = schd.schedule_search(condition, self.options) h, m = schd.cache_stats() self.assertEqual(h, 0) # Make another instance. rsrc = Resource(**self.resource._asdict()) opts = Option(**self.options._asdict()) self.assertNotEqual(id(rsrc), id(self.resource)) self.assertNotEqual(id(opts), id(self.options)) part = PartitionScheme(order=(pe.BATP, pe.INPP, pe.OUTP, pe.OFMP), pdims=((2, 2), (2, 2), (1, 1), (1, 1))) _ = schd.schedule_search_per_node(part, rsrc, opts) h2, m2 = schd.cache_stats() self.assertEqual(h2, h + 1) self.assertEqual(m2, m)
def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option()
def test_invalid_array_bus_width(self): ''' Invalid array_bus_width. ''' with self.assertRaisesRegex(TypeError, 'Resource: .*array_bus_width.*'): _ = Resource( proc_region=self.proc_region, dram_region=self.dram_region, src_data_region=self.src_data_region, dst_data_region=self.dst_data_region, dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, array_bus_width=1.2, dram_bandwidth=128, no_time_mux=False, ) with self.assertRaisesRegex(ValueError, 'Resource: .*array_bus_width.*'): _ = Resource( proc_region=self.proc_region, dram_region=self.dram_region, src_data_region=self.src_data_region, dst_data_region=self.dst_data_region, dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, array_bus_width=-2, dram_bandwidth=128, no_time_mux=False, ) with self.assertRaisesRegex(ValueError, 'Resource: .*array_bus_width.*'): _ = Resource( proc_region=self.proc_region, dram_region=self.dram_region, src_data_region=self.src_data_region, dst_data_region=self.dst_data_region, dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, array_bus_width=0, dram_bandwidth=128, no_time_mux=False, )
def __init__(self, mlp_network): self.net = mlp_network #MLP_network(18,32,64,32,2) self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option()
class TestMLP_network(): def __init__(self, mlp_network): self.net = mlp_network #MLP_network(18,32,64,32,2) self.map_strategy = MapStrategyEyeriss self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, unit_static=0) self.options = Option() def test_eyeriss_isca16(self): network = self.net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, cache_stats = nnd.schedule_search(self.options) if not tops: sys.stderr.write("No valid dataflow found!") return None dfsch = tops[0] ## Write results. res_map = OrderedDict() res_map['net'] = "MLP_L" res_map['batch'] = batch_size res_map['resource'] = self.resource._asdict() res_map['cost'] = self.cost._asdict() res_map['options'] = self.options._asdict() res_map['cache_stats'] = cache_stats stats = stats_dict(dfsch, self.cost) for key, val in stats.items(): res_map[key] = val return res_map
def test_invalid_data_regions_type(self): ''' Invalid data_regions type. ''' with self.assertRaisesRegexp(TypeError, 'Resource: .*data_regions.*'): _ = Resource( proc_region=NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), type=NodeRegion.PROC), data_regions=[ NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 0), type=NodeRegion.DATA), ], dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=(512, ), ) with self.assertRaisesRegexp(TypeError, 'Resource: .*data_regions.*'): _ = Resource( proc_region=NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), type=NodeRegion.PROC), data_regions=NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 0), type=NodeRegion.DATA), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=(512, ), ) with self.assertRaisesRegexp(TypeError, 'Resource: .*data_regions.*'): _ = Resource( proc_region=NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), type=NodeRegion.PROC), data_regions=(NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 0), type=NodeRegion.DATA), PhyDim2(2, 1)), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=(512, ), )
def setUp(self): self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA),), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64) frmap = FmapRangeMap() frmap.add(FmapRange((0, 0, 0, 0), (2, 4, 16, 16)), (PhyDim2(0, 0),)) self.ifmap_layout = DataLayout(origin=PhyDim2(0, 0), frmap=frmap, type=NodeRegion.DATA)
def setUp(self): # AlexNet. self.convlayers = OrderedDict() self.convlayers['conv1'] = ConvLayer(3, 96, 55, 11, 4) self.convlayers['conv2'] = ConvLayer(48, 256, 27, 5) self.convlayers['conv3'] = ConvLayer(256, 384, 13, 3) self.convlayers['conv4'] = ConvLayer(192, 384, 13, 3) self.convlayers['conv5'] = ConvLayer(192, 256, 13, 3) self.fclayers = {} self.fclayers['fc1'] = FCLayer(256, 4096, 6) self.fclayers['fc2'] = FCLayer(4096, 4096) self.fclayers['fc3'] = FCLayer(4096, 1000) # LocalRegionLayer. self.lrlayers = {} self.lrlayers['pool1'] = PoolingLayer(64, 7, 2) self.lrlayers['pool2'] = PoolingLayer(29, 13, 3) self.lrlayers['pool3'] = PoolingLayer(32, 7, 2, strd=3) self.lrlayers['lr1'] = LocalRegionLayer(32, 7, nreg=5, sreg=1) self.lrlayers['lr2'] = LocalRegionLayer(32, 7, nreg=5, sreg=1, strd=2) # Fake layers. self.fake_layers = {} # With irregular nifm/nofm. self.fake_layers['IRR'] = ConvLayer(255, 383, 13, 3) # With small numbers of fmaps. self.fake_layers['SM'] = ConvLayer(5, 3, 13, 3) # With large FIL height. self.fake_layers['LGFIL'] = ConvLayer(64, 64, 13, 22) # Resource. self.resource = {} proc_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC) data_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM) # Eyeriss, ISSCC'16, JSSC'17. self.resource['BASE'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024, size_regf=520, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False)
def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops) # With inter-layer pipelining. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) tops, _ = nnd.schedule_search(options) self.assertFalse(tops)
def test_invalid_proc_region(self): ''' Invalid proc_region. ''' with self.assertRaisesRegexp(TypeError, 'Resource: .*proc_region.*'): _ = Resource( proc_region=PhyDim2(2, 2), data_regions=(NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 0), type=NodeRegion.DATA), NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 1), type=NodeRegion.DATA)), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, )
def test_invalid_proc_region_dram(self): ''' Invalid proc_region with type DRAM. ''' with self.assertRaisesRegex(ValueError, 'Resource: .*proc_.*type.*'): _ = Resource( proc_region=NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), type=NodeRegion.DRAM), dram_region=self.dram_region, src_data_region=self.src_data_region, dst_data_region=self.dst_data_region, dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, array_bus_width=8, dram_bandwidth=128, no_time_mux=False, )
def test_invalid_data_regions_proc(self): ''' Invalid data_regions with type PROC. ''' with self.assertRaisesRegexp(ValueError, 'Resource: .*data_.*type.*'): _ = Resource( proc_region=NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), type=NodeRegion.PROC), data_regions=(NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 0), type=NodeRegion.PROC), NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 1), type=NodeRegion.DATA)), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, )
def test_valid_args(self): ''' Valid arguments. ''' resource = Resource( proc_region=NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), type=NodeRegion.PROC), data_regions=(NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 0), type=NodeRegion.DATA), NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 1), type=NodeRegion.DATA)), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, ) self.assertTupleEqual(resource.proc_region.dim, (2, 2), 'proc_region') self.assertTupleEqual(resource.dim_array, (16, 16), 'dim_array') self.assertEqual(resource.size_gbuf, 131072, 'size_gbuf') self.assertEqual(resource.size_regf, 512, 'size_regf')
def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops)
def test_valid_args(self): ''' Valid arguments. ''' resource = Resource( proc_region=self.proc_region, dram_region=self.dram_region, src_data_region=self.src_data_region, dst_data_region=self.dst_data_region, dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, array_bus_width=8, dram_bandwidth=128, no_time_mux=False, ) self.assertTupleEqual(resource.proc_region.dim, (2, 2), 'proc_region') self.assertTupleEqual(resource.dram_region.dim, (2, 2), 'dram_region') self.assertTupleEqual(resource.dim_array, (16, 16), 'dim_array') self.assertEqual(resource.size_gbuf, 131072, 'size_gbuf') self.assertEqual(resource.size_regf, 512, 'size_regf') self.assertEqual(resource.array_bus_width, 8, 'array_bus_width') self.assertEqual(resource.dram_bandwidth, 128, 'dram_bandwidth') self.assertFalse(resource.no_time_mux, 'no_time_mux')
def test_dst_data_region(self): ''' Accessor dst_data_region. ''' nr1 = NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 0), type=NodeRegion.DATA) nr2 = NodeRegion(dim=PhyDim2(2, 1), origin=PhyDim2(0, 1), type=NodeRegion.DATA) resource = Resource(proc_region=NodeRegion(dim=PhyDim2(4, 4), origin=PhyDim2(0, 0), type=NodeRegion.PROC), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, data_regions=(nr1, nr2)) self.assertEqual(resource.dst_data_region(), nr2, 'dst_data_region') resource = Resource(proc_region=NodeRegion(dim=PhyDim2(4, 4), origin=PhyDim2(0, 0), type=NodeRegion.PROC), dim_array=PhyDim2(16, 16), size_gbuf=131072, size_regf=512, data_regions=(nr1, )) self.assertEqual(resource.dst_data_region(), nr1, 'dst_data_region')
def setUp(self): # Workload. self.layer = {} self.layer['BASE'] = ConvLayer(12, 10, 28, 3) self.layer['LGFIL'] = ConvLayer(2, 4, 28, 20) self.layer['POOL'] = PoolingLayer(32, 28, 2) self.layer['PAR'] = ConvLayer(24, 36, 56, 3) self.batch_size = 4 # Resource. self.resource = {} dim_array = PhyDim2(16, 16) proc_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC) data_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM) # Typical resource. self.resource['BASE'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=65536, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Larger resource with sufficient capacity, to make all schemes valid. self.resource['LG'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Small resource. self.resource['SM'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=4096, size_regf=16, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Multi-node parallel resource. self.resource['PAR'] = Resource(proc_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 2), type=NodeRegion.PROC), dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=25000, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Resource with no data regions. proc_data_region = NodeRegion(origin=PhyDim2(1, 1), dim=PhyDim2(1, 1), type=NodeRegion.PROC) self.resource['SRCNOTDATA'] = Resource( proc_region=proc_region, dram_region=data_region, src_data_region=proc_data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.resource['DSTNOTDATA'] = Resource( proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=proc_data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.resource['DATALOCAL'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=proc_region, dst_data_region=proc_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) # Filter pinning. self.resource['FILPIN'] = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=True) # Nested loop description after mapping. self.nld = {} self.nld['BASE'] = next( MapStrategyEyeriss(self.layer['BASE'], self.batch_size, 1, dim_array).gen_nested_loop_desc()) self.nld['LGFIL'] = next( MapStrategyEyeriss(self.layer['LGFIL'], self.batch_size, 1, dim_array).gen_nested_loop_desc()) self.nld['POOL'] = next( MapStrategyEyeriss(self.layer['POOL'], self.batch_size, 1, dim_array).gen_nested_loop_desc()) # Fake nested loop, with zero filter size. self.nld['ZERO_FIL'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(0, 1000, 800), usize_regf=(0, 3, 1), unit_access=((0, 1000, 800), (0, 1000, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Fake nested loop, with zero ifmap size. self.nld['ZERO_IFM'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(9, 0, 800), usize_regf=(3, 0, 1), unit_access=((9, 0, 800), (9, 0, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Fake partition scheme. self.part = PartitionScheme(range(pe.NUM), ((1, 1), ) * pe.NUM) # Fake buffer sharing scheme. self.bufshr = BufShrScheme(proc_region, self.part) # Options. self.options = {} # Basic. self.options['BASE'] = Option(ntops=2**30) # Multiprocessing. self.options['MP'] = Option(ntops=2**30, nprocesses=8) # Limited top schemes. self.options['NTOPS'] = Option(ntops=10) # Bypass. self.options['BYP'] = Option(sw_gbuf_bypass=(True, ) * 3, ntops=2**30) # Bypass solver. self.options['BYPSOL'] = Option(sw_gbuf_bypass=(True, ) * 3, sw_solve_loopblocking=True, ntops=2**30) # Access forwarding. self.options['ACCFWD'] = Option(hw_access_forwarding=True, ntops=2**30) # Buffer sharing. self.options['BUFSHR'] = Option(hw_gbuf_sharing=True, ntops=2**30) # Buffer sharing with bypassing. self.options['BUFSHR-BYP'] = Option(sw_gbuf_bypass=(True, ) * 3, hw_gbuf_sharing=True, ntops=2**30) # Constraint. self.none_cstr = SchedulingConstraint() self.cstr = SchedulingConstraint(topifm=1, topbat=1) # Cost. self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, idl_unit=50)
def do_scheduling(args): ''' Get optimal scheduling for given problem. Return a result schedule. ''' ## Network. network = import_network(args.net) batch_size = args.batch ## Resource. dim_nodes = PhyDim2(*args.nodes) dim_array = PhyDim2(*args.array) # Sizes of gbuf and regf are in words. word = (args.word + 7) / 8 size_gbuf = args.gbuf / word size_regf = args.regf / word array_bus_width = args.bus_width // args.word if not array_bus_width: array_bus_width = float('inf') dram_bandwidth = args.dram_bw / word proc_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.PROC) if args.mem_type == '2D': # Memory nodes are on two sides. data_region = NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), dist=dim_nodes - PhyDim2(1, 1), type=NodeRegion.DRAM) assert data_region.rel2abs(PhyDim2(1, 1)) + PhyDim2(1, 1) \ == proc_region.dim elif args.mem_type == '3D': # Memory nodes are on the top. data_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.DRAM) resource = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=size_gbuf, size_regf=size_regf, array_bus_width=array_bus_width, dram_bandwidth=dram_bandwidth, no_time_mux=False) ## Cost. hier_cost = [0] * me.NUM hier_cost[me.DRAM] = args.hier_cost[0] hier_cost[me.GBUF] = args.hier_cost[1] hier_cost[me.ITCN] = args.hier_cost[2] hier_cost[me.REGF] = args.hier_cost[3] cost = Cost(mac_op=args.op_cost, mem_hier=tuple(hier_cost), noc_hop=args.hop_cost, idl_unit=args.unit_idle_cost) ## Options. bypass = [True] * de.NUM bypass[de.IFM] = 'i' not in args.disable_bypass bypass[de.OFM] = 'o' not in args.disable_bypass bypass[de.FIL] = 'f' not in args.disable_bypass options = Option( sw_gbuf_bypass=tuple(bypass), sw_solve_loopblocking=args.solve_loopblocking, hw_access_forwarding=args.enable_access_forwarding, hw_gbuf_sharing=args.enable_gbuf_sharing, hw_gbuf_save_writeback=args.enable_save_writeback, partition_hybrid=args.hybrid_partition, partition_batch=args.batch_partition, partition_ifmaps=args.ifmaps_partition, partition_interlayer=args.interlayer_partition, layer_pipeline_time_ovhd=args.layer_pipeline_time_overhead, layer_pipeline_max_degree=args.layer_pipeline_max_degree, layer_pipeline_opt=not args.disable_interlayer_opt, opt_goal=args.goal.lower(), ntops=args.top, nprocesses=args.processes, verbose=args.verbose) ## Search schedules. nnd = NNDataflow(network, batch_size, resource, cost, MapStrategyEyeriss) tbeg = time.time() tops, cache_stats = nnd.schedule_search(options) tend = time.time() telapsed = tend - tbeg if not tops: sys.stderr.write('No valid dataflow found.\n') return None top = tops[0] ## Write results. res_map = OrderedDict() res_map['version'] = get_version(with_local=True) res_map['net'] = args.net res_map['batch'] = args.batch res_map['resource'] = resource._asdict() res_map['cost'] = cost._asdict() res_map['options'] = options._asdict() res_map['cache_stats'] = cache_stats res_map['elapsed'] = telapsed stats = stats_dict(top, cost) for key, val in stats.items(): res_map[key] = val return res_map
def setUp(self): # Workload. self.layer = {} self.layer['BASE'] = ConvLayer(12, 10, 28, 3) self.layer['LGFIL'] = ConvLayer(2, 4, 28, 20) self.layer['POOL'] = PoolingLayer(32, 28, 2) self.batch_size = 4 # Resource. self.resource = {} dim_array = PhyDim2(16, 16) proc_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC) data_regions = (NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ) # Typical resource. self.resource['BASE'] = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=65536, size_regf=64) # Larger resource with sufficient capacity, to make all schemes valid. self.resource['LG'] = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=1024**3, size_regf=1024**3) # Small resource. self.resource['SM'] = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=4096, size_regf=16) # Nested loop description after mapping. self.nld = {} self.nld['BASE'] = next( MapStrategyEyeriss(self.layer['BASE'], self.batch_size, dim_array).gen_nested_loop_desc()) self.nld['LGFIL'] = next( MapStrategyEyeriss(self.layer['LGFIL'], self.batch_size, dim_array).gen_nested_loop_desc()) self.nld['POOL'] = next( MapStrategyEyeriss(self.layer['POOL'], self.batch_size, dim_array).gen_nested_loop_desc()) # Fake nested loop, with zero filter size. self.nld['ZERO_FIL'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(0, 1000, 800), usize_regf=(0, 3, 1), unit_access=((0, 1000, 800), (0, 1000, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Fake nested loop, with zero ifmap size. self.nld['ZERO_IFM'] = NestedLoopDesc( loopcnt=(12, 10, 4), usize_gbuf=(9, 0, 800), usize_regf=(3, 0, 1), unit_access=((9, 0, 800), (9, 0, 800), (3, 9, 7), (1, 1, 1)), data_loops=(DataDimLoops(le.IFM, le.OFM), DataDimLoops(le.IFM, le.BAT), DataDimLoops(le.OFM, le.BAT)), unit_ops=1, unit_time=1) # Options. self.options = {} # Basic. self.options['BASE'] = Option(ntops=2**30) # Multiprocessing. self.options['MP'] = Option(ntops=2**30, nprocesses=8) # Limited top schemes. self.options['NTOPS'] = Option(ntops=10) # Bypass. self.options['BYP'] = Option(sw_gbuf_bypass=(True, ) * 3, ntops=2**30) # Bypass solver. self.options['BYPSOL'] = Option(sw_gbuf_bypass=(True, ) * 3, sw_solve_loopblocking=True, ntops=2**30) # Cost. self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, unit_static=50) # Partition occupation. self.part_occ = 0.91
class TestScheduling(unittest.TestCase): ''' Tests for Scheduling module. ''' def setUp(self): self.layers = {} self.layers['BASE'] = ConvLayer(8, 16, 28, 3) self.layers['POOL'] = PoolingLayer(16, 28, 2) self.layers['LR'] = LocalRegionLayer(16, 28, nreg=3, sreg=1) self.batch_size = 4 self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, idl_unit=50) self.none_cstr = SchedulingConstraint() self.cstr = SchedulingConstraint(topofm=1, topbat=self.batch_size) self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False) self.options = Option(partition_hybrid=True, partition_batch=True, partition_ifmaps=True, ntops=10) self.ifmap_layouts = {} part = PartitionScheme(order=(pe.INPP, pe.BATP, pe.OUTP, pe.OFMP), pdims=((1, 2), (2, 1), (1, 2), (2, 1))) for wlkey in self.layers: input_layer = self.layers[wlkey].input_layer() self.ifmap_layouts[wlkey] = DataLayout( frngs=(FmapRange((0, 0, 0, 0), FmapPosition(b=self.batch_size, n=input_layer.nofm, h=input_layer.hofm, w=input_layer.wofm)), ), regions=(self.resource.src_data_region, ), parts=(part.projection(self.resource.src_data_region, appl2frng=True), )) self.sched_seq = (2, 0, 1) def test_valid_args(self): ''' Valid arguments for constructor. ''' schd = Scheduling(self.layers['BASE'], self.batch_size, self.cost, MapStrategyEyeriss) self.assertEqual(schd.layer, self.layers['BASE']) self.assertEqual(schd.batch_size, self.batch_size) self.assertEqual(schd.cost, self.cost) self.assertEqual(schd.map_strategy_class, MapStrategyEyeriss) def test_invalid_layer(self): ''' Invalid layer argument. ''' with self.assertRaisesRegexp(TypeError, 'Scheduling: .*layer.*'): _ = Scheduling((64, 128, 28, 3), self.batch_size, self.cost, MapStrategyEyeriss) def test_invalid_cost(self): ''' Invalid cost argument. ''' with self.assertRaisesRegexp(TypeError, 'Scheduling: .*cost.*'): _ = Scheduling(self.layers['BASE'], self.batch_size, tuple(self.cost), MapStrategyEyeriss) def test_invalid_map_strategy(self): ''' Invalid cost argument. ''' class _DummyClass(object): # pylint: disable=too-few-public-methods pass with self.assertRaisesRegexp(TypeError, 'Scheduling: .*map_strategy_class.*'): _ = Scheduling(self.layers['BASE'], self.batch_size, self.cost, _DummyClass) def test_schedule_search(self): ''' Schedule search. ''' for wlkey in self.layers: layer = self.layers[wlkey] ifmap_layout = self.ifmap_layouts[wlkey] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) condition = SchedulingCondition(resource=self.resource, constraint=self.cstr, ifmap_layout=ifmap_layout, sched_seq=self.sched_seq) res = schd.schedule_search(condition, self.options) # Top N. self.assertLessEqual(len(res), self.options.ntops) self.assertTrue(all(isinstance(r, SchedulingResult) for r in res)) for idx in range(len(res) - 1): self.assertLessEqual(res[idx].total_cost, res[idx + 1].total_cost) # Combination of loop blocking and partitioning. for r in res: self.assertAlmostEqual( r.total_cost, r.scheme['cost_op'] + r.scheme['cost_access'] + r.scheme['cost_noc'] + r.scheme['cost_static']) self.assertEqual(r.total_ops, layer.total_ops(self.batch_size)) self.assertSequenceEqual(r.scheme['total_nhops'], [ nh * f for nh, f in zip(r.scheme['unit_nhops'], r.scheme['fetch'][0]) ]) self.assertEqual(r.num_nodes, self.resource.proc_region.dim.size()) # Constraint. for r in res: self.assertEqual(r.scheme['to'][0], 1) # Ofmap layout. for r in res: self.assertEqual(r.ofmap_layout.complete_fmap_range().size(), layer.total_ofmap_size(self.batch_size)) # Sequence number. for r in res: self.assertTupleEqual(r.sched_seq, condition.sched_seq) def test_schedule_search_ilayout(self): ''' Invalid ifmap_layout. ''' layer = self.layers['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) # Shift ifmap out of memory region. condition = SchedulingCondition( resource=self.resource, constraint=self.none_cstr, ifmap_layout=self.ifmap_layouts['BASE']._replace(regions=tuple( r._replace(origin=PhyDim2(-10, -10)) for r in self.ifmap_layouts['BASE'].regions)), sched_seq=self.sched_seq) with self.assertRaisesRegexp(ValueError, 'Scheduling: .*ifmap.*'): _ = schd.schedule_search(condition, self.options) # Not match layer. condition = SchedulingCondition( resource=self.resource, constraint=self.none_cstr, ifmap_layout=self.ifmap_layouts['POOL'], sched_seq=self.sched_seq) with self.assertRaisesRegexp(ValueError, 'Scheduling: .*ifmap.*'): _ = schd.schedule_search(condition, self.options) def test_schedule_search_nolbs(self): ''' Schedule search with no lbs. ''' layer = self.layers['BASE'] ifmap_layout = self.ifmap_layouts['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) condition = SchedulingCondition( resource=self.resource._replace(size_regf=0), constraint=self.none_cstr, ifmap_layout=ifmap_layout, sched_seq=self.sched_seq) res = schd.schedule_search(condition, self.options) self.assertFalse(res) def test_pernode_sched_cache(self): ''' Per-node scheduling cache. ''' # pylint: disable=no-member Scheduling.schedule_search_per_node.cache_clear() layer = self.layers['BASE'] ifmap_layout = self.ifmap_layouts['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) self.assertEqual(schd.schedule_search_per_node.cache_info().currsize, 0) self.assertTupleEqual(schd.cache_stats(), (0, 0)) condition = SchedulingCondition(resource=self.resource, constraint=self.cstr, ifmap_layout=ifmap_layout, sched_seq=self.sched_seq) Scheduling.schedule_search.cache_clear() _ = schd.schedule_search(condition, self.options) h, m = schd.cache_stats() self.assertEqual(schd.schedule_search_per_node.cache_info().currsize, m) self.assertEqual(h, 0) n = m Scheduling.schedule_search.cache_clear() _ = schd.schedule_search(condition, self.options) self.assertEqual(schd.schedule_search_per_node.cache_info().currsize, n) self.assertTupleEqual(schd.cache_stats(), (n, n)) def test_pernode_sched_cache_key(self): ''' Per-node scheduling cache key must be hash-able. ''' # pylint: disable=no-member Scheduling.schedule_search.cache_clear() Scheduling.schedule_search_per_node.cache_clear() layer = self.layers['BASE'] ifmap_layout = self.ifmap_layouts['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) condition = SchedulingCondition(resource=self.resource, constraint=self.cstr, ifmap_layout=ifmap_layout, sched_seq=self.sched_seq) _ = schd.schedule_search(condition, self.options) h, m = schd.cache_stats() self.assertEqual(h, 0) # Make another instance. rsrc = Resource(**self.resource._asdict()) cstr = self.cstr opts = Option(**self.options._asdict()) self.assertNotEqual(id(rsrc), id(self.resource)) self.assertNotEqual(id(opts), id(self.options)) part = PartitionScheme(order=(pe.BATP, pe.INPP, pe.OUTP, pe.OFMP), pdims=((2, 4), (2, 1), (1, 1), (1, 1))) _ = schd.schedule_search_per_node(part, rsrc, cstr, opts) h2, m2 = schd.cache_stats() self.assertEqual(h2, h + 1) self.assertEqual(m2, m)
def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72)
def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = {'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue(success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, stats[layer], stats_ref[layer]))
class TestNNDataflow(unittest.TestCase): ''' Tests for NNDataflow module. ''' def setUp(self): self.alex_net = import_network('alex_net') self.vgg_net = import_network('vgg_net') net = Network('simple') net.set_input_layer(InputLayer(4, 2)) net.add('1', ConvLayer(4, 4, 2, 1)) net.add('2', ConvLayer(4, 4, 2, 1)) # Two more layers to avoid single-segment case. net.add('a1', ConvLayer(4, 1, 1, 1, strd=2)) net.add('a2', ConvLayer(1, 1, 1, 1)) self.simple_net = net net = Network('complex') net.set_input_layer(InputLayer(8, 8)) net.add('1', ConvLayer(8, 8, 8, 1)) net.add('2a', ConvLayer(8, 8, 8, 1), prevs=('1',)) net.add('3a', ConvLayer(8, 8, 8, 1)) net.add('2b', ConvLayer(8, 8, 8, 1), prevs=('1',)) net.add('3b', ConvLayer(8, 8, 8, 1)) net.add('4', ConvLayer(16, 8, 8, 1), prevs=('3a', '3b')) self.complex_net = net self.map_strategy = MapStrategyEyeriss self.resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=0, idl_unit=0) self.options = Option() def test_invalid_network(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*network.*'): _ = NNDataflow(self.alex_net.input_layer(), 4, self.resource, self.cost, self.map_strategy) def test_invalid_resource(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*resource.*'): _ = NNDataflow(self.alex_net, 4, self.resource.proc_region, self.cost, self.map_strategy) def test_invalid_cost(self): ''' Invalid network argument. ''' with self.assertRaisesRegex(TypeError, 'NNDataflow: .*cost.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost._asdict(), self.map_strategy) def test_invalid_map_strategy(self): ''' Invalid map_strategy argument. ''' class _DummyClass(): # pylint: disable=too-few-public-methods pass with self.assertRaisesRegex(TypeError, 'NNDataflow: .*map_strategy.*'): _ = NNDataflow(self.alex_net, 4, self.resource, self.cost, _DummyClass) def test_verbose(self): ''' Verbose mode. ''' network = self.alex_net batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, verbose=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() tops, _ = nnd.schedule_search(options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertTrue(tops) self.assertFalse(stdout_value) for layer in network: self.assertIn(layer, stderr_value) def test_pipelining(self): ''' Pipelining. ''' network = self.alex_net batch_size = 1 options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_infeasible(self): ''' Enter fast forward due to infeasible constraint. ''' network = self.simple_net batch_size = 1 # Very small gbuf size. Small fmap tpart is infeasible. resource = self.resource._replace( dim_array=PhyDim2(2, 2), size_gbuf=16) options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) # No pipelining is feasible. for dtfl in tops: self.assertTupleEqual(dtfl['1'].sched_seq, (0, 0, 0)) self.assertTupleEqual(dtfl['2'].sched_seq, (1, 0, 0)) def test_fast_forward_found(self): ''' Enter fast forward due to early found. ''' network = self.simple_net batch_size = 1 # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_crit_time(self): ''' Enter fast forward due to long critical time. ''' network = self.simple_net batch_size = 1 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(1, 1), ) # Very strict time overhead limit. # At large fmap tpart, utilization decreases and critical time would # increase. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=1e-3) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fast_forward_frontier(self): ''' Enter fast forward due to off-frontier. ''' network = self.simple_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(2, 2), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_fmap_fwd(self): ''' Fmap forward with shared mem sources or both on/off-chip destinations. ''' network = self.complex_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_sched_instance_sharing(self): ''' Scheduling instance sharing between layers. ''' network = self.alex_net batch_size = 1 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) self.assertIs(nnd.layer_sched_dict['conv1_a'], nnd.layer_sched_dict['conv1_b']) self.assertIs(nnd.layer_sched_dict['conv2_a'], nnd.layer_sched_dict['conv2_b']) self.assertIs(nnd.layer_sched_dict['pool1_a'], nnd.layer_sched_dict['pool1_b']) def test_opt_goal(self): ''' Optimization goal. ''' network = self.alex_net batch_size = 8 resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC) ) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) options_e = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='e', ntops=16) tops_e, _ = nnd.schedule_search(options_e) self.assertTrue(tops_e) options_d = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='d', ntops=16) tops_d, _ = nnd.schedule_search(options_d) self.assertTrue(tops_d) options_ed = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='ed', ntops=16) tops_ed, _ = nnd.schedule_search(options_ed) self.assertTrue(tops_ed) self.assertLess(tops_e[0].total_cost, tops_d[0].total_cost) self.assertLess(tops_e[0].total_cost, tops_ed[0].total_cost) self.assertLess(tops_d[0].total_time, tops_e[0].total_time) self.assertLess(tops_d[0].total_time, tops_ed[0].total_time) # Sum of the smallest ED may not be the smallest; allow for error. self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_e[0].total_cost * tops_e[0].total_time * 1.05) self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_d[0].total_cost * tops_d[0].total_time * 1.05) def test_ext_layer(self): ''' With external layers. ''' network = self.alex_net network.add_ext('e0', InputLayer(4, 1)) network.add('l1', FCLayer(1000, 4)) network.add('l2', FCLayer(8, 4), prevs=('e0', 'l1')) batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops) # With inter-layer pipelining. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) tops, _ = nnd.schedule_search(options) self.assertFalse(tops) def test_scheduling_failure(self): ''' Layer scheduling failure. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, MapStrategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() with self.assertRaises(NotImplementedError): _ = nnd.schedule_search(self.options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertFalse(stdout_value) self.assertIn('Failed', stderr_value) def test_eyeriss_isca16(self): ''' Reproduce Eyeriss ISCA'16 paper Fig. 10. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in ['conv{}'.format(i) for i in range(1, 6)] \ + ['fc{}'.format(i) for i in range(1, 4)]: op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] op_cost += sr.total_ops * self.cost.mac_op access_cost = [ac + a * c for ac, a, c in zip(access_cost, sr.total_accesses, self.cost.mem_hier)] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] / 1e9) # Check the major parts: ALU, DRAM, RF. major_cost_bkdn_ref = {'conv1': [1.69, 2.46, 6.75], 'conv2': [3.58, 2.27, 14.33], 'conv3': [2.39, 2.02, 9.57], 'conv4': [1.79, 1.57, 7.18], 'conv5': [1.20, 1.05, 4.78], 'fc1': [0.60, 7.78, 2.42], 'fc2': [0.27, 3.39, 1.07], 'fc3': [0.07, 0.84, 0.26], } for layer in cost_bkdn: success = all(abs(a - b) < 0.1 for a, b in zip(cost_bkdn[layer][:2] + cost_bkdn[layer][-1:], major_cost_bkdn_ref[layer])) self.assertTrue(success, 'test_eyeriss_isca16: ' 'ALU, DRAM, RF cost diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, cost_bkdn[layer], major_cost_bkdn_ref[layer])) def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = {'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue(success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, stats[layer], stats_ref[layer])) def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72)
class TestScheduling(unittest.TestCase): ''' Tests for Scheduling module. ''' def setUp(self): self.layers = {} self.layers['BASE'] = ConvLayer(8, 16, 28, 3) self.layers['POOL'] = PoolingLayer(16, 28, 2) self.layers['LR'] = LocalRegionLayer(16, 28, nreg=3, sreg=1) self.batch_size = 4 self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1), noc_hop=50, unit_static=50) self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 1), type=NodeRegion.DATA),), dim_array=PhyDim2(16, 16), size_gbuf=65536, size_regf=64) self.options = Option(partition_hybrid=True, partition_batch=True, partition_ifmaps=True, ntops=10) self.ifmap_layouts = {} part = PartitionScheme(order=(pe.INPP, pe.BATP, pe.OUTP, pe.OFMP), pdims=((1, 2), (2, 1), (1, 2), (2, 1))) for wlkey in self.layers: self.ifmap_layouts[wlkey] = partition.get_ofmap_layout( self.layers[wlkey].input_layer(), self.batch_size, part, self.resource.src_data_region()) def test_valid_args(self): ''' Valid arguments for constructor. ''' schd = Scheduling(self.layers['BASE'], self.batch_size, self.cost, MapStrategyEyeriss) self.assertEqual(schd.layer, self.layers['BASE']) self.assertEqual(schd.batch_size, self.batch_size) self.assertEqual(schd.cost, self.cost) self.assertEqual(schd.map_strategy_class, MapStrategyEyeriss) def test_invalid_layer(self): ''' Invalid layer argument. ''' with self.assertRaisesRegexp(TypeError, 'Scheduling: .*layer.*'): _ = Scheduling((64, 128, 28, 3), self.batch_size, self.cost, MapStrategyEyeriss) def test_invalid_cost(self): ''' Invalid cost argument. ''' with self.assertRaisesRegexp(TypeError, 'Scheduling: .*cost.*'): _ = Scheduling(self.layers['BASE'], self.batch_size, tuple(self.cost), MapStrategyEyeriss) def test_invalid_map_strategy(self): ''' Invalid cost argument. ''' class _DummyClass(object): # pylint: disable=too-few-public-methods pass with self.assertRaisesRegexp(TypeError, 'Scheduling: .*map_strategy_class.*'): _ = Scheduling(self.layers['BASE'], self.batch_size, self.cost, _DummyClass) def test_schedule_search(self): ''' Schedule search. ''' for wlkey in self.layers: layer = self.layers[wlkey] ifmap_layout = self.ifmap_layouts[wlkey] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) condition = SchedulingCondition(resource=self.resource, ifmap_layout=ifmap_layout) res = schd.schedule_search(condition, self.options) # Top N. self.assertLessEqual(len(res), self.options.ntops) self.assertTrue(all(isinstance(r, SchedulingResult) for r in res)) for idx in range(len(res) - 1): self.assertLessEqual(res[idx].total_cost, res[idx + 1].total_cost) # Combination of loop blocking and partitioning. for r in res: self.assertEqual(r.total_cost, r.dict_loop['cost'] + r.dict_part['cost']) self.assertEqual(r.dict_loop['ops'], layer.total_ops(self.batch_size)) self.assertSequenceEqual(r.dict_part['total_nhops'], [nh * f for nh, f in zip(r.dict_part['unit_nhops'], r.dict_loop['fetch'][0])]) self.assertEqual(r.dict_part['num_nodes'], self.resource.proc_region.dim.size()) # Ofmap layout. for r in res: self.assertEqual(r.ofmap_layout.frmap.complete_fmap_range() .size(), layer.total_ofmap_size(self.batch_size)) def test_schedule_search_ilayout(self): ''' Invalid ifmap_layout. ''' layer = self.layers['BASE'] ifmap_layout = self.ifmap_layouts['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) # Shift ifmap out of memory region. condition = SchedulingCondition( resource=self.resource, ifmap_layout=ifmap_layout.view(PhyDim2(1, 1))) with self.assertRaisesRegexp(ValueError, 'Scheduling: .*ifmap.*'): _ = schd.schedule_search(condition, self.options) # Not match layer. condition = SchedulingCondition( resource=self.resource, ifmap_layout=self.ifmap_layouts['POOL']) with self.assertRaisesRegexp(ValueError, 'Scheduling: .*ifmap.*'): _ = schd.schedule_search(condition, self.options) def test_pernode_sched_cache(self): ''' Per-node scheduling cache. ''' layer = self.layers['BASE'] ifmap_layout = self.ifmap_layouts['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) self.assertEqual(len(schd.pernode_sched_cache), 0) self.assertTupleEqual(schd.cache_stats(), (0, 0)) condition = SchedulingCondition(resource=self.resource, ifmap_layout=ifmap_layout) _ = schd.schedule_search(condition, self.options) h, m = schd.cache_stats() self.assertEqual(len(schd.pernode_sched_cache), m) self.assertEqual(h, 0) n = m _ = schd.schedule_search(condition, self.options) self.assertEqual(len(schd.pernode_sched_cache), n) self.assertTupleEqual(schd.cache_stats(), (n, n)) def test_pernode_sched_cache_key(self): ''' Per-node scheduling cache key must be hash-able. ''' layer = self.layers['BASE'] ifmap_layout = self.ifmap_layouts['BASE'] schd = Scheduling(layer, self.batch_size, self.cost, MapStrategyEyeriss) condition = SchedulingCondition(resource=self.resource, ifmap_layout=ifmap_layout) _ = schd.schedule_search(condition, self.options) h, m = schd.cache_stats() self.assertEqual(h, 0) # Make another instance. rsrc = Resource(**self.resource._asdict()) opts = Option(**self.options._asdict()) self.assertNotEqual(id(rsrc), id(self.resource)) self.assertNotEqual(id(opts), id(self.options)) part = PartitionScheme(order=(pe.BATP, pe.INPP, pe.OUTP, pe.OFMP), pdims=((2, 2), (2, 2), (1, 1), (1, 1))) _ = schd.schedule_search_per_node(part, rsrc, opts) h2, m2 = schd.cache_stats() self.assertEqual(h2, h + 1) self.assertEqual(m2, m)
def eyerissAsplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' #network = self.alex_net network = self.mock_net batch_size = 1 resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, num_value_pes=256, ) # model values print('converting weights') q_weight_dict = {} weights_dict = read_weights() for w_layer in [ 'conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6', 'fc7', 'fc8' ]: array = convertToArray(weights_dict, w_layer) array_qint8 = quantizeWeights(array, 'qint8') q_weight_dict[w_layer] = array_qint8 #print('''Hey num weights in conv1 are {} '''.format(len(array_qint8))) # hardware costs mult_cost = readValueMult8Cost() #control_cost = readValueControl8Cost() print('done converting weights') #with open('weights.pickle', 'wb') as f: # pickle.dump(q_weight_dict,f) #counter = 0 #c = 0 #for m in mult_cost.keys(): # c += mult_cost[m] # counter += 1 #ave = c/counter #print('{} '.format(counter)) #print('average = {}'.format(ave)) #print('conv3 weights are') #for w in q_weight_dict['conv1']: # print(w) #exit() cost = Cost( value_control=1.92e-13, value_mult=mult_cost, mac_op=2e-12, adder_cost=(1.178e-5) / 200000000, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pj/16-b noc_hop=40e-12, idl_unit=200e-12, my_weights=q_weight_dict, mem_cycles=(200, 6, 2, 1)) #cost = cost(value_control=control_cost, # value_mult=mult_cost, # mac_op=2e-12, # mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pj/16-b # noc_hop=40e-12, # idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) #pdb.set_trace() nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. #self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) print('t16 ops: {}'.format(dfsch_t16.total_ops)) # Performance of T-16 is proportional to PE resource (20% margin). #self.assertLess(dfsch_t16.total_time, # 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) print('t16_time: {}'.format(dfsch_t16.total_time)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. #self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72) print('t16_energy: {}'.format(dfsch_t16.total_cost)) for i in dfsch_t16: print(str(i) + ',') ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in dfsch_t16: layer = str(layer) op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch_t16[layer_part] op_cost += sr.total_ops * cost.mac_op access_cost = [ ac + a * c for ac, a, c in zip( access_cost, sr.total_accesses, cost.mem_hier) ] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] * 1e12 / 1e9) for layer in cost_bkdn: print(cost_bkdn[layer])