def test_opt_goal(self): ''' Optimization goal. ''' network = self.alex_net batch_size = 8 resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC) ) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) options_e = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='e', ntops=16) tops_e, _ = nnd.schedule_search(options_e) self.assertTrue(tops_e) options_d = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='d', ntops=16) tops_d, _ = nnd.schedule_search(options_d) self.assertTrue(tops_d) options_ed = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True, partition_batch=True, opt_goal='ed', ntops=16) tops_ed, _ = nnd.schedule_search(options_ed) self.assertTrue(tops_ed) self.assertLess(tops_e[0].total_cost, tops_d[0].total_cost) self.assertLess(tops_e[0].total_cost, tops_ed[0].total_cost) self.assertLess(tops_d[0].total_time, tops_e[0].total_time) self.assertLess(tops_d[0].total_time, tops_ed[0].total_time) # Sum of the smallest ED may not be the smallest; allow for error. self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_e[0].total_cost * tops_e[0].total_time * 1.05) self.assertLess(tops_ed[0].total_cost * tops_ed[0].total_time, tops_d[0].total_cost * tops_d[0].total_time * 1.05)
def test_verbose(self): ''' Verbose mode. ''' network = self.alex_net batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, verbose=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() tops, _ = nnd.schedule_search(options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertTrue(tops) self.assertFalse(stdout_value) for layer in network: self.assertIn(layer, stderr_value)
def test_scheduling_failure(self): ''' Layer scheduling failure. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, MapStrategy) old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = stdout = StringIO() sys.stderr = stderr = StringIO() with self.assertRaises(NotImplementedError): _ = nnd.schedule_search(self.options) sys.stdout = old_stdout sys.stderr = old_stderr stdout_value = stdout.getvalue() stderr_value = stderr.getvalue() stdout.close() stderr.close() self.assertFalse(stdout_value) self.assertIn('Failed', stderr_value)
def test_eyeriss_isca16(self): network = self.net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, cache_stats = nnd.schedule_search(self.options) if not tops: sys.stderr.write("No valid dataflow found!") return None dfsch = tops[0] ## Write results. res_map = OrderedDict() res_map['net'] = "MLP_L" res_map['batch'] = batch_size res_map['resource'] = self.resource._asdict() res_map['cost'] = self.cost._asdict() res_map['options'] = self.options._asdict() res_map['cache_stats'] = cache_stats stats = stats_dict(dfsch, self.cost) for key, val in stats.items(): res_map[key] = val return res_map
def test_pipelining(self): ''' Pipelining. ''' network = self.alex_net batch_size = 1 options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops)
def test_fast_forward_found(self): ''' Enter fast forward due to early found. ''' network = self.simple_net batch_size = 1 # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops)
def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops) # With inter-layer pipelining. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) tops, _ = nnd.schedule_search(options) self.assertFalse(tops)
def test_ext_layer(self): ''' With external layers. ''' network = self.alex_net network.add_ext('e0', InputLayer(4, 1)) network.add('l1', FCLayer(1000, 4)) network.add('l2', FCLayer(8, 4), prevs=('e0', 'l1')) batch_size = 16 options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True) nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops)
def test_fmap_fwd(self): ''' Fmap forward with shared mem sources or both on/off-chip destinations. ''' network = self.complex_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace(proc_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops)
def test_no_valid_dataflow(self): ''' No valid dataflow is found. ''' # Very small REGF. self.resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), data_regions=(NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DATA), ), dim_array=PhyDim2(16, 16), size_gbuf=128 * 1024 // 2, # 128 kB size_regf=2, ) nnd = NNDataflow(self.alex_net, 4, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertFalse(tops)
def test_fast_forward_infeasible(self): ''' Enter fast forward due to infeasible constraint. ''' network = self.simple_net batch_size = 1 # Very small gbuf size. Small fmap tpart is infeasible. resource = self.resource._replace(dim_array=PhyDim2(2, 2), size_gbuf=16) options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) # No pipelining is feasible. for dtfl in tops: self.assertTupleEqual(dtfl['1'].sched_seq, (0, 0, 0)) self.assertTupleEqual(dtfl['2'].sched_seq, (1, 0, 0))
def test_fast_forward_frontier(self): ''' Enter fast forward due to off-frontier. ''' network = self.simple_net batch_size = 16 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(2, 2), ) # No time overhead limit. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=float('inf')) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops)
def test_fast_forward_crit_time(self): ''' Enter fast forward due to long critical time. ''' network = self.simple_net batch_size = 1 # Multiple nodes for spatial pipelining. resource = self.resource._replace( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(8, 8), type=NodeRegion.PROC), dim_array=PhyDim2(1, 1), ) # Very strict time overhead limit. # At large fmap tpart, utilization decreases and critical time would # increase. options = Option(hw_gbuf_save_writeback=True, partition_interlayer=True, layer_pipeline_time_ovhd=1e-3) nnd = NNDataflow(network, batch_size, resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops)
def do_scheduling(args): ''' Get optimal scheduling for given problem. Return a result schedule. ''' ## Network. network = import_network(args.net) batch_size = args.batch ## Resource. dim_nodes = PhyDim2(*args.nodes) dim_array = PhyDim2(*args.array) # Sizes of gbuf and regf are in words. word = (args.word + 7) / 8 size_gbuf = args.gbuf / word size_regf = args.regf / word array_bus_width = args.bus_width // args.word if not array_bus_width: array_bus_width = float('inf') dram_bandwidth = args.dram_bw / word proc_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.PROC) if args.mem_type == '2D': # Memory nodes are on two sides. data_region = NodeRegion(dim=PhyDim2(2, 2), origin=PhyDim2(0, 0), dist=dim_nodes - PhyDim2(1, 1), type=NodeRegion.DRAM) assert data_region.rel2abs(PhyDim2(1, 1)) + PhyDim2(1, 1) \ == proc_region.dim elif args.mem_type == '3D': # Memory nodes are on the top. data_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.DRAM) resource = Resource(proc_region=proc_region, dram_region=data_region, src_data_region=data_region, dst_data_region=data_region, dim_array=dim_array, size_gbuf=size_gbuf, size_regf=size_regf, array_bus_width=array_bus_width, dram_bandwidth=dram_bandwidth, no_time_mux=False) ## Cost. hier_cost = [0] * me.NUM hier_cost[me.DRAM] = args.hier_cost[0] hier_cost[me.GBUF] = args.hier_cost[1] hier_cost[me.ITCN] = args.hier_cost[2] hier_cost[me.REGF] = args.hier_cost[3] cost = Cost(mac_op=args.op_cost, mem_hier=tuple(hier_cost), noc_hop=args.hop_cost, idl_unit=args.unit_idle_cost) ## Options. bypass = [True] * de.NUM bypass[de.IFM] = 'i' not in args.disable_bypass bypass[de.OFM] = 'o' not in args.disable_bypass bypass[de.FIL] = 'f' not in args.disable_bypass options = Option( sw_gbuf_bypass=tuple(bypass), sw_solve_loopblocking=args.solve_loopblocking, hw_access_forwarding=args.enable_access_forwarding, hw_gbuf_sharing=args.enable_gbuf_sharing, hw_gbuf_save_writeback=args.enable_save_writeback, partition_hybrid=args.hybrid_partition, partition_batch=args.batch_partition, partition_ifmaps=args.ifmaps_partition, partition_interlayer=args.interlayer_partition, layer_pipeline_time_ovhd=args.layer_pipeline_time_overhead, layer_pipeline_max_degree=args.layer_pipeline_max_degree, layer_pipeline_opt=not args.disable_interlayer_opt, opt_goal=args.goal.lower(), ntops=args.top, nprocesses=args.processes, verbose=args.verbose) ## Search schedules. nnd = NNDataflow(network, batch_size, resource, cost, MapStrategyEyeriss) tbeg = time.time() tops, cache_stats = nnd.schedule_search(options) tend = time.time() telapsed = tend - tbeg if not tops: sys.stderr.write('No valid dataflow found.\n') return None top = tops[0] ## Write results. res_map = OrderedDict() res_map['version'] = get_version(with_local=True) res_map['net'] = args.net res_map['batch'] = args.batch res_map['resource'] = resource._asdict() res_map['cost'] = cost._asdict() res_map['options'] = options._asdict() res_map['cache_stats'] = cache_stats res_map['elapsed'] = telapsed stats = stats_dict(top, cost) for key, val in stats.items(): res_map[key] = val return res_map
def test_eyeriss_asplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 ## L-1 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(16, 16), size_gbuf=576056 // 2, # 576 kB size_regf=1024 // 2, # 1 kB array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(240e-12, 28e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=320e-12) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch_l1 = tops[0] ## T-16 configuration. resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) # Performance of T-16 is proportional to PE resource (20% margin). self.assertLess(dfsch_t16.total_time, 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72)
def test_eyeriss_isscc16(self): ''' Reproduce Eyeriss ISSCC'16 paper Fig. 14.5.6, JSSC'17 paper Table V. ''' network = self.alex_net batch_size = 4 resource = Resource(proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.PROC), dram_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), src_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dst_data_region=NodeRegion( origin=PhyDim2(0, 0), dim=PhyDim2(1, 1), type=NodeRegion.DRAM), dim_array=PhyDim2(12, 14), size_gbuf=108 * 1024 // 2, # 108 kB size_regf=261, # 225 + 12 + 24 array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost(mac_op=2e-12, mem_hier=(460e-12, 15e-12, 4e-12, 1e-12), # pJ/16-b noc_hop=0, idl_unit=30e-3 / 200e6) # 30 mW GBUF + REGF nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as stats of the rows in the table. header = 'Power, Processing Latency, Ops, Active PEs, Filter size' stats = {} for layer in ['conv{}'.format(i) for i in range(1, 6)]: onchip_cost = 0 time = 0 ops = 0 fil_size = 0 for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] onchip_cost += sr.total_cost \ - sr.total_accesses[me.DRAM] * cost.mem_hier[me.DRAM] time += sr.total_time ops += sr.total_ops fil_size += network[layer_part].total_filter_size() power = onchip_cost / (time / 200e6) * 1e3 # mW active_pes = int(ops / time) stats[layer] = [] stats[layer].append(power) stats[layer].append(time / 200.e3) # cycles to ms stats[layer].append(ops / 1e6) # to MOPs stats[layer].append(active_pes) stats[layer].append(fil_size / 1e3) # to k # Check. stats_ref = {'conv1': [332, 16.5, 421.66, 151, 34.8], # Act PE 154 'conv2': [288, 39.2, 895.79, 135, 307.2], 'conv3': [266, 21.8, 598.1, 156, 884.7], 'conv4': [235, 16.0, 448.6, 156, 663.6], 'conv5': [236, 10.0, 299.0, 156, 442.4], } for layer in stats: success = (0.6 * stats_ref[layer][0] < stats[layer][0] < stats_ref[layer][0]) \ and (0.8 * stats_ref[layer][1] < stats[layer][1] < stats_ref[layer][1]) \ and all(abs(a - b) < 0.1 for a, b in zip(stats[layer][2:], stats_ref[layer][2:])) self.assertTrue(success, 'test_eyeriss_isscc16: ' 'stats diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, stats[layer], stats_ref[layer]))
def test_eyeriss_isca16(self): ''' Reproduce Eyeriss ISCA'16 paper Fig. 10. ''' network = self.alex_net batch_size = 16 nnd = NNDataflow(network, batch_size, self.resource, self.cost, self.map_strategy) tops, _ = nnd.schedule_search(self.options) self.assertTrue(tops) dfsch = tops[0] ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in ['conv{}'.format(i) for i in range(1, 6)] \ + ['fc{}'.format(i) for i in range(1, 4)]: op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch[layer_part] op_cost += sr.total_ops * self.cost.mac_op access_cost = [ac + a * c for ac, a, c in zip(access_cost, sr.total_accesses, self.cost.mem_hier)] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] / 1e9) # Check the major parts: ALU, DRAM, RF. major_cost_bkdn_ref = {'conv1': [1.69, 2.46, 6.75], 'conv2': [3.58, 2.27, 14.33], 'conv3': [2.39, 2.02, 9.57], 'conv4': [1.79, 1.57, 7.18], 'conv5': [1.20, 1.05, 4.78], 'fc1': [0.60, 7.78, 2.42], 'fc2': [0.27, 3.39, 1.07], 'fc3': [0.07, 0.84, 0.26], } for layer in cost_bkdn: success = all(abs(a - b) < 0.1 for a, b in zip(cost_bkdn[layer][:2] + cost_bkdn[layer][-1:], major_cost_bkdn_ref[layer])) self.assertTrue(success, 'test_eyeriss_isca16: ' 'ALU, DRAM, RF cost diff in layer {}.\n' 'header: {}\n' 'actual: {}\nref: {}' .format(layer, header, cost_bkdn[layer], major_cost_bkdn_ref[layer]))
def eyerissAsplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' network = self.alex_net batch_size = 16 resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, ) cost = Cost( mac_op=2e-12, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pJ/16-b noc_hop=40e-12, idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) pdb.set_trace() nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. #self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) print('t16 ops: {}'.format(dfsch_t16.total_ops)) # Performance of T-16 is proportional to PE resource (20% margin). #self.assertLess(dfsch_t16.total_time, # 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) print('t16_time: {}'.format(dfsch_t16.total_time)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. #self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72) print('t16_energy: {}'.format(dfsch_t16.total_cost)) for i in dfsch_t16: print(str(i) + ',') ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in dfsch_t16: layer = str(layer) op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch_t16[layer_part] op_cost += sr.total_ops * cost.mac_op access_cost = [ ac + a * c for ac, a, c in zip( access_cost, sr.total_accesses, cost.mem_hier) ] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] * 1e12 / 1e9) for layer in cost_bkdn: print(cost_bkdn[layer])
def do_scheduling(args): ''' Get optimal scheduling for given problem. Return a result schedule. ''' ## Network. network = import_network(args.net) batch_size = args.batch ## Resource. dim_nodes = PhyDim2(*args.nodes) dim_array = PhyDim2(*args.array) # Sizes of gbuf and regf are in words. word = (args.word + 7) / 8 size_gbuf = args.gbuf / word size_regf = args.regf / word proc_region = NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.PROC) if args.mem_type == '2D': # Memory nodes are on two sides. data_regions = (NodeRegion(dim=PhyDim2(h=dim_nodes.h, w=1), origin=PhyDim2(h=0, w=0), type=NodeRegion.DATA), NodeRegion(dim=PhyDim2(h=dim_nodes.h, w=1), origin=PhyDim2(h=0, w=dim_nodes.w - 1), type=NodeRegion.DATA)) elif args.mem_type == '3D': # All nodes have memory. data_regions = (NodeRegion(dim=dim_nodes, origin=PhyDim2(0, 0), type=NodeRegion.DATA), ) resource = Resource(proc_region=proc_region, data_regions=data_regions, dim_array=dim_array, size_gbuf=size_gbuf, size_regf=size_regf) ## Cost. hier_cost = [0] * me.NUM hier_cost[me.DRAM] = args.hier_cost[0] hier_cost[me.GBUF] = args.hier_cost[1] hier_cost[me.ITCN] = args.hier_cost[2] hier_cost[me.REGF] = args.hier_cost[3] cost = Cost(mac_op=args.op_cost, mem_hier=tuple(hier_cost), noc_hop=args.hop_cost, unit_static=args.unit_static_cost) ## Options. bypass = [True] * de.NUM bypass[de.IFM] = 'i' not in args.disable_bypass bypass[de.OFM] = 'o' not in args.disable_bypass bypass[de.FIL] = 'f' not in args.disable_bypass options = Option(sw_gbuf_bypass=tuple(bypass), sw_solve_loopblocking=args.solve_loopblocking, partition_hybrid=args.hybrid_partition, partition_batch=args.batch_partition, partition_ifmaps=args.ifmaps_partition, ntops=args.top, nprocesses=args.processes, verbose=args.verbose) ## Search schedules. nnd = NNDataflow(network, batch_size, resource, cost, MapStrategyEyeriss) tops, cache_stats = nnd.schedule_search(options) if not tops: sys.stderr.write('No valid dataflow found.\n') return None top = tops[0] ## Write results. res_map = OrderedDict() res_map['version'] = get_version(with_local=True) res_map['net'] = args.net res_map['batch'] = args.batch res_map['resource'] = resource._asdict() res_map['cost'] = cost._asdict() res_map['options'] = options._asdict() res_map['cache_stats'] = cache_stats stats = stats_dict(top, cost) for key, val in stats.items(): res_map[key] = val return res_map
def eyerissAsplos17(self): ''' Reproduce TETRIS ASPLOS'17 paper Figure 8. ''' #network = self.alex_net network = self.mock_net batch_size = 1 resource = Resource( proc_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.PROC), dram_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), src_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dst_data_region=NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(4, 4), type=NodeRegion.DRAM), dim_array=PhyDim2(14, 14), size_gbuf=133032 // 2, # 133 kB size_regf=512 // 2, # 512 B array_bus_width=float('inf'), dram_bandwidth=float('inf'), no_time_mux=False, num_value_pes=256, ) # model values print('converting weights') q_weight_dict = {} weights_dict = read_weights() for w_layer in [ 'conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6', 'fc7', 'fc8' ]: array = convertToArray(weights_dict, w_layer) array_qint8 = quantizeWeights(array, 'qint8') q_weight_dict[w_layer] = array_qint8 #print('''Hey num weights in conv1 are {} '''.format(len(array_qint8))) # hardware costs mult_cost = readValueMult8Cost() #control_cost = readValueControl8Cost() print('done converting weights') #with open('weights.pickle', 'wb') as f: # pickle.dump(q_weight_dict,f) #counter = 0 #c = 0 #for m in mult_cost.keys(): # c += mult_cost[m] # counter += 1 #ave = c/counter #print('{} '.format(counter)) #print('average = {}'.format(ave)) #print('conv3 weights are') #for w in q_weight_dict['conv1']: # print(w) #exit() cost = Cost( value_control=1.92e-13, value_mult=mult_cost, mac_op=2e-12, adder_cost=(1.178e-5) / 200000000, mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pj/16-b noc_hop=40e-12, idl_unit=200e-12, my_weights=q_weight_dict, mem_cycles=(200, 6, 2, 1)) #cost = cost(value_control=control_cost, # value_mult=mult_cost, # mac_op=2e-12, # mem_hier=(80e-12, 14e-12, 4e-12, 0.6e-12), # pj/16-b # noc_hop=40e-12, # idl_unit=200e-12) options = Option(sw_gbuf_bypass=(True, True, True), sw_solve_loopblocking=True, partition_hybrid=True) #pdb.set_trace() nnd = NNDataflow(network, batch_size, resource, cost, self.map_strategy) tops, _ = nnd.schedule_search(options) self.assertTrue(tops) dfsch_t16 = tops[0] ## Check results. # Same workload. #self.assertAlmostEqual(dfsch_t16.total_ops, dfsch_l1.total_ops) print('t16 ops: {}'.format(dfsch_t16.total_ops)) # Performance of T-16 is proportional to PE resource (20% margin). #self.assertLess(dfsch_t16.total_time, # 1.2 * dfsch_l1.total_time * (16 * 16) / (14 * 14 * 16)) print('t16_time: {}'.format(dfsch_t16.total_time)) # Energy reduced by > 30%. # self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.7) # With dimension restriction on partitioning, this is slightly violated. #self.assertLess(dfsch_t16.total_cost, dfsch_l1.total_cost * 0.72) print('t16_energy: {}'.format(dfsch_t16.total_cost)) for i in dfsch_t16: print(str(i) + ',') ## Check results. # Results as cost for each component: header = 'ALU, DRAM, Buffer, Array, RF' cost_bkdn = {} for layer in dfsch_t16: layer = str(layer) op_cost = 0 access_cost = [0] * me.NUM for layer_part in network: if not layer_part or not layer_part.startswith(layer): continue sr = dfsch_t16[layer_part] op_cost += sr.total_ops * cost.mac_op access_cost = [ ac + a * c for ac, a, c in zip( access_cost, sr.total_accesses, cost.mem_hier) ] cost_bkdn[layer] = [] # To 1e9. cost_bkdn[layer].append(op_cost * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.DRAM] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.GBUF] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.ITCN] * 1e12 / 1e9) cost_bkdn[layer].append(access_cost[me.REGF] * 1e12 / 1e9) for layer in cost_bkdn: print(cost_bkdn[layer])