def __str__(self): strbuf = 'NSWG: \n' strbuf += '{0:>8} {1:>8}\n'.format('MatrixH', 'MatrixW') for i in range(len(self.net.layers)): if lb.isMatrixLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer( self.net.layers[i]): strbuf += '{0:>8} {1:>8}\n'.format(str(self.matrixH[i]), str(self.matrixW[i])) strbuf += '\n' strbuf += '{0:>8} {1:>8}\n'.format('Synaptic', 'Neuron') for i in range(len(self.net.layers)): if lb.isMatrixLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer( self.net.layers[i]): strbuf += '{0:>8} {1:>8}\n'.format(self.synapse_fold[i], self.neuron_fold[i]) strbuf += '\n' strbuf += '{0:>16} {1:>20} {2:>20} {3:>20} {4:>20}\n'.format( 'Initial Buffer', 'Write Block Cycles', 'Read Block Cycles', 'Total Cycles', 'Input Multiplier') for i in range(len(self.net.layers)): if lb.isConvLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer( self.net.layers[i]): strbuf += '{0:>16} {1:>20} {2:>20} {3:>20} {4:>20}\n'.format( self.initial_buffer[i], self.write_block_cycles[i], self.read_block_cycles[i], self.total_cycles[i], self.input_multiplier[i]) return strbuf
def print_topology(self): print "\nNetwork Topology: " print '{0:>35} {1:>10} {2:>10} {3:>10} {4:>10} {5:>8} {6:>8} {7:>8}'.format('NAME', 'idx', 'out_dim', 'filter_dim', 'in_chan', 'out_chan', 'stride', 'in_dim') for i in range(len(self.net.layers)): if lb.isMatrixLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer(self.net.layers[i]): print '{0:>35} {1:>10} {2:>10} {3:>10} {4:>10} {5:>8} {6:>8} {7:>8}'.format(self.net.layers[i].get_type(), i, self.net.layers[i].get_out_dim(), self.net.layers[i].get_filter_dim(), self.net.layers[i].getInputSize(), self.net.layers[i].getOutputSize(), self.net.layers[i].get_stride(), self.net.layers[i].get_in_dim()) print ""
def ops_per_cycle(self, layer_idx): if lb.isMatrixLayer( self.net.layers[layer_idx]) or lfpga.isFPGAMatrixLayer( self.net.layers[layer_idx]): # 2 because MAC return self.SIMD[layer_idx] * self.PE[layer_idx] * self.MMV[ layer_idx] * 2
def print_folding_factors(self): print "\nFolding factors: " print '{0:>35} {1:>8} {2:>5} {3:>5} {4:>5}'.format('NAME', 'idx', 'SIMD', 'PE', 'MMV') for i in range(len(self.net.layers)): if lb.isMatrixLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer(self.net.layers[i]): print '{0:>35} {1:>8} {2:>5} {3:>5} {4:>5} '.format(self.net.layers[i].get_type(), i, self.SIMD[i], self.PE[i], self.MMV[i]) print ""
def passFwdPropagateLinear(pipeline): "Move linear layers past matrix and pooling layers." inStages = pipeline inStages.reverse() numChanges = 0 ret = [] while len(inStages) > 1: layerA = inStages.pop() layerB = inStages.pop() if lb.isLinearLayer(layerA) and lb.isMatrixLayer(layerB): # move the scalar ax+b to after the matrix layer Wx # originally we have W(ax+b) = Wax + Wb # desired: Mx+N = a(Wx) + Wb # repeat a and b to make appropriately-sized vectors a = layerA.A b = layerA.B W = layerB.W matrixLayerOutSize = W.shape[0] scaleNew = a * np.ones(matrixLayerOutSize) shiftNew = np.dot(W, b * np.ones(W.shape[1])) ret += [layerB, lb.LinearLayer(scaleNew, shiftNew)] numChanges += 1 elif lb.isLinearLayer(layerA) and lb.isPoolingLayer(layerB): # TODO do we need to check layerA.A < 0 and maxpooling here? ret += [layerB, layerA] numChanges += 1 else: ret += [layerA] inStages.append(layerB) # pop final element, if any left if len(inStages) == 1: ret += [inStages.pop()] return (ret, numChanges)
def print_hardware_cost(self): print "\nHardware Cost:" print '{0:>35} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format( 'Layer', 'idx', 'Input BRAMS', 'Weights BRAM', 'Total LUTS', 'Total BRAM') total_input_brams = 0 total_weights_brams = 0 total_buffer_brams = 0 total_luts = 0 total_brams = 0 for i in range(len(self.net.layers)): if lb.isMatrixLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer( self.net.layers[i]): print '{0:>35} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format( self.net.layers[i].get_type(), i, self.bram_cost(i)[0], self.bram_cost(i)[1], self.lut_cost(i), sum(self.bram_cost(i))) brams = self.bram_cost(i) total_input_brams += brams[0] total_weights_brams += brams[1] total_luts += self.lut_cost(i) total_brams += sum(brams) print '{0:>35} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format( "Totals", "ALL", total_input_brams, total_weights_brams, total_luts, total_brams) print ""
def print_cycles(self): print "\nCycles per layer: " layer_cycles = self.calculate_layer_cycles() # Same as est MVC print '{0:>35} {1:>8} {2:>10} {3:>10}'.format('NAME', 'idx', 'ops/layer', 'MVC') for i in range(len(self.net.layers)): if lb.isMatrixLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer(self.net.layers[i]): print '{0:>35} {1:>8} {2:>10} {3:>10}'.format(self.net.layers[i].get_type(), i, self.net.ops_per_layer(self.net.layers[i]), layer_cycles[i]) print ""
def find_slowest_layer(self): """Find worst case layer as index into layers""" slowest_layer = self.find_first_matrix_layer() cycles = self.calculate_layer_cycles() for idx, cycle in enumerate(cycles): if cycle > cycles[slowest_layer] and (lb.isMatrixLayer(self.net.layers[idx]) or lfpga.isFPGAMatrixLayer(self.net.layers[idx])): slowest_layer = idx return slowest_layer
def find_first_matrix_layer(self): first = -1 for idx, layer in enumerate(self.net.layers): if lb.isMatrixLayer(layer) or lfpga.isFPGAMatrixLayer(layer): first = idx break assert (first != -1) return first
def directlyQuantizeLayer(layer, bits): "Apply direct quantization to given layer, returns [quantized layer, scaling layer]" assert (lb.isMatrixLayer(layer)) qlayer = copy.deepcopy(layer) (Wint, alpha) = qnt.quantize_matrix(qlayer.W, bits) qlayer.W = Wint qlayer.wbits = bits slayer = lb.LinearLayer(A=alpha, B=np.zeros(alpha.shape)) return [qlayer, slayer]
def calculate_matrix_cycles(self): layers = [] for idx, layer in enumerate(self.net.layers): if lb.isMatrixLayer(layer) or lfpga.isFPGAMatrixLayer(layer): layers.append(self.net.ops_per_layer(layer) / (self.ops_per_cycle(idx) * layer.get_parallel())) else: layers.append(0) return layers
def ops_per_layer(self, layer): """ if layerType is pool: return out_dim * out_dim * filter_dim * filter_dim else layerType is conv or fc return parallel * 2 * out_dim * out_dim * filter_dim * filter_dim * in_channels * out_channel """ if layers.isMatrixLayer(layer) or lfpga.isFPGAMatrixLayer(layer): return layer.getNumOps() #/2 return 0
def calculate_neural_folding(self): self.synapse_fold = self._zeros() self.neuron_fold = self._zeros() for i in range(len(self.net.layers)): layer = self.net.layers[i] if lb.isMatrixLayer(layer) or lfpga.isFPGAMatrixLayer( self.net.layers[i]): self.synapse_fold[i] = self.matrixH[i] / self.perf.SIMD[i] self.neuron_fold[i] = self.matrixW[i] / self.perf.PE[i]
def calculate_layer_cycles(self): # Same as est MVC """ For each layer, calculate cycles required Formula is ops_per_layer() / ops_per_cycle()""" layer_cycles = [] for idx, layer in enumerate(self.net.layers): if lb.isMatrixLayer(layer) or lfpga.isFPGAMatrixLayer(layer): layer_cycles.append(self.net.ops_per_layer( layer) / (self.ops_per_cycle(idx) * self.net.parallel_per_layer(layer))) else: layer_cycles.append(0) return layer_cycles
def calculate_matrix_sizes(self): self.matrixH = self._zeros() self.matrixW = self._zeros() for i in range(len(self.net.layers)): if lb.isMatrixLayer(self.net.layers[i]) or lfpga.isFPGAMatrixLayer( self.net.layers[i]): self.matrixW[i] = self.net.layers[i].getOutputSize() self.matrixH[i] = self.net.layers[i].getInputSize( ) * self.net.layers[i].get_filter_dim( ) * self.net.layers[i].get_filter_dim()
def directlyQuantizeAllFloatWeights(pipeline, bits): "Quantize all float weights in network to given number of bits." ret = [] pipeline_copy = copy.deepcopy(pipeline) for L in pipeline_copy: if lb.isMatrixLayer(L): if L.wbits == 32: ret += directlyQuantizeLayer(L, bits) else: ret += [L] else: ret += [L] return ret
def summarizePipeline(pipeline): totalParams = 0 totalParamBits = 0 totalOps = 0 totalComputeLayers = 0 output = "" output+= "Per-layer details\n" output+= "==================\n" for l in pipeline: print l.__class__.__name__ if layers.isMatrixLayer(l): np = l.getParamSize() op = l.getNumOps() ins = l.getInputSize() outs = l.getOutputSize() npbits = l.getTotalParamBits() totalParamBits += npbits totalParams += np totalOps += op totalComputeLayers += 1 output += "Type: %s, params: %d, ops: %d, in = %s, out = %s\n" % (l.__class__.__name__, np, op, str(ins), str(outs)) output += "Bitwidths: input %d, weight %d, output %d\n" % (l.ibits, l.wbits, l.obits) inbits = l.getTotalInputBits() outbits = l.getTotalOutputBits() output += "Total in bits: %d, total weight bits: %d, total out bits: %d\n" % (inbits, npbits, outbits) # arithmetic intensity with some components on-chip # TODO include output activations once threshold fusion is in place #ai_none = float(op) / float(inbits + outbits + npbits) #ai_w = float(op) / float(inbits + outbits) #ai_wi = float(op) / float(outbits) #ai_i = float(op) / float(npbits + outbits) ai_wo = float(op) / float(inbits) ai_io = float(op) / float(npbits) ai_o = float(op) / float(inbits + npbits) #output+= "AI none: %f, w: %f, wi: %f, wo: %f, io: %f, i: %f, o: %f" % (ai_none, ai_w, ai_wi, ai_wo, ai_io, ai_i, ai_o) output += "AI on-chip wo: %f, io: %f, o: %f\n" % (ai_wo, ai_io, ai_o) output += "-----\n" output+= "Neural network pipeline summary\n" output+= "================================\n" output+= "Pipeline contains %d layers, %d of which are matrix layers\n" % (len(pipeline), totalComputeLayers) output+= "Number of parameters: %f million\n" % (float(totalParams) / 1000000.0) output+= "Total parameter volume: %f MB\n" % (float(totalParamBits) / (8*1024*1024)) output+= "Operations per inference: %f million\n" % (float(totalOps) / 1000000.0) return output
def passFuseActivations(pipeline): "Replace (Matrix, Threshold) layer pairs with fused equivalents." inStages = pipeline inStages.reverse() numChanges = 0 ret = [] while len(inStages) > 1: layerA = inStages.pop() layerB = inStages.pop() if lb.isMatrixLayer(layerA) and lb.isThresholdLayer(layerB): ret += [lb.MatrixThresholdLayer("", layerA, layerB)] numChanges += 1 else: ret += [layerA] inStages.append(layerB) # pop final element, if any left if len(inStages) == 1: ret += [inStages.pop()] return (ret, numChanges)
def calculate_activation_counts(self): for layer in self.layers: if layers.isMatrixLayer(layer) or lfpga.isFPGAMatrixLayer(layer): self.num_activations.append(layer.get_out_dim() * layer.get_out_dim() * layer.getOutputSize() )
def calculate_weight_counts(self): for layer in self.layers: if layers.isMatrixLayer(layer): self.num_weights.append(layer.getParamSize())
def filter_relevant_layers(self): layers = [] for l in self.layers: if layers.isMatrixLayer(l): layers.append(l) self.layers = layers
def count_matrix_layers(self): count = 0 for l in self.layers: if layers.isMatrixLayer(l): count+=1 return count
def parallel_per_layer(self, layer): parallel = 1 if layers.isMatrixLayer(layer): if hasattr(layer, 'parallel'): parallel = layer.parallel return parallel