def test_validate_mn1_quantized1(mn1q_graph, mn1f_graph): tfi = TfliteImporter() Gf = tfi.create_graph(mn1f_graph, {'load_tensors': True}) Gf.add_dimensions() Gf.adjust_order() matcher = get_pow2_match_group() matcher.match(Gf) Gf.add_dimensions() tfi = TfliteImporter() G = tfi.create_graph(mn1q_graph, { 'load_tensors': True, 'load_quantization': True }) G.add_dimensions() G.adjust_order() matcher = get_pow2_match_group() matcher.match(G) G.add_dimensions() fpnode = Gf.graph_state.steps[2]['node'] fpcnode = fpnode.contained_filters()[0] qpnode = G.graph_state.steps[2]['node'] qpcnode = qpnode.contained_filters()[0] nid = NodeId(qpnode, qpcnode) qrec = G.quantization[nid] dqbiases = qrec.biases_q.get_dequantized(qpcnode.biases) assert np.max(np.abs(fpcnode.biases - dqbiases)) < 0.1 input_tensor = np.load('tests/mobv1_valid/COCO_val2014_000000362331_0.npy') input_tensor = input_tensor.reshape((224, 224, 3)).transpose((2, 0, 1)) executer = GraphExecuter(Gf) foutput_tensors = executer.execute([input_tensor]) foutput_tensor = np.load( 'tests/mobv1_valid/output_COCO_val2014_000000362331_0_float.npy') assert np.max(np.abs(foutput_tensors[-1][0] - foutput_tensor[0])) < 0.0001 executer = GraphExecuter(G, qrecs=G.quantization) qfroutput_tensors = executer.execute([input_tensor], qmode=QuantizationMode.none()) assert np.max(np.abs(qfroutput_tensors[-1][0] - foutput_tensor[0])) < 0.2 executer = GraphExecuter(G, qrecs=G.quantization) qroutput_tensors = executer.execute( [input_tensor], qmode=QuantizationMode.all_dequantize()) output_tensor = np.load( 'tests/mobv1_valid/output_COCO_val2014_000000362331_0_quant.npy') # assert np.max(np.abs(qroutput_tensors[-1][0] - output_tensor[0])) < 0.16 assert np.max(np.abs(qroutput_tensors[-1][0] - output_tensor[0])) < 0.28
def _collect(self, G, input_tensors, step_idx) -> Mapping[NodeId, Mapping]: LOG.debug("gather quantization statistics") if G.has_quantized_parameters: quantization = G.quantization else: quantization = None executer = GraphExecuter(G, qrecs=quantization) foutputs = self._collect_execution(executer, input_tensors, quantization) executer = GraphExecuter(G, qrecs=G.quantization) qoutputs = self._collect_execution(executer, input_tensors, G.quantization, qmode=QuantizationMode.all_dequantize()) stats = OrderedDict() for idx, fstat in enumerate(foutputs): qstat = qoutputs[idx] if fstat['fusion_outputs']: for jdx, ffstat in enumerate(fstat['fusion_outputs']): nid = NodeId(fstat['node'], ffstat['node']) stats[nid] =\ self._collect_one(ffstat, qstat['fusion_outputs'][jdx], G.quantization[nid], quant_compare=self._quant_compare) nid = NodeId(fstat['node'], None) stats[nid] = self._collect_one(fstat, qstat, G.quantization[nid], quant_compare=self._quant_compare) return stats
def _collect(self, G, input_tensors) -> Mapping[NodeId, Mapping]: LOG.debug("gather quantization statistics") output_ = execute(G, input_tensors, limit=self._limit) all_details = [] qoutput_ = execute(G, input_tensors, limit=self._limit, qrecs=G.quantization, qmode=QuantizationMode.all(), all_details=all_details) stats = OrderedDict() for idx, out in enumerate(output_): error_ = np.abs(out[0] - qoutput_[idx][0]) step = G.graph_state.steps[idx] node = step['node'] details = all_details[idx] if details: overflow_dot = details['overflow_dot'] overflow_acc = details['overflow_acc'] else: overflow_dot = overflow_acc = "" stats[NodeId(node, None)] = { 'name': node.name, 'op_name': node.op_name, 'step': idx, 'av_err': np.mean(error_), 'max_err': np.max(error_), 'min_err': np.min(error_), 'qsnr': qsnr(out[0], qoutput_[idx][0]), 'overflow_dot': overflow_dot, 'overflow_acc': overflow_acc, } return stats
def test_graph_imu_auto_quant_and_execute_quant(): G = create_graph("tests/graph/imu.tflite", opts={"load_tensors": True}) G.add_dimensions() G.adjust_order() get_pow2_match_group().match(G) G.add_dimensions() stats_collector = ActivationStatsCollector() for input_file in ['tests/images/imu0.pgm']: input_tensor = import_data(input_file, offset=0, divisor=256, nptype='int16') stats_collector.collect_stats(G, [input_tensor]) astats = stats_collector.reduce_stats() stats_collector = FilterStatsCollector() fstats = stats_collector.collect_stats(G) quantizer = SymmetricQuantizer(astats, fstats, force_width=16) qrecs = quantizer.quantize(G) G.quantization = qrecs executer = GraphExecuter(G, qrecs=qrecs) for input_file in ['tests/images/imu0.pgm']: input_tensor = import_data(input_file, offset=0, divisor=256, nptype='int16') output_ = executer.execute([input_tensor], qmode=QuantizationMode.all())
def execute(self, in_tensors: Sequence[np.ndarray], step_idx_limit=None, only_yield_step=False, qmode: QuantizationMode = None, all_details=None, yield_fusions=False, silent=False): if qmode is None: qmode = QuantizationMode.none() if qmode.is_step_all: iterator = [(qoutput, qdetails, fnode) for _, _, _, _, qoutput, qdetails, fnode in self.execute_qnoq_iterator(in_tensors, yield_fusions=yield_fusions, step_idx_limit=step_idx_limit, silent=silent)] else: iterator = [(output_tensors, details, fnode) for _, _, fnode, output_tensors, details in self.execute_iterator(in_tensors, step_idx_limit=step_idx_limit, qmode=qmode, yield_fusions=yield_fusions, only_yield_step=only_yield_step, yield_details=all_details is not None, silent=silent)] outputs = [] if yield_fusions: fusion_outputs = [] if all_details is not None: fusion_details = [] for output_tensors, details, fnode in iterator: if yield_fusions: if fnode: fusion_outputs.append([output_tensor.copy() for output_tensor in output_tensors]) if all_details is not None: fusion_details.append(details) else: outputs.append({ 'outputs': outputs.append([output_tensor.copy() for output_tensor in output_tensors]), 'fusion_outputs': fusion_outputs.copy(), }) fusion_outputs.clear() if all_details is not None: all_details.append({ 'details': details, 'fusion_details': fusion_details.copy() }) fusion_details.clear() else: outputs.append([output_tensor.copy() for output_tensor in output_tensors]) if all_details is not None: all_details.append(details) return outputs
def test_graph_calc_quantize_one_2(value_cache, mnist_unfused_16bit_state, mnist_images): G = load_state(mnist_unfused_16bit_state, value_cache=value_cache) input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) input_tensor = input_tensor.reshape((28, 28, 1)) output1 = execute(G, [input_tensor]) input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) input_tensor = input_tensor.reshape((28, 28, 1)) output2 = execute(G, [input_tensor], qmode=QuantizationMode.step(4), qrecs=G.quantization) diffs = [] for i, out1 in enumerate(output1): diffs.append(out1[0] - output2[i][0]) assert np.min(diffs[7]) > -2 and np.max(diffs[7]) < 2
def get_base_inputs(self, nodes, progress, quantize): if self._base_inputs is None: base_inputs = self._input_files for node in nodes: node.use_compressed = False progress( f"validation without compression {'quantized: ' if quantize else ': '}", False) base_inputs, good_margin, bad_inputs, bad_margin = self.validate( QuantizationMode.all_dequantize() if quantize else QuantizationMode.none(), inputs=self._input_files, progress=lambda pred: progress('+' if pred else '-', False)) progress('', True) progress( f'good {len(base_inputs)} ({good_margin:.2f}) bad {len(bad_inputs)} ({bad_margin:.2f})', True) self._base_inputs = base_inputs else: base_inputs = self._base_inputs return base_inputs
def do_gen(self, args): """ Generate AutoTiler model C code and optionally dump tensors. If no destination file is given the generated code will be outputed to the screen. Check the 'set' command for settings related to code generation.""" self._check_graph() self._check_quantized() self._check_adjusted() if args.checksums: input_args = self._get_input_args(None) LOG.info("input file %s", args.checksums) data = import_data(args.checksums, **input_args) executer = GraphExecuter(self.G, qrecs=self.G.quantization) executer.execute([data], qmode=QuantizationMode.all()) self.settings['checksum_file'] = args.checksums self.settings['generate_checksums'] = True if args.tensor_directory: self.settings['tensor_directory'] = args.tensor_directory if args.model_directory: self.settings['model_directory'] = args.model_directory self.settings['basic_kernel_source_file'] = args.basic_kernel_source_file self.settings['basic_kernel_header_file'] = args.basic_kernel_header_file code_gen = CodeGenerator(self.G, DefaultNamingConvension(self.G), self.settings) if self.settings['template_file']: code_template = dynamic_template(self.settings['template_file']) else: code_template = default_template if args.model_file: with open(os.path.join(self.settings['model_directory'], args.model_file), "w") as output_fp: output_fp.write(code_template(self.G, code_generator=code_gen)) if self.G.has_expressions: with open(os.path.join(self.settings['model_directory'], args.basic_kernel_source_file), "w") as output_fp: output_fp.write(basic_kernel_source_template(self.G, code_generator=code_gen)) with open(os.path.join(self.settings['model_directory'], args.basic_kernel_header_file), "w") as output_fp: output_fp.write(basic_kernel_header_template(self.G, code_generator=code_gen)) else: self.ppaged(code_template(self.G, code_generator=code_gen)) if self.G.has_expressions: self.ppaged(basic_kernel_source_template(self.G, code_generator=code_gen)) self.ppaged(basic_kernel_header_template(self.G, code_generator=code_gen)) if args.output_tensors: code_gen.write_constants() if args.header_file: with open(os.path.join(self.settings['model_directory'], args.header_file), "w") as output_fp: output_fp.write(header_template(self.G, code_generator=code_gen))
def _collect(self, G, input_tensors) -> Mapping[NodeId, Mapping]: LOG.debug("gather quantization statistics") foutputs = self._collect_execution(G, input_tensors) qoutputs = self._collect_execution(G, input_tensors, qrecs=G.quantization, qmode=QuantizationMode.all()) stats = OrderedDict() for idx, fstat in enumerate(foutputs): qstat = qoutputs[idx] if fstat['fusion_outputs']: for jdx, ffstat in enumerate(fstat['fusion_outputs']): stats[NodeId(fstat['node'], ffstat['node'])] =\ self._collect_one(ffstat, qstat['fusion_outputs'][jdx]) stats[NodeId(fstat['node'], None)] = self._collect_one(fstat, qstat) return stats
def test_external_biases_sq8(qvww_graph): # this model has at the end an external biases layer as constant add tfi = TfliteImporter() G = tfi.create_graph(qvww_graph, {"load_quantization": True, "load_tensors": True}) G.add_dimensions() matcher = get_scale8_match_group() matcher.match(G) G.add_dimensions() image = 'tests/vwwimages/COCO_val2014_000000174838_1.png' img_in = Image.open(image) img_in = img_in.resize((238, 208)) input_tensor = np.array(img_in, dtype=np.uint8) input_tensor = (input_tensor.astype(np.float32) - 128) / 128 executer = GraphExecuter(G, qrecs=G.quantization) # check if nntool can execute qoutput_tensors = executer.execute([input_tensor], qmode=QuantizationMode.all_dequantize()) foutput_tensors = executer.execute([input_tensor], qmode=None) diff = [q[0]-f[0] for q,f in zip(qoutput_tensors, foutput_tensors)] assert max([np.max(d) for d in diff]) < 2.2
def test_validate_mn1_dequant_quantfloat(mn1q_graph): # load dequantized graph same results as quant graph and float execution tfi = TfliteImporter() G = tfi.create_graph(mn1q_graph, { 'load_tensors': True, 'load_quantization': True }) G.add_dimensions() G.adjust_order() matcher = get_pow2_match_group() matcher.match(G) G.add_dimensions() Gdq = tfi.create_graph(mn1q_graph, { 'load_tensors': True, 'load_dequantized': True }) Gdq.add_dimensions() Gdq.adjust_order() matcher = get_pow2_match_group() matcher.match(Gdq) Gdq.add_dimensions() input_tensor = np.load('tests/mobv1_valid/COCO_val2014_000000362331_0.npy') input_tensor = input_tensor.reshape((224, 224, 3)).transpose((2, 0, 1)) executer = GraphExecuter(G, qrecs=G.quantization) qfoutput_tensors = executer.execute([input_tensor], qmode=QuantizationMode.none()) executer = GraphExecuter(Gdq) dfoutput_tensors = executer.execute([input_tensor]) diff_list = [ np.abs(df[0] - qf[0]) for df, qf in zip(dfoutput_tensors, qfoutput_tensors) ] max_diff = [np.max(elem) for elem in diff_list] assert max(max_diff) < 0.003
def test_graph_calc_quantized8(mnist_unfused_8bit_state, mnist_images): G = load_state(mnist_unfused_8bit_state) input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) input_tensor = input_tensor.reshape((28, 28, 1)) executer = GraphExecuter(G, qrecs=G.quantization) output1 = executer.execute([input_tensor], step_idx_limit=7) input_tensor = import_data(mnist_images[0], height=28, width=28, offset=0, divisor=255) input_tensor = input_tensor.reshape((28, 28, 1)) output2 = executer.execute([input_tensor], qmode=QuantizationMode.all_dequantize(), step_idx_limit=7) diffs = [] for i in range(8): diffs.append(output1[i][0] - output2[i][0]) assert np.max(np.abs(diffs[7])) < 9
def do_dump(self, args: argparse.Namespace): """ Dump the activations resulting from running an input file through the graph. You can use the current quantization settings and can also just quantify one specific step of the graph.""" self._check_graph() dequantize = args.dequantize if args.dequantize is not None\ else not (args.pickle or args.save) if args.quantize or args.quantize_step or args.quantize_all_steps: self._check_quantized() if args.quantize: if dequantize: qmode = QuantizationMode.all_dequantize() else: qmode = QuantizationMode.all() elif args.quantize_all_steps: qmode = QuantizationMode.step_all() dequantize = True else: qmode = QuantizationMode.step(args.quantize_step) elif args.quantize_and_dequantize: qmode = QuantizationMode.all_float_quantize_dequantize() else: qmode = QuantizationMode.none() if args.step is not None: step = args.step num_steps = len(self.G.graph_state.steps) if step < 0: step = num_steps + step if step < 0 or step > num_steps: self.perror("step must be from {} to {}".format( -num_steps, num_steps)) return else: step = None input_args = self._get_input_args(args) pickles = [] for file_per_input in glob_input_files(args.input_files, self.G.num_inputs): LOG.info("input file %s", file_per_input) data = [ import_data(input_file, **input_args) for input_file in file_per_input ] executer = GraphExecuter(self.G, qrecs=self.G.quantization) outputs = executer.execute(data, step_idx_limit=step, qmode=qmode) if args.pickle or self._in_py or args.save: pickles.append(outputs) else: self.G.print_intermediates(outputs, limit=step, width=args.number_width, precision=args.precision, channel=args.channel, order=['c', 'h', 'w'], checksum=args.checksum) if args.visualize_detection: img_in = Image.open(file_per_input[0]).convert('RGBA') height = img_in.size[1] if input_args[ 'height'] == -1 else input_args['height'] width = img_in.size[0] if input_args[ 'width'] == -1 else input_args['width'] img_in = img_in.resize((width, height)) if self.G.has_ssd_postprocess: bboxes, classes, scores, _ = [ outputs[graph_out.step_idx][0] for graph_out in self.G.outputs() ] draw = ImageDraw.Draw(img_in, 'RGBA') for box, score, class_id in zip(bboxes, scores, classes): if args.quantize and not args.dequantize: ssd_node = [ node for node in self.G.nodes() if isinstance(node, SSDDetectorParameters) ][0] ssd_qrec = self.G.quantization[NodeId(ssd_node)] x0, x1 = int(box[1] * width * ssd_qrec.out_qs[0].scale), int( box[3] * width * ssd_qrec.out_qs[0].scale) y0, y1 = int(box[0] * height * ssd_qrec.out_qs[0].scale), int( box[2] * height * ssd_qrec.out_qs[0].scale) score = score * ssd_qrec.out_qs[2].scale else: x0, x1 = int(box[1] * width), int(box[3] * width) y0, y1 = int(box[0] * height), int(box[2] * height) rect_points = (x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0) draw.line(rect_points, fill='red', width=2) txt = '{}@{}%'.format(class_id, int(score * 100)) draw.text([x0, y0 - 10], txt, fill=(0, 255, 0)) img_in.show() if args.pickle or args.save or self._in_py: if not pickles: self.perror("no input files found") return if len(args.input_files) == self.G.num_inputs: pickles = pickles[0] if args.pickle: with open(args.pickle, 'wb') as pickle_fp: pickle.dump(pickles, pickle_fp) if args.save: if len(args.input_files) != self.G.num_inputs: self.perror( "can only save dumps on one input to tensor store") return self.tensor_store[args.save] = pickles if self._in_py: self.last_result = pickles
def execute_iterator(self, in_tensors: Sequence[np.ndarray], step_idx_limit: Optional[int] = None, start_node: Optional[Parameters] = None, qmode: Optional[QuantizationMode] = None, yield_fusions=True, yield_details=True, only_yield_step=False, record_inputs: Optional[Mapping] = None, silent=False): if qmode is None: qmode = QuantizationMode.none() saved_outputs = {} if not silent: LOG.info("execute uncached: quantization mode %s", qmode) ExecutionProgress.start() for step_idx, step in enumerate(self._G.graph_state.steps): if step_idx_limit is not None and step_idx > step_idx_limit: break node = step['node'] if start_node and start_node != node: continue # collect outputs from previous nodes # InputNode is already set above output_tensors = self.collect_outputs(saved_outputs, node) if not silent: ExecutionProgress.progress(step_idx, node.name) nid = NodeId(node, None) if record_inputs is not None: if output_tensors is None: record_inputs[nid] = output_tensors else: record_inputs[nid] = [ np.copy(output_tensor) for output_tensor in output_tensors ] qrec = self._qrecs[nid] if self._qrecs is not None else None if qmode.get_quantized(node, step_idx): switch = self._quantized_kernel_switch if qmode.is_step and output_tensors: output_tensors = [ qrec.in_qs[i].quantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] else: switch = self._kernel_switch details = {} if yield_details and ( not only_yield_step or step_idx == step_idx_limit) else None if isinstance(node, (ConvFusionParameters, ActivationFusion)): for fusion_node in node.contained_nodes(): fnid = NodeId(node, fusion_node) fqrec = None if not qrec else self._qrecs[fnid] if record_inputs is not None: record_inputs[nid] = [ np.copy(output_tensor) for output_tensor in output_tensors ] details = {} if yield_fusions and yield_details else None output_tensors = switch.execute(fusion_node, output_tensors, fqrec, details) if yield_fusions: if qmode.dequantize: qoutput_tensors = [ fqrec.out_qs[i].dequantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] yield step_idx, node, fusion_node, qoutput_tensors, details elif qmode.is_float_q_deq: qoutput_tensors = [ fqrec.out_qs[i].dequantize( fqrec.out_qs[i].quantize(output_tensor)) for i, output_tensor in enumerate( output_tensors) ] yield step_idx, node, fusion_node, qoutput_tensors, details else: yield step_idx, node, fusion_node, output_tensors, details elif isinstance(node, InputParameters): output_tensors = switch.execute(node, in_tensors, qrec, details) else: output_tensors = switch.execute(node, output_tensors, qrec, details) if qmode.dequantize: qoutput_tensors = [ qrec.out_qs[i].dequantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] if not only_yield_step or step_idx == step_idx_limit: yield step_idx, node, None, qoutput_tensors, details if qmode.is_step and qmode.get_quantized(node, step_idx): output_tensors = qoutput_tensors elif qmode.is_float_q_deq: if qmode.is_step and qmode.get_quantized(node, step_idx): output_tensors = [ qrec.out_qs[i].dequantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] qoutput_tensors = [ qrec.out_qs[i].dequantize( qrec.out_qs[i].quantize(output_tensor)) for i, output_tensor in enumerate(output_tensors) ] if not only_yield_step or step_idx == step_idx_limit: yield step_idx, node, None, qoutput_tensors, details else: if qmode.is_step and qmode.get_quantized(node, step_idx): output_tensors = [ qrec.out_qs[i].dequantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] if not only_yield_step or step_idx == step_idx_limit: yield step_idx, node, None, output_tensors, details self.save_output(saved_outputs, node, output_tensors) if not silent: ExecutionProgress.end()
def execute_iterator(self, in_tensors: Sequence[np.ndarray], step_idx_limit: Optional[int] = None, start_node: Optional[Parameters] = None, qmode: Optional[QuantizationMode] = None, yield_fusions=True, yield_details=True, only_yield_step=False, record_inputs: Optional[Mapping] = None, silent=False, parent_node=None, parent_step_idx=None, saved_outputs=None, G=None): if qmode is None: qmode = QuantizationMode.none() if G is None: G = self._G saved_outputs = {} if not silent: LOG.info("execute uncached: quantization mode %s", qmode) ExecutionProgress.start() for node in G.dfs(): step_idx = node.step_idx if step_idx_limit is not None and step_idx > step_idx_limit: break if start_node and start_node != node: continue # collect outputs from previous nodes # InputNode is already set above output_tensors = self.collect_outputs(G, saved_outputs, node) if not silent: ExecutionProgress.progress(step_idx, node.name) if parent_node: nid = NodeId(parent_node, node) else: nid = NodeId(node, None) if record_inputs is not None: if output_tensors is None: record_inputs[nid] = output_tensors else: record_inputs[nid] = [ np.copy(output_tensor) for output_tensor in output_tensors ] if isinstance(node, (FusionInputParameters, FusionOutputParameters)): qrec = None else: if self._qrecs and qmode.get_quantized(node, step_idx): if nid not in self._qrecs: LOG.warning("no quantization parameters on %s", node.name) qrec = None else: qrec = self._qrecs[nid] if qmode.is_step and output_tensors: output_tensors = [ qrec.in_qs[i].quantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] else: qrec = None details = {} if yield_details and ( not only_yield_step or step_idx == step_idx_limit) else None if isinstance( node, (FilterFusionBase, ActivationFusionBase, PaddedAddFusionParameters, MatMulOpFusionParameters)): for f_step_idx, f_pnode, f_node, f_output_tensors, f_details in self.execute_iterator( output_tensors, qmode=qmode, yield_fusions=yield_fusions, yield_details=yield_details, silent=True, parent_node=node, parent_step_idx=step_idx, saved_outputs=saved_outputs, G=node.subgraph): if yield_fusions and not isinstance( f_node, (FusionInputParameters, FusionOutputParameters)): yield f_step_idx, f_pnode, f_node, f_output_tensors, f_details f_outputs = node.subgraph.outputs() num_outputs = max(f_output.idx for f_output in f_outputs) + 1 output_tensors = [None] * num_outputs for f_output in f_outputs: output_tensors[f_output.idx] = saved_outputs[f_output][0] elif isinstance(node, (InputParameters, FusionInputParameters)): output_tensors = KernelExecuter.execute( node, in_tensors, qrec, details) else: output_tensors = KernelExecuter.execute( node, output_tensors, qrec, details) if qmode.dequantize and qrec: qoutput_tensors = [ qrec.out_qs[i].dequantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] if parent_node: yield parent_step_idx, parent_node, node, qoutput_tensors, details elif not only_yield_step or step_idx == step_idx_limit: yield step_idx, node, None, qoutput_tensors, details if qmode.is_step and qmode.get_quantized(node, step_idx): output_tensors = qoutput_tensors elif qmode.is_float_q_deq and qrec: if qmode.is_step and qmode.get_quantized(node, step_idx): output_tensors = [ qrec.out_qs[i].dequantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] qoutput_tensors = [ qrec.out_qs[i].dequantize( qrec.out_qs[i].quantize(output_tensor)) for i, output_tensor in enumerate(output_tensors) ] if parent_node: yield parent_step_idx, parent_node, node, qoutput_tensors, details elif not only_yield_step or step_idx == step_idx_limit: yield step_idx, node, None, qoutput_tensors, details else: if qmode.is_step and qmode.get_quantized(node, step_idx) and qrec: output_tensors = [ qrec.out_qs[i].dequantize(output_tensor) for i, output_tensor in enumerate(output_tensors) ] if parent_node: yield parent_step_idx, parent_node, node, output_tensors, details elif not only_yield_step or step_idx == step_idx_limit: yield step_idx, node, None, output_tensors, details self.save_output(saved_outputs, node, output_tensors) if not silent: ExecutionProgress.end()
def tune_all(self, nodes, progress, quantize=False): base_inputs = self.get_base_inputs(nodes, progress, quantize) def opt_func(qsnr, state): progress('compressing: ', False) compression = self.tune_qsnr( nodes, qsnr, progress=lambda _, comp: progress('+' if comp else '-', False)) if not compression or ('best_compression' in state and state['best_compression'] > compression): if qsnr == 0: raise CompressionError("could not compress graph") return None state['best_compression'] = compression progress('', True) progress('validating: ', False) good_inputs, good_margin, bad_inputs, bad_margin = self.validate( state['qmode'], inputs=state['cur_inputs'], progress=lambda pred: progress('+' if pred else '-', False)) progress('', True) progress( f'good {len(good_inputs)} ({good_margin:.2f}) bad {len(bad_inputs)} ({bad_margin:.2f})', True) if bad_inputs: if not state['final']: state['cur_inputs'] = bad_inputs del state['best_compression'] return None return compression qmode = QuantizationMode.none() dir_start = 'down' opt_state = { 'cur_inputs': base_inputs.copy(), 'final': False, 'qmode': qmode } start_qsnr = 30 start_step = 15 maximizer = Maximizer(opt_func, 0, 120) while True: res = maximizer.run( start_qsnr, opt_state, progress=lambda cur, step, direct: progress( f'QSNR {cur} step {step} direction {direct}', True), start_step=start_step, dir_start=dir_start) if quantize and opt_state['qmode'] == QuantizationMode.none(): progress('analysing quantized', True) opt_state['qmode'] = QuantizationMode.all_dequantize() elif opt_state['cur_inputs'] != base_inputs: progress('check with all inputs', True) opt_state['final'] = True else: break opt_state['cur_inputs'] = base_inputs.copy() start_qsnr = res[1] start_step = 0.5 dir_start = 'up' progress(f'tune QSNR to best {res[1]} compressed by {res[0]} bytes', True) self.tune_qsnr( nodes, res[1], progress=lambda _, comp: progress('+' if comp else '-', False)) progress('', True) return res[1]
def finetune(self, nodes, progress, quantize=False): sizes = [(node, node.compressed_value) for node in nodes if node.compressed_value and node.use_compressed] nodes = [size[0] for size in sizes] base_inputs = self.get_base_inputs(nodes, progress, quantize) for node in nodes: if node.compressed_value: node.use_compressed = True def opt_func(bits, threshold, sparse, node, state): progress('compressing: ', False) compression = self.tune_bits( [node], bits, threshold=threshold, sparse=sparse, progress=lambda _, comp: progress('+' if comp else '-', False)) if not compression or ('best_compression' in state and state['best_compression'] > compression): if bits == 8 and sparse: raise CompressionError("could not compress graph") return None state['best_compression'] = compression progress('', True) progress('validating: ', False) good_inputs, good_margin, bad_inputs, bad_margin = self.validate( state['qmode'], inputs=state['cur_inputs'], break_on_error=state['final'], progress=lambda pred: progress('+' if pred else '-', False)) progress('', True) progress( f'good {len(good_inputs)} ({good_margin:.2f}) bad {len(bad_inputs)} ({bad_margin:.2f})', True) if bad_inputs: state['cur_inputs'] = bad_inputs del state['best_compression'] return None return compression maximizer = Maximizer(opt_func, 2, 8, int_step=True) while sizes: sizes.sort(key=lambda x: x[1].size) tune_idx = -1 node = None while node is None and abs(tune_idx) <= len(sizes): node, comp_val = sizes[tune_idx] cur_bits = comp_val.bits if cur_bits > 2: cur_step = max(cur_bits // 2, 1) cur_bits = max(cur_bits - cur_step, 2) else: tune_idx -= 1 node = None if node is None: break progress(f'finetuning {node.name}', True) qmode = QuantizationMode.none() dir_start = 'down' opt_state = { 'cur_inputs': base_inputs.copy(), 'final': False, 'qmode': qmode } while True: res = maximizer.run( cur_bits, None, False, node, opt_state, progress=lambda cur, step, direct: progress( f'bits {cur} step {step} direction {direct}', True), start_step=cur_step, dir_start=dir_start) del sizes[tune_idx] if res is None: break if quantize and opt_state['qmode'] == QuantizationMode.none(): progress('analysing quantized', True) opt_state['qmode'] = QuantizationMode.all_dequantize() elif opt_state['cur_inputs'] != base_inputs: progress('check with all inputs', True) else: break opt_state['final'] = True opt_state['cur_inputs'] = base_inputs.copy() cur_bits = res[1] cur_step = 1 dir_start = 'up' if res is None: progress(f'{node.name} cannot be further optimised', True) self.tune_bits( [node], comp_val.bits, progress=lambda _, comp: progress('+' if comp else '-', False)) else: progress( f'{node.name} tune bits to {res[1]} compressed by {res[0]} bytes', True) self.tune_bits( [node], res[1], progress=lambda _, comp: progress('+' if comp else '-', False)) progress('', True)
def gen_project(G, settings, project_folder, script_commands, overwrite=False, performance=False, quantized=False, test_results=False, save_inputs=False, input_file=None, input_args=None, gen_atproject=False, dump_tensors=False, input_tensors=None, tolerance=0.0): settings = deepcopy(settings) settings['graph_monitor_cycles'] = True settings['graph_produce_node_names'] = True settings['graph_produce_operinfos'] = True code_gen = CodeGenerator(G, DefaultNamingConvension(G), settings) if not os.path.exists(project_folder): os.mkdir(project_folder) qoutputs = None if test_results: np.random.seed(12345) finput_tensors = [] input_tensors = [] for i, node in enumerate(G.input_nodes()): out_q = G.quantization[NodeId(node)].out_qs[0] if input_file: file_per_input = glob_input_files(input_file, G.num_inputs)[0] finput = import_data(file_per_input[i], **input_args) else: min_val = out_q.min if not out_q.is_floating else -1.0 max_val = out_q.max if not out_q.is_floating else 1.0 finput = get_rand(node.out_dims[0].shape, low_high=(min_val, max_val)) finput_tensors.append(finput) executer = GraphExecuter(G, qrecs=G.quantization) qoutput_tensors = executer.execute(finput_tensors.copy(), qmode=QuantizationMode.all()) qoutputs = [] for params in G.outputs(): outp = qoutput_tensors[params.step_idx][0] qoutputs.append(outp) for i, params in enumerate(G.input_nodes()): inp = qoutput_tensors[params.step_idx][0] input_tensors.append(inp) if save_inputs: nodeq = G.quantization[NodeId(params, None)].out_qs[0] np.save(os.path.join(project_folder, f"fake_input_{i}.npy"), nodeq.dequantize(inp)) main = os.path.join(project_folder, f"{code_gen.project_name}") main_c = main + '.c' main_h = main + '.h' common_mk = os.path.join(project_folder, "common.mk") nntool_script = os.path.join(project_folder, "nntool_script") if overwrite or not os.path.exists(main_c): with open(os.path.join(project_folder, f"{code_gen.project_name}.c"), "w") as output_fp: output_fp.write( generate_main_appl_template(G, code_gen, input_tensors, qoutputs, tolerance)) if overwrite or not os.path.exists(main_h): with open(os.path.join(project_folder, f"{code_gen.project_name}.h"), "w") as output_fp: output_fp.write(generate_main_appl_header(G, code_gen)) if overwrite or not os.path.exists(common_mk): open_args = parse_last_open(script_commands) open_args = build_last_open_args(open_args) if open_args else "" with open(os.path.join(project_folder, "common.mk"), "w") as output_fp: if gen_atproject: output_fp.write( generate_main_appl_make_atproject(G, code_gen, quantized, 'Model.c')) else: output_fp.write( generate_main_appl_make(G, code_gen, quantized, open_args=open_args)) if overwrite or not os.path.exists(nntool_script): with open(nntool_script, 'w') as fp: # NOTE - gen_template_project is excluded so that tests work. Normally it will not be in the # history. fp.writelines(process_script(script_commands)) # always add performance since the main template uses it for setting in [ 'set graph_produce_node_names true', 'set graph_produce_operinfos true', 'set graph_monitor_cycles true' ]: fp.write(f'{setting}\n') if dump_tensors: fp.write('set graph_dump_tensor 7\n') if script_commands[-1] != "save_state": fp.write('save_state\n') if gen_atproject: code_gen = CodeGenerator(G, DefaultNamingConvension(G), settings) with open(os.path.join(project_folder, 'Model.c'), "w") as output_fp: output_fp.write(default_template(G, code_generator=code_gen)) if G.has_expressions: with open(os.path.join(project_folder, "Expression_Kernels.c"), "w") as output_fp: output_fp.write( basic_kernel_source_template(G, code_generator=code_gen)) with open(os.path.join(project_folder, "Expression_Kernels.h"), "w") as output_fp: output_fp.write( basic_kernel_header_template(G, code_generator=code_gen)) code_gen.write_constants(tensor_directory=project_folder) ignore_function = None if overwrite else skip_existing_files( project_folder) shutil.copytree(os.path.join(os.environ.get("NNTOOL_PATH"), 'generation/project_template'), project_folder, dirs_exist_ok=True, ignore=ignore_function) if not gen_atproject: try: shutil.copy( G.graph_identity.filename, os.path.join(project_folder, os.path.split(G.graph_identity.filename)[1])) except shutil.SameFileError: pass
def do_validate(self, args: argparse.Namespace): """ Validate the model (quantized [-q] or not) in terms of prediction accuracy rate on a given dataset (images folder). Ground truth labels can be embedded in files names ("filename_03.[png, ppm, pgm]", the number of digits must be coherent with the number of networks outputs: e.g. in a 1000 classes problem the last digits must be 3, "file_45.png" will raise an error) or can be written in a .json object (example: {'file0':label0, 'file1':label1, ...}) and given to the function with --label_json """ self._check_graph() if args.quantize: self._check_quantized() qmode = QuantizationMode.all_dequantize() else: qmode = QuantizationMode.none() LOG.info("quantization mode - %s", qmode) input_args = self._get_input_args(args) good_predictions = [] good_margin = 0 bad_margin = 0 number_samples = sum(1 for _ in glob_input_files(args.input_files)) if args.vww_instances_file: validation = ValidateFromVWWInstances( args.vww_instances_file, class_thr=args.class_thr, binary_classification=args.binary_classification) elif args.label_json: validation = ValidateFromJSON( args.label_json, class_thr=args.class_thr, binary_classification=args.binary_classification) elif args.class_number is not None: validation = ValidateFromClass( args.class_number, class_thr=args.class_thr, binary_classification=args.binary_classification) else: validation = ValidateFromName( class_thr=args.class_thr, binary_classification=args.binary_classification) try: ExecutionProgress.start() for i, file_per_input in enumerate( glob_input_files(args.input_files, self.G.num_inputs)): if not args.silent: LOG.info("input file %s", file_per_input) data = [ import_data(input_file, **input_args) for input_file in file_per_input ] executer = GraphExecuter(self.G, qrecs=self.G.quantization) outputs = executer.execute(data, qmode=qmode, silent=args.silent) predicted_values = np.asarray( outputs[args.prediction_step_idx]) good_prediction, class_predicted, real_class, margin = validation.validate( file_per_input[0], predicted_values) good_predictions.append(good_prediction) if good_prediction: good_margin += margin else: bad_margin += margin if not args.silent: LOG.info( 'Prediction is %s predicted %s correct %s margin %s', good_prediction, class_predicted, real_class, margin) if not i % args.progress_every and i > 0: LOG.info( 'ACCURACY: %.3f %%', 100 * sum(good_predictions) / len(good_predictions)) ExecutionProgress.progress(i, number_samples) ExecutionProgress.end() except (KeyboardInterrupt, SystemExit): pass self.py_locals['labels'] = validation.labels self.py_locals['predictions'] = validation.predictions cnt = len(good_predictions) if cnt: ngood = sum(good_predictions) nbad = cnt - ngood if nbad: LOG.info( "%s out of %s predicted falsly with %s average margin", nbad, cnt, bad_margin / nbad) if ngood: LOG.info( "%s out of %s predicted correctly with %s average margin", ngood, cnt, good_margin / ngood) accuracy_rate = 100 * sum(good_predictions) / len(good_predictions) LOG.info('Total accuracy: %.3f %%', accuracy_rate)
def do_dump(self, args: argparse.Namespace): """ Dump the activations resulting from running an input file through the graph. You can use the current quantization settings and can also just quantify one specific step of the graph.""" self._check_graph() dequantize = args.dequantize if args.dequantize is not None\ else not (args.pickle or args.save) if args.quantize or args.quantize_step or args.quantize_all_steps: self._check_quantized() if args.quantize: if dequantize: qmode = QuantizationMode.all_dequantize() else: qmode = QuantizationMode.all() elif args.quantize_all_steps: qmode = QuantizationMode.step_all() dequantize = True else: qmode = QuantizationMode.step(args.quantize_step) elif args.quantize_and_dequantize: qmode = QuantizationMode.all_float_quantize_dequantize() else: qmode = QuantizationMode.none() if args.step is not None: step = args.step num_steps = len(self.G.graph_state.steps) if step < 0: step = num_steps + step if step < 0 or step > num_steps: self.perror("step must be from {} to {}".format(-num_steps, num_steps)) return else: step = None input_args = self._get_input_args(args) pickles = [] for file_per_input in glob_input_files(args.input_files, self.G.num_inputs): LOG.info("input file %s", file_per_input) data = [import_data(input_file, **input_args) for input_file in file_per_input] executer = GraphExecuter(self.G, qrecs=self.G.quantization) outputs = executer.execute(data, step_idx_limit=step, qmode=qmode) if args.pickle or self._in_py or args.save: pickles.append(format_dump_file(self.G, outputs, not qmode.is_none, args.dequantize, args.quantize_step)) else: self.G.print_intermediates(outputs, limit=step, width=args.number_width, precision=args.precision, channel=args.channel, order=['c', 'h', 'w']) if args.pickle or args.save or self._in_py: if not pickles: self.perror("no input files found") return if len(args.input_files) == 1: pickles = pickles[0] if args.pickle: with open(args.pickle, 'wb') as pickle_fp: pickle.dump(pickles, pickle_fp) if args.save: self.tensor_store[args.save] = pickles if self._in_py: self.last_result = pickles