def quantize_statically(model, inputs, data_loader, linear_only=False, module_swap=False): log_feature_usage("export.quantize.statically") if (hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder) and linear_only): log_accelerator_feature_usage("quantize.statically") qconfig = QConfig( activation=HistogramObserver.with_args(reduce_range=False), weight=default_weight_observer, ) qconfig_dict = {"": None} if module_swap: layers = model.encoder.encoder.transformer.layers.layers layers_str = "encoder.encoder.transformer.layers.layers" else: layers = model.encoder.encoder.transformer.layers layers_str = "encoder.encoder.transformer.layers" # skip first layer for layer_idx in range(1, len(layers)): qconfig_dict[ layers_str + ".{}.attention.input_projection".format(layer_idx)] = qconfig qconfig_dict[ layers_str + ".{}.attention.output_projection".format(layer_idx)] = qconfig for mlp_idx, m in enumerate(layers[layer_idx].residual_mlp.mlp): # Only quantize first linear otherwise there are accuarcy issues if type(m) == torch.nn.Linear and mlp_idx < 1: qconfig_dict[layers_str + ".{}.residual_mlp.mlp.{}".format( layer_idx, mlp_idx)] = qconfig trace = model.graph_mode_quantize(inputs, data_loader, qconfig_dict=qconfig_dict, force_quantize=True) else: trace = model.graph_mode_quantize(inputs, data_loader) return trace
def torchscript_export( self, model, export_path=None, sort_input=False, sort_key=1, export_config=None, ): # TODO(T88310041) Remove These torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(False) # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control accel = AccelerateOptions(export_config.accelerate) print(f"Using accelerate options: {accel.__dict__}") # what hosts can this model run on # by default, pytext works on CPU and CUDA (because it implements set_device) model_host = ["cpu", "cuda"] if accel.use_cuda: # CUDA FP16 models only work on CUDA model_host = ["cuda"] if accel.use_nnpi: model_host = ["nnpi"] if hasattr(model, "set_host"): model.set_host(model_host) # what is the type of this model # pytext models are nlp models model_type = ["nlp"] instance_paths_p = any( True for _ in find_module_instances(model, RoBERTaEncoder, [])) if instance_paths_p: model_type.append("transformer") instance_paths_p = any( True for _ in find_module_instances(model, BiLSTM, [])) if instance_paths_p: model_type.append("BiLSTM") if hasattr(model, "set_model"): model.set_type(model_type) # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) model = rewrite_nnpi_modules(model, accel) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) # Default to dynamic if accel.use_fx_quantize: data_loader = self.data.batches(Stage.TRAIN, load_early=False) trace = quantize_fx( model, inputs, data_loader, accel.use_nnpi_fx_dynamic_quantize or accel.use_cpu_fx_dynamic_quantize, accel.use_nnpi_fx_static_selectively_quantize or accel.use_cpu_fx_static_selectively_quantize, ) elif (quantize or accel.use_nnpi_quantize) and hasattr( model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) print("Quantizing the model ...") # recognize legazy nnpi_q or $platform:$option syntax quantize_linear_only = accel.use_nnpi_quantize module_swap = accel.use_nnpi trace = quantize_statically(model, inputs, data_loader, quantize_linear_only, module_swap) else: if quantize: log_feature_usage("quantize.dynamically.CPU") model.quantize() if accel.use_cuda and (accel.use_cuda_half_ft or accel.use_cuda_dq): log_accelerator_feature_usage( "build.CUDA.half.faster_transformers") # We need a separate path for GPU-only tracing, as we can't just trace a CPU model # and invoke .cuda().half(), # as we don't have equivalent CPU implementations of these operators. precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True if accel.use_cuda_dq: model = swap_modules(model, MODULE_TO_REWRITER["cuda-dq"]) else: model = swap_modules(model, MODULE_TO_REWRITER["cuda"]) model.eval() model.half().cuda() # obtain new inputs with cuda/fp16 enabled. unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) trace = model.trace(inputs) print("Traced (faster_transformers)!") # should be unnecessary. trace.cuda().half() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) results = trace(*inputs) assert results else: trace = model.trace(inputs) print("Traced!") if accel.use_cuda and accel.use_cuda_half: log_accelerator_feature_usage("build.CUDA.half") # convert trace to half precision trace.cuda().half() #### trace test: demonstrate that it is usable precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) assert trace(*inputs) #### end of trace test if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if hasattr(trace, "validate"): trace.validate(export_config) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring seq_padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring batch_padding_control" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if accel.use_nnpi and not accel.use_nnpi_split: print("Lowering using to_glow") trace = lower_modules_to_accelerator( model, trace, export_config, accel.use_nnpi_throughput_optimized, accel.use_nnpi_throughput_maximized, accel.use_nnpi_gelu_clip, ) if accel.use_nnpi_split: print("Lowering split model to Glow") trace = lower_split_model_to_accelerator(model, trace, export_config) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def torchscript_export( self, model, export_path=None, sort_input=False, sort_key=1, export_config=None, ): # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize accelerate = export_config.accelerate seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control inference_interface = export_config.inference_interface # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate) use_cuda_half = "cuda:half" in accelerate use_nnpi_quantize = "nnpi:quantize" in accelerate # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) if use_nnpi: model = swap_modules_for_accelerator(model) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) if (quantize or use_nnpi_quantize) and hasattr(model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) print("Quantizing the model ...") # recognize legazy nnpi_q or $platform:$option syntax quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize" in accelerate) module_swap = use_nnpi trace = quantize_statically(model, inputs, data_loader, quantize_linear_only, module_swap) else: if quantize: log_feature_usage("quantize.dynamically.CPU") model.quantize() trace = model.trace(inputs) print("traced!") if use_cuda_half: log_accelerator_feature_usage("build.CUDA.half") # convert trace to half precision trace.cuda().half() #### trace test: demonstrate that it is usable precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) assert trace(*inputs) #### end of trace test if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if hasattr(trace, "validate"): trace.validate(export_config) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring seq_padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring batch_padding_control" ) if inference_interface is not None: if hasattr(trace, "inference_interface"): trace.inference_interface(inference_interface) else: print( "inference_interface not supported by model. Ignoring inference_interface" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if use_nnpi: print("lowering using to_glow") trace = lower_modules_to_accelerator(model, trace, export_config) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def torchscript_export( self, model, export_path=None, sort_input=False, sort_key=1, export_config=None, ): # TODO(T88310041) Remove These torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(False) # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize accelerate = export_config.accelerate seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate) use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate use_cuda_half = "cuda:half" in accelerate use_cuda_half_faster_transformers = "cuda:half:ft" in accelerate use_nnpi_quantize = "nnpi:quantize" in accelerate use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate use_nnpi_fx_static_selectively_quantize = ( "nnpi:fx_static_selectively_quantize" in accelerate ) use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate use_cpu_fx_static_selectively_quantize = ( "cpu:fx_static_selectively_quantize" in accelerate ) use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate use_fx_quantize = ( use_nnpi_fx_static_quantize or use_nnpi_fx_static_selectively_quantize or use_nnpi_fx_dynamic_quantize or use_cpu_fx_static_quantize or use_cpu_fx_static_selectively_quantize or use_cpu_fx_dynamic_quantize ) # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) if use_nnpi or use_fx_quantize: model = swap_modules(model, MODULE_TO_REWRITER["nnpi"]) if "nnpi:split" in accelerate: model = split_model_for_accelerator(model) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) # Default to dynamic if use_fx_quantize: data_loader = self.data.batches(Stage.TRAIN, load_early=False) trace = quantize_fx( model, inputs, data_loader, use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize, use_nnpi_fx_static_selectively_quantize or use_cpu_fx_static_selectively_quantize, ) elif (quantize or use_nnpi_quantize) and hasattr(model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) print("Quantizing the model ...") # recognize legazy nnpi_q or $platform:$option syntax quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize" in accelerate) module_swap = use_nnpi trace = quantize_statically( model, inputs, data_loader, quantize_linear_only, module_swap ) else: if quantize: log_feature_usage("quantize.dynamically.CPU") model.quantize() if use_cuda_half_faster_transformers: log_accelerator_feature_usage("build.CUDA.half.faster_transformers") # We need a separate path for GPU-only tracing, as we can't just trace a CPU model # and invoke .cuda().half(), # as we don't have equivalent CPU implementations of these operators. precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True model = swap_modules(model, MODULE_TO_REWRITER["cuda"]) model.eval() model.half().cuda() # obtain new inputs with cuda/fp16 enabled. unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) trace = model.trace(inputs) print("traced (faster_transformers)!") # should be unnecessary. trace.cuda().half() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) assert trace(*inputs) else: trace = model.trace(inputs) print("traced!") if use_cuda_half: log_accelerator_feature_usage("build.CUDA.half") # convert trace to half precision trace.cuda().half() #### trace test: demonstrate that it is usable precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) assert trace(*inputs) #### end of trace test if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if hasattr(trace, "validate"): trace.validate(export_config) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring seq_padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring batch_padding_control" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if use_nnpi: print("lowering using to_glow") trace = lower_modules_to_accelerator( model, trace, export_config, use_nnpi_throughput_optimized ) if "split" in accelerate: print("lowering split model to glow") trace = lower_split_model_to_accelerator(model, trace, export_config) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace