def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.use_selfie_encoder = config.use_selfie_encoder if config.use_linformer_encoder: if config.linformer_quantize: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=QuantizedMultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, ), ) for _ in range(config.num_encoder_layers) ] self.encoder = (SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, )) if not self.use_selfie_encoder else PostEncoder( transformer=SELFIETransformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, ))) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding self.use_linformer_encoder = config.use_linformer_encoder log_class_usage(__class__)
def __init__( # noqa C901 self, config: Config, output_encoded_layers: bool, token_embedding: nn.Embedding = None, **kwarg, ) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = ( resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path ) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.use_selfie_encoder = config.use_selfie_encoder self.skip_token_embed = config.skip_token_embed if config.use_linformer_encoder: if config.linformer_quantize: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=QuantizedMultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, scaling=config.scaling, ), normalize_before=config.normalize_before, ) for _ in range(config.num_encoder_layers) ] if not config.skip_token_embed: self.encoder = ( SentenceEncoder( transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, normalize_before=config.normalize_before, token_embedding=token_embedding, ) ) if not self.use_selfie_encoder else PostEncoder( transformer=SELFIETransformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, ) ) ) else: self.encoder = PassthroughEncoder( transformer=PassthroughTransformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, normalize_before=config.normalize_before, ) ) self.apply(init_params) if config.prune_before_load: self._prune_transformer_layers_and_heads(config) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load( f, map_location=lambda s, l: default_restore_location(s, "cpu") ) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) elif config.load_partial_model is not None: roberta_state = { k.replace(config.load_partial_model + ".", ""): v for k, v in roberta_state["model_state"].items() if k.startswith(config.load_partial_model) } self.load_state_dict(roberta_state) else: self.load_state_dict(roberta_state) if config.use_bias_finetuning: for (n, p) in self.encoder.named_parameters(): # "encoder.transformer.layers.0.attention.input_projection.weight" -> false # "encoder.transformer.layers.0.attention.input_projection.bias" -> true if n.split(".")[-1] != "bias": p.requires_grad_(False) if not config.prune_before_load: self._prune_transformer_layers_and_heads(config) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding self.use_linformer_encoder = config.use_linformer_encoder log_class_usage(__class__)
def __init__( self, config: Config, output_encoded_layers: bool, *args, **kwargs ) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # Load config config_file = os.path.join(config.bert_cpt_dir, "config.json") local_config_path = PathManager.get_local_path(config_file) bert_config = BertConfig.from_json_file(local_config_path) print("Bert model config {}".format(bert_config)) # Instantiate model. model = BertModel(bert_config) weights_path = os.path.join(config.bert_cpt_dir, "pytorch_model.bin") # load pre-trained weights if weights_path exists if config.load_weights and PathManager.isfile(weights_path): with PathManager.open(weights_path, "rb") as fd: state_dict = torch.load(fd) missing_keys: List[str] = [] unexpected_keys: List[str] = [] error_msgs: List[str] = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) for key in list(state_dict.keys()): new_key = None if key.endswith("LayerNorm.gamma"): # compatibility with v0.5 models new_key = key.replace("LayerNorm.gamma", "LayerNorm.weight") if key.endswith("LayerNorm.beta"): # compatibility with v0.5 models new_key = key.replace("LayerNorm.beta", "LayerNorm.bias") if new_key is not None: state_dict[new_key] = state_dict.pop(key) if metadata is not None: state_dict._metadata = metadata def load(module, prefix=""): local_metadata = ( {} if metadata is None else metadata.get(prefix[:-1], {}) ) module._load_from_state_dict( state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs, ) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") load(model, prefix="" if hasattr(model, "bert") else "bert.") if len(missing_keys) > 0: print( "Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, missing_keys ) ) if len(unexpected_keys) > 0: print( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys ) ) self.bert = model log_class_usage(__class__)
def torchscript_export( self, model, export_path=None, sort_input=False, sort_key=1, export_config=None, ): # TODO(T88310041) Remove These torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(False) # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize accelerate = export_config.accelerate seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate) use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate use_nnpi_gelu_clip = "nnpi:gelu_clip" in accelerate use_cuda_half = "cuda:half" in accelerate use_cuda_half_faster_transformers = "cuda:half:ft" in accelerate use_nnpi_quantize = "nnpi:quantize" in accelerate use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate use_nnpi_fx_static_selectively_quantize = ( "nnpi:fx_static_selectively_quantize" in accelerate) use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate use_cpu_fx_static_selectively_quantize = ( "cpu:fx_static_selectively_quantize" in accelerate) use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate use_fx_quantize = (use_nnpi_fx_static_quantize or use_nnpi_fx_static_selectively_quantize or use_nnpi_fx_dynamic_quantize or use_cpu_fx_static_quantize or use_cpu_fx_static_selectively_quantize or use_cpu_fx_dynamic_quantize) # what hosts can this model run on # by default, pytext works on CPU and CUDA (because it implements set_device) model_host = ["cpu", "cuda"] if use_cuda_half or use_cuda_half_faster_transformers: # CUDA FP16 models only work on CUDA model_host = ["cuda"] if (use_nnpi or use_nnpi_quantize or use_nnpi_gelu_clip or use_nnpi_throughput_optimized): model_host = ["nnpi"] if hasattr(model, "set_host"): model.set_host(model_host) # what is the type of this model # pytext models are nlp models model_type = ["nlp"] instance_paths_p = any( True for _ in find_module_instances(model, RoBERTaEncoder, [])) if instance_paths_p: model_type.append("transformer") instance_paths_p = any( True for _ in find_module_instances(model, BiLSTM, [])) if instance_paths_p: model_type.append("BiLSTM") if hasattr(model, "set_model"): model.set_type(model_type) # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) if use_nnpi or use_fx_quantize: model = swap_modules(model, MODULE_TO_REWRITER["nnpi"]) if "nnpi:split" in accelerate: model = split_model_for_accelerator(model) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) # Default to dynamic if use_fx_quantize: data_loader = self.data.batches(Stage.TRAIN, load_early=False) trace = quantize_fx( model, inputs, data_loader, use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize, use_nnpi_fx_static_selectively_quantize or use_cpu_fx_static_selectively_quantize, ) elif (quantize or use_nnpi_quantize) and hasattr( model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) print("Quantizing the model ...") # recognize legazy nnpi_q or $platform:$option syntax quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize" in accelerate) module_swap = use_nnpi trace = quantize_statically(model, inputs, data_loader, quantize_linear_only, module_swap) else: if quantize: log_feature_usage("quantize.dynamically.CPU") model.quantize() if use_cuda_half_faster_transformers: log_accelerator_feature_usage( "build.CUDA.half.faster_transformers") # We need a separate path for GPU-only tracing, as we can't just trace a CPU model # and invoke .cuda().half(), # as we don't have equivalent CPU implementations of these operators. precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True model = swap_modules(model, MODULE_TO_REWRITER["cuda"]) model.eval() model.half().cuda() # obtain new inputs with cuda/fp16 enabled. unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) trace = model.trace(inputs) print("traced (faster_transformers)!") # should be unnecessary. trace.cuda().half() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) results = trace(*inputs) assert results print(results) else: trace = model.trace(inputs) print("traced!") if use_cuda_half: log_accelerator_feature_usage("build.CUDA.half") # convert trace to half precision trace.cuda().half() #### trace test: demonstrate that it is usable precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) assert trace(*inputs) #### end of trace test if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if hasattr(trace, "validate"): trace.validate(export_config) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring seq_padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring batch_padding_control" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if use_nnpi: print("lowering using to_glow") trace = lower_modules_to_accelerator( model, trace, export_config, use_nnpi_throughput_optimized, use_nnpi_gelu_clip, ) if "split" in accelerate: print("lowering split model to glow") trace = lower_split_model_to_accelerator(model, trace, export_config) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def get_test_sample(): with PathManager.open(RAW_TEST_PATH, "r") as f: data = json.load(f) return data