def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." self.encoder = SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( config.embedding_dim, config.num_attention_heads), ) for _ in range(config.num_encoder_layers) ], )) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) self.representation_dim = self._embedding().weight.size(-1) log_class_usage(__class__)
def __init__( self, vocab_size: int, embedding_dim: int, num_attention_heads: int, num_encoder_layers: int, output_dropout: float, model_path: Optional[str] = None, ): super().__init__() self.transformer = Transformer( vocab_size=vocab_size, embedding_dim=embedding_dim, layers=[ TransformerLayer( embedding_dim=embedding_dim, attention=MultiheadSelfAttention( embedding_dim, num_attention_heads ), ) for _ in range(num_encoder_layers) ], ) self.output_dropout = nn.Dropout(output_dropout) self.apply(init_params) if model_path: with PathManager.open(model_path, "rb") as f: roberta_state = torch.load( f, map_location=lambda s, l: default_restore_location(s, "cpu") ) if "model" in roberta_state: roberta_state = translate_roberta_state_dict(roberta_state["model"]) self.load_state_dict(roberta_state)
def testLoweringTransformerToTracedNVFastTransformer(self): V = 1000 transformer = Transformer(vocab_size=V).cuda().eval().half() faster_transformer = NVFasterTransformerEncoder(transformer) faster_transformer_jit = None for _ in range(10): B = np.random.randint(low=0, high=64) max_T = np.random.randint(low=0, high=64) lengths = np.random.randint(low=0, high=max_T + 1, size=(B, )) tokens = torch.zeros(B, max_T).cuda().long() for b in range(B): length = lengths[b] tokens[b, :length] = (torch.randint( transformer.padding_idx + 1, V - 1, size=(1, length)).cuda().long()) tokens[b, length:] = transformer.padding_idx if not faster_transformer_jit: faster_transformer_jit = torch.jit.trace( faster_transformer, (tokens, )) ref = transformer(tokens) fast = faster_transformer_jit(tokens) for rref, ffast in zip(ref, fast): for b in range(B): length = lengths[b] torch.testing.assert_allclose(rref[:length, b], ffast[:length, b], atol=2e-2, rtol=2e-2)
def testLoweringBaseTransformerToNVFastTransformerPadded(self): V = 1000 transformer = Transformer(vocab_size=V).cuda().eval().half() faster_transformer = NVFasterTransformerEncoder(transformer) for B in range(1, 32): for max_T in [0, 1, 2, 6, 40, 127]: lengths = np.random.randint(low=0, high=max_T + 1, size=(B, )) tokens = torch.zeros(B, max_T).cuda().long() for b in range(B): length = lengths[b] tokens[b, :length] = (torch.randint( transformer.padding_idx + 1, V - 1, size=(1, length)).cuda().long()) tokens[b, length:] = transformer.padding_idx ref = transformer(tokens) fast = faster_transformer(tokens) for rref, ffast in zip(ref, fast): for b in range(B): length = lengths[b] torch.testing.assert_allclose(rref[:length, b], ffast[:length, b], atol=2e-2, rtol=2e-2)
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.encoder = SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ) if config. use_linformer_encoder else MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, ), ) for _ in range(config.num_encoder_layers) ], max_seq_len=config.max_seq_len, )) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) self.representation_dim = self._embedding().weight.size(-1) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding log_class_usage(__class__)
def _small_encoder(self): layers = [ TransformerLayer( embedding_dim=12, attention=MultiheadSelfAttention( embed_dim=12, num_heads=12, scaling=0.125 ), ) for _ in range(2) ] transformer = Transformer(vocab_size=100, embedding_dim=12, layers=layers) return SentenceEncoder(transformer)
def testLoweringBaseTransformerToNVFastTransformer(self): V = 1000 transformer = Transformer(vocab_size=V).cuda().eval().half() faster_transformer = NVFasterTransformerEncoder(transformer) for B in range(1, 32): for T in [0, 1, 7, 8, 16]: tokens = (torch.randint(transformer.padding_idx + 1, V - 1, size=(B, T)).cuda().long()) ref = transformer(tokens) fast = faster_transformer(tokens) for rref, ffast in zip(ref, fast): torch.testing.assert_allclose(rref, ffast, atol=2e-2, rtol=2e-2)
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." self.encoder = SentenceEncoder(transformer=Transformer( embedding_dim=config.embedding_dim, layers=[ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( config.embedding_dim, config.num_attention_heads), ) for _ in range(config.num_encoder_layers) ], )) roberta_state = torch.load( config.model_path, map_location=lambda s, l: default_restore_location(s, "cpu"), ) self.encoder.load_roberta_state_dict(roberta_state["model"]) self.representation_dim = self.encoder.transformer.token_embedding.weight.size( -1)
def testLoweringLargeTransformerToNVFastTransformer(self): V = 1000 L = 24 D = 1024 H = 16 layers = [ TransformerLayer( embedding_dim=D, attention=MultiheadSelfAttention(embed_dim=D, num_heads=H), ) for _ in range(L) ] transformer = (Transformer(vocab_size=V, embedding_dim=D, layers=layers).cuda().eval().half()) faster_transformer = NVFasterTransformerEncoder(transformer) for _ in range(10): B = np.random.randint(low=0, high=32) max_T = np.random.randint(low=0, high=32) lengths = np.random.randint(low=0, high=max_T + 1, size=(B, )) tokens = torch.zeros(B, max_T).cuda().long() for b in range(B): length = lengths[b] tokens[b, :length] = (torch.randint( transformer.padding_idx + 1, V - 1, size=(1, length)).cuda().long()) tokens[b, length:] = transformer.padding_idx ref = transformer(tokens) fast = faster_transformer(tokens) for rref, ffast in zip(ref, fast): for b in range(B): length = lengths[b] torch.testing.assert_allclose(rref[:length, b], ffast[:length, b], atol=3e-2, rtol=2e-2)
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.use_selfie_encoder = config.use_selfie_encoder if config.use_linformer_encoder: if config.linformer_quantize: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=QuantizedMultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, ), ) for _ in range(config.num_encoder_layers) ] self.encoder = (SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, )) if not self.use_selfie_encoder else PostEncoder( transformer=SELFIETransformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, ))) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) if config.use_bias_finetuning: for (n, p) in self.encoder.named_parameters(): # "encoder.transformer.layers.0.attention.input_projection.weight" -> false # "encoder.transformer.layers.0.attention.input_projection.bias" -> true if n.split(".")[-1] != "bias": p.requires_grad_(False) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding self.use_linformer_encoder = config.use_linformer_encoder log_class_usage(__class__)
def _small_encoder(self): layers = [TransformerLayer(embedding_dim=12) for _ in range(2)] transformer = Transformer(vocab_size=100, embedding_dim=12, layers=layers) return SentenceEncoder(transformer)