def __init__( self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None, special_token_replacements=SPECIAL_TOKEN_REPLACEMENT, ): super().__init__() assert vocab_path or vocab_list, "vocab_path or vocab_list is required" assert not ( vocab_path and vocab_list ), "vocab_path and vocab_list are mutual exclusive" if vocab_list: self.vocab = ScriptVocabulary(vocab_list) else: with PathManager.open(vocab_path) as f: vocab = build_fairseq_vocab( f, special_token_replacements=special_token_replacements ) self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), unk_token=vocab.unk_token, )
def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.normalizer = tensorizers["dense"].normalizer self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
def __init__( self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None ): super().__init__() assert vocab_path or vocab_list, "vocab_path or vocab_list is required" assert not ( vocab_path and vocab_list ), "vocab_path and vocab_list are mutual exclusive" if vocab_list: self.vocab = ScriptVocabulary(vocab_list) else: with PathManager.open(vocab_path) as f: special_token_replacements = { "[UNK]": UNK, "[PAD]": PAD, "[CLS]": BOS, "[MASK]": MASK, "[SEP]": EOS, } vocab = build_fairseq_vocab( f, special_token_replacements=special_token_replacements ) self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), )
def __init__( self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None, special_token_replacements=SPECIAL_TOKEN_REPLACEMENT, add_bos: bool = False, add_eos: bool = False, max_seq_len: int = 2**30, ): super().__init__() assert vocab_path or vocab_list, "vocab_path or vocab_list is required" assert not (vocab_path and vocab_list ), "vocab_path and vocab_list are mutual exclusive" if vocab_list: self.vocab = ScriptVocabulary(vocab_list) else: with PathManager.open(vocab_path) as f: vocab = build_fairseq_vocab( f, special_token_replacements=special_token_replacements) self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), unk_token=vocab.unk_token, ) # TODO T77728853 We need to combine truncate with BOS/EOS as they impact each other # Need to find a nicer way to do this, as this can't be chained. self.add_bos = add_bos self.add_eos = add_eos # Make room for bos and eos from max_seq_len if true self.truncate_transform = TruncateTransform(max_seq_len - add_bos - add_eos)
def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) self.max_seq_len = jit.Attribute(max_seq_len, int)
def __init__(self, vocab: Vocabulary): super().__init__() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), )
def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.max_byte_len = jit.Attribute(max_byte_len, int) self.byte_offset_for_non_padding = jit.Attribute( byte_offset_for_non_padding, int ) self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) self.model = traced_model self.output_layer = output_layer
def __init__(self): super().__init__() self.vocab = ScriptVocabulary( input_vocab, input_vocab.get_unk_index(), input_vocab.get_pad_index(), ) self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int) self.max_seq_len = jit.Attribute(max_seq_len, int)
def _run_benchmark_pytext_script_vocab(toks, v: PytextScriptVocabulary): # list lookup if isinstance(toks, list) and isinstance(toks[0], list): for tokens_list in toks: v.lookup_indices_1d(tokens_list) # single token lookup elif isinstance(toks, list): for token in toks: v.lookup_indices_1d([token]) else: raise RuntimeError("Received tokens of incorrect type {}.".format( type(toks)))
def __init__(self): super().__init__() self.vocab = ScriptVocabulary( input_vocab, input_vocab.get_unk_index(), input_vocab.get_pad_index(), ) self.normalizer = tensorizers["dense"].normalizer self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int) self.max_seq_len = jit.Attribute(max_seq_len, int) self.tokenizer = scripted_tokenizer
def __init__( self, pretrained_embeddings_path: str, embedding_dim: int, mlp_layer_dims: Optional[Sequence[int]] = None, lowercase_tokens: bool = False, skip_header: bool = True, delimiter: str = " ", vocab: ScriptVocabulary = None, ) -> None: super().__init__() vocab = vocab or build_vocab(pretrained_embeddings_path) pretrained_embedding = PretrainedEmbedding( pretrained_embeddings_path, lowercase_tokens=lowercase_tokens, skip_header=skip_header, delimiter=delimiter, ) embeddings_weight = pretrained_embedding.initialize_embeddings_weights( vocab.idx, # tensorizer.vocab.idx, vocab.unk_token, # tensorizer.vocab.unk_token, embedding_dim, EmbedInitStrategy.RANDOM, ) num_embeddings = len(vocab.idx) self.embedding = nn.Embedding( num_embeddings, embedding_dim, _weight=embeddings_weight, padding_idx=vocab.get_pad_index(), ) # Initialize unk embedding with zeros # to guard the model against randomized decisions based on unknown words unk_token_idx = vocab.get_unk_index() if unk_token_idx >= 0: self.embedding.weight.data[unk_token_idx].fill_(0.0) # Create MLP layers if mlp_layer_dims is None: mlp_layer_dims = [] self.mlp = nn.Sequential( *( nn.Sequential(nn.Linear(m, n), nn.ReLU()) for m, n in zip([embedding_dim] + list(mlp_layer_dims), mlp_layer_dims) ) ) self.output_dim = mlp_layer_dims[-1] if mlp_layer_dims else embedding_dim
def __init__(self): super().__init__() self.vocab = ScriptVocabulary( input_vocab, input_vocab.get_unk_index(), input_vocab.get_pad_index(), ) self.normalizer = tensorizers["dense"].normalizer self.max_seq_len = jit.Attribute(max_seq_len, int) self.max_byte_len = jit.Attribute(max_byte_len, int) self.byte_offset_for_non_padding = jit.Attribute( byte_offset_for_non_padding, int) self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int) self.model = traced_model self.output_layer = output_layer
class ModelWithDenseFeat(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.normalizer = tensorizers["dense"].normalizer self.max_byte_len = jit.Attribute(max_byte_len, int) self.byte_offset_for_non_padding = jit.Attribute( byte_offset_for_non_padding, int ) self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) self.model = traced_model self.output_layer = output_layer @jit.script_method def forward(self, tokens: List[List[str]], dense_feat: List[List[float]]): seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) token_bytes, _ = make_byte_inputs( tokens, self.max_byte_len, self.byte_offset_for_non_padding ) dense_feat = self.normalizer.normalize(dense_feat) logits = self.model( torch.tensor(word_ids), token_bytes, torch.tensor(seq_lens), torch.tensor(dense_feat, dtype=torch.float), ) return self.output_layer(logits)
def __init__( self, add_bos_token: bool, add_eos_token: bool, use_eos_token_for_bos: bool, max_seq_len: int, vocab: Vocabulary, tokenizer: Optional[Tokenizer], ): super().__init__() if tokenizer is not None and hasattr(tokenizer, "torchscriptify"): try: self.tokenizer = tokenizer.torchscriptify() except NotImplementedError: # This is fine as long as the exported tokenizer is only used # in pre-tokenized mode self.tokenizer = None else: self.tokenizer = None self.do_nothing_tokenizer = ScriptDoNothingTokenizer() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), bos_idx=vocab.get_bos_index() if add_bos_token else -1, eos_idx=vocab.get_eos_index() if add_eos_token else -1, ) self.vocab_lookup_1d = VocabLookup(self.vocab) self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len
class VocabTransform(nn.Module): def __init__( self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None, special_token_replacements=SPECIAL_TOKEN_REPLACEMENT, ): super().__init__() assert vocab_path or vocab_list, "vocab_path or vocab_list is required" assert not ( vocab_path and vocab_list ), "vocab_path and vocab_list are mutual exclusive" if vocab_list: self.vocab = ScriptVocabulary(vocab_list) else: with PathManager.open(vocab_path) as f: vocab = build_fairseq_vocab( f, special_token_replacements=special_token_replacements ) self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), unk_token=vocab.unk_token, ) def forward(self, tokens: List[List[str]]) -> List[List[int]]: return self.vocab.lookup_indices_2d(tokens)
class ModelWithDenseFeat(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.normalizer = tensorizers["dense"].normalizer self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) @jit.script_method def forward( self, texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, dense_feat: Optional[List[List[float]]] = None, ): if tokens is None: raise RuntimeError("tokens is required") seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) if dense_feat is not None: dense_feat = self.normalizer.normalize(dense_feat) else: raise RuntimeError("dense is required") logits = self.model( torch.tensor(word_ids), torch.tensor(seq_lens), torch.tensor(dense_feat, dtype=torch.float), ) return self.output_layer(logits)
class Model(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) self.max_seq_len = jit.Attribute(max_seq_len, int) @jit.script_method def forward( self, texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): if tokens is None: raise RuntimeError("tokens is required") trimmed_tokens: List[List[str]] = [] if self.max_seq_len >= 0: for token in tokens: trimmed_tokens.append(token[0:self.max_seq_len]) else: trimmed_tokens = tokens seq_lens = make_sequence_lengths(trimmed_tokens) word_ids = self.vocab.lookup_indices_2d(trimmed_tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) logits = self.model(torch.tensor(word_ids), torch.tensor(seq_lens)) return self.output_layer(logits)
def test_xlm_token_tensorizer(self): vocab = self._mock_vocab() xlm = ScriptXLMTensorizer( tokenizer=ScriptDoNothingTokenizer(), token_vocab=vocab, language_vocab=ScriptVocabulary(["ar", "cn", "en"]), max_seq_len=256, default_language="en", ) rand_tokens = [ [str(random.randint(100, 200)) for i in range(20)], [str(random.randint(100, 200)) for i in range(10)], ] tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens)) tokens = tokens.tolist() # eos token self.assertEqual(tokens[0][0], 202) self.assertEqual(tokens[0][-1], 202) # pad token self.assertEqual(tokens[1][12:], [200] * 10) languages = languages.tolist() self.assertEqual(languages[0], [2] * len(tokens[0])) self.assertEqual(languages[1][12:], [0] * 10) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"])) languages = languages.tolist() self.assertEqual(languages[0][:], [1] * len(tokens[0])) self.assertEqual(languages[1][:12], [2] * 12)
class Model(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.max_byte_len = jit.Attribute(max_byte_len, int) self.byte_offset_for_non_padding = jit.Attribute( byte_offset_for_non_padding, int) self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) self.model = traced_model self.output_layer = output_layer @jit.script_method def forward( self, texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): if tokens is None: raise RuntimeError("tokens is required") seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) token_bytes, _ = make_byte_inputs( tokens, self.max_byte_len, self.byte_offset_for_non_padding) logits = self.model(torch.tensor(word_ids), token_bytes, torch.tensor(seq_lens)) return self.output_layer(logits)
class Model(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary( input_vocab, input_vocab.get_unk_index(), input_vocab.get_pad_index(), ) self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int) self.max_seq_len = jit.Attribute(max_seq_len, int) @jit.script_method def forward( self, texts: Optional[List[str]] = None, multi_texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): if tokens is None: raise RuntimeError("tokens is required") tokens = truncate_tokens(tokens, self.max_seq_len, self.vocab.pad_token) seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) logits = self.model(torch.tensor(word_ids), torch.tensor(seq_lens)) return self.output_layer(logits)
def _mock_vocab(self): # mapping of vocab index to token is 0-9 return ScriptVocabulary( [str(i) for i in range(0, 10)], pad_idx=-1, bos_idx=0, unk_idx=-1, )
def _mock_xlm_tensorizer(self, max_seq_len=256): return ScriptXLMTensorizer( tokenizer=ScriptDoNothingTokenizer(), token_vocab=self._mock_vocab(), language_vocab=ScriptVocabulary(["ar", "cn", "en"]), max_seq_len=256, default_language="en", )
class LabelTransform(nn.Module): def __init__(self, label_names: List[str]): super().__init__() self.label_vocab = ScriptVocabulary(sorted(label_names)) def forward(self, labels: List[str]) -> List[int]: return self.label_vocab.lookup_indices_1d(labels)
def torchscriptify(self): languages = [0] * (max(list(self.lang2id.values())) + 1) for k, v in self.lang2id.items(): languages[v] = k return ScriptXLMTensorizer( tokenizer=self.tokenizer.torchscriptify(), token_vocab=ScriptVocabulary( list(self.vocab), pad_idx=self.vocab.get_pad_index(), bos_idx=self.vocab.get_eos_index(), eos_idx=self.vocab.get_eos_index(), unk_idx=self.vocab.get_unk_index(), ), language_vocab=ScriptVocabulary(languages), max_seq_len=self.max_seq_len, default_language=self.default_language, )
def torchscriptify(self): return ScriptBERTTensorizer( tokenizer=self.tokenizer.torchscriptify(), vocab=ScriptVocabulary( list(self.vocab), pad_idx=self.vocab.get_pad_index(), bos_idx=self.vocab.get_bos_index(), eos_idx=self.vocab.get_eos_index(), ), max_seq_len=self.max_seq_len, )
def __init__( self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int, language_vocab: List[str], default_language: str, ): super().__init__(tokenizer, vocab, max_seq_len) self.language_vocab = ScriptVocabulary(language_vocab) self.default_language = torch.jit.Attribute(default_language, str)
def build_pytext_vocab_pipeline(vocab_file): tokenizer = BasicEnglishNormalize() f = open(vocab_file, 'r') vocab_list = [line.rstrip() for line in f] # Insert token in vocab to match a pretrained vocab pipeline = TextDataPipeline(tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list))) jit_pipeline = torch.jit.script(pipeline) print('jit PyText pipeline success!') return pipeline, jit_pipeline
def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int): super().__init__() self.tokenizer = tokenizer self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(), ) self.vocab_lookup = VocabLookup(self.vocab) self.max_seq_len = max_seq_len
def build_pytext_vocab_pipeline(vocab_file): from pytext.torchscript.vocab import ScriptVocabulary tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_list = [line.rstrip() for line in f] # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms( tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)), ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def torchscriptify(self): return ScriptRoBERTaTensorizer( tokenizer=self.tokenizer.torchscriptify(), vocab=ScriptVocabulary( list(self.vocab), pad_idx=self.vocab.get_pad_index(), bos_idx=self.vocab.get_bos_index(), eos_idx=self.vocab.get_eos_index(), ), max_seq_len=self.max_seq_len, add_bos_token=True, use_eos_token_for_bos=False, )