def test_initialize_word_tensorizer(self): tensorizer = TokenTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() self.assertEqual(49, len(tensorizer.vocab))
def test_initialize_token_tensorizer(self): # default (build from data) tensorizer = TokenTensorizer(text_column="text") self._initialize_tensorizer(tensorizer) self.assertEqual(49, len(tensorizer.vocab)) # size limit on tokens from data tensorizer = TokenTensorizer( text_column="text", vocab_config=VocabConfig(size_from_data=3) ) self._initialize_tensorizer(tensorizer) self.assertEqual(5, len(tensorizer.vocab)) # 3 + unk token + pad token embed_file = tests_module.test_file("pretrained_embed_raw") # vocab from data + vocab_file tensorizer = TokenTensorizer( text_column="text", vocab_config=VocabConfig( size_from_data=3, vocab_files=[ VocabFileConfig(filepath=embed_file, skip_header_line=True) ], ), ) self._initialize_tensorizer(tensorizer) self.assertEqual(15, len(tensorizer.vocab)) # vocab just from vocab_file tensorizer = TokenTensorizer( text_column="text", vocab_config=VocabConfig( build_from_data=False, vocab_files=[ VocabFileConfig( filepath=embed_file, skip_header_line=True, size_limit=5 ) ], ), ) init = tensorizer.initialize() # Should skip initialization with self.assertRaises(StopIteration): init.send(None) self.assertEqual(7, len(tensorizer.vocab)) # 5 + unk token + pad token
def test_initialize_tensorizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), "chars": ByteTensorizer(text_column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), } # verify TokenTensorizer isn't in an initialized state yet assert tensorizers["tokens"].vocab is None Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
def test_create_batches_different_tensorizers(self): tensorizers = {"tokens": TokenTensorizer(text_column="text")} data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) raw_batch, batch = next(iter(batches)) self.assertEqual({"tokens"}, set(batch)) tokens, seq_lens, _ = batch["tokens"] self.assertEqual((10, ), seq_lens.size()) self.assertEqual(10, len(tokens))
def from_config(cls, config: Config, **kwargs): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = None if isinstance(tokenizer, WordPieceTokenizer): print("Using WordPieceTokenizer") replacements = { "[UNK]": UNK, "[PAD]": PAD, "[CLS]": BOS, "[SEP]": EOS, "[MASK]": MASK, } vocab = Vocabulary( [token for token, _ in tokenizer.vocab.items()], replacements=replacements, ) doc_tensorizer = TokenTensorizer( text_column=config.doc_column, tokenizer=tokenizer, vocab=vocab, max_seq_len=config.max_doc_seq_len, ) ques_tensorizer = TokenTensorizer( text_column=config.ques_column, tokenizer=tokenizer, vocab=vocab, max_seq_len=config.max_ques_seq_len, ) return cls( doc_tensorizer=doc_tensorizer, ques_tensorizer=ques_tensorizer, doc_column=config.doc_column, ques_column=config.ques_column, answers_column=config.answers_column, answer_starts_column=config.answer_starts_column, tokenizer=tokenizer, vocab=vocab, **kwargs, )
def test_numberize_with_token_tensorizer(self): tensorizer = TokenTensorizer(text_column="text") self._initialize_tensorizer(tensorizer) rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len, token_ranges = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) self.assertEqual([(0, 1), (2, 6), (7, 11), (12, 18)], token_ranges) tokens, seq_len, token_ranges = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len) self.assertEqual([(0, 4), (5, 7), (8, 10)], token_ranges)
def test_create_word_tensors(self): tensorizer = TokenTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) tokens, seq_len = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len)
def setUp(self): self.data_source = TSVDataSource( SafeFileWrapper( tests_module.test_file("train_dense_features_tiny.tsv")), SafeFileWrapper( tests_module.test_file("test_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense"], schema={ "text": str, "label": str }, ) self.tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label", allow_unknown=True), }