def get_tensorizers(add_dict_feat=False, add_contextual_feat=False): schema = {"source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=["source_sequence", "dict_feat", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="source_sequence", add_eos_token=True, add_bos_token=True ) ) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="target_sequence", add_eos_token=True, add_bos_token=True ) ) tensorizers = {"src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer} initialize_tensorizers(tensorizers, data_source.train) if add_dict_feat: tensorizers["dict_feat"] = GazetteerTensorizer.from_config( GazetteerTensorizer.Config( text_column="source_sequence", dict_column="dict_feat" ) ) initialize_tensorizers( {"dict_feat": tensorizers["dict_feat"]}, data_source.train ) return tensorizers
def test_gazetteer_tensor(self): tensorizer = GazetteerTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_dict_features.tsv") ), test_file=None, eval_file=None, field_names=["text", "dict"], schema={"text": str, "dict": Gazetteer}, ) init = tensorizer.initialize() init.send(None) # kick for row in data.train: init.send(row) init.close() # UNK + PAD + 3 labels self.assertEqual(5, len(tensorizer.vocab)) # only one row in test file: # "Order coffee from Starbucks please" for row in data.train: idx, weights, lens = tensorizer.numberize(row) self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx) self.assertEqual( [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], weights ) self.assertEqual([1, 2, 1, 1, 1], lens)
def test_tokens_dictfeat_contextual(self): # TODO (T65593688): this should be removed after # https://github.com/pytorch/pytorch/pull/33645 is merged. with torch.no_grad(): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( dict_feat=GazetteerTensorizer.Config( text_column="source_sequence" ), contextual_token_embedding=ByteTokenTensorizer.Config(), ), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=619) ), dict_embedding=DictEmbedding.Config(), contextual_token_embedding=ContextualTokenEmbedding.Config( embed_dim=7 ), ), get_tensorizers(add_dict_feat=True, add_contextual_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model( ["call", "mom"], (["call", "mom"], [0.42, 0.17], [4, 3]), [0.42] * (7 * 2), ) assert res is not None
def test_gazetteer_tensor(self): tensorizer = GazetteerTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_dict_features.tsv")), test_file=None, eval_file=None, field_names=["text", "dict"], schema={ "text": str, "dict": Gazetteer }, ) init = tensorizer.initialize() init.send(None) # kick for row in data.train: init.send(row) init.close() # UNK + PAD + 5 labels self.assertEqual(7, len(tensorizer.vocab)) # only two rows in test file: # "Order coffee from Starbucks please" # "Order some fries from McDonalds please" for i, row in enumerate(data.train): if i == 0: idx, weights, lens = tensorizer.numberize(row) self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx) self.assertEqual( [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], weights) self.assertEqual([1, 2, 1, 1, 1], lens) if i == 1: idx, weights, lens = tensorizer.numberize(row) self.assertEqual([1, 1, 5, 1, 6, 1], idx) self.assertEqual([0.0, 0.0, 1.0, 0.0, 1.0, 0.0], weights) self.assertEqual([1, 1, 1, 1, 1, 1], lens) feats, weights, lens = tensorizer.tensorize( tensorizer.numberize(row) for row in data.train) self.assertEqual( [ [1, 1, 2, 3, 1, 1, 4, 1, 1, 1, 1, 1], [1, 1, 1, 1, 5, 1, 1, 1, 6, 1, 1, 1], ], feats.numpy().tolist(), ) self.assertEqual( str([ [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], ]), str([[round(w, 2) for w in utt_weights] for utt_weights in weights.numpy()]), ) self.assertEqual([[1, 2, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], lens.numpy().tolist())
def test_gazetteer_tensor_bad_json(self): tensorizer = GazetteerTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_dict_features_bad_json.tsv") ), test_file=None, eval_file=None, field_names=["text", "dict"], schema={"text": str, "dict": Gazetteer}, ) init = tensorizer.initialize() init.send(None) # kick with self.assertRaises(Exception): for row in data.train: init.send(row) init.close()
def test_tokens_dictfeat(self): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( dict_feat=GazetteerTensorizer.Config( text_column="source_sequence")), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=612)), dict_embedding=DictEmbedding.Config(), ), get_tensorizers(add_dict_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model(["call", "mom"], (["call", "mom"], [0.42, 0.17], [4, 3])) assert res is not None