def get_tensorizers(add_dict_feat=False, add_contextual_feat=False): schema = {"source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=["source_sequence", "dict_feat", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="source_sequence", add_eos_token=True, add_bos_token=True ) ) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="target_sequence", add_eos_token=True, add_bos_token=True ) ) tensorizers = {"src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer} initialize_tensorizers(tensorizers, data_source.train) if add_dict_feat: tensorizers["dict_feat"] = GazetteerTensorizer.from_config( GazetteerTensorizer.Config( text_column="source_sequence", dict_column="dict_feat" ) ) initialize_tensorizers( {"dict_feat": tensorizers["dict_feat"]}, data_source.train ) return tensorizers
def test_read_data_source_with_utf8_issues(self): schema = {"text": str, "label": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=tests_module.test_file("test_utf8_errors.tsv"), field_names=["label", "text"], ), schema, ) list(data_source.train)
def test_load_saved_model(self): with tempfile.NamedTemporaryFile() as snapshot_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=snapshot_file.name, ) task = create_task(config.task) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) task2, config2, training_state_none = load(snapshot_file.name) self.assertEqual(config.export, config2.export) self.assertEqual(config.export_list, config2.export_list) self.assertEqual(config.task, config2.task) self.assertEqual(config, config2) self.assertModulesEqual(model, task2.model) self.assertIsNone(training_state_none) model.eval() task2.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual( model(*inputs).tolist(), task2.model(*inputs).tolist())
def test_seq_tensor_with_bos_eos_eol_bol(self): tensorizer = SeqTokenTensorizer( add_bos_token=True, add_eos_token=True, add_bol_token=True, add_eol_token=True, ) data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_seq_features.tsv")), test_file=None, eval_file=None, field_names=["text_seq"], schema={"text_seq": List[str]}, ) self._initialize_tensorizer(tensorizer, data) # UNK + PAD + BOS + EOS + BOL + EOL + 6 tokens self.assertEqual(12, len(tensorizer.vocab)) # only one row in test file: # ["where do you wanna meet?", "MPK"] for row in data.train: idx, lens = tensorizer.numberize(row) self.assertEqual(4, lens) self.assertEqual( [ [2, 4, 3, 1, 1, 1, 1], [2, 6, 7, 8, 9, 10, 3], [2, 11, 3, 1, 1, 1, 1], [2, 5, 3, 1, 1, 1, 1], ], idx, )
def test_seq_tensor(self): tensorizer = SeqTokenTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_seq_features.tsv") ), test_file=None, eval_file=None, field_names=["text_seq"], schema={"text_seq": List[str]}, ) self._initialize_tensorizer(tensorizer, data) # UNK + PAD + 6 tokens self.assertEqual(8, len(tensorizer.vocab)) # only one row in test file: # ["where do you wanna meet?", "MPK"] for row in data.train: tokens, token_lens, seq_lens = tensorizer.prepare_input(row) idx, sentence_lens, lens = tensorizer.numberize(row) self.assertEqual(2, lens) self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx) self.assertEqual([5, 1], sentence_lens) self.assertEqual(2, seq_lens) self.assertEqual( [ ["where", "do", "you", "wanna", "meet?"], ["mpk", "__PAD__", "__PAD__", "__PAD__", "__PAD__"], ], tokens, )
def test_seq_tensor(self): tensorizer = SeqTokenTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_seq_features.tsv") ), test_file=None, eval_file=None, field_names=["text_seq"], schema={"text_seq": List[str]}, ) init = tensorizer.initialize() init.send(None) # kick for row in data.train: init.send(row) init.close() # UNK + PAD + 6 tokens self.assertEqual(8, len(tensorizer.vocab)) # only one row in test file: # ["where do you wanna meet?", "MPK"] for row in data.train: idx, lens = tensorizer.numberize(row) self.assertEqual(2, lens) self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx)
def test_seq_tensor_pad_batch(self): tensorizer = SeqTokenTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_seq_features.tsv")), test_file=None, eval_file=None, field_names=["text_seq"], schema={"text_seq": List[str]}, ) self._initialize_tensorizer(tensorizer, data) token_idx_1 = [[2, 3], [2, 1]] token_count_1 = [2, 1] seq_len_1 = 2 token_idx_2 = [[2, 3, 4]] token_count_2 = [3] seq_len_2 = 1 token_idx_tensor, token_count_tensor, seq_len_tensor = tensorizer.tensorize( [ (token_idx_1, token_count_1, seq_len_1), (token_idx_2, token_count_2, seq_len_2), ]) np.testing.assert_array_almost_equal( np.array([[[2, 3, 1], [2, 1, 1]], [[2, 3, 4], [1, 1, 1]]]), token_idx_tensor.detach().numpy(), ) np.testing.assert_array_almost_equal( np.array([[2, 1], [3, 1]]), token_count_tensor.detach().numpy()) np.testing.assert_array_almost_equal(np.array([2, 1]), seq_len_tensor.detach().numpy())
def _get_pytext_config( self, test_file_name: TestFileName, task_class: Type[NewTask], model_class: Type[Model], ) -> PyTextConfig: test_file_metadata = get_test_file_metadata(test_file_name) return PyTextConfig( task=task_class.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=test_file_metadata.filename, eval_filename=test_file_metadata.filename, test_filename=test_file_metadata.filename, field_names=test_file_metadata.field_names, ), batcher=Batcher.Config( ), # Use Batcher to avoid shuffling. ), trainer=TaskTrainer.Config(epochs=1), model=model_class.Config( inputs=type(model_class.Config.inputs)( dense=FloatListTensorizer.Config( column=test_file_metadata.dense_col_name, dim=test_file_metadata.dense_feat_dim, ))), ), use_tensorboard=False, use_cuda_if_available=False, version=LATEST_VERSION, )
def test_annotation_num(self): data = TSVDataSource( SafeFileWrapper(tests_module.test_file("compositional_seq2seq_unit.tsv")), test_file=None, eval_file=None, field_names=["text", "seqlogical"], schema={"text": str, "seqlogical": str}, ) nbrz = AnnotationNumberizer() init = nbrz.initialize() init.send(None) # kick for row in data.train: init.send(row) init.close() # vocab = {'IN:GET_INFO_TRAFFIC': 0, 'SHIFT': 1, 'SL:LOCATION': 2, # 'REDUCE': 3, 'IN:GET_DIRECTIONS': 4, 'SL:DESTINATION': 5, 'SL:SOURCE': 6, # 'IN:GET_LOCATION_HOME': 7, 'SL:CONTACT': 8, 'SL:DATE_TIME_DEPARTURE': 9, # 'IN:UNSUPPORTED_NAVIGATION': 10, 'IN:GET_ESTIMATED_DURATION': 11, # 'IN:GET_LOCATION_WORK': 12, 'SL:PATH_AVOID': 13, 'IN:GET_DISTANCE': 14} self.assertEqual(15, len(nbrz.vocab)) self.assertEqual(1, nbrz.shift_idx) self.assertEqual(3, nbrz.reduce_idx) self.assertEqual([10], nbrz.ignore_subNTs_roots) self.assertEqual( [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], nbrz.valid_NT_idxs ) self.assertEqual([0, 4, 7, 10, 11, 12, 14], nbrz.valid_IN_idxs) self.assertEqual([2, 5, 6, 8, 9, 13], nbrz.valid_SL_idxs) for row, expected in zip(data.train, EXPECTED_ACTIONS): actions = nbrz.numberize(row) self.assertEqual(expected, actions)
def test_gazetteer_tensor(self): tensorizer = GazetteerTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_dict_features.tsv") ), test_file=None, eval_file=None, field_names=["text", "dict"], schema={"text": str, "dict": Gazetteer}, ) init = tensorizer.initialize() init.send(None) # kick for row in data.train: init.send(row) init.close() # UNK + PAD + 3 labels self.assertEqual(5, len(tensorizer.vocab)) # only one row in test file: # "Order coffee from Starbucks please" for row in data.train: idx, weights, lens = tensorizer.numberize(row) self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx) self.assertEqual( [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], weights ) self.assertEqual([1, 2, 1, 1, 1], lens)
def test_gazetteer_tensor(self): tensorizer = GazetteerTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_dict_features.tsv")), test_file=None, eval_file=None, field_names=["text", "dict"], schema={ "text": str, "dict": Gazetteer }, ) init = tensorizer.initialize() init.send(None) # kick for row in data.train: init.send(row) init.close() # UNK + PAD + 5 labels self.assertEqual(7, len(tensorizer.vocab)) # only two rows in test file: # "Order coffee from Starbucks please" # "Order some fries from McDonalds please" for i, row in enumerate(data.train): if i == 0: idx, weights, lens = tensorizer.numberize(row) self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx) self.assertEqual( [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], weights) self.assertEqual([1, 2, 1, 1, 1], lens) if i == 1: idx, weights, lens = tensorizer.numberize(row) self.assertEqual([1, 1, 5, 1, 6, 1], idx) self.assertEqual([0.0, 0.0, 1.0, 0.0, 1.0, 0.0], weights) self.assertEqual([1, 1, 1, 1, 1, 1], lens) feats, weights, lens = tensorizer.tensorize( tensorizer.numberize(row) for row in data.train) self.assertEqual( [ [1, 1, 2, 3, 1, 1, 4, 1, 1, 1, 1, 1], [1, 1, 1, 1, 5, 1, 1, 1, 6, 1, 1, 1], ], feats.numpy().tolist(), ) self.assertEqual( str([ [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], ]), str([[round(w, 2) for w in utt_weights] for utt_weights in weights.numpy()]), ) self.assertEqual([[1, 2, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], lens.numpy().tolist())
def setUp(self): self.data = TSVDataSource( SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")), SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense"], schema={"text": types.Text, "label": types.Label}, )
def _get_data_source(test_path, field_names, task): if test_path: data_source = TSVDataSource( test_file=SafeFileWrapper(test_path), schema=task.data.data_source.schema, field_names=field_names, ) else: data_source = task.data.data_source return data_source
def test_quoting(self): data_source = TSVDataSource( SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")), SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")), eval_file=None, field_names=["label", "text"], schema={"text": str, "label": str}, ) data = list(data_source.train) self.assertEqual(4, len(data))
def test_reset_incremental_states(self): """ This test might seem trivial. However, interacting with the scripted sequence generator crosses the Torchscript boundary, which can lead to weird behavior. If the incremental states don't get properly reset, the model will produce garbage _after_ the first call, which is a pain to debug when you only catch it after training. """ tensorizers = get_tensorizers() # Avoid numeric issues with quantization by setting a known seed. torch.manual_seed(42) model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), ), tensorizers, ) # Get sample inputs using a data source. schema = { "source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str, } data = Data.from_config( Data.Config(source=TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=[ "source_sequence", "dict_feat", "target_sequence" ], )), schema, tensorizers, ) data.batcher = Batcher(1, 1, 1) raw_batch, batch = next( iter(data.batches(Stage.TRAIN, load_early=True))) inputs = model.arrange_model_inputs(batch) model.eval() outputs = model(*inputs) pred, scores = model.get_pred(outputs, {"stage": Stage.TEST}) # Verify that the incremental states reset correctly. decoder = model.sequence_generator.beam_search.decoder_ens decoder.reset_incremental_states() self.assertDictEqual(decoder.incremental_states, {"0": {}}) # Verify that the model returns the same predictions. new_pred, new_scores = model.get_pred(outputs, {"stage": Stage.TEST}) self.assertEqual(new_scores, scores)
def setUp(self): self.data_source = TSVDataSource( SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")), SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense"], schema={"text": types.Text, "label": types.Label}, ) self.tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label", allow_unknown=True), }
def test_csv(self): data_source = TSVDataSource( SafeFileWrapper(tests_module.test_file("test_data_tiny_csv.tsv")), test_file=None, eval_file=None, field_names=["label", "slots", "text"], delimiter=",", schema={"text": str, "label": str}, quoted=True, ) for row in data_source.train: self.assertEqual("alarm/set_alarm", row["label"]) self.assertTrue(row["text"].startswith("this is the text"))
def _get_tensorizers(self): schema = {"source_sequence": str, "target_sequence": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=tests_module.test_file( "compositional_seq2seq_unit.tsv"), field_names=["source_sequence", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="source_sequence", add_eos_token=True, add_bos_token=True)) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="target_sequence", add_eos_token=True, add_bos_token=True)) tensorizers = { "src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer, } initialize_tensorizers(tensorizers, data_source.train) return tensorizers
def test_read_data_source_with_column_remapping(self): data_source = TSVDataSource( SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")), SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")), eval_file=None, field_names=["remapped_label", "slots", "remapped_text", "dense"], column_mapping={"remapped_label": "label", "remapped_text": "text"}, schema={"text": str, "label": str}, ) data = list(data_source.train) self.assertEqual(10, len(data)) example = next(iter(data)) self.assertEqual(2, len(example)) self.assertEqual({"label", "text"}, set(example))
def test_quoting(self): """ The text column of the first row of this file opens a quote but does not close it. """ data_source = TSVDataSource( SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")), SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")), eval_file=None, field_names=["label", "text"], schema={"text": str, "label": str}, ) data = list(data_source.train) self.assertEqual(4, len(data))
def test_load_checkpoint(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=task.data.tensorizers, ) id = "epoch-1" saved_path = save(config, model, None, task.data.tensorizers, training_state, id) # TODO: fix get_latest_checkpoint_path T53664139 # self.assertEqual(saved_path, get_latest_checkpoint_path()) task_restored, config_restored, training_state_restored = load( saved_path) self.assertCheckpointEqual( model, config, training_state, task_restored.model, config_restored, training_state_restored, )
def setUp(self): self.data = TSVDataSource( SafeFileWrapper( tests_module.test_file("compositional_seq2seq_unit.tsv")), test_file=None, eval_file=None, field_names=["text", "seqlogical"], schema={ "text": str, "seqlogical": str }, ) self.masked_tensorizer = MaskedTokenTensorizer.from_config( MaskedTokenTensorizer.Config(column="seqlogical", masking_function=TreeMask.Config())) self._initialize_tensorizer(self.masked_tensorizer)
def test_gazetteer_tensor_bad_json(self): tensorizer = GazetteerTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_dict_features_bad_json.tsv") ), test_file=None, eval_file=None, field_names=["text", "dict"], schema={"text": str, "dict": Gazetteer}, ) init = tensorizer.initialize() init.send(None) # kick with self.assertRaises(Exception): for row in data.train: init.send(row) init.close()
def test_force_predictions_on_eval(self): tensorizers = get_tensorizers() model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), ), tensorizers, ) # Get sample inputs using a data source. schema = { "source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str, } data = Data.from_config( Data.Config(source=TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=[ "source_sequence", "dict_feat", "target_sequence" ], )), schema, tensorizers, ) data.batcher = Batcher(1, 1, 1) raw_batch, batch = next( iter(data.batches(Stage.TRAIN, load_early=True))) inputs = model.arrange_model_inputs(batch) # Verify that model does not run sequence generation on prediction. outputs = model(*inputs) pred = model.get_pred(outputs, {"stage": Stage.EVAL}) self.assertEqual(pred, (None, None)) # Verify that attempting to set force_eval_predictions is correctly # accounted for. model.force_eval_predictions = True with self.assertRaises(AssertionError): _ = model.get_pred(outputs, {"stage": Stage.EVAL})
def test_seq_tensor_max_turn(self): tensorizer = SeqTokenTensorizer(max_turn=1) data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_seq_features.tsv")), test_file=None, eval_file=None, field_names=["text_seq"], schema={"text_seq": List[str]}, ) self._initialize_tensorizer(tensorizer, data) # only one row in test file: # ["where do you wanna meet?", "MPK"] for row in data.train: idx, sentence_lens, seq_len = tensorizer.numberize(row) self.assertEqual(1, seq_len) self.assertEqual([[2, 3, 4, 5, 6]], idx) self.assertEqual([5], sentence_lens)
def test_create_normalized_float_list_tensor(self): def round_list(l): return [float("%.4f" % n) for n in l] data = TSVDataSource( SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense_feat"], schema={"text": str, "label": str, "dense_feat": List[float]}, ) tensorizer = FloatListTensorizer( column="dense_feat", dim=10, error_check=True, normalize=True ) self._initialize_tensorizer(tensorizer, data) self.assertEqual(10, tensorizer.normalizer.num_rows) self.assertEqual( round_list( [ 7.56409, 8.2388, 0.5531, 0.2403, 1.03130, 6.2888, 3.1595, 0.1538, 0.2403, 5.3463, ] ), round_list(tensorizer.normalizer.feature_sums), ) self.assertEqual( round_list( [ 5.80172, 7.57586, 0.30591, 0.05774, 0.52762, 5.22811, 2.51727, 0.02365, 0.05774, 4.48798, ] ), round_list(tensorizer.normalizer.feature_squared_sums), ) self.assertEqual( round_list( [ 0.75640, 0.82388, 0.05531, 0.02403, 0.10313, 0.62888, 0.31595, 0.01538, 0.02403, 0.53463, ] ), round_list(tensorizer.normalizer.feature_avgs), ) self.assertEqual( round_list( [ 0.08953, 0.28072, 0.16593, 0.07209, 0.20524, 0.35682, 0.38974, 0.04614, 0.07209, 0.40369, ] ), round_list(tensorizer.normalizer.feature_stddevs), ) row = [0.64840776, 0.7575, 0.5531, 0.2403, 0, 0.9481, 0, 0.1538, 0.2403, 0.3564] output = tensorizer.numberize({"dense_feat": row}) self.assertEqual( round_list( [ -1.20619, -0.23646, 2.99999, 3.0, -0.50246, 0.89462, -0.81066, 2.99999, 3.0, -0.44149, ] ), round_list(output), )