Exemplo n.º 1
0
    def test_label_list_tensors_pad_missing(self):
        ds = SessionPandasDataSource(
            test_df=pd.DataFrame(
                # test None and empty case
                {
                    "session_id": [1, 1, 1, 1],
                    "label": ["positive", "negative", None, ""],
                }
            ),
            schema={"label": List[str]},
            id_col="session_id",
        )
        tensorizers = {
            "label": LabelListTensorizer(
                pad_missing=True,
                label_column="label",
                pad_in_vocab=False,
                allow_unknown=False,
            )
        }
        initialize_tensorizers(tensorizers, ds.test)
        self.assertEqual(2, len(tensorizers["label"].vocab))
        # only one row in test data
        label_idx_list, lens = tensorizers["label"].numberize(next(iter(ds.test)))
        self.assertEqual([0, 1, -1, -1], label_idx_list)

        tensorizers["label"].pad_missing = False
        with self.assertRaises(Exception):
            tensorizers["label"].numberize(next(iter(ds.test)))
Exemplo n.º 2
0
def get_tensorizers(add_dict_feat=False, add_contextual_feat=False):
    schema = {"source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str}
    data_source = TSVDataSource.from_config(
        TSVDataSource.Config(
            train_filename=TEST_FILE_NAME,
            field_names=["source_sequence", "dict_feat", "target_sequence"],
        ),
        schema,
    )
    src_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(
            column="source_sequence", add_eos_token=True, add_bos_token=True
        )
    )
    tgt_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(
            column="target_sequence", add_eos_token=True, add_bos_token=True
        )
    )
    tensorizers = {"src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer}
    initialize_tensorizers(tensorizers, data_source.train)

    if add_dict_feat:
        tensorizers["dict_feat"] = GazetteerTensorizer.from_config(
            GazetteerTensorizer.Config(
                text_column="source_sequence", dict_column="dict_feat"
            )
        )
        initialize_tensorizers(
            {"dict_feat": tensorizers["dict_feat"]}, data_source.train
        )
    return tensorizers
Exemplo n.º 3
0
 def _init_tensorizer(self, tsv=False):
     tensorizer_dict = {
         "wordpiece": self.tensorizer_with_wordpiece,
         "alphanumeric": self.tensorizer_with_alphanumeric,
     }
     data_source = self.tsv_data_source.train if tsv else self.json_data_source.train
     initialize_tensorizers(tensorizer_dict, data_source)
Exemplo n.º 4
0
 def test_create_label_list_tensors(self):
     tensorizers = {
         "intent":
         LabelListTensorizer(label_column="intent",
                             pad_in_vocab=True,
                             allow_unknown=True)
     }
     initialize_tensorizers(tensorizers, self.data.train)
     tensors = [
         tensorizers["intent"].numberize(row) for row in self.data.train
     ]
     # test label idx
     self.assertEqual([2, 3], tensors[0][0])
     self.assertEqual([4, 5], tensors[1][0])
     self.assertEqual([6, 7, 8], tensors[2][0])
     # test seq lens
     self.assertEqual(2, tensors[0][1])
     self.assertEqual(2, tensors[1][1])
     self.assertEqual(3, tensors[2][1])
     self.assertEqual(3, len(tensors))
     tensors, lens = tensorizers["intent"].tensorize(tensors)
     np.testing.assert_array_almost_equal(
         np.array([[2, 3, 1], [4, 5, 1], [6, 7, 8]]),
         tensors.detach().numpy())
     np.testing.assert_array_almost_equal(np.array([2, 2, 3]),
                                          lens.detach().numpy())
Exemplo n.º 5
0
 def test_initialize_tensorizers(self):
     tensorizers = {
         "tokens": WordTensorizer(column="text"),
         "labels": LabelTensorizer(column="label"),
         "chars": CharacterTensorizer(column="text"),
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].labels))
Exemplo n.º 6
0
 def test_initialize_tensorizers(self):
     tensorizers = {
         "tokens": TokenTensorizer(text_column="text"),
         "labels": LabelTensorizer(label_column="label"),
         "chars": ByteTensorizer(text_column="text"),
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].vocab))
Exemplo n.º 7
0
 def test_initialize_list_tensorizers(self):
     tensorizers = {
         "intent": LabelListTensorizer(
             label_column="intent", pad_in_vocab=True, allow_unknown=True
         ),
         "goal": LabelListTensorizer(label_column="goal"),
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(9, len(tensorizers["intent"].vocab))
     self.assertEqual(7, len(tensorizers["goal"].vocab))
Exemplo n.º 8
0
 def test_label_list_tensors_no_pad_in_vocab(self):
     tensorizers = {
         "intent": LabelListTensorizer(
             label_column="intent", pad_in_vocab=False, allow_unknown=True
         )
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(8, len(tensorizers["intent"].vocab))
     tensors = []
     for row in self.data.train:
         row["intent"].append("unknown")
         tensors.append(tensorizers["intent"].numberize(row))
     tensors, lens = tensorizers["intent"].tensorize(tensors)
     np.testing.assert_array_almost_equal(
         np.array([[1, 2, 0, -1], [3, 4, 0, -1], [5, 6, 7, 0]]),
         tensors.detach().numpy(),
     )
 def _get_tensorizers(self):
     schema = {"source_sequence": str, "target_sequence": str}
     data_source = TSVDataSource.from_config(
         TSVDataSource.Config(
             train_filename=tests_module.test_file(
                 "compositional_seq2seq_unit.tsv"),
             field_names=["source_sequence", "target_sequence"],
         ),
         schema,
     )
     src_tensorizer = TokenTensorizer.from_config(
         TokenTensorizer.Config(column="source_sequence",
                                add_eos_token=True,
                                add_bos_token=True))
     tgt_tensorizer = TokenTensorizer.from_config(
         TokenTensorizer.Config(column="target_sequence",
                                add_eos_token=True,
                                add_bos_token=True))
     tensorizers = {
         "src_seq_tokens": src_tensorizer,
         "trg_seq_tokens": tgt_tensorizer,
     }
     initialize_tensorizers(tensorizers, data_source.train)
     return tensorizers
Exemplo n.º 10
0
 def _init_tensorizer(self):
     tensorizer_dict = {
         "wordpiece": self.tensorizer_with_wordpiece,
         "alphanumeric": self.tensorizer_with_alphanumeric,
     }
     initialize_tensorizers(tensorizer_dict, self.data_source.train)