def test_label_list_tensors_pad_missing(self): ds = SessionPandasDataSource( test_df=pd.DataFrame( # test None and empty case { "session_id": [1, 1, 1, 1], "label": ["positive", "negative", None, ""], } ), schema={"label": List[str]}, id_col="session_id", ) tensorizers = { "label": LabelListTensorizer( pad_missing=True, label_column="label", pad_in_vocab=False, allow_unknown=False, ) } initialize_tensorizers(tensorizers, ds.test) self.assertEqual(2, len(tensorizers["label"].vocab)) # only one row in test data label_idx_list, lens = tensorizers["label"].numberize(next(iter(ds.test))) self.assertEqual([0, 1, -1, -1], label_idx_list) tensorizers["label"].pad_missing = False with self.assertRaises(Exception): tensorizers["label"].numberize(next(iter(ds.test)))
def get_tensorizers(add_dict_feat=False, add_contextual_feat=False): schema = {"source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=["source_sequence", "dict_feat", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="source_sequence", add_eos_token=True, add_bos_token=True ) ) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="target_sequence", add_eos_token=True, add_bos_token=True ) ) tensorizers = {"src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer} initialize_tensorizers(tensorizers, data_source.train) if add_dict_feat: tensorizers["dict_feat"] = GazetteerTensorizer.from_config( GazetteerTensorizer.Config( text_column="source_sequence", dict_column="dict_feat" ) ) initialize_tensorizers( {"dict_feat": tensorizers["dict_feat"]}, data_source.train ) return tensorizers
def _init_tensorizer(self, tsv=False): tensorizer_dict = { "wordpiece": self.tensorizer_with_wordpiece, "alphanumeric": self.tensorizer_with_alphanumeric, } data_source = self.tsv_data_source.train if tsv else self.json_data_source.train initialize_tensorizers(tensorizer_dict, data_source)
def test_create_label_list_tensors(self): tensorizers = { "intent": LabelListTensorizer(label_column="intent", pad_in_vocab=True, allow_unknown=True) } initialize_tensorizers(tensorizers, self.data.train) tensors = [ tensorizers["intent"].numberize(row) for row in self.data.train ] # test label idx self.assertEqual([2, 3], tensors[0][0]) self.assertEqual([4, 5], tensors[1][0]) self.assertEqual([6, 7, 8], tensors[2][0]) # test seq lens self.assertEqual(2, tensors[0][1]) self.assertEqual(2, tensors[1][1]) self.assertEqual(3, tensors[2][1]) self.assertEqual(3, len(tensors)) tensors, lens = tensorizers["intent"].tensorize(tensors) np.testing.assert_array_almost_equal( np.array([[2, 3, 1], [4, 5, 1], [6, 7, 8]]), tensors.detach().numpy()) np.testing.assert_array_almost_equal(np.array([2, 2, 3]), lens.detach().numpy())
def test_initialize_tensorizers(self): tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label"), "chars": CharacterTensorizer(column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].labels))
def test_initialize_tensorizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), "chars": ByteTensorizer(text_column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
def test_initialize_list_tensorizers(self): tensorizers = { "intent": LabelListTensorizer( label_column="intent", pad_in_vocab=True, allow_unknown=True ), "goal": LabelListTensorizer(label_column="goal"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(9, len(tensorizers["intent"].vocab)) self.assertEqual(7, len(tensorizers["goal"].vocab))
def test_label_list_tensors_no_pad_in_vocab(self): tensorizers = { "intent": LabelListTensorizer( label_column="intent", pad_in_vocab=False, allow_unknown=True ) } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(8, len(tensorizers["intent"].vocab)) tensors = [] for row in self.data.train: row["intent"].append("unknown") tensors.append(tensorizers["intent"].numberize(row)) tensors, lens = tensorizers["intent"].tensorize(tensors) np.testing.assert_array_almost_equal( np.array([[1, 2, 0, -1], [3, 4, 0, -1], [5, 6, 7, 0]]), tensors.detach().numpy(), )
def _get_tensorizers(self): schema = {"source_sequence": str, "target_sequence": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=tests_module.test_file( "compositional_seq2seq_unit.tsv"), field_names=["source_sequence", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="source_sequence", add_eos_token=True, add_bos_token=True)) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="target_sequence", add_eos_token=True, add_bos_token=True)) tensorizers = { "src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer, } initialize_tensorizers(tensorizers, data_source.train) return tensorizers
def _init_tensorizer(self): tensorizer_dict = { "wordpiece": self.tensorizer_with_wordpiece, "alphanumeric": self.tensorizer_with_alphanumeric, } initialize_tensorizers(tensorizer_dict, self.data_source.train)