예제 #1
0
 def test_uppercase_tokens(self):
     """
     Test that the text is not lower-cased when lowercase_tokens is False.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=1)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=False),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {
             "<unk>",
             "What",
             "EVENTS",
             "can",
             "I",
             "go",
             "today",
             "Are",
             "there",
             "any",
             "adult",
             "events",
             "this",
             "weekend",
         },
     )
 def setUp(self):
     self.data_handler = JointModelDataHandler.from_config(
         JointModelDataHandler.Config(),
         FeatureConfig(),
         [DocLabelConfig(), WordLabelConfig()],
         featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                 FeatureConfig()),
     )
예제 #3
0
 def _init_data_handler(self):
     data_handler = LanguageModelDataHandler.from_config(
         LanguageModelDataHandler.Config(),
         FeatureConfig(),
         WordLabelConfig(),
         featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()),
         shuffle=False,
     )
     data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME)
     return data_handler
예제 #4
0
    def test_intializing_embeds_from_config(self):
        feature_config = FeatureConfig(
            word_feat=WordFeatConfig(
                embedding_init_strategy=EmbedInitStrategy.RANDOM,
                embed_dim=5,
                pretrained_embeddings_path=tests_module.TEST_BASE_DIR,
            )
        )
        data_handler = JointModelDataHandler.from_config(
            JointModelDataHandler.Config(),
            feature_config,
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), feature_config
            ),
        )

        data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE)

        pretrained_embeds = data_handler.metadata.features[
            DatasetFieldName.TEXT_FIELD
        ].pretrained_embeds_weight
        # test random initialization (values should be non-0)
        np.testing.assert_array_less(
            [0, 0, 0, 0, 0], np.absolute(pretrained_embeds[11].numpy())
        )

        feature_config = FeatureConfig(
            word_feat=WordFeatConfig(
                embedding_init_strategy=EmbedInitStrategy.ZERO,
                embed_dim=5,
                pretrained_embeddings_path=tests_module.TEST_BASE_DIR,
            )
        )
        data_handler = JointModelDataHandler.from_config(
            JointModelDataHandler.Config(),
            feature_config,
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), feature_config
            ),
        )
        data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE)

        pretrained_embeds = data_handler.metadata.features[
            DatasetFieldName.TEXT_FIELD
        ].pretrained_embeds_weight
        # test zero initialization (values should all be 0)
        np.testing.assert_array_equal([0, 0, 0, 0, 0], pretrained_embeds[11].numpy())
예제 #5
0
    def _create_dummy_data_handler(self):
        feat = WordFeatConfig(
            vocab_size=4,
            vocab_from_all_data=True,
            vocab_from_train_data=True,
            vocab_from_pretrained_embeddings=False,
            pretrained_embeddings_path=None,
        )
        featurizer = create_featurizer(SimpleFeaturizer.Config(),
                                       FeatureConfig(word_feat=feat))
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        train_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                              include_label_fields=False)
        eval_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        test_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        data_handler.init_feature_metadata(train_data, eval_data, test_data)

        return data_handler
    def test_split_with_regex(self):
        featurizer = SimpleFeaturizer.from_config(
            SimpleFeaturizer.Config(split_regex=r"[\s,;!.?\"\(\)\-]+"),
            FeatureConfig())
        sentence = """
            Your bones don't break, mine do. That's clear. Your cells react to
            bacteria and viruses differently than mine. You don't get sick,
            I do. That's also clear. But for some reason, you and I react the
            exact same way to water. We swallow it too fast, we choke. We get
            some in our lungs, we drown. However unreal it may seem, we are
            connected, you and I. We're on the same curve, just on opposite
            ends.
        """
        expected = """
            your bones don't break mine do that's clear your cells react to
            bacteria and viruses differently than mine you don't get sick
            i do that's also clear but for some reason you and i react the
            exact same way to water we swallow it too fast we choke we get
            some in our lungs we drown however unreal it may seem we are
            connected you and i we're on the same curve just on opposite ends
        """.split()
        tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens
        self.assertListEqual(expected, tokens)

        sentence = '"Please, buy me a coffee?" He implored-in vain.'
        expected = "please buy me a coffee he implored in vain".split()
        tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens
        self.assertListEqual(expected, tokens)
예제 #7
0
    def test_read_partially_from_csv(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")
        columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2}

        feat = WordFeatConfig(
            vocab_from_all_data=True,
            vocab_from_train_data=False,
            vocab_from_pretrained_embeddings=False,
        )
        featurizer = create_featurizer(
            SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat)
        )
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        data = list(data_handler.read_from_file(file_name, columns))
        for col in columns:
            self.assertTrue(col in data[0], "{} must in the data".format(col))
        self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL])
        self.assertEqual(
            "change my alarm tomorrow to wake me up 30 minutes earlier",
            data[0][DFColumn.UTTERANCE],
        )
예제 #8
0
    def get_feature_metadata(cls, feature_config: FeatureConfig,
                             feature_meta: Dict[str, FieldMeta]):
        # The number of names in input_names *must* be equal to the number of
        # tensors passed in dummy_input
        input_names: List[str] = []
        dummy_model_input: List = []
        feature_itos_map = {}

        for name, feat_config in feature_config._asdict().items():
            if isinstance(feat_config, ConfigBase):
                input_names.extend(feat_config.export_input_names)
                if getattr(feature_meta[name], "vocab", None):
                    feature_itos_map[feat_config.export_input_names[
                        0]] = feature_meta[name].vocab.itos
                dummy_model_input.append(feature_meta[name].dummy_model_input)

        if "tokens_vals" in input_names:
            dummy_model_input.append(torch.tensor(
                [1, 1], dtype=torch.long))  # token lengths
            input_names.append("tokens_lens")
        if "seq_tokens_vals" in input_names:
            dummy_model_input.append(torch.tensor(
                [1, 1], dtype=torch.long))  # seq lengths
            input_names.append("seq_tokens_lens")
        return input_names, tuple(dummy_model_input), feature_itos_map
예제 #9
0
    def test_tokenize(self):
        featurizer = SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                  FeatureConfig())

        tokens = featurizer.featurize(
            InputRecord(raw_text="At eight o'clock")).tokens
        self.assertEqual(['at', 'eight', "o'clock"], tokens)
예제 #10
0
 def test_convert_to_bytes(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(convert_to_bytes=True,
                                 lowercase_tokens=False),
         FeatureConfig(),
     )
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(
         tokens,
         [
             "O",
             "r",
             "d",
             "e",
             "r",
             " ",
             "m",
             "e",
             " ",
             "a",
             " ",
             "c",
             "o",
             "f",
             "f",
             "e",
             "e",
         ],
     )
예제 #11
0
 class Config(ConfigBase):
     features: FeatureConfig = FeatureConfig()
     featurizer: Featurizer.Config = SimpleFeaturizer.Config()
     data_handler: DataHandler.Config
     trainer: Trainer.Config = Trainer.Config()
     optimizer: Optimizer.Config = Adam.Config()
     scheduler: Optional[Scheduler.Config] = Scheduler.Config()
     exporter: Optional[ModelExporter.Config] = None
 def test_tokenize_add_sentence_markers(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(sentence_markers=("<s>", "</s>")),
         FeatureConfig())
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(tokens,
                          ["<s>", "order", "me", "a", "coffee", "</s>"])
예제 #13
0
 def test_freeze_all_embedding(self):
     model = create_model(
         DocModel_Deprecated.Config(),
         FeatureConfig(freeze=True),
         metadata=mock_metadata(),
     )
     for param in model.embedding.parameters():
         self.assertFalse(param.requires_grad)
예제 #14
0
 def test_tokenize_dont_lowercase(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()
     )
     features = featurizer.featurize(InputRecord(raw_text=self.sentence))
     expected_tokens = ["Order", "me", "a", "coffee"]
     expected_chars = [list(tok) for tok in expected_tokens]
     self.assertListEqual(features.tokens, expected_tokens)
     self.assertListEqual(features.characters, expected_chars)
 def setUp(self):
     handler_config = DocClassificationDataHandler.Config()
     handler_config.columns_to_read.append(ModelInput.DENSE_FEAT)
     self.data_handler = DocClassificationDataHandler.from_config(
         DocClassificationDataHandler.Config(),
         ModelInputConfig(),
         [],
         featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                 FeatureConfig()),
     )
 def test_min_freq(self):
     """
     Test that UNKification is triggered when min_freq is 2.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=2)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=True),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     # <unk>-NUM = <unk> for numeric tokens
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {"<unk>", "<unk>-NUM", "<unk>", "<unk>", "events"},
     )
예제 #17
0
 def test_convert_to_bytes(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(convert_to_bytes=True, lowercase_tokens=False),
         FeatureConfig(),
     )
     features = featurizer.featurize(InputRecord(raw_text=self.sentence))
     expected_tokens = list("Order me a coffee")
     expected_chars = [list(char) for char in expected_tokens]
     self.assertListEqual(features.tokens, expected_tokens)
     self.assertListEqual(features.characters, expected_chars)
    def setUp(self):
        simple_featurizer_config = SimpleFeaturizer.Config()
        simple_featurizer_config.split_regex = r""
        simple_featurizer_config.convert_to_bytes = True

        self.data_handler = QueryDocumentPairwiseRankingDataHandler.from_config(
            QueryDocumentPairwiseRankingDataHandler.Config(),
            ModelInputConfig(),
            [],
            featurizer=SimpleFeaturizer.from_config(simple_featurizer_config,
                                                    FeatureConfig()),
        )
예제 #19
0
    def test_data_handler(self):
        data_handler = BPTTLanguageModelDataHandler.from_config(
            BPTTLanguageModelDataHandler.Config(bptt_len=4),
            FeatureConfig(),
            WordLabelConfig(),
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), FeatureConfig()
            ),
        )
        data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME)

        train_iter = data_handler.get_train_iter_from_path(FILE_NAME, BATCH_SIZE)

        batches = [t for t in train_iter]
        # There are two batches in the tiny dataset
        self.assertEqual(len(batches), 2)

        # batches of tuple(input, target, context)
        # input -> tuple(input_sequences, sequence_length)
        # input_sequence -> tensor of dim (bsize, max_seq_length)
        np.testing.assert_array_equal(
            batches[0][0][0],
            [[15, 19, 12, 16], [3, 13, 21, 8], [20, 7, 23, 4], [6, 5, 7, 22]],
        )
        # sequence_length -> tensor of dim (bsize)
        np.testing.assert_array_equal(batches[0][0][1], [4, 4, 4, 4])

        # target -> tensor of same dim as input_sequences (bsize, max_seq_length)
        np.testing.assert_array_equal(
            batches[0][1][0],
            [[19, 12, 16, 14], [13, 21, 8, 3], [7, 23, 4, 3], [5, 7, 22, 10]],
        )

        np.testing.assert_array_equal(
            batches[1][0][0], [[14, 17, 11], [3, 5, 18], [3, 8, 4], [10, 4, 9]]
        )
        np.testing.assert_array_equal(batches[1][0][1], [3, 3, 3, 3])
        np.testing.assert_array_equal(
            batches[1][1][0], [[17, 11, 4], [5, 18, 6], [8, 4, 3], [4, 9, 1]]
        )
예제 #20
0
 def _create_dummy_model(self):
     return create_model(
         DocModel_Deprecated.Config(
             representation=BiLSTMDocAttention.Config(
                 save_path=self.representation_path),
             decoder=MLPDecoder.Config(save_path=self.decoder_path),
         ),
         FeatureConfig(
             word_feat=WordEmbedding.Config(
                 embed_dim=300, save_path=self.word_embedding_path),
             save_path=self.embedding_path,
         ),
         self._create_dummy_meta_data(),
     )
예제 #21
0
    def setUp(self):
        self.train_data = [{
            DFColumn.DOC_LABEL:
            "cu:discuss_where",
            DFColumn.UTTERANCE:
            '["where do you wanna meet?", "MPK"]',
        }]

        self.eval_data = [
            {
                DFColumn.DOC_LABEL: "cu:discuss_where",
                DFColumn.UTTERANCE: '["how about SF?", "sounds good"]',
            },
            {
                DFColumn.DOC_LABEL: "cu:other",
                DFColumn.UTTERANCE: '["lol"]'
            },
        ]

        self.test_data = [
            {
                DFColumn.DOC_LABEL: "cu:discuss_where",
                DFColumn.UTTERANCE: '["MPK sounds good to me"]',
            },
            {
                DFColumn.DOC_LABEL: "cu:other",
                DFColumn.UTTERANCE: '["great", "awesome"]',
            },
        ]

        self.dh = SeqModelDataHandler.from_config(
            SeqModelDataHandler.Config(),
            FeatureConfig(),
            DocLabelConfig(),
            featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                    FeatureConfig()),
        )
예제 #22
0
 def setUp(self):
     file_name = tests_module.test_file(
         "knowledge_distillation_test_tiny.tsv")
     label_config_dict = {"target_prob": True}
     data_handler_dict = {
         "columns_to_read":
         ["text", "target_probs", "target_labels", "doc_label"]
     }
     self.data_handler = KDDocClassificationDataHandler.from_config(
         KDDocClassificationDataHandler.Config(**data_handler_dict),
         ModelInputConfig(),
         TargetConfig(**label_config_dict),
         featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                 FeatureConfig()),
     )
     self.data = self.data_handler.read_from_file(
         file_name, self.data_handler.raw_columns)
예제 #23
0
    def create_language_model_data_handler(cls) -> LanguageModelDataHandler:
        # TODO: Refactor this after Shicong refactors PyText config and removes
        # Thrift. After that directly use Data Handler's from config method
        # with synthetic configs
        columns = [DFColumn.UTTERANCE]
        features: Dict[str, Field] = {
            DatasetFieldName.TEXT_FIELD: TextFeatureField(
                eos_token=VocabMeta.EOS_TOKEN, init_token=VocabMeta.INIT_TOKEN
            )
        }

        return LanguageModelDataHandler(
            raw_columns=columns,
            features=features,
            labels={},
            featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()),
        )
예제 #24
0
    def DISABLED_test_freeze_word_embedding(self):
        model = create_model(
            DocModel.Config(),
            FeatureConfig(
                word_feat=WordFeatConfig(freeze=True, mlp_layer_dims=[4]),
                dict_feat=DictFeatConfig(),
            ),
            metadata=mock_metadata(),
        )
        # word embedding
        for param in model.embedding[0].word_embedding.parameters():
            self.assertFalse(param.requires_grad)
        for param in model.embedding[0].mlp.parameters():
            self.assertTrue(param.requires_grad)

        # dict feat embedding
        for param in model.embedding[1].parameters():
            self.assertTrue(param.requires_grad)
예제 #25
0
    def setup_data(self):
        simple_featurizer_config = SimpleFeaturizer.Config()
        simple_featurizer_config.split_regex = r""
        simple_featurizer_config.convert_to_bytes = True

        self.data_handler = QueryDocumentPairwiseRankingDataHandler.from_config(
            QueryDocumentPairwiseRankingDataHandler.Config(),
            ModelInputConfig(),
            [],
            featurizer=SimpleFeaturizer.from_config(simple_featurizer_config,
                                                    FeatureConfig()),
        )
        self.file_name = tests_module.test_file(
            "query_document_pairwise_ranking_tiny.tsv")
        self.data_handler.shuffle = False
        self.data_handler.init_metadata_from_path(self.file_name,
                                                  self.file_name,
                                                  self.file_name)
예제 #26
0
    def _get_exportable_metadata(
        cls,
        exportable_filter: Callable,
        feature_config: FeatureConfig,
        feature_meta: Dict[str, FieldMeta],
    ) -> Tuple[List[str], List, Dict]:
        # The number of names in input_names *must* be equal to the number of
        # tensors passed in dummy_input
        input_names: List[str] = []
        dummy_model_input: List = []
        feature_itos_map = {}

        for name, feat_config in feature_config._asdict().items():
            if exportable_filter(feat_config):
                input_names.extend(feat_config.export_input_names)
                if getattr(feature_meta[name], "vocab", None):
                    feature_itos_map[feat_config.export_input_names[
                        0]] = feature_meta[name].vocab.itos
                dummy_model_input.append(feature_meta[name].dummy_model_input)
        return input_names, dummy_model_input, feature_itos_map
예제 #27
0
    def create_sub_embs(cls, emb_config: FeatureConfig,
                        metadata: CommonMetadata) -> Dict[str, EmbeddingBase]:
        """
        Creates the embedding modules defined in the `emb_config`.

        Args:
            emb_config (FeatureConfig): Object containing all the sub-embedding
                configurations.
            metadata (CommonMetadata): Object containing features and label metadata.

        Returns:
            Dict[str, EmbeddingBase]: Named dictionary of embedding modules.

        """
        sub_emb_module_dict = {}
        for name, config in emb_config._asdict().items():
            if issubclass(getattr(config, "__COMPONENT__", object),
                          EmbeddingBase):
                sub_emb_module_dict[name] = create_module(
                    config, metadata=metadata.features[name])
            else:
                print(f"{name} is not a config of embedding, skipping")
        return sub_emb_module_dict
예제 #28
0
    def test_load_save(self):
        text_field_meta = FieldMeta()
        text_field_meta.vocab = VocabStub()
        text_field_meta.vocab_size = 4
        text_field_meta.unk_token_idx = 1
        text_field_meta.pad_token_idx = 0
        text_field_meta.pretrained_embeds_weight = None
        label_meta = FieldMeta()
        label_meta.vocab = VocabStub()
        label_meta.vocab_size = 3
        metadata = CommonMetadata()
        metadata.features = {DatasetFieldName.TEXT_FIELD: text_field_meta}
        metadata.target = label_meta

        saved_model = create_model(
            DocModel.Config(
                representation=BiLSTMDocAttention.Config(
                    save_path=self.representation_path),
                decoder=MLPDecoder.Config(save_path=self.decoder_path),
            ),
            FeatureConfig(save_path=self.embedding_path),
            metadata,
        )
        saved_model.save_modules()

        loaded_model = create_model(
            DocModel.Config(
                representation=BiLSTMDocAttention.Config(
                    load_path=self.representation_path),
                decoder=MLPDecoder.Config(load_path=self.decoder_path),
            ),
            FeatureConfig(load_path=self.embedding_path),
            metadata,
        )

        random_model = create_model(
            DocModel.Config(representation=BiLSTMDocAttention.Config(),
                            decoder=MLPDecoder.Config()),
            FeatureConfig(),
            metadata,
        )

        # Loaded and saved modules should be equal. Neither should be equal to
        # a randomly initialised model.

        for p1, p2, p3 in itertools.zip_longest(
                saved_model.embedding.parameters(),
                loaded_model.embedding.parameters(),
                random_model.embedding.parameters(),
        ):
            self.assertTrue(p1.equal(p2))
            self.assertFalse(p3.equal(p1))
            self.assertFalse(p3.equal(p2))

        for p1, p2, p3 in itertools.zip_longest(
                saved_model.representation.parameters(),
                loaded_model.representation.parameters(),
                random_model.representation.parameters(),
        ):
            self.assertTrue(p1.equal(p2))
            self.assertFalse(p3.equal(p1))
            self.assertFalse(p3.equal(p2))

        for p1, p2, p3 in itertools.zip_longest(
                saved_model.decoder.parameters(),
                loaded_model.decoder.parameters(),
                random_model.decoder.parameters(),
        ):
            self.assertTrue(p1.equal(p2))
            self.assertFalse(p3.equal(p1))
            self.assertFalse(p3.equal(p2))
예제 #29
0
    def test_init_feature_metadata(self):
        # Specify data
        feat_name = ModelInput.WORD_FEAT
        train_text = "Hi there you"
        eval_text = ""
        test_text = "Go away"
        pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw")
        pretrained_tokens = {
            "</s>",
            "the",
            "to",
            "and",
            "a",
            "I",
            "you",
            "is",
            "aloha",
            "for",
        }

        # Specify test cases
        test_cases = (
            # Vocab from train / eval / test data
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=True,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=False,
                ),
                "expected_tokens": {
                    "hi",
                    "there",
                    "you",
                    "go",
                    "away",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                "expected_num_pretrained_tokens": 0,
            },
            # Vocab from train data or pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=True,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                ),
                "expected_tokens": pretrained_tokens.union(
                    {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN}
                ),
                "expected_num_pretrained_tokens": len(pretrained_tokens) + 4,
            },
            # Vocab from limited number of pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                    vocab_size=2,
                ),
                "expected_tokens": {
                    "</s>",
                    "the",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                # special tokens excluded from vocab_size = 2
                "expected_num_pretrained_tokens": 4,
            },
        )

        for case in test_cases:
            # Setup data handler
            featurizer = create_featurizer(
                SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"])
            )
            data_handler = DocClassificationDataHandler.from_config(
                DocClassificationDataHandler.Config(),
                ModelInputConfig(word_feat=case["feat"]),
                TargetConfig(),
                featurizer=featurizer,
            )
            train_data = data_handler.gen_dataset(
                [{"text": train_text}], include_label_fields=False
            )
            eval_data = data_handler.gen_dataset(
                [{"text": eval_text}], include_label_fields=False
            )
            test_data = data_handler.gen_dataset(
                [{"text": test_text}], include_label_fields=False
            )
            data_handler.init_feature_metadata(train_data, eval_data, test_data)

            # Check created vocab
            meta = data_handler.metadata.features[feat_name]
            self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"])
            if case["expected_num_pretrained_tokens"] == 0:
                self.assertIsNone(meta.pretrained_embeds_weight)
            else:
                self.assertEqual(
                    meta.pretrained_embeds_weight.size(0),
                    case["expected_num_pretrained_tokens"],
                )
예제 #30
0
    def setUp(self):
        self.train_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [{
                "id": "SL:DATE_TIME",
                "span": {
                    "start": 21,
                    "end": 26
                },
                "text": "today",
            }],
            DFColumn.UTTERANCE:
            "What EVENTS can I go today",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]",
        }]

        self.eval_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ATTRIBUTE_EVENT",
                    "span": {
                        "start": 14,
                        "end": 19
                    },
                    "text": "adult",
                },
                {
                    "id": "SL:DATE_TIME",
                    "span": {
                        "start": 27,
                        "end": 39
                    },
                    "text": "this weekend",
                },
            ],
            DFColumn.UTTERANCE:
            "Are there any adult events this weekend",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]",
        }]

        self.test_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_INFO_ROAD_CONDITION",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ROAD_CONDITION",
                    "span": {
                        "start": 9,
                        "end": 21
                    },
                    "text": "any flooding",
                },
                {
                    "id": "SL:DESTINATION",
                    "span": {
                        "start": 36,
                        "end": 41
                    },
                    "text": "Karen",
                    "subframe": {
                        "utterance":
                        "Karen",
                        "domain":
                        "",
                        "intent":
                        "IN:GET_LOCATION_HOME",
                        "slots": [{
                            "id": "SL:CONTACT",
                            "span": {
                                "start": 0,
                                "end": 5
                            },
                            "text": "Karen",
                        }],
                        "span": {
                            "start": 0,
                            "end": 5
                        },
                    },
                },
            ],
            DFColumn.UTTERANCE:
            "Is there any flooding on the way to Karen's?",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]",
        }]

        self.dh = CompositionalDataHandler.from_config(
            CompositionalDataHandler.Config(),
            FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                   min_freq=1)),
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(lowercase_tokens=True),
                FeatureConfig()),
        )