예제 #1
0
    def test_intializing_embeds_from_config(self):
        feature_config = FeatureConfig(
            word_feat=WordFeatConfig(
                embedding_init_strategy=EmbedInitStrategy.RANDOM,
                embed_dim=5,
                pretrained_embeddings_path=tests_module.TEST_BASE_DIR,
            )
        )
        data_handler = JointModelDataHandler.from_config(
            JointModelDataHandler.Config(),
            feature_config,
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), feature_config
            ),
        )

        data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE)

        pretrained_embeds = data_handler.metadata.features[
            DatasetFieldName.TEXT_FIELD
        ].pretrained_embeds_weight
        # test random initialization (values should be non-0)
        np.testing.assert_array_less(
            [0, 0, 0, 0, 0], np.absolute(pretrained_embeds[11].numpy())
        )

        feature_config = FeatureConfig(
            word_feat=WordFeatConfig(
                embedding_init_strategy=EmbedInitStrategy.ZERO,
                embed_dim=5,
                pretrained_embeddings_path=tests_module.TEST_BASE_DIR,
            )
        )
        data_handler = JointModelDataHandler.from_config(
            JointModelDataHandler.Config(),
            feature_config,
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(), feature_config
            ),
        )
        data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE)

        pretrained_embeds = data_handler.metadata.features[
            DatasetFieldName.TEXT_FIELD
        ].pretrained_embeds_weight
        # test zero initialization (values should all be 0)
        np.testing.assert_array_equal([0, 0, 0, 0, 0], pretrained_embeds[11].numpy())
예제 #2
0
    def _create_dummy_data_handler(self):
        feat = WordFeatConfig(
            vocab_size=4,
            vocab_from_all_data=True,
            vocab_from_train_data=True,
            vocab_from_pretrained_embeddings=False,
            pretrained_embeddings_path=None,
        )
        featurizer = create_featurizer(SimpleFeaturizer.Config(),
                                       FeatureConfig(word_feat=feat))
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        train_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                              include_label_fields=False)
        eval_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        test_data = data_handler.gen_dataset([{
            "text": "<pad>"
        }],
                                             include_label_fields=False)
        data_handler.init_feature_metadata(train_data, eval_data, test_data)

        return data_handler
예제 #3
0
 def test_uppercase_tokens(self):
     """
     Test that the text is not lower-cased when lowercase_tokens is False.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=1)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=False),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {
             "<unk>",
             "What",
             "EVENTS",
             "can",
             "I",
             "go",
             "today",
             "Are",
             "there",
             "any",
             "adult",
             "events",
             "this",
             "weekend",
         },
     )
예제 #4
0
    def test_read_partially_from_csv(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")
        columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2}

        feat = WordFeatConfig(
            vocab_from_all_data=True,
            vocab_from_train_data=False,
            vocab_from_pretrained_embeddings=False,
        )
        featurizer = create_featurizer(
            SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat)
        )
        data_handler = DocClassificationDataHandler.from_config(
            DocClassificationDataHandler.Config(),
            ModelInputConfig(word_feat=feat),
            TargetConfig(),
            featurizer=featurizer,
        )
        data = list(data_handler.read_from_file(file_name, columns))
        for col in columns:
            self.assertTrue(col in data[0], "{} must in the data".format(col))
        self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL])
        self.assertEqual(
            "change my alarm tomorrow to wake me up 30 minutes earlier",
            data[0][DFColumn.UTTERANCE],
        )
예제 #5
0
    class Config(NewModel.Config, DocModel.Config):
        embedding: WordFeatConfig = WordFeatConfig()

        class ModelInput(NewModel.Config.ModelInput):
            tokens: WordTensorizer.Config = WordTensorizer.Config()
            labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True)

        inputs: ModelInput = ModelInput()
예제 #6
0
    def create_model(self, shared_rep):
        # shared_rep: do query and response share representation layer?
        metadata = self.data_handler.metadata
        model_config = QueryDocumentPairwiseRankingModel_Deprecated.Config()

        model_config.representation = QueryDocumentPairwiseRankingRep.Config()
        model_config.representation.shared_representations = shared_rep

        model_config.decoder = MLPDecoderQueryResponse.Config()
        model_config.decoder.hidden_dims = [64]
        model_config.output_layer = PairwiseRankingOutputLayer.Config()

        feat_config = ModelInputConfig()
        feat_config.pos_response = WordFeatConfig()
        feat_config.pos_response.embed_dim = 64
        feat_config.neg_response = WordFeatConfig()
        feat_config.query = WordFeatConfig()
        return QueryDocumentPairwiseRankingModel_Deprecated.from_config(
            model_config, feat_config, metadata)
예제 #7
0
    class Config(DocModel.Config):
        class ModelInput(Model.Config.ModelInput):
            tokens: WordTensorizer.Config = WordTensorizer.Config()
            labels: LabelTensorizer.Config = LabelTensorizer.Config(
                allow_unknown=True)
            # for metric reporter
            raw_text: MetaInput.Config = MetaInput.Config(column="text")

        inputs: ModelInput = ModelInput()
        embedding: WordFeatConfig = WordFeatConfig()
예제 #8
0
    def DISABLED_test_freeze_word_embedding(self):
        model = create_model(
            DocModel.Config(),
            FeatureConfig(
                word_feat=WordFeatConfig(freeze=True, mlp_layer_dims=[4]),
                dict_feat=DictFeatConfig(),
            ),
            metadata=mock_metadata(),
        )
        # word embedding
        for param in model.embedding[0].word_embedding.parameters():
            self.assertFalse(param.requires_grad)
        for param in model.embedding[0].mlp.parameters():
            self.assertTrue(param.requires_grad)

        # dict feat embedding
        for param in model.embedding[1].parameters():
            self.assertTrue(param.requires_grad)
 def test_min_freq(self):
     """
     Test that UNKification is triggered when min_freq is 2.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=2)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=True),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     # <unk>-NUM = <unk> for numeric tokens
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {"<unk>", "<unk>-NUM", "<unk>", "<unk>", "events"},
     )
예제 #10
0
    def setUp(self):
        self.train_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [{
                "id": "SL:DATE_TIME",
                "span": {
                    "start": 21,
                    "end": 26
                },
                "text": "today",
            }],
            DFColumn.UTTERANCE:
            "What EVENTS can I go today",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]",
        }]

        self.eval_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ATTRIBUTE_EVENT",
                    "span": {
                        "start": 14,
                        "end": 19
                    },
                    "text": "adult",
                },
                {
                    "id": "SL:DATE_TIME",
                    "span": {
                        "start": 27,
                        "end": 39
                    },
                    "text": "this weekend",
                },
            ],
            DFColumn.UTTERANCE:
            "Are there any adult events this weekend",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]",
        }]

        self.test_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_INFO_ROAD_CONDITION",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ROAD_CONDITION",
                    "span": {
                        "start": 9,
                        "end": 21
                    },
                    "text": "any flooding",
                },
                {
                    "id": "SL:DESTINATION",
                    "span": {
                        "start": 36,
                        "end": 41
                    },
                    "text": "Karen",
                    "subframe": {
                        "utterance":
                        "Karen",
                        "domain":
                        "",
                        "intent":
                        "IN:GET_LOCATION_HOME",
                        "slots": [{
                            "id": "SL:CONTACT",
                            "span": {
                                "start": 0,
                                "end": 5
                            },
                            "text": "Karen",
                        }],
                        "span": {
                            "start": 0,
                            "end": 5
                        },
                    },
                },
            ],
            DFColumn.UTTERANCE:
            "Is there any flooding on the way to Karen's?",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]",
        }]

        self.dh = CompositionalDataHandler.from_config(
            CompositionalDataHandler.Config(),
            FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                   min_freq=1)),
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(lowercase_tokens=True),
                FeatureConfig()),
        )
예제 #11
0
    def test_init_feature_metadata(self):
        # Specify data
        feat_name = ModelInput.WORD_FEAT
        train_text = "Hi there you"
        eval_text = ""
        test_text = "Go away"
        pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw")
        pretrained_tokens = {
            "</s>",
            "the",
            "to",
            "and",
            "a",
            "I",
            "you",
            "is",
            "aloha",
            "for",
        }

        # Specify test cases
        test_cases = (
            # Vocab from train / eval / test data
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=True,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=False,
                ),
                "expected_tokens": {
                    "hi",
                    "there",
                    "you",
                    "go",
                    "away",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                "expected_num_pretrained_tokens": 0,
            },
            # Vocab from train data or pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=True,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                ),
                "expected_tokens": pretrained_tokens.union(
                    {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN}
                ),
                "expected_num_pretrained_tokens": len(pretrained_tokens) + 4,
            },
            # Vocab from limited number of pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                    vocab_size=2,
                ),
                "expected_tokens": {
                    "</s>",
                    "the",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                # special tokens excluded from vocab_size = 2
                "expected_num_pretrained_tokens": 4,
            },
        )

        for case in test_cases:
            # Setup data handler
            featurizer = create_featurizer(
                SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"])
            )
            data_handler = DocClassificationDataHandler.from_config(
                DocClassificationDataHandler.Config(),
                ModelInputConfig(word_feat=case["feat"]),
                TargetConfig(),
                featurizer=featurizer,
            )
            train_data = data_handler.gen_dataset(
                [{"text": train_text}], include_label_fields=False
            )
            eval_data = data_handler.gen_dataset(
                [{"text": eval_text}], include_label_fields=False
            )
            test_data = data_handler.gen_dataset(
                [{"text": test_text}], include_label_fields=False
            )
            data_handler.init_feature_metadata(train_data, eval_data, test_data)

            # Check created vocab
            meta = data_handler.metadata.features[feat_name]
            self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"])
            if case["expected_num_pretrained_tokens"] == 0:
                self.assertIsNone(meta.pretrained_embeds_weight)
            else:
                self.assertEqual(
                    meta.pretrained_embeds_weight.size(0),
                    case["expected_num_pretrained_tokens"],
                )
예제 #12
0
class ModelInputConfig(ModuleConfig):
    seq_word_feat: WordFeatConfig = WordFeatConfig(min_freq=1, )
예제 #13
0
 class Config(Model.Config, doc_model.DocModel.Config):
     inputs: Dict[str, Tensorizer.Config] = {
         "tokens": WordTensorizer.Config(),
         "labels": LabelTensorizer.Config(),
     }
     embedding: WordFeatConfig = WordFeatConfig()