Exemplo n.º 1
0
    def test_tokenize(self):
        featurizer = SimpleFeaturizer.from_config(
            SimpleFeaturizer.Config(), FeatureConfig()
        )

        tokens = featurizer.featurize(InputRecord(raw_text="At eight o'clock")).tokens
        self.assertEqual(['at', 'eight', "o'clock"], tokens)
Exemplo n.º 2
0
 def test_uppercase_tokens(self):
     """
     Test that the text is not lower-cased when lowercase_tokens is False.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=1)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=False),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {
             "<unk>",
             "<unk>-NUM",
             "What",
             "EVENTS",
             "can",
             "I",
             "go",
             "to",
             "at",
             "pm",
             "today",
             "Are",
             "there",
             "any",
             "fun",
             "events",
             "this",
             "weekend",
         },
     )
Exemplo n.º 3
0
    def create_language_model_data_handler(cls) -> LanguageModelDataHandler:
        # TODO: Refactor this after Shicong refactors PyText config and removes
        # Thrift. After that directly use Data Handler's from config method
        # with synthetic configs
        columns = [DFColumn.UTTERANCE]
        features: Dict[str, Field] = {
            DatasetFieldName.TEXT_FIELD: TextFeatureField(
                eos_token=VocabMeta.EOS_TOKEN, init_token=VocabMeta.INIT_TOKEN
            )
        }

        return LanguageModelDataHandler(
            raw_columns=columns,
            features=features,
            labels={},
            featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()),
        )
Exemplo n.º 4
0
 class Config(ConfigBase):
     features: FeatureConfig = FeatureConfig()
     featurizer: Featurizer.Config = SimpleFeaturizer.Config()
     data_handler: DataHandler.Config
     trainer: Trainer.Config = Trainer.Config()
     exporter: Optional[ModelExporter.Config] = None
Exemplo n.º 5
0
    def setUp(self):
        self.train_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [{
                "id": "SL:DATE_TIME",
                "span": {
                    "start": 21,
                    "end": 26
                },
                "text": "today",
            }],
            DFColumn.UTTERANCE:
            "What EVENTS can I go today",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]",
        }]

        self.eval_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ATTRIBUTE_EVENT",
                    "span": {
                        "start": 14,
                        "end": 19
                    },
                    "text": "adult",
                },
                {
                    "id": "SL:DATE_TIME",
                    "span": {
                        "start": 27,
                        "end": 39
                    },
                    "text": "this weekend",
                },
            ],
            DFColumn.UTTERANCE:
            "Are there any adult events this weekend",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]",
        }]

        self.test_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_INFO_ROAD_CONDITION",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ROAD_CONDITION",
                    "span": {
                        "start": 9,
                        "end": 21
                    },
                    "text": "any flooding",
                },
                {
                    "id": "SL:DESTINATION",
                    "span": {
                        "start": 36,
                        "end": 41
                    },
                    "text": "Karen",
                    "subframe": {
                        "utterance":
                        "Karen",
                        "domain":
                        "",
                        "intent":
                        "IN:GET_LOCATION_HOME",
                        "slots": [{
                            "id": "SL:CONTACT",
                            "span": {
                                "start": 0,
                                "end": 5
                            },
                            "text": "Karen",
                        }],
                        "span": {
                            "start": 0,
                            "end": 5
                        },
                    },
                },
            ],
            DFColumn.UTTERANCE:
            "Is there any flooding on the way to Karen's?",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]",
        }]

        self.dh = CompositionalDataHandler.from_config(
            CompositionalDataHandler.Config(),
            FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                   min_freq=1)),
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(lowercase_tokens=True),
                FeatureConfig()),
        )
Exemplo n.º 6
0
    def test_init_feature_metadata(self):
        # Specify data
        feat_name = ModelInput.WORD_FEAT
        train_text = "Hi there you"
        eval_text = ""
        test_text = "Go away"
        pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw")
        pretrained_tokens = {
            "</s>",
            "the",
            "to",
            "and",
            "a",
            "I",
            "you",
            "is",
            "aloha",
            "for",
        }

        # Specify test cases
        test_cases = (
            # Vocab from train / eval / test data
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=True,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=False,
                ),
                "expected_tokens": {
                    "hi",
                    "there",
                    "you",
                    "go",
                    "away",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                "expected_num_pretrained_tokens": 0,
            },
            # Vocab from train data or pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=True,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                ),
                "expected_tokens": pretrained_tokens.union(
                    {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN}
                ),
                "expected_num_pretrained_tokens": len(pretrained_tokens) + 4,
            },
            # Vocab from limited number of pretrained embeddings
            {
                "feat": WordFeatConfig(
                    vocab_from_all_data=False,
                    vocab_from_train_data=False,
                    vocab_from_pretrained_embeddings=True,
                    pretrained_embeddings_path=pretrained_embedding_file,
                    embed_dim=5,
                    vocab_size=2,
                ),
                "expected_tokens": {
                    "</s>",
                    "the",
                    VocabMeta.UNK_TOKEN,
                    VocabMeta.PAD_TOKEN,
                },
                # special tokens excluded from vocab_size = 2
                "expected_num_pretrained_tokens": 4,
            },
        )

        for case in test_cases:
            # Setup data handler
            featurizer = create_featurizer(
                SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"])
            )
            data_handler = DocClassificationDataHandler.from_config(
                DocClassificationDataHandler.Config(),
                ModelInputConfig(word_feat=case["feat"]),
                TargetConfig(),
                featurizer=featurizer,
            )
            train_data = data_handler.gen_dataset(
                [{"text": train_text}], include_label_fields=False
            )
            eval_data = data_handler.gen_dataset(
                [{"text": eval_text}], include_label_fields=False
            )
            test_data = data_handler.gen_dataset(
                [{"text": test_text}], include_label_fields=False
            )
            data_handler.init_feature_metadata(train_data, eval_data, test_data)

            # Check created vocab
            meta = data_handler.metadata.features[feat_name]
            self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"])
            if case["expected_num_pretrained_tokens"] == 0:
                self.assertIsNone(meta.pretrained_embeds_weight)
            else:
                self.assertEqual(
                    meta.pretrained_embeds_weight.size(0),
                    case["expected_num_pretrained_tokens"],
                )
 def test_tokenize_dont_lowercase(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig())
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(tokens, ["Order", "me", "a", "coffee"])
Exemplo n.º 8
0
 def test_tokenize_add_sentence_markers(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(sentence_markers=("<s>", "</s>")), FeatureConfig()
     )
     tokens = featurizer.featurize(InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(tokens, ["<s>", "order", "me", "a", "coffee", "</s>"])