示例#1
0
 def test_uppercase_tokens(self):
     """
     Test that the text is not lower-cased when lowercase_tokens is False.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=1)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=False),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {
             "<unk>",
             "What",
             "EVENTS",
             "can",
             "I",
             "go",
             "today",
             "Are",
             "there",
             "any",
             "adult",
             "events",
             "this",
             "weekend",
         },
     )
示例#2
0
文件: tasks.py 项目: LinHR000/pytext
 class Config(Task.Config):
     model: RNNGParser.Config = RNNGParser.Config()
     trainer: HogwildTrainer.Config = HogwildTrainer.Config()
     data_handler: CompositionalDataHandler.Config = CompositionalDataHandler.Config(
     )
     labels: Optional[WordLabelConfig] = None
     metric_reporter: CompositionalMetricReporter.Config = CompositionalMetricReporter.Config(
     )
 def test_min_freq(self):
     """
     Test that UNKification is triggered when min_freq is 2.
     """
     custom_dh = CompositionalDataHandler.from_config(
         CompositionalDataHandler.Config(),
         FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                min_freq=2)),
         featurizer=SimpleFeaturizer.from_config(
             SimpleFeaturizer.Config(lowercase_tokens=True),
             FeatureConfig()),
     )
     custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data,
                                           self.test_data)
     # <unk>-NUM = <unk> for numeric tokens
     self.assertSetEqual(
         set(custom_dh.features["word_feat"].vocab.stoi),
         {"<unk>", "<unk>-NUM", "<unk>", "<unk>", "events"},
     )
示例#4
0
    def setUp(self):
        self.train_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [{
                "id": "SL:DATE_TIME",
                "span": {
                    "start": 21,
                    "end": 26
                },
                "text": "today",
            }],
            DFColumn.UTTERANCE:
            "What EVENTS can I go today",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]",
        }]

        self.eval_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_EVENT",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ATTRIBUTE_EVENT",
                    "span": {
                        "start": 14,
                        "end": 19
                    },
                    "text": "adult",
                },
                {
                    "id": "SL:DATE_TIME",
                    "span": {
                        "start": 27,
                        "end": 39
                    },
                    "text": "this weekend",
                },
            ],
            DFColumn.UTTERANCE:
            "Are there any adult events this weekend",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]",
        }]

        self.test_data = [{
            DFColumn.DOC_LABEL:
            "IN:GET_INFO_ROAD_CONDITION",
            DFColumn.WORD_LABEL: [
                {
                    "id": "SL:ROAD_CONDITION",
                    "span": {
                        "start": 9,
                        "end": 21
                    },
                    "text": "any flooding",
                },
                {
                    "id": "SL:DESTINATION",
                    "span": {
                        "start": 36,
                        "end": 41
                    },
                    "text": "Karen",
                    "subframe": {
                        "utterance":
                        "Karen",
                        "domain":
                        "",
                        "intent":
                        "IN:GET_LOCATION_HOME",
                        "slots": [{
                            "id": "SL:CONTACT",
                            "span": {
                                "start": 0,
                                "end": 5
                            },
                            "text": "Karen",
                        }],
                        "span": {
                            "start": 0,
                            "end": 5
                        },
                    },
                },
            ],
            DFColumn.UTTERANCE:
            "Is there any flooding on the way to Karen's?",
            DFColumn.DICT_FEAT:
            "",
            DFColumn.SEQLOGICAL:
            "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]",
        }]

        self.dh = CompositionalDataHandler.from_config(
            CompositionalDataHandler.Config(),
            FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True,
                                                   min_freq=1)),
            featurizer=SimpleFeaturizer.from_config(
                SimpleFeaturizer.Config(lowercase_tokens=True),
                FeatureConfig()),
        )