def test_uppercase_tokens(self): """ Test that the text is not lower-cased when lowercase_tokens is False. """ custom_dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()), ) custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data, self.test_data) self.assertSetEqual( set(custom_dh.features["word_feat"].vocab.stoi), { "<unk>", "What", "EVENTS", "can", "I", "go", "today", "Are", "there", "any", "adult", "events", "this", "weekend", }, )
class Config(Task.Config): model: RNNGParser.Config = RNNGParser.Config() trainer: HogwildTrainer.Config = HogwildTrainer.Config() data_handler: CompositionalDataHandler.Config = CompositionalDataHandler.Config( ) labels: Optional[WordLabelConfig] = None metric_reporter: CompositionalMetricReporter.Config = CompositionalMetricReporter.Config( )
def test_min_freq(self): """ Test that UNKification is triggered when min_freq is 2. """ custom_dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=2)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=True), FeatureConfig()), ) custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data, self.test_data) # <unk>-NUM = <unk> for numeric tokens self.assertSetEqual( set(custom_dh.features["word_feat"].vocab.stoi), {"<unk>", "<unk>-NUM", "<unk>", "<unk>", "events"}, )
def setUp(self): self.train_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [{ "id": "SL:DATE_TIME", "span": { "start": 21, "end": 26 }, "text": "today", }], DFColumn.UTTERANCE: "What EVENTS can I go today", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]", }] self.eval_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [ { "id": "SL:ATTRIBUTE_EVENT", "span": { "start": 14, "end": 19 }, "text": "adult", }, { "id": "SL:DATE_TIME", "span": { "start": 27, "end": 39 }, "text": "this weekend", }, ], DFColumn.UTTERANCE: "Are there any adult events this weekend", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]", }] self.test_data = [{ DFColumn.DOC_LABEL: "IN:GET_INFO_ROAD_CONDITION", DFColumn.WORD_LABEL: [ { "id": "SL:ROAD_CONDITION", "span": { "start": 9, "end": 21 }, "text": "any flooding", }, { "id": "SL:DESTINATION", "span": { "start": 36, "end": 41 }, "text": "Karen", "subframe": { "utterance": "Karen", "domain": "", "intent": "IN:GET_LOCATION_HOME", "slots": [{ "id": "SL:CONTACT", "span": { "start": 0, "end": 5 }, "text": "Karen", }], "span": { "start": 0, "end": 5 }, }, }, ], DFColumn.UTTERANCE: "Is there any flooding on the way to Karen's?", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]", }] self.dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=True), FeatureConfig()), )