def test_tokenize(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), FeatureConfig() ) tokens = featurizer.featurize(InputRecord(raw_text="At eight o'clock")).tokens self.assertEqual(['at', 'eight', "o'clock"], tokens)
def test_uppercase_tokens(self): """ Test that the text is not lower-cased when lowercase_tokens is False. """ custom_dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()), ) custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data, self.test_data) self.assertSetEqual( set(custom_dh.features["word_feat"].vocab.stoi), { "<unk>", "<unk>-NUM", "What", "EVENTS", "can", "I", "go", "to", "at", "pm", "today", "Are", "there", "any", "fun", "events", "this", "weekend", }, )
def create_language_model_data_handler(cls) -> LanguageModelDataHandler: # TODO: Refactor this after Shicong refactors PyText config and removes # Thrift. After that directly use Data Handler's from config method # with synthetic configs columns = [DFColumn.UTTERANCE] features: Dict[str, Field] = { DatasetFieldName.TEXT_FIELD: TextFeatureField( eos_token=VocabMeta.EOS_TOKEN, init_token=VocabMeta.INIT_TOKEN ) } return LanguageModelDataHandler( raw_columns=columns, features=features, labels={}, featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()), )
class Config(ConfigBase): features: FeatureConfig = FeatureConfig() featurizer: Featurizer.Config = SimpleFeaturizer.Config() data_handler: DataHandler.Config trainer: Trainer.Config = Trainer.Config() exporter: Optional[ModelExporter.Config] = None
def setUp(self): self.train_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [{ "id": "SL:DATE_TIME", "span": { "start": 21, "end": 26 }, "text": "today", }], DFColumn.UTTERANCE: "What EVENTS can I go today", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]", }] self.eval_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [ { "id": "SL:ATTRIBUTE_EVENT", "span": { "start": 14, "end": 19 }, "text": "adult", }, { "id": "SL:DATE_TIME", "span": { "start": 27, "end": 39 }, "text": "this weekend", }, ], DFColumn.UTTERANCE: "Are there any adult events this weekend", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]", }] self.test_data = [{ DFColumn.DOC_LABEL: "IN:GET_INFO_ROAD_CONDITION", DFColumn.WORD_LABEL: [ { "id": "SL:ROAD_CONDITION", "span": { "start": 9, "end": 21 }, "text": "any flooding", }, { "id": "SL:DESTINATION", "span": { "start": 36, "end": 41 }, "text": "Karen", "subframe": { "utterance": "Karen", "domain": "", "intent": "IN:GET_LOCATION_HOME", "slots": [{ "id": "SL:CONTACT", "span": { "start": 0, "end": 5 }, "text": "Karen", }], "span": { "start": 0, "end": 5 }, }, }, ], DFColumn.UTTERANCE: "Is there any flooding on the way to Karen's?", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]", }] self.dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=True), FeatureConfig()), )
def test_init_feature_metadata(self): # Specify data feat_name = ModelInput.WORD_FEAT train_text = "Hi there you" eval_text = "" test_text = "Go away" pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw") pretrained_tokens = { "</s>", "the", "to", "and", "a", "I", "you", "is", "aloha", "for", } # Specify test cases test_cases = ( # Vocab from train / eval / test data { "feat": WordFeatConfig( vocab_from_all_data=True, vocab_from_train_data=False, vocab_from_pretrained_embeddings=False, ), "expected_tokens": { "hi", "there", "you", "go", "away", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN, }, "expected_num_pretrained_tokens": 0, }, # Vocab from train data or pretrained embeddings { "feat": WordFeatConfig( vocab_from_all_data=False, vocab_from_train_data=True, vocab_from_pretrained_embeddings=True, pretrained_embeddings_path=pretrained_embedding_file, embed_dim=5, ), "expected_tokens": pretrained_tokens.union( {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN} ), "expected_num_pretrained_tokens": len(pretrained_tokens) + 4, }, # Vocab from limited number of pretrained embeddings { "feat": WordFeatConfig( vocab_from_all_data=False, vocab_from_train_data=False, vocab_from_pretrained_embeddings=True, pretrained_embeddings_path=pretrained_embedding_file, embed_dim=5, vocab_size=2, ), "expected_tokens": { "</s>", "the", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN, }, # special tokens excluded from vocab_size = 2 "expected_num_pretrained_tokens": 4, }, ) for case in test_cases: # Setup data handler featurizer = create_featurizer( SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"]) ) data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(word_feat=case["feat"]), TargetConfig(), featurizer=featurizer, ) train_data = data_handler.gen_dataset( [{"text": train_text}], include_label_fields=False ) eval_data = data_handler.gen_dataset( [{"text": eval_text}], include_label_fields=False ) test_data = data_handler.gen_dataset( [{"text": test_text}], include_label_fields=False ) data_handler.init_feature_metadata(train_data, eval_data, test_data) # Check created vocab meta = data_handler.metadata.features[feat_name] self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"]) if case["expected_num_pretrained_tokens"] == 0: self.assertIsNone(meta.pretrained_embeds_weight) else: self.assertEqual( meta.pretrained_embeds_weight.size(0), case["expected_num_pretrained_tokens"], )
def test_tokenize_dont_lowercase(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()) tokens = featurizer.featurize( InputRecord(raw_text=self.sentence)).tokens self.assertListEqual(tokens, ["Order", "me", "a", "coffee"])
def test_tokenize_add_sentence_markers(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(sentence_markers=("<s>", "</s>")), FeatureConfig() ) tokens = featurizer.featurize(InputRecord(raw_text=self.sentence)).tokens self.assertListEqual(tokens, ["<s>", "order", "me", "a", "coffee", "</s>"])