def test_intializing_embeds_from_config(self): feature_config = FeatureConfig( word_feat=WordFeatConfig( embedding_init_strategy=EmbedInitStrategy.RANDOM, embed_dim=5, pretrained_embeddings_path=tests_module.TEST_BASE_DIR, ) ) data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), feature_config, [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), feature_config ), ) data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE) pretrained_embeds = data_handler.metadata.features[ DatasetFieldName.TEXT_FIELD ].pretrained_embeds_weight # test random initialization (values should be non-0) np.testing.assert_array_less( [0, 0, 0, 0, 0], np.absolute(pretrained_embeds[11].numpy()) ) feature_config = FeatureConfig( word_feat=WordFeatConfig( embedding_init_strategy=EmbedInitStrategy.ZERO, embed_dim=5, pretrained_embeddings_path=tests_module.TEST_BASE_DIR, ) ) data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), feature_config, [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), feature_config ), ) data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE) pretrained_embeds = data_handler.metadata.features[ DatasetFieldName.TEXT_FIELD ].pretrained_embeds_weight # test zero initialization (values should all be 0) np.testing.assert_array_equal([0, 0, 0, 0, 0], pretrained_embeds[11].numpy())
def example_config(cls): return cls.Config( labels=[DocLabelConfig(), WordLabelConfig()], model=BaggingDocEnsemble_Deprecated.Config( models=[DocModel_Deprecated.Config()] ), )
class Config(Task.Config): model: WordTaggingModel.Config = WordTaggingModel.Config() trainer: Trainer.Config = Trainer.Config() labels: WordLabelConfig = WordLabelConfig() data_handler: JointModelDataHandler.Config = JointModelDataHandler.Config( ) metric_reporter: WordTaggingMetricReporter.Config = ( WordTaggingMetricReporter.Config())
def setUp(self): self.data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), FeatureConfig(), [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(), FeatureConfig()), )
def _init_data_handler(self): data_handler = LanguageModelDataHandler.from_config( LanguageModelDataHandler.Config(), FeatureConfig(), WordLabelConfig(), featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()), shuffle=False, ) data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME) return data_handler
def setUp(self): file_name = tests_module.test_file( "contextual_intent_slot_train_tiny.tsv") self.dh = ContextualIntentSlotModelDataHandler.from_config( ContextualIntentSlotModelDataHandler.Config(), ModelInputConfig(), [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer(SimpleFeaturizer.Config(), ModelInputConfig()), ) self.data = self.dh.read_from_file(file_name, self.dh.raw_columns)
def test_read_file_with_dense_features(self): data_handler_config = ContextualIntentSlotModelDataHandler.Config() data_handler_config.columns_to_read.append(ModelInput.DENSE) dense_file_name = tests_module.test_file( "contextual_intent_slot_train_tiny_dense.tsv") data_handler = ContextualIntentSlotModelDataHandler.from_config( data_handler_config, ModelInputConfig(), [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer(SimpleFeaturizer.Config(), ModelInputConfig()), ) dense_data = list( data_handler.read_from_file(dense_file_name, data_handler.raw_columns)) self.assertEqual(dense_data[0][ModelInput.DENSE], "[0,1,2,3,4]")
def test_data_handler(self): data_handler = BPTTLanguageModelDataHandler.from_config( BPTTLanguageModelDataHandler.Config(bptt_len=4), FeatureConfig(), WordLabelConfig(), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), FeatureConfig() ), ) data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME) train_iter = data_handler.get_train_iter_from_path(FILE_NAME, BATCH_SIZE) batches = [t for t in train_iter] # There are two batches in the tiny dataset self.assertEqual(len(batches), 2) # batches of tuple(input, target, context) # input -> tuple(input_sequences, sequence_length) # input_sequence -> tensor of dim (bsize, max_seq_length) np.testing.assert_array_equal( batches[0][0][0], [[15, 19, 12, 16], [3, 13, 21, 8], [20, 7, 23, 4], [6, 5, 7, 22]], ) # sequence_length -> tensor of dim (bsize) np.testing.assert_array_equal(batches[0][0][1], [4, 4, 4, 4]) # target -> tensor of same dim as input_sequences (bsize, max_seq_length) np.testing.assert_array_equal( batches[0][1][0], [[19, 12, 16, 14], [13, 21, 8, 3], [7, 23, 4, 3], [5, 7, 22, 10]], ) np.testing.assert_array_equal( batches[1][0][0], [[14, 17, 11], [3, 5, 18], [3, 8, 4], [10, 4, 9]] ) np.testing.assert_array_equal(batches[1][0][1], [3, 3, 3, 3]) np.testing.assert_array_equal( batches[1][1][0], [[17, 11, 4], [5, 18, 6], [8, 4, 3], [4, 9, 1]] )
def example_config(cls): return cls.Config(labels=[DocLabelConfig(), WordLabelConfig()])