def test_read_from_file(self): file_name = tests_module.test_file("train_dense_features_tiny.tsv") data = self.data_handler.read_from_file(file_name, self.data_handler.raw_columns) # Check if the data has 10 rows and 6 columns self.assertEqual(len(data), 10) self.assertEqual(len(data[0]), 4) self.assertEqual(data[0][RawData.DOC_LABEL], "alarm/modify_alarm")
def test_tokenization(self): file_name = tests_module.test_file("train_dense_features_tiny.tsv") data = self.data_handler.read_from_file(file_name, self.data_handler.raw_columns) data = list(self.data_handler.preprocess(data)) # test tokenization without language-specific tokenizers self.assertEqual(data[0][ModelInput.WORD_FEAT][0], "16:24:datetime,39:57:datetime") self.assertIsNotNone(data[0][ModelInput.DENSE_FEAT])
def test_read_partially_from_csv(self): file_name = tests_module.test_file("train_data_tiny.tsv") columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2} data = DataHandler.read_from_file(file_name, columns) for col in columns: self.assertTrue(col in data[0], "{} must in the data".format(col)) self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL]) self.assertEqual( "change my alarm tomorrow to wake me up 30 minutes earlier", data[0][DFColumn.UTTERANCE], )
def setUp(self): file_name = tests_module.test_file( "contextual_intent_slot_train_tiny.tsv") self.dh = ContextualIntentSlotModelDataHandler.from_config( ContextualIntentSlotModelDataHandler.Config(), ModelInputConfig(), [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer(SimpleFeaturizer.Config(), ModelInputConfig()), ) self.data = self.dh.read_from_file(file_name, self.dh.raw_columns)
def test_read_from_file(self): file_name = tests_module.test_file("train_data_tiny.tsv") data = self.data_handler.read_from_file(file_name, self.data_handler.raw_columns) # Check if the data has 10 rows and 6 columns self.assertEqual(len(data), 10) self.assertEqual(len(data[0]), 6) self.assertEqual( data[0][DFColumn.UTTERANCE], "change my alarm tomorrow to wake me up 30 minutes earlier", )
def test_tokenization(self): file_name = tests_module.test_file("train_data_tiny.tsv") data = self.data_handler.read_from_file(file_name, self.data_handler.raw_columns) data = list(self.data_handler.preprocess(data)) # test tokenization without language-specific tokenizers self.assertEqual(data[0][DatasetFieldName.TEXT_FIELD][0], "change") self.assertEqual(data[4][DatasetFieldName.TEXT_FIELD][2], "alarm") # test token ranges self.assertEqual(data[0][DatasetFieldName.TOKEN_RANGE][0], (0, 6)) self.assertEqual(data[4][DatasetFieldName.TOKEN_RANGE][2], (12, 17))
def test_read_from_csv(self): file_name = tests_module.test_file("train_data_tiny.tsv") columns = [ DFColumn.DOC_LABEL, DFColumn.WORD_LABEL, DFColumn.UTTERANCE, DFColumn.DICT_FEAT, ] data = DataHandler.read_from_file(file_name, columns) for col in columns: self.assertTrue(col in data[0], "{} must in the data".format(col)) self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL]) self.assertEqual("16:24:datetime,39:57:datetime", data[0][DFColumn.WORD_LABEL]) self.assertEqual( "change my alarm tomorrow to wake me up 30 minutes earlier", data[0][DFColumn.UTTERANCE], ) self.assertEqual("", data[0][DFColumn.DICT_FEAT])
import numpy as np from pytext.common.constants import DatasetFieldName from pytext.config.field_config import ( DocLabelConfig, EmbedInitStrategy, FeatureConfig, WordFeatConfig, WordLabelConfig, ) from pytext.data import JointModelDataHandler from pytext.data.featurizer import SimpleFeaturizer from pytext.utils.embeddings_utils import PretrainedEmbedding from pytext.utils.test_utils import tests_module TRAIN_FILE = tests_module.test_file("train_data_tiny.tsv") EVAL_FILE = tests_module.test_file("test_data_tiny.tsv") TEST_FILE = tests_module.test_file("test_data_tiny.tsv") EMBED_RAW_PATH = tests_module.test_file("pretrained_embed_raw") EMBED_CACHED_PATH = tests_module.test_file("test_embed.cached") EMBED_XLU_CACHED_PATH = tests_module.test_file("test_embed_xlu.cached") class PretrainedEmbedsTest(unittest.TestCase): def test_cache_embeds(self): embeddings_ref = PretrainedEmbedding() embeddings_ref.load_pretrained_embeddings(EMBED_RAW_PATH) with tempfile.NamedTemporaryFile( delete=False, suffix=".{}".format("cached")) as cached_path: embeddings_ref.cache_pretrained_embeddings(cached_path.name)
#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import unittest from typing import Dict import numpy as np from pytext.common.constants import BatchContext, DatasetFieldName, DFColumn, VocabMeta from pytext.config.component import create_featurizer from pytext.config.field_config import FeatureConfig, WordLabelConfig from pytext.data import LanguageModelDataHandler from pytext.data.featurizer import SimpleFeaturizer from pytext.fields import Field, TextFeatureField from pytext.utils.test_utils import tests_module FILE_NAME = tests_module.test_file("alarm_lm_tiny.tsv") BATCH_SIZE = 5 class LanguageModelDataHandlerTest(unittest.TestCase): @classmethod def create_language_model_data_handler(cls) -> LanguageModelDataHandler: # TODO: Refactor this after Shicong refactors PyText config and removes # Thrift. After that directly use Data Handler's from config method # with synthetic configs columns = [DFColumn.UTTERANCE] features: Dict[str, Field] = { DatasetFieldName.TEXT_FIELD: TextFeatureField( eos_token=VocabMeta.EOS_TOKEN, init_token=VocabMeta.INIT_TOKEN ) }