def test_read_from_file(self):
        file_name = tests_module.test_file("train_dense_features_tiny.tsv")
        data = self.data_handler.read_from_file(file_name,
                                                self.data_handler.raw_columns)

        # Check if the data has 10 rows and 6 columns
        self.assertEqual(len(data), 10)
        self.assertEqual(len(data[0]), 4)

        self.assertEqual(data[0][RawData.DOC_LABEL], "alarm/modify_alarm")
    def test_tokenization(self):
        file_name = tests_module.test_file("train_dense_features_tiny.tsv")

        data = self.data_handler.read_from_file(file_name,
                                                self.data_handler.raw_columns)
        data = list(self.data_handler.preprocess(data))

        # test tokenization without language-specific tokenizers
        self.assertEqual(data[0][ModelInput.WORD_FEAT][0],
                         "16:24:datetime,39:57:datetime")
        self.assertIsNotNone(data[0][ModelInput.DENSE_FEAT])
示例#3
0
    def test_read_partially_from_csv(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")
        columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2}

        data = DataHandler.read_from_file(file_name, columns)
        for col in columns:
            self.assertTrue(col in data[0], "{} must in the data".format(col))
        self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL])
        self.assertEqual(
            "change my alarm tomorrow to wake me up 30 minutes earlier",
            data[0][DFColumn.UTTERANCE],
        )
    def setUp(self):
        file_name = tests_module.test_file(
            "contextual_intent_slot_train_tiny.tsv")
        self.dh = ContextualIntentSlotModelDataHandler.from_config(
            ContextualIntentSlotModelDataHandler.Config(),
            ModelInputConfig(),
            [DocLabelConfig(), WordLabelConfig()],
            featurizer=SimpleFeaturizer(SimpleFeaturizer.Config(),
                                        ModelInputConfig()),
        )

        self.data = self.dh.read_from_file(file_name, self.dh.raw_columns)
示例#5
0
    def test_read_from_file(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")
        data = self.data_handler.read_from_file(file_name,
                                                self.data_handler.raw_columns)

        # Check if the data has 10 rows and 6 columns
        self.assertEqual(len(data), 10)
        self.assertEqual(len(data[0]), 6)

        self.assertEqual(
            data[0][DFColumn.UTTERANCE],
            "change my alarm tomorrow to wake me up 30 minutes earlier",
        )
示例#6
0
    def test_tokenization(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")

        data = self.data_handler.read_from_file(file_name,
                                                self.data_handler.raw_columns)
        data = list(self.data_handler.preprocess(data))

        # test tokenization without language-specific tokenizers
        self.assertEqual(data[0][DatasetFieldName.TEXT_FIELD][0], "change")
        self.assertEqual(data[4][DatasetFieldName.TEXT_FIELD][2], "alarm")

        # test token ranges
        self.assertEqual(data[0][DatasetFieldName.TOKEN_RANGE][0], (0, 6))
        self.assertEqual(data[4][DatasetFieldName.TOKEN_RANGE][2], (12, 17))
示例#7
0
    def test_read_from_csv(self):
        file_name = tests_module.test_file("train_data_tiny.tsv")
        columns = [
            DFColumn.DOC_LABEL,
            DFColumn.WORD_LABEL,
            DFColumn.UTTERANCE,
            DFColumn.DICT_FEAT,
        ]

        data = DataHandler.read_from_file(file_name, columns)
        for col in columns:
            self.assertTrue(col in data[0], "{} must in the data".format(col))
        self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL])
        self.assertEqual("16:24:datetime,39:57:datetime",
                         data[0][DFColumn.WORD_LABEL])
        self.assertEqual(
            "change my alarm tomorrow to wake me up 30 minutes earlier",
            data[0][DFColumn.UTTERANCE],
        )
        self.assertEqual("", data[0][DFColumn.DICT_FEAT])
示例#8
0
import numpy as np
from pytext.common.constants import DatasetFieldName
from pytext.config.field_config import (
    DocLabelConfig,
    EmbedInitStrategy,
    FeatureConfig,
    WordFeatConfig,
    WordLabelConfig,
)
from pytext.data import JointModelDataHandler
from pytext.data.featurizer import SimpleFeaturizer
from pytext.utils.embeddings_utils import PretrainedEmbedding
from pytext.utils.test_utils import tests_module

TRAIN_FILE = tests_module.test_file("train_data_tiny.tsv")
EVAL_FILE = tests_module.test_file("test_data_tiny.tsv")
TEST_FILE = tests_module.test_file("test_data_tiny.tsv")

EMBED_RAW_PATH = tests_module.test_file("pretrained_embed_raw")
EMBED_CACHED_PATH = tests_module.test_file("test_embed.cached")
EMBED_XLU_CACHED_PATH = tests_module.test_file("test_embed_xlu.cached")


class PretrainedEmbedsTest(unittest.TestCase):
    def test_cache_embeds(self):
        embeddings_ref = PretrainedEmbedding()
        embeddings_ref.load_pretrained_embeddings(EMBED_RAW_PATH)
        with tempfile.NamedTemporaryFile(
                delete=False, suffix=".{}".format("cached")) as cached_path:
            embeddings_ref.cache_pretrained_embeddings(cached_path.name)
示例#9
0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import unittest
from typing import Dict

import numpy as np
from pytext.common.constants import BatchContext, DatasetFieldName, DFColumn, VocabMeta
from pytext.config.component import create_featurizer
from pytext.config.field_config import FeatureConfig, WordLabelConfig
from pytext.data import LanguageModelDataHandler
from pytext.data.featurizer import SimpleFeaturizer
from pytext.fields import Field, TextFeatureField
from pytext.utils.test_utils import tests_module


FILE_NAME = tests_module.test_file("alarm_lm_tiny.tsv")
BATCH_SIZE = 5


class LanguageModelDataHandlerTest(unittest.TestCase):
    @classmethod
    def create_language_model_data_handler(cls) -> LanguageModelDataHandler:
        # TODO: Refactor this after Shicong refactors PyText config and removes
        # Thrift. After that directly use Data Handler's from config method
        # with synthetic configs
        columns = [DFColumn.UTTERANCE]
        features: Dict[str, Field] = {
            DatasetFieldName.TEXT_FIELD: TextFeatureField(
                eos_token=VocabMeta.EOS_TOKEN, init_token=VocabMeta.INIT_TOKEN
            )
        }