예제 #1
0
    def test_errors_vectors_python(self):
        tokens = []
        vecs = torch.empty(0, dtype=torch.float)

        with self.assertRaises(ValueError):
            # Test proper error raised when passing in empty tokens and vectors and
            # not passing in a user defined unk_tensor
            vectors(tokens, vecs)

        tensorA = torch.tensor([1, 0, 0], dtype=torch.int8)
        tokens = ['a']
        vecs = tensorA.unsqueeze(0)

        with self.assertRaises(TypeError):
            # Test proper error raised when vector is not of type torch.float
            vectors(tokens, vecs)

        with tempfile.TemporaryDirectory() as dir_name:
            # Test proper error raised when incorrect filename or dim passed into GloVe
            asset_name = 'glove.6B.zip'
            asset_path = get_asset_path(asset_name)
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)

            with self.assertRaises(ValueError):
                # incorrect name
                GloVe(name='UNK', dim=50, root=dir_name, validate_file=False)

            with self.assertRaises(ValueError):
                # incorrect dim
                GloVe(name='6B', dim=500, root=dir_name, validate_file=False)
예제 #2
0
 def test_vocab_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     f = open(asset_path, 'r')
     vocab_transform = VocabTransform(vocab_from_file(f))
     self.assertEqual(vocab_transform(['of', 'that', 'new']), [7, 18, 24])
     jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue())
     self.assertEqual(jit_vocab_transform(['of', 'that', 'new']), [7, 18, 24])
예제 #3
0
 def test_text_sequential_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     with open(asset_path, 'r') as f:
         pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f))
         jit_pipeline = torch.jit.script(pipeline.to_ivalue())
         self.assertEqual(pipeline('of that new'), [7, 18, 24])
         self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
예제 #4
0
 def test_vocab_from_file(self):
     asset_name = 'vocab_test.txt'
     asset_path = get_asset_path(asset_name)
     with open(asset_path, 'r') as f:
         v = vocab_from_file(f, unk_token='<new_unk>')
         expected_itos = ['<new_unk>', 'b', 'a', 'c']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
예제 #5
0
 def test_no_download(self):
     asset_name = 'glove.840B.300d.zip'
     asset_path = get_asset_path(asset_name)
     root = os.path.abspath('.data')
     if not os.path.exists(root):
         os.makedirs(root)
     data_path = os.path.abspath(os.path.join('.data', asset_name))
     shutil.copy(asset_path, data_path)
     file_path = utils.download_from_url('fakedownload/glove.840B.300d.zip')
     self.assertEqual(file_path, data_path)
     conditional_remove(data_path)
예제 #6
0
 def test_sentencepiece_processor(self):
     model_path = get_asset_path('spm_example.model')
     spm_transform = sentencepiece_processor(model_path)
     jit_spm_transform = torch.jit.script(spm_transform.to_ivalue())
     test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
     ref_results = [15340, 4286, 981, 1207, 1681, 17, 84, 684, 8896, 5366,
                    144, 3689, 9, 5602, 12114, 6, 560, 649, 5602, 12114]
     self.assertEqual(spm_transform(test_sample), ref_results)
     self.assertEqual(jit_spm_transform(test_sample), ref_results)
     self.assertEqual(spm_transform.decode(ref_results), test_sample)
     self.assertEqual(jit_spm_transform.decode(ref_results), test_sample)
예제 #7
0
    def test_errors(self):
        tokens = []
        vectors = torch.empty(0, dtype=torch.float)

        with self.assertRaises(ValueError):
            # Test proper error raised when passing in empty tokens and vectors and
            # not passing in a user defined unk_tensor
            Vectors(tokens, vectors)

        tensorA = torch.tensor([1, 0, 0], dtype=torch.float)
        tensorB = torch.tensor([0, 1, 0], dtype=torch.float)
        tokens = ['a', 'b', 'c']
        vectors = torch.stack((
            tensorA,
            tensorB,
        ), 0)

        with self.assertRaises(RuntimeError):
            # Test proper error raised when tokens and vectors have different sizes
            Vectors(tokens, vectors)

        tensorC = torch.tensor([0, 0, 1], dtype=torch.float)
        tokens = ['a', 'a', 'c']
        vectors = torch.stack((tensorA, tensorB, tensorC), 0)

        with self.assertRaises(RuntimeError):
            # Test proper error raised when tokens have duplicates
            # TODO (Nayef211): use self.assertRaisesRegex() to check
            # the key of the duplicate token in the error message
            Vectors(tokens, vectors)

        tensorC = torch.tensor([0, 0, 1], dtype=torch.int8)
        tokens = ['a']
        vectors = tensorC.unsqueeze(0)

        with self.assertRaises(TypeError):
            # Test proper error raised when vector is not of type torch.float
            Vectors(tokens, vectors)

        with tempfile.TemporaryDirectory() as dir_name:
            # Test proper error raised when incorrect filename or dim passed into GloVe
            asset_name = 'glove.6B.zip'
            asset_path = get_asset_path(asset_name)
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)

            with self.assertRaises(ValueError):
                # incorrect name
                GloVe(name='UNK', dim=50, root=dir_name, validate_file=False)

            with self.assertRaises(ValueError):
                # incorrect dim
                GloVe(name='6B', dim=500, root=dir_name, validate_file=False)
예제 #8
0
    def test_glove_different_dims(self):
        # copy the asset file into the expected download location
        # note that this is just a zip file with 1 line txt files used to test that the
        # correct files are being loaded
        asset_name = 'glove.6B.zip'
        asset_path = get_asset_path(asset_name)

        with tempfile.TemporaryDirectory() as dir_name:
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)

            glove_50d = GloVe(name='6B',
                              dim=50,
                              root=dir_name,
                              validate_file=False)
            glove_100d = GloVe(name='6B',
                               dim=100,
                               root=dir_name,
                               validate_file=False)
            glove_200d = GloVe(name='6B',
                               dim=200,
                               root=dir_name,
                               validate_file=False)
            glove_300d = GloVe(name='6B',
                               dim=300,
                               root=dir_name,
                               validate_file=False)
            vectors_objects = [glove_50d, glove_100d, glove_200d, glove_300d]

            # The first 3 entries in each vector.
            expected_glove_50d = {
                'the': [0.418, 0.24968, -0.41242],
            }
            expected_glove_100d = {
                'the': [-0.038194, -0.24487, 0.72812],
            }
            expected_glove_200d = {
                'the': [-0.071549, 0.093459, 0.023738],
            }
            expected_glove_300d = {
                'the': [0.04656, 0.21318, -0.0074364],
            }
            expected_gloves = [
                expected_glove_50d, expected_glove_100d, expected_glove_200d,
                expected_glove_300d
            ]

            for vectors_obj, expected_glove in zip(vectors_objects,
                                                   expected_gloves):
                for word in expected_glove.keys():
                    self.assertEqual(vectors_obj[word][:3],
                                     expected_glove[word])
예제 #9
0
    def test_vector_transform(self):
        asset_name = 'wiki.en.vec'
        asset_path = get_asset_path(asset_name)

        with tempfile.TemporaryDirectory() as dir_name:
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)
            vector_transform = VectorTransform(FastText(root=dir_name, validate_file=False))
            jit_vector_transform = torch.jit.script(vector_transform.to_ivalue())
            # The first 3 entries in each vector.
            expected_fasttext_simple_en = torch.tensor([[-0.065334, -0.093031, -0.017571], [-0.32423, -0.098845, -0.0073467]])
            self.assertEqual(vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
            self.assertEqual(jit_vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
예제 #10
0
    def test_vectors_from_file(self):
        asset_name = 'vectors_test.csv'
        asset_path = get_asset_path(asset_name)
        f = open(asset_path, 'r')
        vectors_obj = vectors_from_file_object(f)

        expected_tensorA = torch.tensor([1, 0, 0], dtype=torch.float)
        expected_tensorB = torch.tensor([0, 1, 0], dtype=torch.float)
        expected_unk_tensor = torch.tensor([0, 0, 0], dtype=torch.float)

        self.assertEqual(vectors_obj['a'], expected_tensorA)
        self.assertEqual(vectors_obj['b'], expected_tensorB)
        self.assertEqual(vectors_obj['not_in_it'], expected_unk_tensor)
예제 #11
0
    def test_vocab_from_file(self):
        asset_name = 'vocab_test.txt'
        asset_path = get_asset_path(asset_name)
        f = open(asset_path, 'r')
        v = vocab_from_file_object(f,
                                   specials=('<unk>', '<pad>', '<eos>'),
                                   specials_first=False)

        expected_itos = ['a', 'b', 'c', '<unk>', '<pad>', '<eos>']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}

        self.assertEqual(v.get_itos(), expected_itos)
        self.assertEqual(dict(v.get_stoi()), expected_stoi)
예제 #12
0
 def test_vocab_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     with open(asset_path, 'r') as f:
         vocab_transform = VocabTransform(vocab_from_file(f))
         self.assertEqual(
             vocab_transform([['of', 'that', 'new'],
                              ['of', 'that', 'new', 'that']]),
             [[21, 26, 20], [21, 26, 20, 26]])
         jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue())
         self.assertEqual(
             jit_vocab_transform([['of', 'that', 'new'],
                                  ['of', 'that', 'new', 'that']]),
             [[21, 26, 20], [21, 26, 20, 26]])
예제 #13
0
    def test_sentencepiece_tokenizer(self):
        model_path = get_asset_path('spm_example.model')
        spm_tokenizer = sentencepiece_tokenizer(model_path)
        jit_spm_tokenizer = torch.jit.script(spm_tokenizer.to_ivalue())
        test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is',
                       '\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text',
                       '\u2581to', 'ken', 'izer', '\u2581and',
                       '\u2581de', 'to', 'ken', 'izer']

        self.assertEqual(spm_tokenizer(test_sample), ref_results)
        self.assertEqual(spm_tokenizer.decode(ref_results), test_sample)
        self.assertEqual(jit_spm_tokenizer(test_sample), ref_results)
        self.assertEqual(jit_spm_tokenizer.decode(ref_results), test_sample)
예제 #14
0
파일: test_vocab.py 프로젝트: cpuhrsch/text
 def test_vocab_from_raw_text_file(self):
     asset_name = 'vocab_raw_text_test.txt'
     asset_path = get_asset_path(asset_name)
     with open(asset_path, 'r') as f:
         tokenizer = basic_english_normalize()
         jit_tokenizer = torch.jit.script(tokenizer.to_ivalue())
         v = vocab_from_raw_text_file(f, jit_tokenizer, unk_token='<new_unk>')
         expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
                          'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
                          'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
                          'unions', 'with', 'workers']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
         self.assertEqual(v.get_itos(), expected_itos)
         self.assertEqual(dict(v.get_stoi()), expected_stoi)
예제 #15
0
    def test_sentencepiece_load_and_save(self):
        model_path = get_asset_path('spm_example.model')
        input = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        expected = [
            '▁Sent',
            'ence',
            'P',
            'ie',
            'ce',
            '▁is',
            '▁an',
            '▁un',
            'super',
            'vis',
            'ed',
            '▁text',
            '▁to',
            'ken',
            'izer',
            '▁and',
            '▁de',
            'to',
            'ken',
            'izer',
        ]

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'spm_pybind.pt')
            spm = sentencepiece_tokenizer((model_path))
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'spm_torchscript.pt')
            # Call the __prepare_scriptable__() func and convert the building block to the torbhind version
            # Not expect users to use the torchbind version on eager mode but still need a CI test here.
            spm = sentencepiece_tokenizer(
                (model_path)).__prepare_scriptable__()
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))
예제 #16
0
    def test_sentencepiece_load_and_save(self):
        model_path = get_asset_path('spm_example.model')
        input = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        expected = [
            '▁Sent',
            'ence',
            'P',
            'ie',
            'ce',
            '▁is',
            '▁an',
            '▁un',
            'super',
            'vis',
            'ed',
            '▁text',
            '▁to',
            'ken',
            'izer',
            '▁and',
            '▁de',
            'to',
            'ken',
            'izer',
        ]

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'spm_pybind.pt')
            spm = sentencepiece_tokenizer((model_path))
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'spm_torchscript.pt')
            spm = sentencepiece_tokenizer((model_path)).to_ivalue()
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))
예제 #17
0
    def test_glove(self):
        # copy the asset file into the expected download location
        # note that this is just a zip file with the first 100 entries of the GloVe 840B dataset
        asset_name = 'glove.840B.300d.zip'
        asset_path = get_asset_path(asset_name)

        with tempfile.TemporaryDirectory() as dir_name:
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)
            vectors_obj = GloVe(root=dir_name, validate_file=False)
            jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue())

            # The first 3 entries in each vector.
            expected_glove = {
                'the': [0.27204, -0.06203, -0.1884],
                'people': [-0.19686, 0.11579, -0.41091],
            }

            for word in expected_glove.keys():
                self.assertEqual(vectors_obj[word][:3], expected_glove[word])
                self.assertEqual(jit_vectors_obj[word][:3],
                                 expected_glove[word])
예제 #18
0
    def test_fast_text(self):
        # copy the asset file into the expected download location
        # note that this is just a file with the first 100 entries of the FastText english dataset
        asset_name = 'wiki.en.vec'
        asset_path = get_asset_path(asset_name)

        with tempfile.TemporaryDirectory() as dir_name:
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)
            vectors_obj = FastText(root=dir_name, validate_file=False)
            jit_vectors_obj = torch.jit.script(vectors_obj)

            # The first 3 entries in each vector.
            expected_fasttext_simple_en = {
                'the': [-0.065334, -0.093031, -0.017571],
                'world': [-0.32423, -0.098845, -0.0073467],
            }

            for word in expected_fasttext_simple_en.keys():
                self.assertEqual(vectors_obj[word][:3],
                                 expected_fasttext_simple_en[word])
                self.assertEqual(jit_vectors_obj[word][:3],
                                 expected_fasttext_simple_en[word])