示例#1
0
 def test_create_with_existing_file(self, tmp_path):
     path = tmp_path / 'existing'
     path.touch()
     data = [('blah', numpy.array([1, 2]))]
     with pytest.raises(FileExistsError):
         MockEmbFile.create(path, data, overwrite=False)
     MockEmbFile.create(path, data, overwrite=True)
示例#2
0
 def test_create_warns_for_wrong_vocab_size(self, reference_pairs):
     wrong_vocab_size = REFERENCE_VOCAB_SIZE + 5
     with pytest.warns(UserWarning):
         MockEmbFile.create('fake_path',
                            reference_pairs,
                            vocab_size=wrong_vocab_size)
     _create_called_with = MockEmbFile._create_kwargs
     assert _create_called_with.vocab_size == REFERENCE_VOCAB_SIZE
示例#3
0
    def test_create_from_file_returned_path(self, tmpdir):
        src_file = MockEmbFile(path='path/to/file.txt')

        # default out_dir and out_filename
        out_path = MockEmbFile.create_from_file(src_file, compression='gz')
        assert out_path == Path('path/to/file.mock.gz')

        # custom out_dir and filename
        out_path = MockEmbFile.create_from_file(
            src_file, out_dir='out/dir/', out_filename='out_filename.mock')
        assert out_path == Path('out/dir/out_filename.mock')
示例#4
0
    def test_create_from_file_returned_path_with_compressed_source_file(
            self, tmpdir):
        # We need to create a real compressed file because EmbFile decompresses it
        path = tmpdir / 'file.txt.gz'
        with gzip.open(path, 'wt') as f:
            f.write('ciao')
        compressed_src_file = MockEmbFile(path=path)

        # default out_dir and out_filename + source file is compressed
        out_path = MockEmbFile.create_from_file(compressed_src_file,
                                                compression='bz2')
        assert out_path == Path(tmpdir / 'file.mock.bz2')

        # non-default filename arg
        out_path = MockEmbFile.create_from_file(compressed_src_file,
                                                out_filename='changed',
                                                compression='bz2')
        assert out_path == Path(tmpdir / 'changed')
示例#5
0
def test_build_matrix_raises_for_dict_with_empty_slots():
    word2vec = {
        'a': numpy.random.rand(5),
        'b': numpy.random.rand(5),
        'c': numpy.random.rand(5),
        'd': numpy.random.rand(5)
    }
    oov_vector = numpy.array([1, 2, 3, 4, 5])
    word2index = {'a': 1, 'c': 3, 'missing': 5}
    file = MockEmbFile(list(word2vec.items()))
    out = embfile.build_matrix(file,
                               word2index,
                               oov_initializer=lambda shape: oov_vector)
    assert out.missing_words == {'missing'}
    out.word2index == word2index
    for i in [0, 2, 4]:
        assert numpy.allclose(out.matrix[i], numpy.zeros(5))
    assert numpy.allclose(out.matrix[1], word2vec['a'])
    assert numpy.allclose(out.matrix[3], word2vec['c'])
    assert numpy.allclose(out.matrix[5], oov_vector)
示例#6
0
 def file(self, tmp_path, reference_pairs):
     return MockEmbFile(reference_pairs, path=tmp_path / 'file.mock')
示例#7
0
 def pairs_and_file(self, tmp_path, pairs_factory, dtype):
     target_pairs = pairs_factory()
     path = tmp_path / 'file.mock'
     with MockEmbFile(target_pairs, path=path) as file:
         yield target_pairs, file
示例#8
0
 def test_create_with_wrong_type(self):
     with pytest.raises(TypeError):
         # noinspection PyTypeChecker
         MockEmbFile.create('fake_path', word_vectors=5)
示例#9
0
 def test_create_with_iterator(self, reference_pairs):
     MockEmbFile.create('fake_path', iter(reference_pairs))
     _create_called_with = MockEmbFile._create_kwargs
     assert _create_called_with.vocab_size is None
     assert _create_called_with.vector_size == REFERENCE_VECTOR_SIZE
     assert list(_create_called_with.pairs) == reference_pairs
示例#10
0
 def test_create_with_dict(self, reference_dict):
     MockEmbFile.create('fake_path', reference_dict)
     _create_called_with = MockEmbFile._create_kwargs
     assert _create_called_with.vocab_size == REFERENCE_VOCAB_SIZE
     assert _create_called_with.vector_size == REFERENCE_VECTOR_SIZE
     assert _create_called_with.pairs == reference_dict.items()
示例#11
0
def file(reference_pairs):
    return MockEmbFile(reference_pairs)