def test_create_with_existing_file(self, tmp_path): path = tmp_path / 'existing' path.touch() data = [('blah', numpy.array([1, 2]))] with pytest.raises(FileExistsError): MockEmbFile.create(path, data, overwrite=False) MockEmbFile.create(path, data, overwrite=True)
def test_create_warns_for_wrong_vocab_size(self, reference_pairs): wrong_vocab_size = REFERENCE_VOCAB_SIZE + 5 with pytest.warns(UserWarning): MockEmbFile.create('fake_path', reference_pairs, vocab_size=wrong_vocab_size) _create_called_with = MockEmbFile._create_kwargs assert _create_called_with.vocab_size == REFERENCE_VOCAB_SIZE
def test_create_from_file_returned_path(self, tmpdir): src_file = MockEmbFile(path='path/to/file.txt') # default out_dir and out_filename out_path = MockEmbFile.create_from_file(src_file, compression='gz') assert out_path == Path('path/to/file.mock.gz') # custom out_dir and filename out_path = MockEmbFile.create_from_file( src_file, out_dir='out/dir/', out_filename='out_filename.mock') assert out_path == Path('out/dir/out_filename.mock')
def test_create_from_file_returned_path_with_compressed_source_file( self, tmpdir): # We need to create a real compressed file because EmbFile decompresses it path = tmpdir / 'file.txt.gz' with gzip.open(path, 'wt') as f: f.write('ciao') compressed_src_file = MockEmbFile(path=path) # default out_dir and out_filename + source file is compressed out_path = MockEmbFile.create_from_file(compressed_src_file, compression='bz2') assert out_path == Path(tmpdir / 'file.mock.bz2') # non-default filename arg out_path = MockEmbFile.create_from_file(compressed_src_file, out_filename='changed', compression='bz2') assert out_path == Path(tmpdir / 'changed')
def test_build_matrix_raises_for_dict_with_empty_slots(): word2vec = { 'a': numpy.random.rand(5), 'b': numpy.random.rand(5), 'c': numpy.random.rand(5), 'd': numpy.random.rand(5) } oov_vector = numpy.array([1, 2, 3, 4, 5]) word2index = {'a': 1, 'c': 3, 'missing': 5} file = MockEmbFile(list(word2vec.items())) out = embfile.build_matrix(file, word2index, oov_initializer=lambda shape: oov_vector) assert out.missing_words == {'missing'} out.word2index == word2index for i in [0, 2, 4]: assert numpy.allclose(out.matrix[i], numpy.zeros(5)) assert numpy.allclose(out.matrix[1], word2vec['a']) assert numpy.allclose(out.matrix[3], word2vec['c']) assert numpy.allclose(out.matrix[5], oov_vector)
def file(self, tmp_path, reference_pairs): return MockEmbFile(reference_pairs, path=tmp_path / 'file.mock')
def pairs_and_file(self, tmp_path, pairs_factory, dtype): target_pairs = pairs_factory() path = tmp_path / 'file.mock' with MockEmbFile(target_pairs, path=path) as file: yield target_pairs, file
def test_create_with_wrong_type(self): with pytest.raises(TypeError): # noinspection PyTypeChecker MockEmbFile.create('fake_path', word_vectors=5)
def test_create_with_iterator(self, reference_pairs): MockEmbFile.create('fake_path', iter(reference_pairs)) _create_called_with = MockEmbFile._create_kwargs assert _create_called_with.vocab_size is None assert _create_called_with.vector_size == REFERENCE_VECTOR_SIZE assert list(_create_called_with.pairs) == reference_pairs
def test_create_with_dict(self, reference_dict): MockEmbFile.create('fake_path', reference_dict) _create_called_with = MockEmbFile._create_kwargs assert _create_called_with.vocab_size == REFERENCE_VOCAB_SIZE assert _create_called_with.vector_size == REFERENCE_VECTOR_SIZE assert _create_called_with.pairs == reference_dict.items()
def file(reference_pairs): return MockEmbFile(reference_pairs)