def test_embeddings_text_file(self): txt_path = str(self.FIXTURES_ROOT / "utf-8_sample/utf-8_sample.txt") # This is for sure a correct way to read an utf-8 encoded text file with open(txt_path, "rt", encoding="utf-8") as f: correct_text = f.read() # Check if we get the correct text on plain and compressed versions of the file paths = [txt_path] + [txt_path + ext for ext in [".gz", ".zip"]] for path in paths: with EmbeddingsTextFile(path) as f: text = f.read() assert text == correct_text, "Test failed for file: " + path # Check for a file contained inside an archive with multiple files for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.lzma"]: archive_path = str( self.FIXTURES_ROOT / "utf-8_sample/archives/utf-8") + ext file_uri = format_embeddings_file_uri(archive_path, "folder/utf-8_sample.txt") with EmbeddingsTextFile(file_uri) as f: text = f.read() assert text == correct_text, "Test failed for file: " + archive_path # Passing a second level path when not reading an archive with pytest.raises(ValueError): with EmbeddingsTextFile( format_embeddings_file_uri(txt_path, "a/fake/path")): pass
def test_embeddings_text_file(self): txt_path = str(self.FIXTURES_ROOT / 'utf-8_sample/utf-8_sample.txt') # This is for sure a correct way to read an utf-8 encoded text file with open(txt_path, 'rt', encoding='utf-8') as f: correct_text = f.read() # Check if we get the correct text on plain and compressed versions of the file paths = [txt_path] + [txt_path + ext for ext in ['.gz', '.zip']] for path in paths: with EmbeddingsTextFile(path) as f: text = f.read() assert text == correct_text, "Test failed for file: " + path # Check for a file contained inside an archive with multiple files for ext in ['.zip', '.tar.gz', '.tar.bz2', '.tar.lzma']: archive_path = str(self.FIXTURES_ROOT / 'utf-8_sample/archives/utf-8') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/utf-8_sample.txt') with EmbeddingsTextFile(file_uri) as f: text = f.read() assert text == correct_text, "Test failed for file: " + archive_path # Passing a second level path when not reading an archive with pytest.raises(ValueError): with EmbeddingsTextFile(format_embeddings_file_uri(txt_path, 'a/fake/path')): pass
def test_read_pretrained_words(self): # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote words = set("If you think you are too small to make a difference " "try to sleeping with a mosquito àèìòù".split(" ")) # Reading from a single (compressed) file or a single-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt") for ext in ["", ".gz", ".lzma", ".bz2", ".zip", ".tar.gz"]: file_path = base_path + ext words_read = set(_read_pretrained_tokens(file_path)) assert words_read == words, (f"Wrong words for file {file_path}\n" f" Read: {sorted(words_read)}\n" f"Correct: {sorted(words)}") # Reading from a multi-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive") file_path = "folder/fake_embeddings.5d.txt" for ext in [".zip", ".tar.gz"]: archive_path = base_path + ext embeddings_file_uri = format_embeddings_file_uri( archive_path, file_path) words_read = set(_read_pretrained_tokens(embeddings_file_uri)) assert words_read == words, ( f"Wrong words for file {archive_path}\n" f" Read: {sorted(words_read)}\n" f"Correct: {sorted(words)}")
def test_from_instances_exclusive_embeddings_file_inside_archive(self): """ Just for ensuring there are no problems when reading pretrained tokens from an archive """ # Read embeddings file from archive archive_path = str(self.TEST_DIR / "embeddings-archive.zip") with zipfile.ZipFile(archive_path, 'w') as archive: file_path = 'embedding.3d.vec' with archive.open(file_path, 'w') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) with archive.open('dummy.vec', 'w') as dummy_file: dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8')) embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' in words assert 'c' not in words
def test_from_instances_exclusive_embeddings_file_inside_archive(self): """ Just for ensuring there are no problems when reading pretrained tokens from an archive """ # Read embeddings file from archive archive_path = str(self.TEST_DIR / "embeddings-archive.zip") with zipfile.ZipFile(archive_path, 'w') as archive: file_path = 'embedding.3d.vec' with archive.open(file_path, 'w') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) with archive.open('dummy.vec', 'w') as dummy_file: dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8')) embeddings_file_uri = format_embeddings_file_uri( archive_path, file_path) vocab = Vocabulary.from_instances( self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances( self.dataset, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' in words assert 'c' not in words
def test_read_embedding_file_inside_archive(self): token2vec = { u"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), u"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), u"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), u"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive.zip'), u'embedding_dim': 5 }) with pytest.raises(ValueError, message=u"No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in [u'.zip', u'.tar.gz']: archive_path = unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, u'folder/fake_embeddings.5d.txt') params = Params({ u'pretrained_file': file_uri, u'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in list(token2vec.items()): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), u'Problem with format ' + archive_path
def test_read_pretrained_words(self): # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote words = set(u"If you think you are too small to make a difference " u"try to sleeping with a mosquito àèìòù".split(u' ')) # Reading from a single (compressed) file or a single-file archive base_path = unicode(self.FIXTURES_ROOT / u"embeddings/fake_embeddings.5d.txt") for ext in [u'', u'.gz', u'.lzma', u'.bz2', u'.zip', u'.tar.gz']: file_path = base_path + ext words_read = _read_pretrained_tokens(file_path) assert words_read == words, "Wrong words for file {file_path}\n"\ " Read: {sorted(words_read)}\n"\ "Correct: {sorted(words)}" # Reading from a multi-file archive base_path = unicode(self.FIXTURES_ROOT / u"embeddings/multi-file-archive") file_path = u'folder/fake_embeddings.5d.txt' for ext in [u'.zip', u'.tar.gz']: archive_path = base_path + ext embeddings_file_uri = format_embeddings_file_uri( archive_path, file_path) words_read = _read_pretrained_tokens(embeddings_file_uri) assert words_read == words, "Wrong words for file {archive_path}\n"\ " Read: {sorted(words_read)}\n"\ "Correct: {sorted(words)}"
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, match="The archive .*/embeddings/multi-file-archive.zip contains multiple files, " "so you must select one of the files inside " "providing a uri of the type: " "\\(path_or_url_to_archive\\)#path_inside_archive\\."): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def test_decode_embeddings_file_uri(self): first_level_paths = [ "path/to/embeddings.gz", "unicode/path/òàè+ù.vec", "http://www.embeddings.com/path/to/embeddings.gz", "http://www.embeddings.com/àèìòù?query=blabla.zip", ] second_level_paths = [ "path/to/glove.27B.300d.vec", "òàè+ù.vec", "crawl-300d-2M.vec" ] for simple_path in first_level_paths: assert parse_embeddings_file_uri(simple_path) == (simple_path, None) for path1, path2 in zip(first_level_paths, second_level_paths): uri = format_embeddings_file_uri(path1, path2) decoded = parse_embeddings_file_uri(uri) assert decoded == (path1, path2)
def test_decode_embeddings_file_uri(self): first_level_paths = [ 'path/to/embeddings.gz', 'unicode/path/òàè+ù.vec', 'http://www.embeddings.com/path/to/embeddings.gz', 'http://www.embeddings.com/àèìòù?query=blabla.zip', ] second_level_paths = [ 'path/to/glove.27B.300d.vec', 'òàè+ù.vec', 'crawl-300d-2M.vec' ] for simple_path in first_level_paths: assert parse_embeddings_file_uri(simple_path) == (simple_path, None) for path1, path2 in zip(first_level_paths, second_level_paths): uri = format_embeddings_file_uri(path1, path2) decoded = parse_embeddings_file_uri(uri) assert decoded == (path1, path2)
def test_read_pretrained_words(self): # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote words = set("If you think you are too small to make a difference " "try to sleeping with a mosquito àèìòù".split(' ')) # Reading from a single (compressed) file or a single-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt") for ext in ['', '.gz', '.lzma', '.bz2', '.zip', '.tar.gz']: file_path = base_path + ext words_read = set(_read_pretrained_tokens(file_path)) assert words_read == words, f"Wrong words for file {file_path}\n" \ f" Read: {sorted(words_read)}\n" \ f"Correct: {sorted(words)}" # Reading from a multi-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive") file_path = 'folder/fake_embeddings.5d.txt' for ext in ['.zip', '.tar.gz']: archive_path = base_path + ext embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path) words_read = set(_read_pretrained_tokens(embeddings_file_uri)) assert words_read == words, f"Wrong words for file {archive_path}\n" \ f" Read: {sorted(words_read)}\n" \ f"Correct: {sorted(words)}"