def __init__(self, words: Iterable[str], word2vec: Word2Vector, word2index: Optional[Callable[[str], int]] = None, missing_ok: bool = True, verbose: bool = False, close_hook: Optional[Callable] = None): """ Args: words: word2vec: object that implements ``word2vec[word]`` and ``word in word2vec`` word2index: function that returns the index (position) of a word inside the file; this enables an optimization for formats like VVM that store vectors sequentially in the same file. missing_ok: verbose: close_hook: function to call when closing this loader """ super().__init__(words, missing_ok) self.verbose = verbose self.word2vec = word2vec self.word2pos = word2index echo = print if verbose else noop missing_words = set(words) num_words = len(missing_words) available_words = [] for word in progbar(self.words, enable=verbose, desc='Collecting all available words'): if word in word2vec: available_words.append(word) missing_words.remove(word) elif not missing_ok: raise KeyError('word not found in the file: ' + word) if word2index: echo('Sorting words based on their position in the file') available_words.sort(key=self.word2pos) self._missing_words = missing_words self._available_words = available_words self._iterator = (WordVector(word, word2vec[word]) for word in available_words) desc = 'Loading available vectors (%d of %d)' % (len(available_words), num_words) self._progbar = progbar(enable=verbose, total=len(available_words), desc=desc) self.close_hook = close_hook
def __init__(self, file: 'EmbFile', words: Iterable[str], missing_ok: bool = True, verbose: bool = False): super().__init__(words, missing_ok) self.file = file self.verbose = verbose self._missing_words = set(words) self._reader = file.reader() self._progbar = progbar(enable=verbose, total=self.file.vocab_size, desc='Reading')
def _create(cls, out_path: Path, word_vectors: Iterable[Tuple[str, VectorType]], vector_size: int, vocab_size: int, compression: Optional[str] = None, verbose: bool = True, encoding: str = DEFAULT_ENCODING, precision: int = 5) -> Path: number_fmt = '%.{}f'.format(precision) if not vocab_size: raise ValueError( 'unable to infer vocab_size; you must manually provide it') # Because of a bug that io.TextIOWrapper presents when used in combination with bz2 and lzma # we have to complicate things a little bit. For more info about the bug (that I discovered # testing this code) see: # https://stackoverflow.com/questions/55171439/python-bz2-and-lzma-in-mode-wt-dont-write-the-bom-while-gzip-does-why); # I'll open the file in binary mode and encode the text using an IncrementalEncoder for # writing the BOM only at the beginning. encoder = codecs.getincrementalencoder(encoding)() encode = encoder.encode with open_file(out_path, 'wb', compression=compression) as fout: fout.write(encode('%d %d\n' % (vocab_size, vector_size))) for i, (word, vector) in progbar(enumerate(word_vectors), enable=verbose, desc='Writing', total=vocab_size): if ' ' in word: raise ValueError( "the word number %d contains one or more spaces: %r" % (i, word)) check_vector_size(i, vector, vector_size) fout.write(encode(word + ' ')) vector_string = ' '.join(number_fmt % num for num in vector) fout.write(encode(vector_string)) fout.write(encode('\n')) warn_if_wrong_vocab_size( vocab_size, actual_size=i + 1, extra_info='As a consequence, the header of the file has a wrong ' 'vocab_size. You can change it editing the file.') return out_path
def _create(cls, out_path: Path, word_vectors: Iterable[Tuple[str, VectorType]], vector_size: int, vocab_size: Optional[int], compression: Optional[str] = None, verbose: bool = True, encoding: str = DEFAULT_ENCODING, dtype: Optional[DType] = None) -> Path: echo = print if verbose else noop encoding = _bom_free_version(encoding) if not dtype: (_, first_vector), word_vectors = glance_first_element(word_vectors) dtype = first_vector.dtype else: dtype = numpy.dtype(dtype) if not vocab_size: raise ValueError( 'unable to infer vocab_size; you must manually provide it') with open_file(out_path, 'wb', compression=compression) as file: header_line = '%d %d\n' % (vocab_size, vector_size) echo('Writing the header: %s', header_line) header_bytes = header_line.encode(encoding) file.write(header_bytes) for i, (word, vector) in progbar(enumerate(word_vectors), verbose, total=vocab_size): if ' ' in word: raise ValueError( "the word number %d contains one or more spaces: %r" % (i, word)) file.write((word + ' ').encode(encoding)) check_vector_size(i, vector, vector_size) file.write(numpy.asarray(vector, dtype).tobytes()) warn_if_wrong_vocab_size( vocab_size, actual_size=i + 1, extra_info='As a consequence, the header of the file has a wrong ' 'vocab_size') return out_path
def _create(cls, out_path: Path, word_vectors: Iterable[Tuple[str, VectorType]], vector_size: int, vocab_size: Optional[int], compression: Optional[str] = None, verbose: bool = True, encoding: str = DEFAULT_ENCODING, dtype: Optional[DType] = None) -> Path: echo = print if verbose else noop if not dtype: (_, vector), word_vectors = glance_first_element(word_vectors) dtype = vector.dtype else: dtype = numpy.dtype(dtype) # Write everything in a temporary directory and then pack them into a tar file tempdir = Path(tempfile.mkdtemp()) vocab_tmp_path = tempdir / VOCAB_FILENAME vectors_tmp_path = tempdir / VECTORS_FILENAME meta_tmp_path = tempdir / META_FILENAME with open(vocab_tmp_path, 'wt', encoding=encoding) as vocab_file, \ open(vectors_tmp_path, 'wb') as vectors_file: # noqa desc = 'Generating {} and {} file'.format(VOCAB_FILENAME, VECTORS_FILENAME) i = -1 for i, (word, vector) in progbar(enumerate(word_vectors), verbose, desc=desc, total=vocab_size): if '\n' in word: raise ValueError( "the word number %d contains one or more newline characters: " "%r" % (i, word)) vocab_file.write(word) vocab_file.write('\n') check_vector_size(i, vector, vector_size) vectors_file.write(numpy.asarray(vector, dtype).tobytes()) actual_vocab_size = i + 1 warn_if_wrong_vocab_size( vocab_size, actual_vocab_size, extra_info='the actual size will be written in meta.json') vocab_size = actual_vocab_size echo('Writing {}...'.format(META_FILENAME)) metadata = { "vocab_size": vocab_size, "vector_size": vector_size, "dtype": dtype.str, "encoding": encoding } with open(meta_tmp_path, 'w') as meta_file: json.dump(metadata, meta_file, indent=2) if not compression: tar_path = out_path tar_mode = 'w' elif compression in _TAR_COMPRESSIONS: tar_path = out_path tar_mode = 'w:' + compression else: warnings.warn( 'A VVM file is just a TAR file; you should compress it using ' 'one the formats directly supported by tarfile ({}). ' 'Using another compression format will require me to create a ' 'temporary uncompressed TAR file first, doubling the required time!' ) tar_path = out_path.with_suffix(out_path.suffix + '.tmp') tar_mode = 'w' echo('Packing all the files together') with tarfile.open(tar_path, tar_mode) as tar_file: tar_file.add(str(vocab_tmp_path), VOCAB_FILENAME) tar_file.add(str(vectors_tmp_path), VECTORS_FILENAME) tar_file.add(str(meta_tmp_path), META_FILENAME) shutil.rmtree(tempdir) if compression and compression not in _TAR_COMPRESSIONS: echo("Compressing to %s file: %s" % (compression, out_path)) with open_file(out_path, 'wb', compression=compression) as compressed_file: with open(tar_path, 'rb') as non_compressed_file: shutil.copyfileobj(non_compressed_file, compressed_file) os.remove(tar_path) return out_path