def __enter__(self): codec = detect_compression(self.file) # (1) Open the input file # We keep a reference (`f`) to the original file, # in order to be able to close it on exit. # Calling `close()` on `ZstdDecompressor` does not # close the underlying resource. self.f = self.file.open("rb") # UTF-8 decoded, decompressed, file self.fb = self.f # (2) Setup the decompressor, if needed if codec == CompressionFormat.Zstandard: dict_data = ZstdCompressionDict(dictionary.read_bytes()) ctx = ZstdDecompressor(dict_data=dict_data) self.fb = ctx.stream_reader(self.fb, read_across_frames=True) # (3) Decode the file self.fb = TextIOWrapper(self.fb, "utf-8") # (4) Deserialize the records stream = map(json_tryloads, self.fb) # (5) Apply the filters stream = filter( lambda record: all(fn.keep(record) for fn in self.filters), stream) # (6) Apply the transformers for fn in self.transformers: stream = map(fn, stream) return stream
def zstdlines(filename, encoding="utf-8", bufsize=65536): """ Generator over lines from a zstd compressed file. >>> for line in zstdlines("file.zst"): ... print(line) """ with open(filename, "rb") as f: decomp = ZstdDecompressor() with decomp.stream_reader(f) as reader: prev_line = "" while True: chunk = reader.read(bufsize) if not chunk: break while True: # We start with bytes but want unicode, which might not # align; so we jitter around the end to complete the # codepoint. try: string_data = chunk.decode(encoding) except UnicodeDecodeError: chunk = chunk + reader.read(1) else: break lines = string_data.split("\n") for i, line in enumerate(lines[:-1]): if i == 0: line = prev_line + line yield line prev_line = lines[-1]
def from_file(cls, file): data = [] # TODO: Unified "file loader" # (detect_codec is used is many place) codec = detect_compression(file) with open(file, "rb") as f: if codec == CompressionFormat.Zstandard: ctx = ZstdDecompressor() f = ctx.stream_reader(f) f = TextIOWrapper(f, "utf-8") for line in f: if line.startswith(";"): continue prefix, origins = line.split("\t") origins = [int(x) for x in origins.split(",")] data.append((prefix, origins)) return cls(data)
def zopen(fn, mode="r", *args, **kwargs): import codecs objs = None if fn.endswith(".gz"): import gzip objs = [gzip.open(fn, "rb")] elif fn.endswith(".bz2"): import bz2 objs = [bz2.open(fn, "rb")] elif fn.endswith(".xz"): import lzma objs = [lzma.open(fn, "rb")] elif fn.endswith(".zst"): from zstandard import ZstdDecompressor try: # documentation says KiB but it seems to be bytes ctx = ZstdDecompressor(max_window_size=1024 * 1024 * 1024 * 2) except: # fallback in case that changes ctx = ZstdDecompressor(max_window_size=1024 * 1024 * 2) f1 = open(fn, "rb", 512 * 1024) f2 = ctx.stream_reader(f1) objs = [f2, f1] else: objs = [open(fn, "rb", 512 * 1024)] if "b" not in mode: enc = kwargs.get("encoding", "utf-8") # yield io.TextIOWrapper(io.BufferedReader(f2)) yield codecs.getreader(enc)(objs[0]) else: yield objs[0] for obj in objs: obj.close()
def _read_rel_zs_rows(filepath, chunk_size=8 * 1000 * 1000): from zstandard import ZstdDecompressor with open(filepath, "rb") as fh: ctx = ZstdDecompressor() with ctx.stream_reader(fh) as reader: over = False chunks = [] rows = [] while not over: have_row = False while not have_row: chunk = reader.read(chunk_size) if not chunk: over = True break if b"\n" in chunk: have_row = True chunks.append(chunk) (new_rows, semi_row) = _consume_rows(chunks) rows += new_rows chunks = [semi_row] return rows
continue out += ' '.join(tokenizer.tokenize(sent)) + '\n' sent = '' if sent: out += ' '.join(tokenizer.tokenize(sent)) + '\n' return out dctx = ZstdDecompressor() for group in tqdm(os.listdir(source_dir), ncols=80, position=0): group_dir = os.path.join(source_dir, group) group_outs = [] for filename in tqdm(os.listdir(group_dir), ncols=80, desc=group, position=1): filepath = os.path.join(group_dir, filename) with open(filepath, 'rb') as f: with dctx.stream_reader(f) as r: text_stream = io.TextIOWrapper(r, encoding='utf-8') out = convert_file(text_stream) group_outs.append(out) out = '\n'.join(group_outs) out_path = os.path.join(dest_dir, '{}.txt'.format(group)) with open(out_path, 'w') as f: f.write(out)
useBlockCompression = blockMagic == b'NCZBLOCK' if useBlockCompression: BlockHeader = Block(f) if BlockHeader.blockSizeExponent < 14 or BlockHeader.blockSizeExponent > 32: raise ValueError( "Corrupted NCZBLOCK header: Block size must be between 14 and 32" ) blockSize = 2**BlockHeader.blockSizeExponent pos = f.tell() with open(argv[2], 'wb+') as o: o.write(header) decompressedBytes = 0 blockID = 0 dctx = ZstdDecompressor() if not useBlockCompression: decompressor = dctx.stream_reader(f) while True: if useBlockCompression: if BlockHeader.compressedBlockSizeList[blockID] < blockSize: decompressor = dctx.stream_reader(f) inputChunk = decompressor.read(blockSize) decompressedBytes += len(inputChunk) o.write(inputChunk) decompressor.flush() o.flush() print( 'Block', str(blockID + 1) + '/' + str(BlockHeader.numberOfBlocks)) else: o.write(f.read(blockSize))