示例#1
0
    def __enter__(self):
        codec = detect_compression(self.file)

        # (1) Open the input file

        # We keep a reference (`f`) to the original file,
        # in order to be able to close it on exit.
        # Calling `close()` on `ZstdDecompressor` does not
        # close the underlying resource.
        self.f = self.file.open("rb")

        # UTF-8 decoded, decompressed, file
        self.fb = self.f

        # (2) Setup the decompressor, if needed
        if codec == CompressionFormat.Zstandard:
            dict_data = ZstdCompressionDict(dictionary.read_bytes())
            ctx = ZstdDecompressor(dict_data=dict_data)
            self.fb = ctx.stream_reader(self.fb, read_across_frames=True)

        # (3) Decode the file
        self.fb = TextIOWrapper(self.fb, "utf-8")

        # (4) Deserialize the records
        stream = map(json_tryloads, self.fb)

        # (5) Apply the filters
        stream = filter(
            lambda record: all(fn.keep(record) for fn in self.filters), stream)

        # (6) Apply the transformers
        for fn in self.transformers:
            stream = map(fn, stream)

        return stream
示例#2
0
文件: utils.py 项目: miku/fuzzycat
def zstdlines(filename, encoding="utf-8", bufsize=65536):
    """
    Generator over lines from a zstd compressed file.

    >>> for line in zstdlines("file.zst"):
    ...     print(line)

    """
    with open(filename, "rb") as f:
        decomp = ZstdDecompressor()
        with decomp.stream_reader(f) as reader:
            prev_line = ""
            while True:
                chunk = reader.read(bufsize)
                if not chunk:
                    break
                while True:
                    # We start with bytes but want unicode, which might not
                    # align; so we jitter around the end to complete the
                    # codepoint.
                    try:
                        string_data = chunk.decode(encoding)
                    except UnicodeDecodeError:
                        chunk = chunk + reader.read(1)
                    else:
                        break
                lines = string_data.split("\n")
                for i, line in enumerate(lines[:-1]):
                    if i == 0:
                        line = prev_line + line
                    yield line
                prev_line = lines[-1]
示例#3
0
 def from_file(cls, file):
     data = []
     # TODO: Unified "file loader"
     # (detect_codec is used is many place)
     codec = detect_compression(file)
     with open(file, "rb") as f:
         if codec == CompressionFormat.Zstandard:
             ctx = ZstdDecompressor()
             f = ctx.stream_reader(f)
         f = TextIOWrapper(f, "utf-8")
         for line in f:
             if line.startswith(";"):
                 continue
             prefix, origins = line.split("\t")
             origins = [int(x) for x in origins.split(",")]
             data.append((prefix, origins))
     return cls(data)
示例#4
0
文件: util.py 项目: 9001/softchat
def zopen(fn, mode="r", *args, **kwargs):
    import codecs

    objs = None
    if fn.endswith(".gz"):
        import gzip

        objs = [gzip.open(fn, "rb")]

    elif fn.endswith(".bz2"):
        import bz2

        objs = [bz2.open(fn, "rb")]

    elif fn.endswith(".xz"):
        import lzma

        objs = [lzma.open(fn, "rb")]

    elif fn.endswith(".zst"):
        from zstandard import ZstdDecompressor

        try:
            # documentation says KiB but it seems to be bytes
            ctx = ZstdDecompressor(max_window_size=1024 * 1024 * 1024 * 2)
        except:
            # fallback in case that changes
            ctx = ZstdDecompressor(max_window_size=1024 * 1024 * 2)

        f1 = open(fn, "rb", 512 * 1024)
        f2 = ctx.stream_reader(f1)
        objs = [f2, f1]

    else:
        objs = [open(fn, "rb", 512 * 1024)]

    if "b" not in mode:
        enc = kwargs.get("encoding", "utf-8")
        # yield io.TextIOWrapper(io.BufferedReader(f2))
        yield codecs.getreader(enc)(objs[0])
    else:
        yield objs[0]

    for obj in objs:
        obj.close()
示例#5
0
def _read_rel_zs_rows(filepath, chunk_size=8 * 1000 * 1000):
    from zstandard import ZstdDecompressor

    with open(filepath, "rb") as fh:
        ctx = ZstdDecompressor()
        with ctx.stream_reader(fh) as reader:
            over = False
            chunks = []
            rows = []
            while not over:
                have_row = False
                while not have_row:
                    chunk = reader.read(chunk_size)
                    if not chunk:
                        over = True
                        break
                    if b"\n" in chunk:
                        have_row = True
                    chunks.append(chunk)
                (new_rows, semi_row) = _consume_rows(chunks)
                rows += new_rows
                chunks = [semi_row]
    return rows
示例#6
0
                continue

            out += ' '.join(tokenizer.tokenize(sent)) + '\n'
            sent = ''

    if sent:
        out += ' '.join(tokenizer.tokenize(sent)) + '\n'

    return out


dctx = ZstdDecompressor()

for group in tqdm(os.listdir(source_dir), ncols=80, position=0):
    group_dir = os.path.join(source_dir, group)
    group_outs = []

    for filename in tqdm(os.listdir(group_dir), ncols=80, desc=group, position=1):
        filepath = os.path.join(group_dir, filename)

        with open(filepath, 'rb') as f:
            with dctx.stream_reader(f) as r:
                text_stream = io.TextIOWrapper(r, encoding='utf-8')
                out = convert_file(text_stream)
        group_outs.append(out)

    out = '\n'.join(group_outs)
    out_path = os.path.join(dest_dir, '{}.txt'.format(group))
    with open(out_path, 'w') as f:
        f.write(out)
示例#7
0
 useBlockCompression = blockMagic == b'NCZBLOCK'
 if useBlockCompression:
     BlockHeader = Block(f)
     if BlockHeader.blockSizeExponent < 14 or BlockHeader.blockSizeExponent > 32:
         raise ValueError(
             "Corrupted NCZBLOCK header: Block size must be between 14 and 32"
         )
     blockSize = 2**BlockHeader.blockSizeExponent
 pos = f.tell()
 with open(argv[2], 'wb+') as o:
     o.write(header)
     decompressedBytes = 0
     blockID = 0
     dctx = ZstdDecompressor()
     if not useBlockCompression:
         decompressor = dctx.stream_reader(f)
     while True:
         if useBlockCompression:
             if BlockHeader.compressedBlockSizeList[blockID] < blockSize:
                 decompressor = dctx.stream_reader(f)
                 inputChunk = decompressor.read(blockSize)
                 decompressedBytes += len(inputChunk)
                 o.write(inputChunk)
                 decompressor.flush()
                 o.flush()
                 print(
                     'Block',
                     str(blockID + 1) + '/' +
                     str(BlockHeader.numberOfBlocks))
             else:
                 o.write(f.read(blockSize))