예제 #1
0
def test(integration_case: _IntegrationCase, data_pattern: bytes,
         tmp_path: Path) -> None:
    xz_path, metadata = integration_case
    data = memoryview(data_pattern)

    if "padding" in xz_path.name:
        pytest.skip("Write mode does not support stream padding yet")

    generated_path = tmp_path / "archive.xz"

    with xz.open(generated_path, "w") as xzfile:
        for stream in metadata["streams"]:
            xzfile.check = stream["check"]
            xzfile.change_stream()
            for block in stream["blocks"]:
                xzfile.filters = block.get("filters")
                xzfile.change_block()
                xzfile.write(data[:block["length"]])
                data = data[block["length"]:]

    assert not data

    expected_hash = sha256(xz_path.read_bytes())
    generated_hash = sha256(generated_path.read_bytes())

    assert generated_hash.hexdigest() == expected_hash.hexdigest()
예제 #2
0
 def _tryOpenGlobalFile(filename):
     # This is not thread-safe! But it will be executed in a process pool, in which each worker has its own
     # global variable set. Using a global variable for this is safe because we know that there is one process pool
     # per BlockParallelReader, meaning the filename is a constant for each worker.
     global _parallelXzReaderFile
     if _parallelXzReaderFile is None:
         _parallelXzReaderFile = xz.open(filename, 'rb')
예제 #3
0
    def __init__(self, filename: str, parallelization: Optional[int] = None):
        fileObject = xz.open(filename, 'rb')

        blockBoundaries = fileObject.block_boundaries.copy()
        blockBoundaries.append(len(fileObject))

        super().__init__(filename, fileObject, blockBoundaries,
                         parallelization, ParallelXZReader._initWorker2,
                         (filename, ))
        self._openFiles()
예제 #4
0
    def _testSequentialReading(archivePath: str, bufferSize: int,
                               parallelization: int):
        with xz.open(archivePath, 'rb') as serialFile, lzma.open(
                archivePath) if parallelization == 1 else ParallelXZReader(
                    archivePath, parallelization) as parallelFile:
            bytesRead = 0
            while True:
                serialData = serialFile.read(bufferSize)
                parallelData = parallelFile.read(bufferSize)
                assert len(serialData) == len(parallelData)
                assert serialData == parallelData
                bytesRead += len(serialData)
                if len(serialData) < bufferSize:
                    break

            if hasattr(parallelFile, 'blockBoundaries'):
                assert bytesRead == parallelFile.blockBoundaries[-1]
예제 #5
0
def benchmark_python_xz():
    print("== Benchmark xz file decompression ==")

    h = guppy.hpy()
    result = None

    size = 0
    t0 = time.time()
    with xz.open(sys.argv[1], 'rb') as file:
        t1 = time.time()

        while True:
            readSize = len(file.read(32*1024*1024))
            if readSize == 0:
                break
            size += readSize

            if time.time() - t1 > 5:
                t1 = time.time()
                print(f"{t1 - t0:.2f}s {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS")
                gc.collect()
                # Note that this heap call would add a reference count to the file object...
                #if result is None:
                #    result = h.heap()
                #else:
                #    print((h.heap() - result))

        print(f"After finishing loop: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS")
        file.close()
        print(f"After closing file: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS")

    print("file closed?", file.closed)
    t1 = time.time()

    print("File type:", file)
    print("File type:", type(file))
    print("File referred by:", gc.get_referrers(file))
    print(f"After closing file: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS")
    del file
    print(f"After deleting file: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS")
    gc.collect()
    print(f"After garbage collection: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss // 1024} MiB RSS")
    if result is not None:
        print((h.heap() - result))
    print(f"Reading {size} B took: {t1-t0}s")
예제 #6
0
    def _testRandomReads(archivePath: str, samples: int, parallelization: int):
        with xz.open(archivePath, 'rb') as serialFile, lzma.open(
                archivePath) if parallelization == 1 else ParallelXZReader(
                    archivePath, parallelization) as parallelFile:
            if hasattr(parallelFile, 'blockBoundaries'):
                size = parallelFile.blockBoundaries[-1]
            else:
                parallelFile.seek(io.SEEK_END)
                size = parallelFile.tell()
                parallelFile.seek(0)

            for _ in range(samples):
                offset = random.randint(0, size + 1)
                size = random.randint(0, (size + 1 - offset) *
                                      2)  # half the time read past the end

                serialFile.seek(offset)
                serialData = serialFile.read(size)
                parallelFile.seek(offset)
                parallelData = parallelFile.read(size)
                assert len(serialData) == len(parallelData)
                assert serialData == parallelData
예제 #7
0
 'rar':
 CompressionInfo(
     ['rar'],
     [],
     'rarfile',
     lambda x: x.read(6) == b'Rar!\x1A\x07',
     lambda x: rarfile.RarFile(x),
 ),
 'xz':
 CompressionInfo(
     ['xz'],
     ['txz'],
     'lzmaffi' if 'lzmaffi' in sys.modules else 'xz',
     lambda x: x.read(6) == b"\xFD7zXZ\x00",
     (lambda x: lzmaffi.open(x)) if 'lzmaffi' in sys.modules else
     (lambda x: xz.open(x)),
 ),
 'zip':
 CompressionInfo(
     ['zip'],
     [],
     'zipfile',
     lambda x: x.read(2) == b'PK',
     lambda x: zipfile.ZipFile(x),
 ),
 'zst':
 CompressionInfo(
     ['zst', 'zstd'],
     ['tzst'],
     'indexed_zstd',
     lambda x: x.read(4) == (0xFD2FB528).to_bytes(4, 'little'),
예제 #8
0

if __name__ == '__main__':
    for module in ('zstandard', 'indexed_zstd', 'ratarmountcore'):
        if hasattr( sys.modules[module], '__version__' ):
            print(module, "version:", getattr(sys.modules[module], '__version__'))
    print()

    filename = sys.argv[1]
    if filename.endswith('.xz'):
        filename = filename[:-3]
    elif filename.endswith('.zst'):
        filename = filename[:-4]

    if os.path.isfile(filename + '.xz'):
        compareReading(xz.open(filename + '.xz', 'rb'), ParallelXZReader(filename + '.xz', os.cpu_count()))
        benchmarkReading(xz.open(filename + '.xz', 'rb'))
        benchmarkReading(lzma.open(filename + '.xz', 'rb'))
        benchmarkReading(ParallelXZReader(filename + '.xz', os.cpu_count()))

    print()

    if os.path.isfile(filename + '.zst'):
        #simpleParallelZstdReading(filename + '.zst')
        #testZstdSeeking(filename + '.zst')

        compareReading(zstandard.open(filename + '.zst', 'rb'), ParallelZstdReader(filename + '.zst', os.cpu_count()))
        benchmarkReading(zstandard.open(filename + '.zst', 'rb'))
        benchmarkReading(indexed_zstd.IndexedZstdFile(filename + '.zst'))
        benchmarkReading(ParallelZstdReader(filename + '.zst', os.cpu_count()))