def test_seek_delimiter_endline():
    f = io.BytesIO(b'123\n456\n789')

    # if at zero, stay at zero
    seek_delimiter(f, b'\n', 5)
    assert f.tell() == 0

    # choose the first block
    for bs in [1, 5, 100]:
        f.seek(1)
        seek_delimiter(f, b'\n', blocksize=bs)
        assert f.tell() == 4

    # handle long delimiters well, even with short blocksizes
    f = io.BytesIO(b'123abc456abc789')
    for bs in [1, 2, 3, 4, 5, 6, 10]:
        f.seek(1)
        seek_delimiter(f, b'abc', blocksize=bs)
        assert f.tell() == 6

    # End at the end
    f = io.BytesIO(b'123\n456')
    f.seek(5)
    seek_delimiter(f, b'\n', 5)
    assert f.tell() == 7
Exemplo n.º 2
0
def read_avro(urlpath,
              block_finder='auto',
              blocksize=100000000,
              check_headers=True,
              **kwargs):
    """Read set of avro files into dask dataframes

    Use this only with avro schema that make sense as tabular data, i.e.,
    not deeply nested with arrays and maps.

    Parameters
    ----------
    urlpath: string
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    block_finder: auto|scan|seek|none
        Method for chunking avro files.
        - scan: read the first bytes of every block to find the size of all
            blocks and therefore the boundaries
        - seek: find the block delimiter bytestring every blocksize bytes
        - none: do not chunk files, parallelism will be only across files
        - auto: use seek if the first block is larger than 0.2*blocksize, else
            scan.
    check_headers: bool (True)
        Read the small header at the start of every file. If False, the headers
        must be exactly byte-equivalent. This is a useful optimisation,
        especially if block_finder='none'
    """
    if block_finder not in ['auto', 'scan', 'seek', 'none']:
        raise ValueError("block_finder must be in ['auto', 'scan', 'seek',"
                         " 'none'], got %s" % block_finder)
    fs, paths, myopen = get_fs_paths_myopen(urlpath, None, 'rb', None,
                                            **kwargs)
    chunks = []
    head = None
    read = delayed(read_avro_bytes)
    for path in paths:
        if head is None:
            # sample first file
            with myopen(path, 'rb') as f:
                head = read_header(f)
        size = fs.size(path)
        b_to_s = blocksize / head['first_block_bytes']
        if (block_finder == 'none' or blocksize > 0.9 * size
                or head['first_block_bytes'] > 0.9 * size):
            # one chunk per file
            chunks.append(read(path, myopen, 0, size, None))
        elif block_finder == 'scan' or (block_finder == 'auto'
                                        and b_to_s < 0.2):
            # hop through file pick blocks ~blocksize apart, append to chunks
            with myopen(path, 'rb') as f:
                head['blocks'] = []
                scan_blocks(f, head, size)
            blocks = head['blocks']
            head['blocks'] = []
            loc0 = head['header_size']
            loc = loc0
            nrows = 0
            for o in blocks:
                loc += o['size'] + SYNC_SIZE
                nrows += o['nrows']
                if loc - loc0 > blocksize:
                    chunks.append(
                        read(path, myopen, loc0, loc - loc0, head,
                             nrows=nrows))
                    loc0 = loc
                    nrows = 0
            chunks.append(
                read(path, myopen, loc0, size - loc0, head, nrows=nrows))
        else:
            # block-seek case: find sync markers
            loc0 = head['header_size']
            with myopen(path, 'rb') as f:
                while True:
                    f.seek(blocksize, 1)
                    seek_delimiter(f, head['sync'],
                                   head['first_block_bytes'] * 4)
                    loc = f.tell()
                    chunks.append(read(path, myopen, loc0, loc - loc0, head))
                    if f.tell() >= size:
                        break
                    loc0 = loc
    return from_delayed(chunks,
                        meta=head['dtypes'],
                        divisions=[None] * (len(chunks) + 1))