def test_seek_delimiter_endline(): f = io.BytesIO(b'123\n456\n789') # if at zero, stay at zero seek_delimiter(f, b'\n', 5) assert f.tell() == 0 # choose the first block for bs in [1, 5, 100]: f.seek(1) seek_delimiter(f, b'\n', blocksize=bs) assert f.tell() == 4 # handle long delimiters well, even with short blocksizes f = io.BytesIO(b'123abc456abc789') for bs in [1, 2, 3, 4, 5, 6, 10]: f.seek(1) seek_delimiter(f, b'abc', blocksize=bs) assert f.tell() == 6 # End at the end f = io.BytesIO(b'123\n456') f.seek(5) seek_delimiter(f, b'\n', 5) assert f.tell() == 7
def read_avro(urlpath, block_finder='auto', blocksize=100000000, check_headers=True, **kwargs): """Read set of avro files into dask dataframes Use this only with avro schema that make sense as tabular data, i.e., not deeply nested with arrays and maps. Parameters ---------- urlpath: string Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. block_finder: auto|scan|seek|none Method for chunking avro files. - scan: read the first bytes of every block to find the size of all blocks and therefore the boundaries - seek: find the block delimiter bytestring every blocksize bytes - none: do not chunk files, parallelism will be only across files - auto: use seek if the first block is larger than 0.2*blocksize, else scan. check_headers: bool (True) Read the small header at the start of every file. If False, the headers must be exactly byte-equivalent. This is a useful optimisation, especially if block_finder='none' """ if block_finder not in ['auto', 'scan', 'seek', 'none']: raise ValueError("block_finder must be in ['auto', 'scan', 'seek'," " 'none'], got %s" % block_finder) fs, paths, myopen = get_fs_paths_myopen(urlpath, None, 'rb', None, **kwargs) chunks = [] head = None read = delayed(read_avro_bytes) for path in paths: if head is None: # sample first file with myopen(path, 'rb') as f: head = read_header(f) size = fs.size(path) b_to_s = blocksize / head['first_block_bytes'] if (block_finder == 'none' or blocksize > 0.9 * size or head['first_block_bytes'] > 0.9 * size): # one chunk per file chunks.append(read(path, myopen, 0, size, None)) elif block_finder == 'scan' or (block_finder == 'auto' and b_to_s < 0.2): # hop through file pick blocks ~blocksize apart, append to chunks with myopen(path, 'rb') as f: head['blocks'] = [] scan_blocks(f, head, size) blocks = head['blocks'] head['blocks'] = [] loc0 = head['header_size'] loc = loc0 nrows = 0 for o in blocks: loc += o['size'] + SYNC_SIZE nrows += o['nrows'] if loc - loc0 > blocksize: chunks.append( read(path, myopen, loc0, loc - loc0, head, nrows=nrows)) loc0 = loc nrows = 0 chunks.append( read(path, myopen, loc0, size - loc0, head, nrows=nrows)) else: # block-seek case: find sync markers loc0 = head['header_size'] with myopen(path, 'rb') as f: while True: f.seek(blocksize, 1) seek_delimiter(f, head['sync'], head['first_block_bytes'] * 4) loc = f.tell() chunks.append(read(path, myopen, loc0, loc - loc0, head)) if f.tell() >= size: break loc0 = loc return from_delayed(chunks, meta=head['dtypes'], divisions=[None] * (len(chunks) + 1))