def _byte_block_counts( urlpath, blocksize, lineterminator=None, compression="infer", storage_options=None, **kwargs, ): """Return a list of paths and block counts. Logic copied from dask.bytes.read_bytes """ if lineterminator is not None and len(lineterminator) == 1: kwargs["lineterminator"] = lineterminator else: lineterminator = "\n" if compression == "infer": paths = get_fs_token_paths(urlpath, mode="rb", storage_options=storage_options)[2] compression = infer_compression(paths[0]) if isinstance(blocksize, str): blocksize = parse_bytes(blocksize) if blocksize and compression: blocksize = None b_out = read_bytes( urlpath, delimiter=lineterminator.encode(), blocksize=blocksize, sample=False, compression=compression, include_path=True, **(storage_options or {}), ) _, values, paths = b_out if not isinstance(values[0], (tuple, list)): values = [values] return paths, [len(v) for v in values]
def read_json(url_path, orient="records", lines=None, storage_options=None, blocksize=None, sample=2**20, encoding="utf-8", errors="strict", compression="infer", meta=None, engine=pd.read_json, **kwargs): """Create a dataframe from a set of JSON files This utilises ``pandas.read_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this is appropriate for line-delimited "JSON-lines" data, the kind of JSON output that is most common in big-data scenarios, and which can be chunked when reading (see ``read_json()``). All other options require blocksize=None, i.e., one partition per input file. Parameters ---------- url_path: str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation blocksize: None or int If None, files are not blocked, and you get one partition per input file. If int, which can only be used for line-delimited JSON files, each partition will be approximately this size in bytes, to the nearest newline character. sample: int Number of bytes to pre-load, to provide an empty dataframe structure to any blocks wihout data. Only relevant is using blocksize. encoding, errors: Text conversion, ``see bytes.decode()`` compression : string or None String like 'gzip' or 'xz'. engine : function object, default ``pd.read_json`` The underlying function that dask will use to read JSON files. By default, this will be the pandas JSON reader (``pd.read_json``). $META Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_json('myfile.1.json') # doctest: +SKIP Load multiple files >>> dd.read_json('myfile.*.json') # doctest: +SKIP >>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP Load large line-delimited JSON files using partitions of approx 256MB size >> dd.read_json('data/file*.csv', blocksize=2**28) """ import dask.dataframe as dd if lines is None: lines = orient == "records" if orient != "records" and lines: raise ValueError("Line-delimited JSON is only available with" 'orient="records".') if blocksize and (orient != "records" or not lines): raise ValueError("JSON file chunking only allowed for JSON-lines" "input (orient='records', lines=True).") storage_options = storage_options or {} if blocksize: first, chunks = read_bytes(url_path, b"\n", blocksize=blocksize, sample=sample, compression=compression, **storage_options) chunks = list(dask.core.flatten(chunks)) if meta is None: meta = read_json_chunk(first, encoding, errors, engine, kwargs) meta = make_meta(meta) parts = [ dask.delayed(read_json_chunk)(chunk, encoding, errors, engine, kwargs, meta=meta) for chunk in chunks ] return dd.from_delayed(parts, meta=meta) else: files = open_files(url_path, "rt", encoding=encoding, errors=errors, compression=compression, **storage_options) parts = [ dask.delayed(read_json_file)(f, orient, lines, engine, kwargs) for f in files ] return dd.from_delayed(parts, meta=meta)
def read_text( urlpath, blocksize=None, compression="infer", encoding=system_encoding, errors="strict", linedelimiter=None, collection=True, storage_options=None, files_per_partition=None, include_path=False, ): """Read lines from text files Parameters ---------- urlpath : string or list Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. blocksize: None, int, or str Size (in bytes) to cut up larger files. Streams by default. Can be ``None`` for streaming, an integer number of bytes, or a string like "128MiB" compression: string Compression format like 'gzip' or 'xz'. Defaults to 'infer' encoding: string errors: string linedelimiter: string or None collection: bool, optional Return dask.bag if True, or list of delayed values if false storage_options: dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. files_per_partition: None or int If set, group input files into partitions of the requested size, instead of one partition per file. Mutually exclusive with blocksize. include_path: bool Whether or not to include the path in the bag. If true, elements are tuples of (line, path). Default is False. Examples -------- >>> b = read_text('myfiles.1.txt') # doctest: +SKIP >>> b = read_text('myfiles.*.txt') # doctest: +SKIP >>> b = read_text('myfiles.*.txt.gz') # doctest: +SKIP >>> b = read_text('s3://bucket/myfiles.*.txt') # doctest: +SKIP >>> b = read_text('s3://key:secret@bucket/myfiles.*.txt') # doctest: +SKIP >>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt') # doctest: +SKIP Parallelize a large file by providing the number of uncompressed bytes to load into each partition. >>> b = read_text('largefile.txt', blocksize='10MB') # doctest: +SKIP Get file paths of the bag by setting include_path=True >>> b = read_text('myfiles.*.txt', include_path=True) # doctest: +SKIP >>> b.take(1) # doctest: +SKIP (('first line of the first file', '/home/dask/myfiles.0.txt'),) Returns ------- dask.bag.Bag or list dask.bag.Bag if collection is True or list of Delayed lists otherwise. See Also -------- from_sequence: Build bag from Python sequence """ if blocksize is not None and files_per_partition is not None: raise ValueError( "Only one of blocksize or files_per_partition can be set") if isinstance(blocksize, str): blocksize = parse_bytes(blocksize) if blocksize is None: if linedelimiter in [None, "", "\n", "\r", "\r\n"]: newline = linedelimiter linedelimiter = None else: newline = "" files = open_files( urlpath, mode="rt", encoding=encoding, errors=errors, compression=compression, newline=newline, **(storage_options or {}), ) if files_per_partition is None: blocks = [ delayed(list)(delayed( partial(file_to_blocks, include_path, delimiter=linedelimiter))(fil)) for fil in files ] else: blocks = [] for start in range(0, len(files), files_per_partition): block_files = files[start:(start + files_per_partition)] block_lines = delayed(concat)(delayed(map)( partial(file_to_blocks, include_path, delimiter=linedelimiter), block_files, )) blocks.append(block_lines) else: # special case for linedelimiter=None: we will need to split on an actual bytestring # and the line reader will then use "universal" mode. Just as well that \r\n and \n # will both work (thankfully \r for MacOS is no longer a thing) o = read_bytes( urlpath, delimiter=linedelimiter.encode() if linedelimiter is not None else b"\n", blocksize=blocksize, sample=False, compression=compression, include_path=include_path, **(storage_options or {}), ) raw_blocks = o[1] blocks = [ delayed(decode)(b, encoding, errors, linedelimiter) for b in concat(raw_blocks) ] if include_path: paths = list( concat([[path] * len(raw_blocks[i]) for i, path in enumerate(o[2])])) blocks = [ delayed(attach_path)(entry, path) for entry, path in zip(blocks, paths) ] if not blocks: raise ValueError("No files found", urlpath) if collection: blocks = from_delayed(blocks) return blocks
def read_pandas( reader, urlpath, blocksize="default", lineterminator=None, compression="infer", sample=256000, sample_rows=10, enforce=False, assume_missing=False, storage_options=None, include_path_column=False, **kwargs, ): reader_name = reader.__name__ if lineterminator is not None and len(lineterminator) == 1: kwargs["lineterminator"] = lineterminator else: lineterminator = "\n" if include_path_column and isinstance(include_path_column, bool): include_path_column = "path" if "index" in kwargs or "index_col" in kwargs: raise ValueError( "Keywords 'index' and 'index_col' not supported. " f"Use dd.{reader_name}(...).set_index('my-index') instead") for kw in ["iterator", "chunksize"]: if kw in kwargs: raise ValueError(f"{kw} not supported for dd.{reader_name}") if kwargs.get("nrows", None): raise ValueError("The 'nrows' keyword is not supported by " "`dd.{0}`. To achieve the same behavior, it's " "recommended to use `dd.{0}(...)." "head(n=nrows)`".format(reader_name)) if isinstance(kwargs.get("skiprows"), int): skiprows = lastskiprow = firstrow = kwargs.get("skiprows") elif kwargs.get("skiprows") is None: skiprows = lastskiprow = firstrow = 0 else: # When skiprows is a list, we expect more than max(skiprows) to # be included in the sample. This means that [0,2] will work well, # but [0, 440] might not work. skiprows = set(kwargs.get("skiprows")) lastskiprow = max(skiprows) # find the firstrow that is not skipped, for use as header firstrow = min(set(range(len(skiprows) + 1)) - set(skiprows)) if isinstance(kwargs.get("header"), list): raise TypeError( f"List of header rows not supported for dd.{reader_name}") if isinstance(kwargs.get("converters"), dict) and include_path_column: path_converter = kwargs.get("converters").get(include_path_column, None) else: path_converter = None # If compression is "infer", inspect the (first) path suffix and # set the proper compression option if the suffix is recongnized. if compression == "infer": # Translate the input urlpath to a simple path list paths = get_fs_token_paths(urlpath, mode="rb", storage_options=storage_options)[2] # Check for at least one valid path if len(paths) == 0: raise OSError(f"{urlpath} resolved to no files") # Infer compression from first path compression = infer_compression(paths[0]) if blocksize == "default": blocksize = AUTO_BLOCKSIZE if isinstance(blocksize, str): blocksize = parse_bytes(blocksize) if blocksize and compression: # NONE of the compressions should use chunking warn("Warning %s compression does not support breaking apart files\n" "Please ensure that each individual file can fit in memory and\n" "use the keyword ``blocksize=None to remove this message``\n" "Setting ``blocksize=None``" % compression) blocksize = None if compression not in compr: raise NotImplementedError("Compression format %s not installed" % compression) if blocksize and sample and blocksize < sample and lastskiprow != 0: warn("Unexpected behavior can result from passing skiprows when\n" "blocksize is smaller than sample size.\n" "Setting ``sample=blocksize``") sample = blocksize b_lineterminator = lineterminator.encode() b_out = read_bytes( urlpath, delimiter=b_lineterminator, blocksize=blocksize, sample=sample, compression=compression, include_path=include_path_column, **(storage_options or {}), ) if include_path_column: b_sample, values, paths = b_out path = (include_path_column, path_converter) else: b_sample, values = b_out path = None if not isinstance(values[0], (tuple, list)): values = [values] # If we have not sampled, then use the first row of the first values # as a representative sample. if b_sample is False and len(values[0]): b_sample = values[0][0].compute() # Get header row, and check that sample is long enough. If the file # contains a header row, we need at least 2 nonempty rows + the number of # rows to skip. names = kwargs.get("names", None) header = kwargs.get("header", "infer" if names is None else None) need = 1 if header is None else 2 if kwargs.get("comment"): # if comment is provided, step through lines of b_sample and strip out comments parts = [] for part in b_sample.split(b_lineterminator): split_comment = part.decode().split(kwargs.get("comment")) if len(split_comment) > 1: # if line starts with comment, don't include that line in parts. if len(split_comment[0]) > 0: parts.append(split_comment[0].strip().encode()) else: parts.append(part) if len(parts) > need: break else: parts = b_sample.split(b_lineterminator, lastskiprow + need) # If the last partition is empty, don't count it nparts = 0 if not parts else len(parts) - int(not parts[-1]) if sample is not False and nparts < lastskiprow + need and len( b_sample) >= sample: raise ValueError("Sample is not large enough to include at least one " "row of data. Please increase the number of bytes " "in `sample` in the call to `read_csv`/`read_table`") if isinstance(header, int): firstrow += header header = b"" if header is None else parts[firstrow] + b_lineterminator # Use sample to infer dtypes and check for presence of include_path_column head_kwargs = kwargs.copy() head_kwargs.pop("skipfooter", None) try: head = reader(BytesIO(b_sample), nrows=sample_rows, **head_kwargs) except pd.errors.ParserError as e: if "EOF" in str(e): raise ValueError( "EOF encountered while reading header. \n" "Pass argument `sample_rows` and make sure the value of `sample` " "is large enough to accommodate that many rows of data") from e raise if include_path_column and (include_path_column in head.columns): raise ValueError("Files already contain the column name: %s, so the " "path column cannot use this name. Please set " "`include_path_column` to a unique name." % include_path_column) specified_dtypes = kwargs.get("dtype", {}) if specified_dtypes is None: specified_dtypes = {} # If specified_dtypes is a single type, then all columns were specified if assume_missing and isinstance(specified_dtypes, dict): # Convert all non-specified integer columns to floats for c in head.columns: if is_integer_dtype(head[c].dtype) and c not in specified_dtypes: head[c] = head[c].astype(float) values = [[list(dsk.dask.values()) for dsk in block] for block in values] return text_blocks_to_pandas( reader, values, header, head, kwargs, enforce=enforce, specified_dtypes=specified_dtypes, path=path, blocksize=blocksize, urlpath=urlpath, )
def read_json( url_path, orient="records", lines=None, storage_options=None, blocksize=None, sample=2**20, encoding="utf-8", errors="strict", compression="infer", meta=None, engine=pd.read_json, include_path_column=False, path_converter=None, **kwargs, ): """Create a dataframe from a set of JSON files This utilises ``pandas.read_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this is appropriate for line-delimited "JSON-lines" data, the kind of JSON output that is most common in big-data scenarios, and which can be chunked when reading (see ``read_json()``). All other options require blocksize=None, i.e., one partition per input file. Parameters ---------- url_path: str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation blocksize: None or int If None, files are not blocked, and you get one partition per input file. If int, which can only be used for line-delimited JSON files, each partition will be approximately this size in bytes, to the nearest newline character. sample: int Number of bytes to pre-load, to provide an empty dataframe structure to any blocks without data. Only relevant when using blocksize. encoding, errors: Text conversion, ``see bytes.decode()`` compression : string or None String like 'gzip' or 'xz'. engine : function object, default ``pd.read_json`` The underlying function that dask will use to read JSON files. By default, this will be the pandas JSON reader (``pd.read_json``). include_path_column : bool or str, optional Include a column with the file path where each row in the dataframe originated. If ``True``, a new column is added to the dataframe called ``path``. If ``str``, sets new column name. Default is ``False``. path_converter : function or None, optional A function that takes one argument and returns a string. Used to convert paths in the ``path`` column, for instance, to strip a common prefix from all the paths. $META Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_json('myfile.1.json') # doctest: +SKIP Load multiple files >>> dd.read_json('myfile.*.json') # doctest: +SKIP >>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP Load large line-delimited JSON files using partitions of approx 256MB size >> dd.read_json('data/file*.csv', blocksize=2**28) """ if lines is None: lines = orient == "records" if orient != "records" and lines: raise ValueError( "Line-delimited JSON is only available with" 'orient="records".' ) if blocksize and (orient != "records" or not lines): raise ValueError( "JSON file chunking only allowed for JSON-lines" "input (orient='records', lines=True)." ) storage_options = storage_options or {} if include_path_column is True: include_path_column = "path" if path_converter is None: path_converter = lambda x: x if blocksize: b_out = read_bytes( url_path, b"\n", blocksize=blocksize, sample=sample, compression=compression, include_path=include_path_column, **storage_options, ) if include_path_column: first, chunks, paths = b_out first_path = path_converter(paths[0]) path_dtype = pd.CategoricalDtype(path_converter(p) for p in paths) flat_paths = flatten( [path_converter(p)] * len(chunk) for p, chunk in zip(paths, chunks) ) else: first, chunks = b_out first_path = None flat_paths = (None,) path_dtype = None flat_chunks = flatten(chunks) if meta is None: meta = read_json_chunk( first, encoding, errors, engine, include_path_column, first_path, path_dtype, kwargs, ) meta = make_meta(meta) parts = [ delayed(read_json_chunk)( chunk, encoding, errors, engine, include_path_column, path, path_dtype, kwargs, meta=meta, ) for chunk, path in zip_longest(flat_chunks, flat_paths) ] else: files = open_files( url_path, "rt", encoding=encoding, errors=errors, compression=compression, **storage_options, ) path_dtype = pd.CategoricalDtype(path_converter(f.path) for f in files) parts = [ delayed(read_json_file)( f, orient, lines, engine, include_path_column, path_converter(f.path), path_dtype, kwargs, ) for f in files ] return from_delayed(parts, meta=meta)
def read_json(url_path, orient='records', lines=None, storage_options=None, blocksize=None, sample=2**20, encoding='utf-8', errors='strict', **kwargs): """Create a dataframe from a set of JSON files This utilises ``pandas.read_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this is apropriate for line-delimited "JSON-lines" data, the kind of JSON output that is most common in big-data scenarios, and which can be chunked when reading (see ``read_json()``). All other options require blocksize=None, i.e., one partition per input file. Parameters ---------- url_path: str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation blocksize: None or int If None, files are not blocked, and you get one partition per input file. If int, which can only be used for line-delimited JSON files, each partition will be approximately this size in bytes, to the nearest newline character. sample: int Number of bytes to pre-load, to provide an empty dataframe structure to any blocks wihout data. Only relevant is using blocksize. encoding, errors: Text conversion, ``see bytes.decode()`` Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_json('myfile.1.json') # doctest: +SKIP Load multiple files >>> dd.read_json('myfile.*.json') # doctest: +SKIP >>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP Load large line-delimited JSON files using partitions of approx 256MB size >> dd.read_json('data/file*.csv', blocksize=2**28) """ import dask.dataframe as dd if lines is None: lines = orient == 'records' if orient != 'records' and lines: raise ValueError('Line-delimited JSON is only available with' 'orient="records".') if blocksize and (orient != 'records' or not lines): raise ValueError("JSON file chunking only allowed for JSON-lines" "input (orient='records', lines=True).") storage_options = storage_options or {} if blocksize: first, chunks = read_bytes(url_path, b'\n', blocksize=blocksize, sample=sample, **storage_options) chunks = list(dask.core.flatten(chunks)) first = read_json_chunk(first, encoding, errors, kwargs) parts = [dask.delayed(read_json_chunk)( chunk, encoding, errors, kwargs, meta=first[:0] ) for chunk in chunks] else: files = open_files(url_path, 'rt', encoding=encoding, errors=errors, **storage_options) parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs) for f in files] return dd.from_delayed(parts)