def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) if start is not None and end is not None: # pop "compression" from kwargs because bio is uncompressed bio = FileDispatcher.file_open( fname, "rb", kwargs.pop("compression", "infer") ) bio.seek(start) to_read = b"" + bio.read(end - start) bio.close() columns = kwargs.pop("columns") pandas_df = pandas.read_json(BytesIO(to_read), **kwargs) else: # This only happens when we are reading with only one worker (Default) return pandas.read_json(fname, **kwargs) if not pandas_df.columns.equals(columns): raise NotImplementedError("Columns must be the same across all rows.") partition_columns = pandas_df.columns return _split_result_for_readers(1, num_splits, pandas_df) + [ len(pandas_df), pandas_df.dtypes, partition_columns, ]
def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) index_col = kwargs.get("index_col", None) if start is not None and end is not None: # pop "compression" from kwargs because bio is uncompressed bio = FileDispatcher.file_open( fname, "rb", kwargs.pop("compression", "infer") ) if kwargs.get("encoding", None) is not None: header = b"" + bio.readline() else: header = b"" bio.seek(start) to_read = header + bio.read(end - start) bio.close() pandas_df = pandas.read_fwf(BytesIO(to_read), **kwargs) else: # This only happens when we are reading with only one worker (Default) return pandas.read_fwf(fname, **kwargs) if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]
def parse(chunks, **kwargs): warnings.filterwarnings("ignore") num_splits = kwargs.pop("num_splits", None) index_col = kwargs.get("index_col", None) pandas_dfs = [] for fname, start, end in chunks: if start is not None and end is not None: # pop "compression" from kwargs because bio is uncompressed bio = FileDispatcher.file_open( fname, "rb", kwargs.pop("compression", "infer") ) if kwargs.get("encoding", None) is not None: header = b"" + bio.readline() else: header = b"" bio.seek(start) to_read = header + bio.read(end - start) bio.close() pandas_dfs.append(pandas.read_csv(BytesIO(to_read), **kwargs)) else: # This only happens when we are reading with only one worker (Default) return pandas.read_csv(fname, **kwargs) # Combine read in data. if len(pandas_dfs) > 1: pandas_df = pandas.concat(pandas_dfs) elif len(pandas_dfs) > 0: pandas_df = pandas_dfs[0] else: pandas_df = pandas.DataFrame() # Set internal index. if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]
def parse(cls, fname, **kwargs): warnings.filterwarnings("ignore") num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) index_col = kwargs.get("index_col", None) gpu_selected = kwargs.pop("gpu", 0) if start is not None and end is not None: put_func = cls.frame_partition_cls.put # pop "compression" from kwargs because bio is uncompressed bio = FileDispatcher.file_open( fname, "rb", kwargs.pop("compression", "infer") ) if kwargs.get("encoding", None) is not None: header = b"" + bio.readline() else: header = b"" bio.seek(start) to_read = header + bio.read(end - start) bio.close() pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs) else: # This only happens when we are reading with only one worker (Default) pandas_df = pandas.read_csv(fname, **kwargs) num_splits = ( 1 # force num_splits to be 1 here because we don't want it partitioning ) if index_col is not None: index = pandas_df.index else: index = len(pandas_df) partition_dfs = _split_result_for_readers(1, num_splits, pandas_df) key = [ put_func(GPU_MANAGERS[gpu_selected], partition_df) for partition_df in partition_dfs ] return key + [index, pandas_df.dtypes]
def parse(fname, **kwargs): warnings.filterwarnings("ignore") num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) header_size = kwargs.pop("header_size", None) if start is not None and end is not None: # pop "compression" from kwargs because bio is uncompressed bio = FileDispatcher.file_open( fname, "rb", kwargs.pop("compression", "infer") ) header = b"" # In this case we beware that fisrt line can contain BOM, so # adding this line to the `header` for reading and then skip it if kwargs.get("encoding", None) is not None and header_size == 0: header += bio.readline() # `skiprows` can be only None here, so don't check it's type # and just set to 1 kwargs["skiprows"] = 1 for _ in range(header_size): header += bio.readline() bio.seek(start) to_read = header + bio.read(end - start) bio.close() pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs) else: # This only happens when we are reading with only one worker (Default) return pandas.read_csv(fname, **kwargs) index = ( pandas_df.index if not isinstance(pandas_df.index, pandas.RangeIndex) else len(pandas_df) ) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]