示例#1
0
 def parse(fname, **kwargs):
     num_splits = kwargs.pop("num_splits", None)
     start = kwargs.pop("start", None)
     end = kwargs.pop("end", None)
     if start is not None and end is not None:
         # pop "compression" from kwargs because bio is uncompressed
         bio = FileDispatcher.file_open(
             fname, "rb", kwargs.pop("compression", "infer")
         )
         bio.seek(start)
         to_read = b"" + bio.read(end - start)
         bio.close()
         columns = kwargs.pop("columns")
         pandas_df = pandas.read_json(BytesIO(to_read), **kwargs)
     else:
         # This only happens when we are reading with only one worker (Default)
         return pandas.read_json(fname, **kwargs)
     if not pandas_df.columns.equals(columns):
         raise NotImplementedError("Columns must be the same across all rows.")
     partition_columns = pandas_df.columns
     return _split_result_for_readers(1, num_splits, pandas_df) + [
         len(pandas_df),
         pandas_df.dtypes,
         partition_columns,
     ]
示例#2
0
 def parse(fname, **kwargs):
     num_splits = kwargs.pop("num_splits", None)
     start = kwargs.pop("start", None)
     end = kwargs.pop("end", None)
     index_col = kwargs.get("index_col", None)
     if start is not None and end is not None:
         # pop "compression" from kwargs because bio is uncompressed
         bio = FileDispatcher.file_open(
             fname, "rb", kwargs.pop("compression", "infer")
         )
         if kwargs.get("encoding", None) is not None:
             header = b"" + bio.readline()
         else:
             header = b""
         bio.seek(start)
         to_read = header + bio.read(end - start)
         bio.close()
         pandas_df = pandas.read_fwf(BytesIO(to_read), **kwargs)
     else:
         # This only happens when we are reading with only one worker (Default)
         return pandas.read_fwf(fname, **kwargs)
     if index_col is not None:
         index = pandas_df.index
     else:
         # The lengths will become the RangeIndex
         index = len(pandas_df)
     return _split_result_for_readers(1, num_splits, pandas_df) + [
         index,
         pandas_df.dtypes,
     ]
示例#3
0
    def parse(chunks, **kwargs):
        warnings.filterwarnings("ignore")
        num_splits = kwargs.pop("num_splits", None)
        index_col = kwargs.get("index_col", None)

        pandas_dfs = []
        for fname, start, end in chunks:
            if start is not None and end is not None:
                # pop "compression" from kwargs because bio is uncompressed
                bio = FileDispatcher.file_open(
                    fname, "rb", kwargs.pop("compression", "infer")
                )
                if kwargs.get("encoding", None) is not None:
                    header = b"" + bio.readline()
                else:
                    header = b""
                bio.seek(start)
                to_read = header + bio.read(end - start)
                bio.close()
                pandas_dfs.append(pandas.read_csv(BytesIO(to_read), **kwargs))
            else:
                # This only happens when we are reading with only one worker (Default)
                return pandas.read_csv(fname, **kwargs)

        # Combine read in data.
        if len(pandas_dfs) > 1:
            pandas_df = pandas.concat(pandas_dfs)
        elif len(pandas_dfs) > 0:
            pandas_df = pandas_dfs[0]
        else:
            pandas_df = pandas.DataFrame()

        # Set internal index.
        if index_col is not None:
            index = pandas_df.index
        else:
            # The lengths will become the RangeIndex
            index = len(pandas_df)
        return _split_result_for_readers(1, num_splits, pandas_df) + [
            index,
            pandas_df.dtypes,
        ]
示例#4
0
    def parse(cls, fname, **kwargs):
        warnings.filterwarnings("ignore")
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        index_col = kwargs.get("index_col", None)
        gpu_selected = kwargs.pop("gpu", 0)

        if start is not None and end is not None:
            put_func = cls.frame_partition_cls.put

            # pop "compression" from kwargs because bio is uncompressed
            bio = FileDispatcher.file_open(
                fname, "rb", kwargs.pop("compression", "infer")
            )
            if kwargs.get("encoding", None) is not None:
                header = b"" + bio.readline()
            else:
                header = b""
            bio.seek(start)
            to_read = header + bio.read(end - start)
            bio.close()
            pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
        else:
            # This only happens when we are reading with only one worker (Default)
            pandas_df = pandas.read_csv(fname, **kwargs)
            num_splits = (
                1  # force num_splits to be 1 here because we don't want it partitioning
            )
        if index_col is not None:
            index = pandas_df.index
        else:
            index = len(pandas_df)
        partition_dfs = _split_result_for_readers(1, num_splits, pandas_df)
        key = [
            put_func(GPU_MANAGERS[gpu_selected], partition_df)
            for partition_df in partition_dfs
        ]
        return key + [index, pandas_df.dtypes]
示例#5
0
 def parse(fname, **kwargs):
     warnings.filterwarnings("ignore")
     num_splits = kwargs.pop("num_splits", None)
     start = kwargs.pop("start", None)
     end = kwargs.pop("end", None)
     header_size = kwargs.pop("header_size", None)
     if start is not None and end is not None:
         # pop "compression" from kwargs because bio is uncompressed
         bio = FileDispatcher.file_open(
             fname, "rb", kwargs.pop("compression", "infer")
         )
         header = b""
         # In this case we beware that fisrt line can contain BOM, so
         # adding this line to the `header` for reading and then skip it
         if kwargs.get("encoding", None) is not None and header_size == 0:
             header += bio.readline()
             # `skiprows` can be only None here, so don't check it's type
             # and just set to 1
             kwargs["skiprows"] = 1
         for _ in range(header_size):
             header += bio.readline()
         bio.seek(start)
         to_read = header + bio.read(end - start)
         bio.close()
         pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
     else:
         # This only happens when we are reading with only one worker (Default)
         return pandas.read_csv(fname, **kwargs)
     index = (
         pandas_df.index
         if not isinstance(pandas_df.index, pandas.RangeIndex)
         else len(pandas_df)
     )
     return _split_result_for_readers(1, num_splits, pandas_df) + [
         index,
         pandas_df.dtypes,
     ]