Python get_default_chunksize示例

编程语言: Python

命名空间/包名称: modin.data_management.utils

方法/功能: get_default_chunksize

hotexamples.com的示例: 3

Python get_default_chunksize - 已找到3个示例。这些是从开源项目中提取的最受好评的modin.data_management.utils.get_default_chunksize现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： io.py 项目： pranabsarkar/modin

def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs,
                                         header):  # pragma: no cover
    """Use a Ray task to read a chunk of a CSV into a pyarrow Table.
     Note: Ray functions are not detected by codecov (thus pragma: no cover)
     Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the pyarrow `read_csv` function.
        header: The header of the file.
     Returns:
         A list containing the split pyarrow Tables and the the number of
         rows of the tables as the last element. This is used to determine
         the total length of the DataFrame to build a default Index.
    """
    bio = open(fname, "rb")
    # The header line for the CSV file
    first_line = bio.readline()
    bio.seek(start)
    to_read = header + first_line + bio.read(end - start)
    bio.close()
    table = csv.read_csv(BytesIO(to_read),
                         parse_options=csv.ParseOptions(header_rows=1))
    chunksize = get_default_chunksize(table.num_columns, num_splits)
    chunks = [
        pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)])
        for i in range(num_splits)
    ]
    return chunks + [
        table.num_rows,
        pandas.Series([t.to_pandas_dtype() for t in table.schema.types],
                      index=table.schema.names),
    ]

示例#2

显示文件

    def parse(self, **kwargs):
        import pyarrow as pa
        import pyarrow.csv as csv

        fname = kwargs.pop("fname", None)
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        header = kwargs.pop("header", None)
        bio = open(fname, "rb")
        # The header line for the CSV file
        first_line = bio.readline()
        bio.seek(start)
        to_read = header + first_line + bio.read(end - start)
        bio.close()
        table = csv.read_csv(BytesIO(to_read),
                             parse_options=csv.ParseOptions(header_rows=1))
        chunksize = get_default_chunksize(table.num_columns, num_splits)
        chunks = [
            pa.Table.from_arrays(table.columns[chunksize * i:chunksize *
                                               (i + 1)])
            for i in range(num_splits)
        ]
        return chunks + [
            table.num_rows,
            pandas.Series(
                [t.to_pandas_dtype() for t in table.schema.types],
                index=table.schema.names,
            ),
        ]

示例#3

显示文件

    def parse(self, fname, num_splits, start, end, header, **kwargs):
        """
        Parse CSV file into PyArrow tables.

        Parameters
        ----------
        fname : str
            Name of the CSV file to parse.
        num_splits : int
            Number of partitions to split the resulted PyArrow table into.
        start : int
            Position in the specified file to start parsing from.
        end : int
            Position in the specified file to end parsing at.
        header : str
            Header line that will be interpret as the first line of the parsed CSV file.
        **kwargs : kwargs
            Serves the compatibility purpose. Does not affect the result.

        Returns
        -------
        list
            List with splitted parse results and it's metadata:

            - First `num_split` elements are PyArrow tables, representing the corresponding chunk.
            - Next element is the number of rows in the parsed table.
            - Last element is the pandas Series, containing the data-types for each column of the parsed table.
        """
        import pyarrow as pa
        import pyarrow.csv as csv

        bio = open(fname, "rb")
        # The header line for the CSV file
        first_line = bio.readline()
        bio.seek(start)
        to_read = header + first_line + bio.read(end - start)
        bio.close()
        table = csv.read_csv(BytesIO(to_read),
                             parse_options=csv.ParseOptions(header_rows=1))
        chunksize = get_default_chunksize(table.num_columns, num_splits)
        chunks = [
            pa.Table.from_arrays(table.columns[chunksize * i:chunksize *
                                               (i + 1)])
            for i in range(num_splits)
        ]
        return chunks + [
            table.num_rows,
            pandas.Series(
                [t.to_pandas_dtype() for t in table.schema.types],
                index=table.schema.names,
            ),
        ]