def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs, header): # pragma: no cover """Use a Ray task to read a chunk of a CSV into a pyarrow Table. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: fname: The filename of the file to open. num_splits: The number of splits (partitions) to separate the DataFrame into. start: The start byte offset. end: The end byte offset. kwargs: The kwargs for the pyarrow `read_csv` function. header: The header of the file. Returns: A list containing the split pyarrow Tables and the the number of rows of the tables as the last element. This is used to determine the total length of the DataFrame to build a default Index. """ bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = get_default_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series([t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names), ]
def parse(self, **kwargs): import pyarrow as pa import pyarrow.csv as csv fname = kwargs.pop("fname", None) num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) header = kwargs.pop("header", None) bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = get_default_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series( [t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names, ), ]
def parse(self, fname, num_splits, start, end, header, **kwargs): """ Parse CSV file into PyArrow tables. Parameters ---------- fname : str Name of the CSV file to parse. num_splits : int Number of partitions to split the resulted PyArrow table into. start : int Position in the specified file to start parsing from. end : int Position in the specified file to end parsing at. header : str Header line that will be interpret as the first line of the parsed CSV file. **kwargs : kwargs Serves the compatibility purpose. Does not affect the result. Returns ------- list List with splitted parse results and it's metadata: - First `num_split` elements are PyArrow tables, representing the corresponding chunk. - Next element is the number of rows in the parsed table. - Last element is the pandas Series, containing the data-types for each column of the parsed table. """ import pyarrow as pa import pyarrow.csv as csv bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = get_default_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series( [t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names, ), ]