def read_avro( filepath_or_buffer, engine="cudf", columns=None, skiprows=None, num_rows=None, **kwargs, ): """{docstring}""" from cudf import DataFrame is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_avro` does not yet support reading multiple files") filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return DataFrame._from_table( libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows, num_rows)) else: raise NotImplementedError("read_avro currently only supports cudf")
def read_orc_statistics( filepath_or_buffer, columns=None, **kwargs, ): """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_orc_statistics` does not support reading multiple files" ) filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") # Read in statistics and unpack ( column_names, raw_file_statistics, raw_stripes_statistics, ) = libcudf.orc.read_raw_orc_statistics(filepath_or_buffer) # Parse column names column_names = [ column_name.decode("utf-8") for column_name in column_names ] # Parse statistics cs = cs_pb2.ColumnStatistics() file_statistics = { column_names[i]: _parse_column_statistics(cs, raw_file_stats) for i, raw_file_stats in enumerate(raw_file_statistics) if columns is None or column_names[i] in columns } if any( not parsed_statistics for parsed_statistics in file_statistics.values() ): return None stripes_statistics = [] for raw_stripe_statistics in raw_stripes_statistics: stripe_statistics = { column_names[i]: _parse_column_statistics(cs, raw_file_stats) for i, raw_file_stats in enumerate(raw_stripe_statistics) if columns is None or column_names[i] in columns } if any( not parsed_statistics for parsed_statistics in stripe_statistics.values() ): return None else: stripes_statistics.append(stripe_statistics) return file_statistics, stripes_statistics
def read_json( path_or_buf, engine="auto", dtype=True, lines=False, compression="infer", byte_range=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf" and not lines: raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=path_or_buf, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_json` does not yet support reading multiple files") path_or_buf, compression = ioutils.get_filepath_or_buffer( path_or_data=path_or_buf, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if engine == "cudf": return cudf.DataFrame._from_table( libjson.read_json(path_or_buf, dtype, lines, compression, byte_range)) else: warnings.warn("Using CPU via Pandas to read JSON dataset, this may " "be GPU accelerated in the future") if kwargs.get("orient") == "table": pd_value = pd.read_json( path_or_buf, lines=lines, compression=compression, *args, **kwargs, ) else: pd_value = pd.read_json( path_or_buf, lines=lines, dtype=dtype, compression=compression, *args, **kwargs, ) df = cudf.from_pandas(pd_value) return df
def read_json( path_or_buf, engine="auto", dtype=True, lines=False, compression="infer", byte_range=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf" and not lines: raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" if engine == "cudf": # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(path_or_buf): path_or_buf = [path_or_buf] filepaths_or_buffers = [] for source in path_or_buf: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem(passed_filesystem=None, path=source) source = ioutils.stringify_pathlike(source) source = fs.sep.join([source, "*.json"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) return cudf.DataFrame._from_data(*libjson.read_json( filepaths_or_buffers, dtype, lines, compression, byte_range)) else: warnings.warn("Using CPU via Pandas to read JSON dataset, this may " "be GPU accelerated in the future") if not ioutils.ensure_single_filepath_or_buffer( path_or_data=path_or_buf, **kwargs, ): raise NotImplementedError( "`read_json` does not yet support reading " "multiple files via pandas") path_or_buf, compression = ioutils.get_filepath_or_buffer( path_or_data=path_or_buf, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if kwargs.get("orient") == "table": pd_value = pd.read_json( path_or_buf, lines=lines, compression=compression, *args, **kwargs, ) else: pd_value = pd.read_json( path_or_buf, lines=lines, dtype=dtype, compression=compression, *args, **kwargs, ) df = cudf.from_pandas(pd_value) return df
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, timestamp_type=None, **kwargs, ): """{docstring}""" from cudf import DataFrame is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_orc` does not yet support reading multiple files" ) filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if filters is not None: selected_stripes = _filter_stripes( filters, filepath_or_buffer, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepath_or_buffer, columns) else: stripes = selected_stripes if engine == "cudf": df = DataFrame._from_table( libcudf.orc.read_orc( filepath_or_buffer, columns, stripes, skiprows, num_rows, use_index, decimals_as_float, force_decimal_scale, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripes is not None and len(stripes) > 0: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripes ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_csv( filepath_or_buffer, lineterminator="\n", quotechar='"', quoting=0, doublequote=True, header="infer", mangle_dupe_cols=True, usecols=None, sep=",", delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression="infer", thousands=None, decimal=".", true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, parse_dates=None, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None, **kwargs, ): """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_csv` does not yet support reading multiple files") filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if na_values is not None and is_scalar(na_values): na_values = [na_values] if keep_default_na is False: # TODO: Remove this error once the following issue is fixed: # https://github.com/rapidsai/cudf/issues/6680 raise NotImplementedError( "keep_default_na=False is currently not supported, please refer " "to: https://github.com/rapidsai/cudf/issues/6680") return libcudf.csv.read_csv( filepath_or_buffer, lineterminator=lineterminator, quotechar=quotechar, quoting=quoting, doublequote=doublequote, header=header, mangle_dupe_cols=mangle_dupe_cols, usecols=usecols, sep=sep, delimiter=delimiter, delim_whitespace=delim_whitespace, skipinitialspace=skipinitialspace, names=names, dtype=dtype, skipfooter=skipfooter, skiprows=skiprows, dayfirst=dayfirst, compression=compression, thousands=thousands, decimal=decimal, true_values=true_values, false_values=false_values, nrows=nrows, byte_range=byte_range, skip_blank_lines=skip_blank_lines, parse_dates=parse_dates, comment=comment, na_values=na_values, keep_default_na=keep_default_na, na_filter=na_filter, prefix=prefix, index_col=index_col, )
def read_csv( filepath_or_buffer, lineterminator="\n", quotechar='"', quoting=0, doublequote=True, header="infer", mangle_dupe_cols=True, usecols=None, sep=",", delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression="infer", thousands=None, decimal=".", true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, parse_dates=None, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None, use_python_file_object=True, **kwargs, ): """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_csv` does not yet support reading multiple files" ) filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO, NativeFile), use_python_file_object=use_python_file_object, **kwargs, ) if na_values is not None and is_scalar(na_values): na_values = [na_values] return libcudf.csv.read_csv( filepath_or_buffer, lineterminator=lineterminator, quotechar=quotechar, quoting=quoting, doublequote=doublequote, header=header, mangle_dupe_cols=mangle_dupe_cols, usecols=usecols, sep=sep, delimiter=delimiter, delim_whitespace=delim_whitespace, skipinitialspace=skipinitialspace, names=names, dtype=dtype, skipfooter=skipfooter, skiprows=skiprows, dayfirst=dayfirst, compression=compression, thousands=thousands, decimal=decimal, true_values=true_values, false_values=false_values, nrows=nrows, byte_range=byte_range, skip_blank_lines=skip_blank_lines, parse_dates=parse_dates, comment=comment, na_values=na_values, keep_default_na=keep_default_na, na_filter=na_filter, prefix=prefix, index_col=index_col, )