def quantile(column, quant, method, exact): """ Calculate the `quant` quantile for the column Returns value with the quantile specified by quant """ gdf_context = ffi.new('gdf_context*') method_api = _join_method_api['sort'] libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0) # libgdf.gdf_context_view(gdf_context, 0, method_api, 0) # px = ffi.new("double *") res = [] for q in quant: px = ffi.new("double *") if exact: libgdf.gdf_quantile_exact(column.cffi_view, get_quantile_method(method), q, ffi.cast('void *', px), gdf_context) else: libgdf.gdf_quantile_aprrox(column.cffi_view, q, ffi.cast('void *', px), gdf_context) res.append(px[0]) return res
def hash_partition(input_columns, key_indices, nparts, output_columns): """Partition the input_columns by the hash values on the keys. Parameters ---------- input_columns : sequence of Column key_indices : sequence of int Indices into `input_columns` that indicates the key columns. nparts : int number of partitions Returns ------- partition_offsets : list of int Each index indicates the start of a partition. """ assert len(input_columns) == len(output_columns) col_inputs = [col.cffi_view for col in input_columns] col_outputs = [col.cffi_view for col in output_columns] offsets = ffi.new('int[]', nparts) hashfn = libgdf.GDF_HASH_MURMUR3 libgdf.gdf_hash_partition(len(col_inputs), col_inputs, key_indices, len(key_indices), nparts, col_outputs, offsets, hashfn) offsets = list(offsets) return offsets
def columnview(size, data, mask=None, dtype=None): """ Make a column view. Parameters ---------- size : int Data count. data : Buffer The data buffer. mask : Buffer; optional The mask buffer. dtype : numpy.dtype; optional The dtype of the data. Defaults to *data.dtype*. """ def unwrap(buffer): if buffer is None: return ffi.NULL assert buffer.mem.is_c_contiguous(), "libGDF expects contiguous memory" devary = buffer.to_gpu_array() return unwrap_devary(devary) dtype = dtype or data.dtype colview = ffi.new('gdf_column*') libgdf.gdf_column_view(colview, unwrap(data), unwrap(mask), size, np_to_gdf_dtype(dtype)) return colview
def _apply_basic_agg(self, agg_type): """ Parameters ---------- agg_type : str The aggregation function to run. """ result = DataFrame() add_col_values = True ctx = ffi.new('gdf_context*') ctx.flag_sorted = 0 ctx.flag_method = self._method ctx.flag_distinct = 0 val_columns = self._val_columns val_columns_out = [agg_type + "_" + column for column in val_columns] return self._apply_agg(agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=False)
def apply_join(col_lhs, col_rhs, how): """Returns a tuple of the left and right joined indices as gpu arrays. """ if (len(col_lhs) != len(col_rhs)): msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'" raise ValueError(msg) joiner = _join_how_api[how] join_result_ptr = ffi.new("gdf_join_result_type**", None) if (how == 'left'): list_lhs = [] list_rhs = [] for i in range(len(col_lhs)): list_lhs.append(col_lhs[i].cffi_view) list_rhs.append(col_rhs[i].cffi_view) # Call libgdf joiner(len(col_lhs), list_lhs, list_rhs, join_result_ptr) else: joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, join_result_ptr) # Extract result join_result = join_result_ptr[0] dataptr = libgdf.gdf_join_result_data(join_result) datasize = libgdf.gdf_join_result_size(join_result) ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)), nelem=datasize, dtype=np.int32) ary = ary.reshape(2, datasize // 2) yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary)) libgdf.gdf_join_result_free(join_result)
def _call_join_multi(api, ncols, col_left, col_right, ctxt): join_result_ptr = ffi.new("gdf_join_result_type**", None) api(ncols, col_left, col_right, join_result_ptr, ctxt) join_result = join_result_ptr[0] print('join_result', join_result) dataptr = libgdf.gdf_join_result_data(join_result) print(dataptr) datasize = libgdf.gdf_join_result_size(join_result) print(datasize) addr = ctypes.c_uint64(int(ffi.cast("uintptr_t", dataptr))) print(hex(addr.value)) memptr = cuda.driver.MemoryPointer(context=cuda.current_context(), pointer=addr, size=4 * datasize) print(memptr) ary = cuda.devicearray.DeviceNDArray(shape=(datasize, ), strides=(4, ), dtype=np.dtype(np.int32), gpu_data=memptr) joined_idx = ary.reshape(2, datasize // 2).copy_to_host() print(joined_idx) libgdf.gdf_join_result_free(join_result) return joined_idx
def count_nonzero_mask(mask, size): assert mask.size * mask_bitsize >= size nnz = ffi.new('int*') nnz[0] = 0 mask_ptr, addr = unwrap_mask(mask) if addr != ffi.NULL: libgdf.gdf_count_nonzero_mask(mask_ptr, size, nnz) return nnz[0]
def apply_join(col_lhs, col_rhs, how, method='hash'): """Returns a tuple of the left and right joined indices as gpu arrays. """ if (len(col_lhs) != len(col_rhs)): msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'" raise ValueError(msg) joiner = _join_how_api[how] method_api = _join_method_api[method] gdf_context = ffi.new('gdf_context*') if method == 'hash': libgdf.gdf_context_view(gdf_context, 0, method_api, 0) elif method == 'sort': libgdf.gdf_context_view(gdf_context, 1, method_api, 0) else: msg = "method not supported" raise ValueError(msg) col_result_l = columnview(0, None, dtype=np.int32) col_result_r = columnview(0, None, dtype=np.int32) if (how in ['left', 'inner']): list_lhs = [] list_rhs = [] for i in range(len(col_lhs)): list_lhs.append(col_lhs[i].cffi_view) list_rhs.append(col_rhs[i].cffi_view) # Call libgdf joiner(len(col_lhs), list_lhs, list_rhs, col_result_l, col_result_r, gdf_context) else: joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, col_result_l, col_result_r) # Extract result # yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary)) left = _as_numba_devarray(intaddr=int( ffi.cast("uintptr_t", col_result_l.data)), nelem=col_result_l.size, dtype=np.int32) right = _as_numba_devarray(intaddr=int( ffi.cast("uintptr_t", col_result_r.data)), nelem=col_result_r.size, dtype=np.int32) yield (left, right) libgdf.gdf_column_free(col_result_l) libgdf.gdf_column_free(col_result_r)
def count_nulls(mask, size): mask_bitsize = 8 assert mask.size * mask_bitsize >= size nnz = ffi.new('int*') nnz[0] = 0 mask_ptr = unwrap_devary(mask) if mask_ptr != ffi.NULL: libgdf.gdf_count_nonzero_mask(mask_ptr, size, nnz) return (size - nnz[0])
def _apply_basic_agg(self, agg_type, sort_results=False): """ Parameters ---------- agg_type : str The aggregation function to run. """ result = DataFrame() add_col_values = True ctx = ffi.new('gdf_context*') ctx.flag_sorted = 0 ctx.flag_method = self._method ctx.flag_distinct = 0 val_columns = self._val_columns val_columns_out = self._val_columns result = self._apply_agg(agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=sort_results) # If a Groupby has one index column and one value column # and as_index is set, return a Series instead of a df if isinstance(val_columns, (str, Number)) and self._as_index: result_series = result[val_columns] idx = index.as_index(result[self._by[0]]) if self.level == 0: idx.name = self._original_index_name else: idx.name = self._by[0] result_series = result_series.set_index(idx) return result_series # TODO: Do MultiIndex here if (self._as_index): idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result.drop_column(idx.name) if self.level == 0: idx.name = self._original_index_name else: idx.name = self._by[0] result = result.set_index(idx) nvtx_range_pop() return result
def _columnview(size, data, mask, dtype, null_count): colview = ffi.new('gdf_column*') extra_dtype_info = ffi.new('gdf_dtype_extra_info*') extra_dtype_info.time_unit = libgdf.TIME_UNIT_NONE if null_count is None: libgdf.gdf_column_view( colview, data, mask, size, np_to_gdf_dtype(dtype), ) else: libgdf.gdf_column_view_augmented( colview, data, mask, size, np_to_gdf_dtype(dtype), null_count, extra_dtype_info[0], ) return colview
def _columnview(size, data, mask, dtype, null_count, nvcat): colview = ffi.new('gdf_column*') extra_dtype_info = ffi.new('gdf_dtype_extra_info*') extra_dtype_info.time_unit = libgdf.TIME_UNIT_NONE if nvcat is not None: extra_dtype_info.category = ffi.cast('void*', nvcat.get_cpointer()) else: extra_dtype_info.category = ffi.NULL if mask is None: null_count = 0 mask = ffi.NULL libgdf.gdf_column_view_augmented( colview, data, mask, size, np_to_gdf_dtype(dtype), null_count, extra_dtype_info[0], ) return colview
def apply_join(col_lhs, col_rhs, how): """Returns a tuple of the left and right joined indices as gpu arrays. """ joiner = _join_how_api[how] join_result_ptr = ffi.new("gdf_join_result_type**", None) # Call libgdf joiner(col_lhs.cffi_view, col_rhs.cffi_view, join_result_ptr) # Extract result join_result = join_result_ptr[0] dataptr = libgdf.gdf_join_result_data(join_result) datasize = libgdf.gdf_join_result_size(join_result) ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)), nelem=datasize, dtype=np.int32) ary = ary.reshape(2, datasize // 2) yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary)) libgdf.gdf_join_result_free(join_result)
def columnview(size, data, mask=None, dtype=None): """ Make a column view. """ def unwrap(buffer): if buffer is None: return ffi.NULL devary = buffer.to_gpu_array() return unwrap_devary(devary) dtype = dtype or data.dtype colview = ffi.new('gdf_column*') libgdf.gdf_column_view(colview, unwrap(data), unwrap(mask), size, np_to_gdf_dtype(dtype)) return colview
def columnview(size, data, mask=None, dtype=None): """ Make a column view. """ def unwrap(buffer): if buffer is None: return ffi.NULL devary = buffer.to_gpu_array() return ffi.cast('void*', devary.device_ctypes_pointer.value) dtype = dtype or data.dtype colview = ffi.new('gdf_column*') libgdf.gdf_column_view(colview, unwrap(data), unwrap(mask), size, np_to_gdf_dtype(dtype)) return colview
def _columnview(size, data, mask, dtype, null_count): colview = ffi.new('gdf_column*') if null_count is None: libgdf.gdf_column_view( colview, data, mask, size, np_to_gdf_dtype(dtype), ) else: libgdf.gdf_column_view_augmented( colview, data, mask, size, np_to_gdf_dtype(dtype), null_count, ) return colview
def nvtx_range_push(name, color='green'): """ Demarcate the beginning of a user-defined NVTX range. Parameters ---------- name : str The name of the NVTX range color : str The color to use for the range. Can be named color or hex RGB string. """ name_c = ffi.new("char[]", name.encode('ascii')) try: color = int(color, 16) # only works if color is a hex string libgdf.gdf_nvtx_range_push_hex(name_c, ffi.cast('unsigned int', color)) except ValueError: color = str_to_gdf_color(color) libgdf.gdf_nvtx_range_push(name_c, color)
def test_prefixsum_masked(dtype, nelem): if dtype == np.int8: data = gen_rand(dtype, nelem, low=-2, high=2) else: data = gen_rand(dtype, nelem) mask = gen_rand(np.int8, (nelem + 8 - 1) // 8) dummy_mask = gen_rand(np.int8, (nelem + 8 - 1) // 8) d_data = rmm.to_device(data) d_mask = rmm.to_device(mask) d_result = rmm.device_array(d_data.size, dtype=d_data.dtype) d_result_mask = rmm.to_device(dummy_mask) gdf_dtype = get_dtype(dtype) extra_dtype_info = ffi.new('gdf_dtype_extra_info*') extra_dtype_info.time_unit = libgdf.TIME_UNIT_NONE col_data = new_column() libgdf.gdf_column_view_augmented(col_data, unwrap_devary(d_data), unwrap_devary(d_mask), nelem, gdf_dtype, count_nulls(d_mask, nelem), extra_dtype_info[0]) col_result = new_column() libgdf.gdf_column_view(col_result, unwrap_devary(d_result), unwrap_devary(d_result_mask), nelem, gdf_dtype) inclusive = True libgdf.gdf_prefixsum(col_data, col_result, inclusive) boolmask = buffer_as_bits(mask)[:nelem] expect = np.cumsum(data[boolmask]) got = d_result.copy_to_host()[boolmask] if not inclusive: expect = expect[:-1] assert got[0] == 0 got = got[1:] decimal = 4 if dtype == np.float32 else 6 np.testing.assert_array_almost_equal(expect, got, decimal=decimal)
def libgdf_join(col_lhs, col_rhs, on, how, method='sort'): joiner = _join_how_api[how] method_api = _join_method_api[method] gdf_context = ffi.new('gdf_context*') libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0) if how not in ['left', 'inner', 'outer']: msg = "new join api only supports left or inner" raise ValueError(msg) list_lhs = [] list_rhs = [] result_cols = [] result_col_names = [] left_idx = [] right_idx = [] # idx = 0 for name, col in col_lhs.items(): list_lhs.append(col._column.cffi_view) if name not in on: result_cols.append(columnview(0, None, dtype=col._column.dtype)) result_col_names.append(name) for name in on: result_cols.append(columnview(0, None, dtype=col_lhs[name]._column.dtype)) result_col_names.append(name) left_idx.append(list(col_lhs.keys()).index(name)) right_idx.append(list(col_rhs.keys()).index(name)) for name, col in col_rhs.items(): list_rhs.append(col._column.cffi_view) if name not in on: result_cols.append(columnview(0, None, dtype=col._column.dtype)) result_col_names.append(name) num_cols_to_join = len(on) result_num_cols = len(list_lhs) + len(list_rhs) - num_cols_to_join joiner(list_lhs, len(list_lhs), left_idx, list_rhs, len(list_rhs), right_idx, num_cols_to_join, result_num_cols, result_cols, ffi.NULL, ffi.NULL, gdf_context) res = [] valids = [] for col in result_cols: intaddr = int(ffi.cast("uintptr_t", col.data)) res.append(rmm.device_array_from_ptr(ptr=intaddr, nelem=col.size, dtype=gdf_to_np_dtype(col.dtype), finalizer=rmm._make_finalizer( intaddr, 0))) intaddr = int(ffi.cast("uintptr_t", col.valid)) valids.append(rmm.device_array_from_ptr(ptr=intaddr, nelem=calc_chunk_size( col.size, mask_bitsize), dtype=mask_dtype, finalizer=rmm._make_finalizer( intaddr, 0))) return res, valids
def new_column(): return ffi.new('gdf_column*')
def new_context(): return ffi.new('gdf_context*')
def _columnview(size, data, mask, dtype): colview = ffi.new('gdf_column*') libgdf.gdf_column_view(colview, data, mask, size, np_to_gdf_dtype(dtype)) return colview
def read_csv(filepath_or_buffer, lineterminator='\n', quotechar='"', quoting=0, doublequote=True, header='infer', mangle_dupe_cols=True, usecols=None, sep=',', delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression='infer', thousands=None, decimal='.', true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None): """ Load and parse a CSV file into a DataFrame Parameters ---------- filepath_or_buffer : str Path of file to be read or a file-like object containing the file. sep : char, default ',' Delimiter to be used. delimiter : char, default None Alternative argument name for sep. delim_whitespace : bool, default False Determines whether to use whitespace as delimiter. lineterminator : char, default '\\n' Character to indicate end of line. skipinitialspace : bool, default False Skip spaces after delimiter. names : list of str, default None List of column names to be used. dtype : list of str or dict of {col: dtype}, default None List of data types in the same order of the column names or a dictionary with column_name:dtype (pandas style). quotechar : char, default '"' Character to indicate start and end of quote item. quoting : str or int, default 0 Controls quoting behavior. Set to one of 0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL), 2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE). Quoting is enabled with all values except 3. doublequote : bool, default True When quoting is enabled, indicates whether to interpret two consecutive quotechar inside fields as single quotechar header : int, default 'infer' Row number to use as the column names. Default behavior is to infer the column names: if no names are passed, header=0; if column names are passed explicitly, header=None. usecols : list of int or str, default None Returns subset of the columns given in the list. All elements must be either integer indices (column number) or strings that correspond to column names mangle_dupe_cols : boolean, default True Duplicate columns will be specified as 'X','X.1',...'X.N'. skiprows : int, default 0 Number of rows to be skipped from the start of file. skipfooter : int, default 0 Number of rows to be skipped at the bottom of file. compression : {'infer', 'gzip', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If ‘infer’, then detect compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no decompression). If using ‘zip’, the ZIP file must contain only one data file to be read in, otherwise the first non-zero-sized file will be used. Set to None for no decompression. decimal : char, default '.' Character used as a decimal point. thousands : char, default None Character used as a thousands delimiter. true_values : list, default None Values to consider as boolean True false_values : list, default None Values to consider as boolean False nrows : int, default None If specified, maximum number of rows to read byte_range : list or tuple, default None Byte range within the input file to be read. The first number is the offset in bytes, the second number is the range size in bytes. Set the size to zero to read all data after the offset location. Reads the row that starts before or at the end of the range, even if it ends after the end of the range. skip_blank_lines : bool, default True If True, discard and do not parse empty lines If False, interpret empty lines as NaN values comment : char, default None Character used as a comments indicator. If found at the beginning of a line, the line will be ignored altogether. na_values : list, default None Values to consider as invalid keep_default_na : bool, default True Whether or not to include the default NA values when parsing the data. na_filter : bool, default True Detect missing values (empty strings and the values in na_values). Passing False can improve performance. prefix : str, default None Prefix to add to column numbers when parsing without a header row index_col : int or string, default None Column to use as the row labels Returns ------- GPU ``DataFrame`` object. Examples -------- Create a test csv file >>> import cudf >>> filename = 'foo.csv' >>> lines = [ ... "num1,datetime,text", ... "123,2018-11-13T12:00:00,abc", ... "456,2018-11-14T12:35:01,def", ... "789,2018-11-15T18:02:59,ghi" ... ] >>> with open(filename, 'w') as fp: ... fp.write('\\n'.join(lines)+'\\n') Read the file with ``cudf.read_csv`` >>> cudf.read_csv(filename) num1 datetime text 0 123 2018-11-13T12:00:00.000 5451 1 456 2018-11-14T12:35:01.000 5784 2 789 2018-11-15T18:02:59.000 6117 See Also -------- .read_csv_strings """ if delim_whitespace: if delimiter is not None: raise ValueError("cannot set both delimiter and delim_whitespace") if sep != ',': raise ValueError("cannot set both sep and delim_whitespace") # Alias sep -> delimiter. if delimiter is None: delimiter = sep if dtype is not None: if isinstance(dtype, collections.abc.Mapping): dtype_dict = True elif isinstance(dtype, collections.abc.Iterable): dtype_dict = False else: msg = '''dtype must be 'list like' or 'dict' ''' raise TypeError(msg) if names is not None and len(dtype) != len(names): msg = '''All column dtypes must be specified.''' raise TypeError(msg) nvtx_range_push("CUDF_READ_CSV", "purple") csv_reader = ffi.new('csv_read_arg*') # Populate csv_reader struct if is_file_like(filepath_or_buffer): if compression == 'infer': compression = None buffer = filepath_or_buffer.read() # check if StringIO is used if hasattr(buffer, 'encode'): buffer_as_bytes = buffer.encode() else: buffer_as_bytes = buffer buffer_data_holder = ffi.new("char[]", buffer_as_bytes) csv_reader.input_data_form = libgdf.HOST_BUFFER csv_reader.filepath_or_buffer = buffer_data_holder csv_reader.buffer_size = len(buffer_as_bytes) else: if (not os.path.isfile(filepath_or_buffer)): raise (FileNotFoundError) if (not os.path.exists(filepath_or_buffer)): raise (FileNotFoundError) file_path = _wrap_string(filepath_or_buffer) csv_reader.input_data_form = libgdf.FILE_PATH csv_reader.filepath_or_buffer = file_path if header == 'infer': header = -1 header_infer = header arr_names = [] arr_dtypes = [] if names is None: if header is -1: header_infer = 0 if header is None: header_infer = -1 csv_reader.names = ffi.NULL csv_reader.num_cols = 0 else: if header is None: header_infer = -1 csv_reader.num_cols = len(names) for col_name in names: arr_names.append(_wrap_string(col_name)) if dtype is not None: if dtype_dict: arr_dtypes.append(_wrap_string(str(dtype[col_name]))) names_ptr = ffi.new('char*[]', arr_names) csv_reader.names = names_ptr if dtype is None: csv_reader.dtype = ffi.NULL else: if not dtype_dict: for col_dtype in dtype: arr_dtypes.append(_wrap_string(str(col_dtype))) dtype_ptr = ffi.new('char*[]', arr_dtypes) csv_reader.dtype = dtype_ptr csv_reader.use_cols_int = ffi.NULL csv_reader.use_cols_int_len = 0 csv_reader.use_cols_char = ffi.NULL csv_reader.use_cols_char_len = 0 if usecols is not None: arr_col_names = [] if (all(isinstance(x, int) for x in usecols)): usecols_ptr = ffi.new('int[]', usecols) csv_reader.use_cols_int = usecols_ptr csv_reader.use_cols_int_len = len(usecols) else: for col_name in usecols: arr_col_names.append(_wrap_string(col_name)) col_names_ptr = ffi.new('char*[]', arr_col_names) csv_reader.use_cols_char = col_names_ptr csv_reader.use_cols_char_len = len(usecols) if decimal == delimiter: raise ValueError("decimal cannot be the same as delimiter") if thousands == delimiter: raise ValueError("thousands cannot be the same as delimiter") if nrows is not None and skipfooter != 0: raise ValueError("cannot use both nrows and skipfooter parameters") if byte_range is not None: if skipfooter != 0 or skiprows != 0 or nrows is not None: raise ValueError("""cannot manually limit rows to be read when using the byte range parameter""") arr_true_values = [] for value in true_values or []: arr_true_values.append(_wrap_string(str(value))) arr_true_values_ptr = ffi.new('char*[]', arr_true_values) csv_reader.true_values = arr_true_values_ptr csv_reader.num_true_values = len(arr_true_values) arr_false_values = [] for value in false_values or []: arr_false_values.append(_wrap_string(str(value))) false_values_ptr = ffi.new('char*[]', arr_false_values) csv_reader.false_values = false_values_ptr csv_reader.num_false_values = len(arr_false_values) arr_na_values = [] for value in na_values or []: arr_na_values.append(_wrap_string(str(value))) arr_na_values_ptr = ffi.new('char*[]', arr_na_values) csv_reader.na_values = arr_na_values_ptr csv_reader.num_na_values = len(arr_na_values) compression_bytes = _wrap_string(compression) prefix_bytes = _wrap_string(prefix) csv_reader.delimiter = delimiter.encode() csv_reader.lineterminator = lineterminator.encode() csv_reader.quotechar = quotechar.encode() csv_reader.quoting = _quoting_enum[quoting] csv_reader.doublequote = doublequote csv_reader.delim_whitespace = delim_whitespace csv_reader.skipinitialspace = skipinitialspace csv_reader.dayfirst = dayfirst csv_reader.header = header_infer csv_reader.skiprows = skiprows csv_reader.skipfooter = skipfooter csv_reader.mangle_dupe_cols = mangle_dupe_cols csv_reader.windowslinetermination = False csv_reader.compression = compression_bytes csv_reader.decimal = decimal.encode() csv_reader.thousands = thousands.encode() if thousands else b'\0' csv_reader.nrows = nrows if nrows is not None else -1 if byte_range is not None: csv_reader.byte_range_offset = byte_range[0] csv_reader.byte_range_size = byte_range[1] else: csv_reader.byte_range_offset = 0 csv_reader.byte_range_size = 0 csv_reader.skip_blank_lines = skip_blank_lines csv_reader.comment = comment.encode() if comment else b'\0' csv_reader.keep_default_na = keep_default_na csv_reader.na_filter = na_filter csv_reader.prefix = prefix_bytes # Call read_csv libgdf.read_csv(csv_reader) out = csv_reader.data if out == ffi.NULL: raise ValueError("Failed to parse CSV") # Extract parsed columns outcols = [] new_names = [] for i in range(csv_reader.num_cols_out): newcol = Column.from_cffi_view(out[i]) new_names.append(ffi.string(out[i].col_name).decode()) if (newcol.dtype == np.dtype('datetime64[ms]')): outcols.append(newcol.view(DatetimeColumn, dtype='datetime64[ms]')) else: outcols.append(newcol.view(NumericalColumn, dtype=newcol.dtype)) # Build dataframe df = DataFrame() # if names is not None and header_infer is -1: for k, v in zip(new_names, outcols): df[k] = v # Set index if the index_col parameter is passed if index_col is not None and index_col is not False: if isinstance(index_col, (int)): df = df.set_index(df.columns[index_col]) else: df = df.set_index(index_col) nvtx_range_pop() return df
def read_csv(filepath, lineterminator='\n', quotechar='"', quoting=True, doublequote=True, delimiter=',', sep=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False): """ Load and parse a CSV file into a DataFrame Parameters ---------- filepath : str Path of file to be read. delimiter : char, default ',' Delimiter to be used. delim_whitespace : bool, default False Determines whether to use whitespace as delimiter. lineterminator : char, default '\\n' Character to indicate end of line. skipinitialspace : bool, default False Skip spaces after delimiter. names : list of str, default None List of column names to be used. dtype : list of str or dict of {col: dtype}, default None List of data types in the same order of the column names or a dictionary with column_name:dtype (pandas style). quotechar : char, default '"' Character to indicate start and end of quote item. quoting : bool, default True If True, start and end quotechar are removed from returned strings If False, start and end quotechar are kept in returned strings doublequote : bool, default True When quotechar is specified and quoting is True, indicates whether to interpret two consecutive quotechar inside fields as single quotechar skiprows : int, default 0 Number of rows to be skipped from the start of file. skipfooter : int, default 0 Number of rows to be skipped at the bottom of file. Returns ------- GPU ``DataFrame`` object. Examples -------- foo.txt : :: 50,50|40,60|30,70|20,80| >>> import cudf >>> df = cudf.read_csv('foo.txt', delimiter=',', lineterminator='|', ... names=['col1', 'col2'], dtype=['int64', 'int64'], ... skiprows=1, skipfooter=1) >>> df col1 col2 0 40 60 1 30 70 """ if names is None or dtype is None: msg = '''Automatic dtype detection not implemented: Column names and dtypes must be specified.''' raise TypeError(msg) if isinstance(dtype, dict): dtype_dict = True elif isinstance(dtype, list): dtype_dict = False if len(dtype) != len(names): msg = '''All column dtypes must be specified.''' raise TypeError(msg) else: msg = '''dtype must be 'list' or 'dict' ''' raise TypeError(msg) nvtx_range_push("PYGDF_READ_CSV", "purple") csv_reader = ffi.new('csv_read_arg*') # Populate csv_reader struct file_path = _wrap_string(filepath) csv_reader.file_path = file_path arr_names = [] arr_dtypes = [] for col_name in names: arr_names.append(_wrap_string(col_name)) if dtype_dict: arr_dtypes.append(_wrap_string(str(dtype[col_name]))) names_ptr = ffi.new('char*[]', arr_names) csv_reader.names = names_ptr if not dtype_dict: for col_dtype in dtype: arr_dtypes.append(_wrap_string(str(col_dtype))) dtype_ptr = ffi.new('char*[]', arr_dtypes) csv_reader.dtype = dtype_ptr csv_reader.delimiter = delimiter.encode() csv_reader.lineterminator = lineterminator.encode() csv_reader.quotechar = quotechar.encode() csv_reader.quoting = quoting csv_reader.doublequote = doublequote csv_reader.delim_whitespace = delim_whitespace csv_reader.skipinitialspace = skipinitialspace csv_reader.dayfirst = dayfirst csv_reader.num_cols = len(names) csv_reader.skiprows = skiprows csv_reader.skipfooter = skipfooter # Call read_csv libgdf.read_csv(csv_reader) out = csv_reader.data if out == ffi.NULL: raise ValueError("Failed to parse CSV") # Extract parsed columns outcols = [] for i in range(csv_reader.num_cols_out): newcol = Column.from_cffi_view(out[i]) if (newcol.dtype == np.dtype('datetime64[ms]')): outcols.append(newcol.view(DatetimeColumn, dtype='datetime64[ms]')) else: outcols.append(newcol.view(NumericalColumn, dtype=newcol.dtype)) # Build dataframe df = DataFrame() for k, v in zip(names, outcols): df[k] = v nvtx_range_pop() return df
def read_csv_strings(filepath_or_buffer, lineterminator='\n', quotechar='"', quoting=0, doublequote=True, header='infer', sep=',', delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression='infer', thousands=None, decimal='.', true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None): """ **Experimental**: This function exists only as a beta way to use `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. with cudf. Future versions of cuDF will provide cleaner integration. Uses mostly same arguments as read_csv. Note: Doesn't currently support auto-column detection, header, usecols and mangle_dupe_cols args. Returns ------- columns : ordered list of cudf.dataframe.Series and nvstrings objects numeric or date dtyped columns will be Series. 'str' dtyped columns will be `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. Examples -------- Create a test csv file >>> import cudf >>> filename = 'foo.csv' >>> lines = [ ... "num1,datetime,text", ... "123,2018-11-13T12:00:00,abc", ... "456,2018-11-14T12:35:01,def", ... "789,2018-11-15T18:02:59,ghi" ... ] >>> with open(filename, 'w') as fp: ... fp.write('\\n'.join(lines)+'\\n') Read the file with cudf >>> names = ['num1', 'datetime', 'text'] >>> dtypes = ['int', 'date', 'str'] >>> columns = cudf.io.csv.read_csv_strings(filename, delimiter=',', ... names=names, dtype=dtypes, ... skiprows=1) Display results >>> print(columns[0]) 0 123 1 456 2 789 >>> print(columns[2]) ['abc', 'def', 'ghi'] See Also -------- .read_csv """ import nvstrings from cudf.dataframe.series import Series # Alias sep -> delimiter. if delimiter is None: delimiter = sep if dtype is not None: if isinstance(dtype, collections.abc.Mapping): dtype_dict = True elif isinstance(dtype, collections.abc.Iterable): dtype_dict = False else: msg = '''dtype must be 'list like' or 'dict' ''' raise TypeError(msg) if names is not None and len(dtype) != len(names): msg = '''All column dtypes must be specified.''' raise TypeError(msg) csv_reader = ffi.new('csv_read_arg*') # Populate csv_reader struct if is_file_like(filepath_or_buffer): buffer = filepath_or_buffer.read() # check if StringIO is used if hasattr(buffer, 'encode'): buffer_as_bytes = buffer.encode() else: buffer_as_bytes = buffer buffer_data_holder = ffi.new("char[]", buffer_as_bytes) csv_reader.input_data_form = libgdf.HOST_BUFFER csv_reader.filepath_or_buffer = buffer_data_holder csv_reader.buffer_size = len(buffer_as_bytes) else: if (not os.path.isfile(filepath_or_buffer)): raise (FileNotFoundError) if (not os.path.exists(filepath_or_buffer)): raise (FileNotFoundError) file_path = _wrap_string(filepath_or_buffer) csv_reader.input_data_form = libgdf.FILE_PATH csv_reader.filepath_or_buffer = file_path if header == 'infer': header = -1 header_infer = header arr_names = [] arr_dtypes = [] if names is None: if header is -1: header_infer = 0 if header is None: header_infer = -1 csv_reader.names = ffi.NULL csv_reader.num_cols = 0 else: if header is None: header_infer = -1 csv_reader.num_cols = len(names) for col_name in names: arr_names.append(_wrap_string(col_name)) if dtype is not None: if dtype_dict: arr_dtypes.append(_wrap_string(str(dtype[col_name]))) names_ptr = ffi.new('char*[]', arr_names) csv_reader.names = names_ptr if dtype is None: csv_reader.dtype = ffi.NULL else: if not dtype_dict: for col_dtype in dtype: arr_dtypes.append(_wrap_string(str(col_dtype))) dtype_ptr = ffi.new('char*[]', arr_dtypes) csv_reader.dtype = dtype_ptr if decimal == delimiter: raise ValueError("decimal cannot be the same as delimiter") if thousands == delimiter: raise ValueError("thousands cannot be the same as delimiter") if nrows is not None and skipfooter != 0: raise ValueError("cannot use both nrows and skipfooter parameters") if byte_range is not None: if skipfooter != 0 or skiprows != 0 or nrows is not None: raise ValueError("""cannot manually limit rows to be read when using the byte range parameter""") # Start with default values recognized as boolean arr_true_values = [_wrap_string(str('True')), _wrap_string(str('TRUE'))] arr_false_values = [_wrap_string(str('False')), _wrap_string(str('FALSE'))] for value in true_values or []: arr_true_values.append(_wrap_string(str(value))) arr_true_values_ptr = ffi.new('char*[]', arr_true_values) csv_reader.true_values = arr_true_values_ptr csv_reader.num_true_values = len(arr_true_values) for value in false_values or []: arr_false_values.append(_wrap_string(str(value))) false_values_ptr = ffi.new('char*[]', arr_false_values) csv_reader.false_values = false_values_ptr csv_reader.num_false_values = len(arr_false_values) arr_na_values = [] for value in na_values or []: arr_na_values.append(_wrap_string(str(value))) arr_na_values_ptr = ffi.new('char*[]', arr_na_values) csv_reader.na_values = arr_na_values_ptr csv_reader.num_na_values = len(arr_na_values) compression_bytes = _wrap_string(compression) prefix_bytes = _wrap_string(prefix) csv_reader.delimiter = delimiter.encode() csv_reader.lineterminator = lineterminator.encode() csv_reader.quotechar = quotechar.encode() csv_reader.quoting = _quoting_enum[quoting] csv_reader.doublequote = doublequote csv_reader.delim_whitespace = delim_whitespace csv_reader.skipinitialspace = skipinitialspace csv_reader.dayfirst = dayfirst csv_reader.header = header_infer csv_reader.skiprows = skiprows csv_reader.skipfooter = skipfooter csv_reader.compression = compression_bytes csv_reader.decimal = decimal.encode() csv_reader.thousands = thousands.encode() if thousands else b'\0' csv_reader.nrows = nrows if nrows is not None else -1 if byte_range is not None: csv_reader.byte_range_offset = byte_range[0] csv_reader.byte_range_size = byte_range[1] else: csv_reader.byte_range_offset = 0 csv_reader.byte_range_size = 0 csv_reader.skip_blank_lines = skip_blank_lines csv_reader.comment = comment.encode() if comment else b'\0' csv_reader.keep_default_na = keep_default_na csv_reader.na_filter = na_filter csv_reader.prefix = prefix_bytes # Call read_csv libgdf.read_csv(csv_reader) out = csv_reader.data if out == ffi.NULL: raise ValueError("Failed to parse CSV") # Extract parsed columns outcols = [] for i in range(csv_reader.num_cols_out): if out[i].dtype == libgdf.GDF_STRING: ptr = int(ffi.cast("uintptr_t", out[i].data)) outcols.append(nvstrings.bind_cpointer(ptr)) else: newcol = Column.from_cffi_view(out[i]) if (newcol.dtype == np.dtype('datetime64[ms]')): col = newcol.view(DatetimeColumn, dtype='datetime64[ms]') else: col = newcol.view(NumericalColumn, dtype=newcol.dtype) outcols.append(Series(col)) return outcols
def read_csv(filepath_or_buffer, lineterminator='\n', quotechar='"', quoting=True, doublequote=True, header='infer', mangle_dupe_cols=True, usecols=None, sep=',', delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression='infer', thousands=None, decimal='.', true_values=None, false_values=None, nrows=None): """ Load and parse a CSV file into a DataFrame Parameters ---------- filepath_or_buffer : str Path of file to be read or a file-like object containing the file. sep : char, default ',' Delimiter to be used. delimiter : char, default None Alternative argument name for sep. delim_whitespace : bool, default False Determines whether to use whitespace as delimiter. lineterminator : char, default '\\n' Character to indicate end of line. skipinitialspace : bool, default False Skip spaces after delimiter. names : list of str, default None List of column names to be used. dtype : list of str or dict of {col: dtype}, default None List of data types in the same order of the column names or a dictionary with column_name:dtype (pandas style). quotechar : char, default '"' Character to indicate start and end of quote item. quoting : bool, default True If True, start and end quotechar are removed from returned strings If False, start and end quotechar are kept in returned strings doublequote : bool, default True When quotechar is specified and quoting is True, indicates whether to interpret two consecutive quotechar inside fields as single quotechar header : int, default 'infer' Row number to use as the column names. Default behavior is to infer the column names: if no names are passed, header=0; if column names are passed explicitly, header=None. usecols : list of int or str, default None Returns subset of the columns given in the list. All elements must be either integer indices (column number) or strings that correspond to column names mangle_dupe_cols : boolean, default True Duplicate columns will be specified as 'X','X.1',...'X.N'. skiprows : int, default 0 Number of rows to be skipped from the start of file. skipfooter : int, default 0 Number of rows to be skipped at the bottom of file. compression : {'infer', 'gzip', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If ‘infer’, then detect compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no decompression). If using ‘zip’, the ZIP file must contain only one data file to be read in, otherwise the first non-zero-sized file will be used. Set to None for no decompression. decimal : char, default '.' Character used as a decimal point. thousands : char, default None Character used as a thousands delimiter. true_values : list, default None Values to consider as boolean True false_values : list, default None Values to consider as boolean False nrows: int, default None If specified, maximum number of rows to read Returns ------- GPU ``DataFrame`` object. Examples -------- .. code-block:: python import cudf # Create a test csv file filename = 'foo.csv' lines = [ "num1,datetime,text", "123,2018-11-13T12:00:00,abc", "456,2018-11-14T12:35:01,def", "789,2018-11-15T18:02:59,ghi" ] with open(filename, 'w') as fp: fp.write('\\n'.join(lines)+'\\n') # Read the file with cudf names = ['num1', 'datetime', 'text'] # Note 'int' for 3rd column- text will be hashed dtypes = ['int', 'date', 'int'] df = cudf.read_csv(filename, delimiter=',', names=names, dtype=dtypes, skiprows=1) # Display results print(df) Output: .. code-block:: python num1 datetime text 0 123 2018-11-13T12:00:00.000 5451 1 456 2018-11-14T12:35:01.000 5784 2 789 2018-11-15T18:02:59.000 6117 See Also -------- .read_csv_strings """ # Alias sep -> delimiter. if delimiter is None: delimiter = sep if dtype is not None: if isinstance(dtype, collections.abc.Mapping): dtype_dict = True elif isinstance(dtype, collections.abc.Iterable): dtype_dict = False else: msg = '''dtype must be 'list like' or 'dict' ''' raise TypeError(msg) if names is not None and len(dtype) != len(names): msg = '''All column dtypes must be specified.''' raise TypeError(msg) nvtx_range_push("PYGDF_READ_CSV", "purple") csv_reader = ffi.new('csv_read_arg*') # Populate csv_reader struct if is_file_like(filepath_or_buffer): if compression == 'infer': compression = None buffer = filepath_or_buffer.read() # check if StringIO is used if hasattr(buffer, 'encode'): buffer_as_bytes = buffer.encode() else: buffer_as_bytes = buffer buffer_data_holder = ffi.new("char[]", buffer_as_bytes) csv_reader.input_data_form = libgdf.HOST_BUFFER csv_reader.filepath_or_buffer = buffer_data_holder csv_reader.buffer_size = len(buffer_as_bytes) else: file_path = _wrap_string(filepath_or_buffer) csv_reader.input_data_form = libgdf.FILE_PATH csv_reader.filepath_or_buffer = file_path if header == 'infer': header = -1 header_infer = header arr_names = [] arr_dtypes = [] if names is None: if header is -1: header_infer = 0 if header is None: header_infer = -1 csv_reader.names = ffi.NULL csv_reader.num_cols = 0 else: if header is None: header_infer = -1 csv_reader.num_cols = len(names) for col_name in names: arr_names.append(_wrap_string(col_name)) if dtype is not None: if dtype_dict: arr_dtypes.append(_wrap_string(str(dtype[col_name]))) names_ptr = ffi.new('char*[]', arr_names) csv_reader.names = names_ptr if dtype is None: csv_reader.dtype = ffi.NULL else: if not dtype_dict: for col_dtype in dtype: arr_dtypes.append(_wrap_string(str(col_dtype))) dtype_ptr = ffi.new('char*[]', arr_dtypes) csv_reader.dtype = dtype_ptr csv_reader.use_cols_int = ffi.NULL csv_reader.use_cols_int_len = 0 csv_reader.use_cols_char = ffi.NULL csv_reader.use_cols_char_len = 0 if usecols is not None: arr_col_names = [] if(all(isinstance(x, int) for x in usecols)): usecols_ptr = ffi.new('int[]', usecols) csv_reader.use_cols_int = usecols_ptr csv_reader.use_cols_int_len = len(usecols) else: for col_name in usecols: arr_col_names.append(_wrap_string(col_name)) col_names_ptr = ffi.new('char*[]', arr_col_names) csv_reader.use_cols_char = col_names_ptr csv_reader.use_cols_char_len = len(usecols) if decimal == delimiter: raise ValueError("decimal cannot be the same as delimiter") if thousands == delimiter: raise ValueError("thousands cannot be the same as delimiter") if nrows is not None and skipfooter != 0: raise ValueError("cannot use both nrows and skipfooter parameters") # Start with default values recognized as boolean arr_true_values = [_wrap_string(str('True')), _wrap_string(str('TRUE'))] arr_false_values = [_wrap_string(str('False')), _wrap_string(str('FALSE'))] for value in true_values or []: arr_true_values.append(_wrap_string(str(value))) arr_true_values_ptr = ffi.new('char*[]', arr_true_values) csv_reader.true_values = arr_true_values_ptr csv_reader.num_true_values = len(arr_true_values) for value in false_values or []: arr_false_values.append(_wrap_string(str(value))) false_values_ptr = ffi.new('char*[]', arr_false_values) csv_reader.false_values = false_values_ptr csv_reader.num_false_values = len(arr_false_values) compression_bytes = _wrap_string(compression) csv_reader.delimiter = delimiter.encode() csv_reader.lineterminator = lineterminator.encode() csv_reader.quotechar = quotechar.encode() csv_reader.quoting = quoting csv_reader.doublequote = doublequote csv_reader.delim_whitespace = delim_whitespace csv_reader.skipinitialspace = skipinitialspace csv_reader.dayfirst = dayfirst csv_reader.header = header_infer csv_reader.skiprows = skiprows csv_reader.skipfooter = skipfooter csv_reader.mangle_dupe_cols = mangle_dupe_cols csv_reader.windowslinetermination = False csv_reader.compression = compression_bytes csv_reader.decimal = decimal.encode() csv_reader.thousands = thousands.encode() if thousands else b'\0' csv_reader.nrows = nrows if nrows is not None else -1 # Call read_csv libgdf.read_csv(csv_reader) out = csv_reader.data if out == ffi.NULL: raise ValueError("Failed to parse CSV") # Extract parsed columns outcols = [] new_names = [] for i in range(csv_reader.num_cols_out): newcol = Column.from_cffi_view(out[i]) new_names.append(ffi.string(out[i].col_name).decode()) if(newcol.dtype == np.dtype('datetime64[ms]')): outcols.append(newcol.view(DatetimeColumn, dtype='datetime64[ms]')) else: outcols.append(newcol.view(NumericalColumn, dtype=newcol.dtype)) # Build dataframe df = DataFrame() # if names is not None and header_infer is -1: for k, v in zip(new_names, outcols): df[k] = v nvtx_range_pop() return df
def _wrap_string(text): if(text is None): return ffi.NULL else: return ffi.new("char[]", text.encode())
def agg(self, args): """Invoke aggregation functions on the groups. Parameters ---------- args : dict, list, str, callable - str The aggregate function name. - list List of *str* of the aggregate function. - dict key-value pairs of source column name and list of aggregate functions as *str*. Returns ------- result : DataFrame Notes ----- Since multi-indexes aren't supported aggregation results are returned in columns using the naming scheme of `aggregation_columnname`. """ result = DataFrame() add_col_values = True ctx = ffi.new('gdf_context*') ctx.flag_sorted = 0 ctx.flag_method = self._method ctx.flag_distinct = 0 sort_result = True if not isinstance(args, str) and isinstance( args, collections.abc.Sequence): if (len(args) == 1 and len(self._val_columns) == 1): sort_result = False for agg_type in args: val_columns_out = [agg_type + '_' + val for val in self._val_columns] result = self._apply_agg( agg_type, result, add_col_values, ctx, self._val_columns, val_columns_out, sort_result=sort_result) add_col_values = False # we only want to add them once elif isinstance(args, collections.abc.Mapping): if (len(args.keys()) == 1): if(len(list(args.values())[0]) == 1): sort_result = False for val, agg_type in args.items(): if not isinstance(agg_type, str) and \ isinstance(agg_type, collections.abc.Sequence): for sub_agg_type in agg_type: val_columns_out = [sub_agg_type + '_' + val] result = self._apply_agg(sub_agg_type, result, add_col_values, ctx, [val], val_columns_out, sort_result=sort_result) elif isinstance(agg_type, str): val_columns_out = [agg_type + '_' + val] result = self._apply_agg(agg_type, result, add_col_values, ctx, [val], val_columns_out, sort_result=sort_result) add_col_values = False # we only want to add them once else: result = self.agg([args]) nvtx_range_pop() return result
def read_csv_strings(filepath_or_buffer, lineterminator='\n', quotechar='"', quoting=True, doublequote=True, sep=',', delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression='infer', thousands=None, decimal='.', true_values=None, false_values=None, nrows=None): """ **Experimental**: This function exists only as a beta way to use `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. with cudf. Future versions of cuDF will provide cleaner integration. Uses mostly same arguments as read_csv. Note: Doesn't currently support auto-column detection, header, usecols and mangle_dupe_cols args. Returns ------- columns : ordered list of cudf.dataframe.Series and nvstrings objects numeric or date dtyped columns will be Series. 'str' dtyped columns will be `nvstrings <https://nvstrings.readthedocs.io/en/latest/>`_. Examples -------- .. code-block:: python import cudf # Create a test csv file filename = 'foo.csv' lines = [ "num1,datetime,text", "123,2018-11-13T12:00:00,abc", "456,2018-11-14T12:35:01,def", "789,2018-11-15T18:02:59,ghi" ] with open(filename, 'w') as fp: fp.write('\\n'.join(lines)+'\\n') # Read the file with cudf names = ['num1', 'datetime', 'text'] dtypes = ['int', 'date', 'str'] columns = cudf.io.csv.read_csv_strings(filename, delimiter=',', names=names, dtype=dtypes, skiprows=1) # Display results columns[0] print(columns[0]) columns[2] print(columns[2]) Output: .. code-block:: python <cudf.Series nrows=3 > 0 123 1 456 2 789 <nvstrings count=3> ['abc', 'def', 'ghi'] See Also -------- .read_csv """ import nvstrings from cudf.dataframe.series import Series if names is None or dtype is None: msg = '''Automatic dtype detection not implemented: Column names and dtypes must be specified.''' raise TypeError(msg) # Alias sep -> delimiter. if delimiter is None: delimiter = sep if isinstance(dtype, dict): dtype_dict = True elif isinstance(dtype, list): dtype_dict = False if len(dtype) != len(names): msg = '''All column dtypes must be specified.''' raise TypeError(msg) else: msg = '''dtype must be 'list' or 'dict' ''' raise TypeError(msg) csv_reader = ffi.new('csv_read_arg*') # Populate csv_reader struct if is_file_like(filepath_or_buffer): buffer = filepath_or_buffer.read() # check if StringIO is used if hasattr(buffer, 'encode'): buffer_as_bytes = buffer.encode() else: buffer_as_bytes = buffer buffer_data_holder = ffi.new("char[]", buffer_as_bytes) csv_reader.input_data_form = libgdf.HOST_BUFFER csv_reader.filepath_or_buffer = buffer_data_holder csv_reader.buffer_size = len(buffer_as_bytes) else: file_path = _wrap_string(filepath_or_buffer) csv_reader.input_data_form = libgdf.FILE_PATH csv_reader.filepath_or_buffer = file_path arr_names = [] arr_dtypes = [] for col_name in names: arr_names.append(_wrap_string(col_name)) if dtype_dict: arr_dtypes.append(_wrap_string(str(dtype[col_name]))) names_ptr = ffi.new('char*[]', arr_names) csv_reader.names = names_ptr if not dtype_dict: for col_dtype in dtype: arr_dtypes.append(_wrap_string(str(col_dtype))) dtype_ptr = ffi.new('char*[]', arr_dtypes) csv_reader.dtype = dtype_ptr if decimal == delimiter: raise ValueError("decimal cannot be the same as delimiter") if thousands == delimiter: raise ValueError("thousands cannot be the same as delimiter") if nrows is not None and skipfooter != 0: raise ValueError("cannot use both nrows and skipfooter parameters") # Start with default values recognized as boolean arr_true_values = [_wrap_string(str('True')), _wrap_string(str('TRUE'))] arr_false_values = [_wrap_string(str('False')), _wrap_string(str('FALSE'))] for value in true_values or []: arr_true_values.append(_wrap_string(str(value))) arr_true_values_ptr = ffi.new('char*[]', arr_true_values) csv_reader.true_values = arr_true_values_ptr csv_reader.num_true_values = len(arr_true_values) for value in false_values or []: arr_false_values.append(_wrap_string(str(value))) false_values_ptr = ffi.new('char*[]', arr_false_values) csv_reader.false_values = false_values_ptr csv_reader.num_false_values = len(arr_false_values) compression_bytes = _wrap_string(compression) csv_reader.delimiter = delimiter.encode() csv_reader.lineterminator = lineterminator.encode() csv_reader.quotechar = quotechar.encode() csv_reader.quoting = quoting csv_reader.doublequote = doublequote csv_reader.delim_whitespace = delim_whitespace csv_reader.skipinitialspace = skipinitialspace csv_reader.dayfirst = dayfirst csv_reader.num_cols = len(names) csv_reader.skiprows = skiprows csv_reader.skipfooter = skipfooter csv_reader.compression = compression_bytes csv_reader.decimal = decimal.encode() csv_reader.thousands = thousands.encode() if thousands else b'\0' csv_reader.nrows = nrows if nrows is not None else -1 # Call read_csv libgdf.read_csv(csv_reader) out = csv_reader.data if out == ffi.NULL: raise ValueError("Failed to parse CSV") # Extract parsed columns outcols = [] for i in range(csv_reader.num_cols_out): if out[i].dtype == libgdf.GDF_STRING: ptr = int(ffi.cast("uintptr_t", out[i].data)) outcols.append(nvstrings.bind_cpointer(ptr)) else: newcol = Column.from_cffi_view(out[i]) if(newcol.dtype == np.dtype('datetime64[ms]')): col = newcol.view(DatetimeColumn, dtype='datetime64[ms]') else: col = newcol.view(NumericalColumn, dtype=newcol.dtype) outcols.append(Series(col)) return outcols
def read_csv_strings(filepath, lineterminator='\n', quotechar='"', quoting=True, doublequote=True, delimiter=',', sep=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False): import nvstrings from cudf.dataframe.series import Series """ **Experimental**: This function provided only as an alpha way of providing a way to use nvstrings alongside cudf. Future versions of cuDF will provide cleaner integration. Uses the same arguments as read_csv. Returns list of Series objects for numeric or date columns and nvstrings objects for those columns that are strings (dtype='str'). Examples -------- foo.txt : :: 50,abc|40,def|30,ghi|20,jkl| .. code-block:: python import cudf fn = 'foo.txt' cols = cudf.io.read_csv_strings(fn, delimiter=',', lineterminator='|', names=['col1', 'col2'], dtype=['int64', 'str'], skiprows=1, skipfooter=1) type(cols[0]) print(cols[0]) type(cols[1]) print(cols[1]) Output: .. code-block:: python <class 'cudf.series.Series'> 0 40 1 30 <class 'nvstrings.nvstrings'> ['def', 'ghi'] """ if names is None or dtype is None: msg = '''Automatic dtype detection not implemented: Column names and dtypes must be specified.''' raise TypeError(msg) if isinstance(dtype, dict): dtype_dict = True elif isinstance(dtype, list): dtype_dict = False if len(dtype) != len(names): msg = '''All column dtypes must be specified.''' raise TypeError(msg) else: msg = '''dtype must be 'list' or 'dict' ''' raise TypeError(msg) csv_reader = ffi.new('csv_read_arg*') # Populate csv_reader struct file_path = _wrap_string(filepath) csv_reader.file_path = file_path arr_names = [] arr_dtypes = [] for col_name in names: arr_names.append(_wrap_string(col_name)) if dtype_dict: arr_dtypes.append(_wrap_string(str(dtype[col_name]))) names_ptr = ffi.new('char*[]', arr_names) csv_reader.names = names_ptr if not dtype_dict: for col_dtype in dtype: arr_dtypes.append(_wrap_string(str(col_dtype))) dtype_ptr = ffi.new('char*[]', arr_dtypes) csv_reader.dtype = dtype_ptr csv_reader.delimiter = delimiter.encode() csv_reader.lineterminator = lineterminator.encode() csv_reader.quotechar = quotechar.encode() csv_reader.quoting = quoting csv_reader.doublequote = doublequote csv_reader.delim_whitespace = delim_whitespace csv_reader.skipinitialspace = skipinitialspace csv_reader.dayfirst = dayfirst csv_reader.num_cols = len(names) csv_reader.skiprows = skiprows csv_reader.skipfooter = skipfooter # Call read_csv libgdf.read_csv(csv_reader) out = csv_reader.data if out == ffi.NULL: raise ValueError("Failed to parse CSV") # Extract parsed columns outcols = [] for i in range(csv_reader.num_cols_out): if out[i].dtype == libgdf.GDF_STRING: ptr = int(ffi.cast("uintptr_t", out[i].data)) outcols.append(nvstrings.bind_cpointer(ptr)) else: newcol = Column.from_cffi_view(out[i]) if (newcol.dtype == np.dtype('datetime64[ms]')): col = newcol.view(DatetimeColumn, dtype='datetime64[ms]') else: col = newcol.view(NumericalColumn, dtype=newcol.dtype) outcols.append(Series(col)) return outcols