def typeof_pd_dataframe(val, c): col_names = tuple(val.columns.tolist()) # TODO: support other types like string and timestamp col_types = get_hiframes_dtypes(val) index_type = _infer_index_type(val.index) return DataFrameType(col_types, index_type, col_names, True)
def csv_reader_infer_nb_pandas_type( filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False ): # infer column types from the first block (similarly as Arrow does this) # TO-DO: tune the block size or allow user configure it via env var rows_to_read = 1000 df = pd.read_csv(filepath_or_buffer, delimiter=delimiter, names=names, usecols=usecols, dtype=dtype, skiprows=skiprows, nrows=rows_to_read, parse_dates=parse_dates) try: df_type = numba.typeof(df) except ValueError: nb_col_types = [] for col_name in df.columns: try: series_type = numba.typeof(df[col_name]) col_type = series_type.data except ValueError: col_type = string_array_type nb_col_types.append(col_type) nb_col_types = tuple(nb_col_types) nb_col_names = tuple(df.columns) column_loc, _, _ = get_structure_maps(nb_col_types, nb_col_names) df_type = DataFrameType(nb_col_types, PositionalIndexType(), nb_col_names, column_loc=column_loc) return df_type
def init_dataframe(typingctx, *args): """Create a DataFrame with provided data, index and columns values. Used as a single constructor for DataFrame and assigning its data, so that optimization passes can look for init_dataframe() to see if underlying data has changed, and get the array variables from init_dataframe() args if not changed. """ n_cols = len(args) // 2 data_typs = tuple(args[:n_cols]) index_typ = args[n_cols] column_names = tuple(a.literal_value for a in args[n_cols + 1:]) def codegen(context, builder, signature, args): in_tup = args[0] data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)] index = builder.extract_value(in_tup, n_cols) column_strs = [ numba.unicode.make_string_from_constant(context, builder, string_type, c) for c in column_names ] # create dataframe struct and store values dataframe = cgutils.create_struct_proxy(signature.return_type)(context, builder) data_tup = context.make_tuple(builder, types.Tuple(data_typs), data_arrs) column_tup = context.make_tuple(builder, types.UniTuple(string_type, n_cols), column_strs) zero = context.get_constant(types.int8, 0) dataframe.data = data_tup dataframe.index = index dataframe.columns = column_tup dataframe.parent = context.get_constant_null(types.pyobject) # increase refcount of stored values if context.enable_nrt: context.nrt.incref(builder, index_typ, index) for var, typ in zip(data_arrs, data_typs): context.nrt.incref(builder, typ, var) for var in column_strs: context.nrt.incref(builder, string_type, var) return dataframe._getvalue() ret_typ = DataFrameType(data_typs, index_typ, column_names) sig = signature(ret_typ, types.Tuple(args)) return sig, codegen
def _gen_pandas_read_csv_func_text(col_names, col_typs, py_col_dtypes, usecols, signature=None): func_name = 'csv_reader_py' return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names column_loc, _, _ = get_structure_maps(col_typs, return_columns) df_type = DataFrameType(tuple(col_typs), types.none, tuple(col_names), column_loc=column_loc) df_type_repr = repr(df_type) # for some reason pandas and pyarrow read_csv() return CategoricalDtype with # ordered=False in case when dtype is with ordered=None df_type_repr = df_type_repr.replace('ordered=None', 'ordered=False') # TODO: support non-numpy types like strings date_inds = ", ".join( str(i) for i, t in enumerate(col_typs) if t.dtype == types.NPDatetime('ns')) return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names if signature is None: signature = "filepath_or_buffer" # map generated func params into values used in inner call of pandas_read_csv # if no transformation is needed just use outer param name (since APIs match) # otherwise use value in the dictionary inner_call_params = {'parse_dates': f"[{date_inds}]"} used_read_csv_params = ('filepath_or_buffer', 'names', 'skiprows', 'parse_dates', 'dtype', 'usecols', 'sep', 'delimiter') # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N" # during inference from file names should be raplaced with "Unnamed: N" # passing names to pyarrow means that one row is header and should be skipped if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)): inner_call_params['names'] = str(col_names) inner_call_params['skiprows'] = "(skiprows and skiprows + 1) or 1" # dtype parameter of compiled function is not used at all, instead a python dict # of columns dtypes is captured at compile time, because some dtypes (like datetime) # are converted and also to avoid penalty of creating dict in objmode inner_call_params['dtype'] = 'read_as_dtypes' params_str = '\n'.join([ f" {param}={inner_call_params.get(param, param)}," for param in used_read_csv_params ]) func_text = '\n'.join([ f"def {func_name}({signature}):", f" with objmode(df=\"{df_type_repr}\"):", f" df = pandas_read_csv(\n{params_str}", f" )", f" return df" ]) global_vars = { 'read_as_dtypes': py_col_dtypes, 'objmode': objmode, 'pandas_read_csv': pandas_read_csv, } return func_text, func_name, global_vars