Exemplo n.º 1
0
def typeof_pd_dataframe(val, c):
    col_names = tuple(val.columns.tolist())
    # TODO: support other types like string and timestamp
    col_types = get_hiframes_dtypes(val)
    index_type = _infer_index_type(val.index)

    return DataFrameType(col_types, index_type, col_names, True)
Exemplo n.º 2
0
def csv_reader_infer_nb_pandas_type(
    filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False
):

    # infer column types from the first block (similarly as Arrow does this)
    # TO-DO: tune the block size or allow user configure it via env var
    rows_to_read = 1000
    df = pd.read_csv(filepath_or_buffer, delimiter=delimiter, names=names,
                     usecols=usecols, dtype=dtype, skiprows=skiprows, nrows=rows_to_read,
                     parse_dates=parse_dates)

    try:
        df_type = numba.typeof(df)
    except ValueError:
        nb_col_types = []
        for col_name in df.columns:
            try:
                series_type = numba.typeof(df[col_name])
                col_type = series_type.data
            except ValueError:
                col_type = string_array_type
            nb_col_types.append(col_type)

        nb_col_types = tuple(nb_col_types)
        nb_col_names = tuple(df.columns)
        column_loc, _, _ = get_structure_maps(nb_col_types, nb_col_names)
        df_type = DataFrameType(nb_col_types, PositionalIndexType(), nb_col_names, column_loc=column_loc)

    return df_type
Exemplo n.º 3
0
def init_dataframe(typingctx, *args):
    """Create a DataFrame with provided data, index and columns values.
    Used as a single constructor for DataFrame and assigning its data, so that
    optimization passes can look for init_dataframe() to see if underlying
    data has changed, and get the array variables from init_dataframe() args if
    not changed.
    """

    n_cols = len(args) // 2
    data_typs = tuple(args[:n_cols])
    index_typ = args[n_cols]
    column_names = tuple(a.literal_value for a in args[n_cols + 1:])

    def codegen(context, builder, signature, args):
        in_tup = args[0]
        data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)]
        index = builder.extract_value(in_tup, n_cols)
        column_strs = [
            numba.unicode.make_string_from_constant(context, builder,
                                                    string_type, c)
            for c in column_names
        ]
        # create dataframe struct and store values
        dataframe = cgutils.create_struct_proxy(signature.return_type)(context,
                                                                       builder)

        data_tup = context.make_tuple(builder, types.Tuple(data_typs),
                                      data_arrs)
        column_tup = context.make_tuple(builder,
                                        types.UniTuple(string_type, n_cols),
                                        column_strs)
        zero = context.get_constant(types.int8, 0)

        dataframe.data = data_tup
        dataframe.index = index
        dataframe.columns = column_tup
        dataframe.parent = context.get_constant_null(types.pyobject)

        # increase refcount of stored values
        if context.enable_nrt:
            context.nrt.incref(builder, index_typ, index)
            for var, typ in zip(data_arrs, data_typs):
                context.nrt.incref(builder, typ, var)
            for var in column_strs:
                context.nrt.incref(builder, string_type, var)

        return dataframe._getvalue()

    ret_typ = DataFrameType(data_typs, index_typ, column_names)
    sig = signature(ret_typ, types.Tuple(args))
    return sig, codegen
Exemplo n.º 4
0
def _gen_pandas_read_csv_func_text(col_names,
                                   col_typs,
                                   py_col_dtypes,
                                   usecols,
                                   signature=None):

    func_name = 'csv_reader_py'
    return_columns = usecols if usecols and isinstance(usecols[0],
                                                       str) else col_names

    column_loc, _, _ = get_structure_maps(col_typs, return_columns)
    df_type = DataFrameType(tuple(col_typs),
                            types.none,
                            tuple(col_names),
                            column_loc=column_loc)

    df_type_repr = repr(df_type)
    # for some reason pandas and pyarrow read_csv() return CategoricalDtype with
    # ordered=False in case when dtype is with ordered=None
    df_type_repr = df_type_repr.replace('ordered=None', 'ordered=False')

    # TODO: support non-numpy types like strings
    date_inds = ", ".join(
        str(i) for i, t in enumerate(col_typs)
        if t.dtype == types.NPDatetime('ns'))
    return_columns = usecols if usecols and isinstance(usecols[0],
                                                       str) else col_names

    if signature is None:
        signature = "filepath_or_buffer"

    # map generated func params into values used in inner call of pandas_read_csv
    # if no transformation is needed just use outer param name (since APIs match)
    # otherwise use value in the dictionary
    inner_call_params = {'parse_dates': f"[{date_inds}]"}
    used_read_csv_params = ('filepath_or_buffer', 'names', 'skiprows',
                            'parse_dates', 'dtype', 'usecols', 'sep',
                            'delimiter')

    # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N"
    # during inference from file names should be raplaced with "Unnamed: N"
    # passing names to pyarrow means that one row is header and should be skipped
    if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)):
        inner_call_params['names'] = str(col_names)
        inner_call_params['skiprows'] = "(skiprows and skiprows + 1) or 1"

    # dtype parameter of compiled function is not used at all, instead a python dict
    # of columns dtypes is captured at compile time, because some dtypes (like datetime)
    # are converted and also to avoid penalty of creating dict in objmode
    inner_call_params['dtype'] = 'read_as_dtypes'

    params_str = '\n'.join([
        f"      {param}={inner_call_params.get(param, param)},"
        for param in used_read_csv_params
    ])
    func_text = '\n'.join([
        f"def {func_name}({signature}):",
        f"  with objmode(df=\"{df_type_repr}\"):",
        f"    df = pandas_read_csv(\n{params_str}", f"    )", f"  return df"
    ])

    global_vars = {
        'read_as_dtypes': py_col_dtypes,
        'objmode': objmode,
        'pandas_read_csv': pandas_read_csv,
    }

    return func_text, func_name, global_vars