Exemplo n.º 1
0
def arrow_to_pydf(data: pa.Table,
                  columns: Optional[Sequence[str]] = None,
                  rechunk: bool = True) -> "PyDataFrame":
    """
    Construct a PyDataFrame from an Arrow Table.
    """
    if columns is not None:
        try:
            data = data.rename_columns(columns)
        except pa.lib.ArrowInvalid as e:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.") from e

    data_dict = {}
    for i, column in enumerate(data):
        # extract the name before casting
        if column._name is None:
            name = f"column_{i}"
        else:
            name = column._name

        column = coerce_arrow(column)
        data_dict[name] = column

    batches = pa.table(data_dict).to_batches()
    pydf = PyDataFrame.from_arrow_record_batches(batches)
    if rechunk:
        pydf = pydf.rechunk()
    return pydf
Exemplo n.º 2
0
def _run_field_mapping(
    table: pyarrow.Table, field_mapping: Dict[str, str],
) -> pyarrow.Table:
    # run field mapping in the forward direction
    cols = table.column_names
    mapped_cols = [
        field_mapping[col] if col in field_mapping.keys() else col for col in cols
    ]
    table = table.rename_columns(mapped_cols)
    return table
Exemplo n.º 3
0
def run_forward_field_mapping(table: pyarrow.Table,
                              feature_view: FeatureView) -> pyarrow.Table:
    # run field mapping in the forward direction
    if table is not None and feature_view.input.field_mapping is not None:
        cols = table.column_names
        mapped_cols = [
            feature_view.input.field_mapping[col]
            if col in feature_view.input.field_mapping.keys() else col
            for col in cols
        ]
        table = table.rename_columns(mapped_cols)
    return table
Exemplo n.º 4
0
def arrow_to_pydf(data: pa.Table,
                  columns: ColumnsType | None = None,
                  rechunk: bool = True) -> PyDataFrame:
    """
    Construct a PyDataFrame from an Arrow Table.
    """
    if not _PYARROW_AVAILABLE:  # pragma: no cover
        raise ImportError(
            "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table."
        )
    original_columns = columns
    if columns is not None:
        columns, dtypes = _unpack_columns(columns)
        try:
            data = data.rename_columns(columns)
        except pa.lib.ArrowInvalid as e:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.") from e

    data_dict = {}
    # dictionaries cannot be build in different batches (categorical does not allow that)
    # so we rechunk them and create them separate.
    dictionary_cols = {}
    names = []
    for i, column in enumerate(data):
        # extract the name before casting
        if column._name is None:
            name = f"column_{i}"
        else:
            name = column._name
        names.append(name)

        column = coerce_arrow(column)
        if pa.types.is_dictionary(column.type):
            ps = arrow_to_pyseries(name, column, rechunk)
            dictionary_cols[i] = pli.wrap_s(ps)
        else:
            data_dict[name] = column

    if len(data_dict) > 0:
        tbl = pa.table(data_dict)

        # path for table without rows that keeps datatype
        if tbl.shape[0] == 0:
            pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df
        else:
            pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches())
    else:
        pydf = pli.DataFrame([])._df
    if rechunk:
        pydf = pydf.rechunk()

    if len(dictionary_cols) > 0:
        df = pli.wrap_df(pydf)
        df = df.with_columns(
            [pli.lit(s).alias(s.name) for s in dictionary_cols.values()])
        df = df[names]
        pydf = df._df

    if columns is not None and dtypes and original_columns:
        pydf = _post_apply_columns(pydf, original_columns)
    return pydf