Python Table.field 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyarrow

클래스/타입: Table

메소드/함수: field

hotexamples.com에서의 예제들: 4

Python Table.field - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyarrow.Table.field에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

from_pandas(18)

to_pandas(18)

column(16)

append_column(10)

slice(9)

itercolumns(8)

from_pydict(7)

set_column(6)

to_batches(6)

drop(5)

take(5)

to_pydict(5)

filter(4)

field(4)

rename_columns(4)

from_arrays(3)

from_batches(3)

cast(3)

replace_schema_metadata(3)

combine_chunks(2)

select(1)

예제 #1

파일 보기

파일: _resampling.py 프로젝트: rnyb/webviz-subsurface

def resample_single_real_table(table: pa.Table, freq: Frequency) -> pa.Table:
    """Resample table that contains only a single realization.
    The table must contain a DATE column and it must be sorted on DATE
    """

    # Notes:
    # Getting meta data using json.loads() takes quite a bit of time!!
    # We should provide this info in another way.

    schema = table.schema

    raw_dates_np = table.column("DATE").to_numpy()
    raw_dates_np_as_uint = raw_dates_np.astype(np.uint64)

    min_raw_date = np.min(raw_dates_np)
    max_raw_date = np.max(raw_dates_np)

    sample_dates_np = generate_normalized_sample_dates(
        min_raw_date, max_raw_date, freq=freq
    )
    sample_dates_np_as_uint = sample_dates_np.astype(np.uint64)

    column_arrays = []

    for colname in schema.names:
        if colname == "DATE":
            column_arrays.append(sample_dates_np)
        elif colname == "REAL":
            column_arrays.append(
                np.full(len(sample_dates_np), table.column("REAL")[0].as_py())
            )
        else:
            raw_numpy_arr = table.column(colname).to_numpy()
            if is_rate_from_field_meta(table.field(colname)):
                i = interpolate_backfill(
                    sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr, 0, 0
                )
            else:
                i = np.interp(
                    sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr
                )

            column_arrays.append(i)

    ret_table = pa.table(column_arrays, schema=schema)

    return ret_table

예제 #2

파일 보기

def read_columns(table: pa.Table, full: bool = True) -> List[Column]:
    """Read Column definitions and validate Workbench assumptions.

    Raise ValidateError if:

    * table has metadata
    * table has more than one record batch
    * columns have invalid metadata (e.g., a "format" on a "text" column, or
      a timestamp with unit!=ns or a timezone)
    * column values disagree with metadata (e.g., date32 "2021-04-12" with
      `ColumnType.Date("month")`)

    Be sure the Arrow file backing the table was validated with
    `validate_arrow_file()` first. Otherwise, you'll get undefined behavior.

    If `full=False`, skip costly checks. Only pass `full=False` when you can
    guarantee the data has been generated by a source you trust. (In particular,
    module output is not trusted and it must use the default `full=True`.)
    """
    if table.schema.metadata is not None:
        raise TableSchemaHasMetadata()

    seen_column_names: Dict[str, int] = {}
    ret = []

    for position, column in enumerate(table.itercolumns()):
        field = table.field(position)
        if column.num_chunks > 1:
            raise TableHasTooManyRecordBatches(column.num_chunks)

        if field.name in seen_column_names:
            raise DuplicateColumnName(
                field.name, seen_column_names[field.name], position
            )
        else:
            seen_column_names[field.name] = position

        ret.append(Column(field.name, _read_column_type(column, field, full=full)))

    return ret

예제 #3

파일 보기

파일: _resampling.py 프로젝트: rnyb/webviz-subsurface

def sample_segmented_multi_real_table_at_date(
    table: pa.Table, np_datetime: np.datetime64
) -> pa.Table:
    """Sample table containing multiple realizations at the specified date.
    The table must contain both a REAL and a DATE column.
    The table must be segmented on REAL (so that all rows from a single
    realization are contiguous) and within each REAL segment, it must be
    sorted on DATE.
    """
    # pylint: disable=too-many-locals

    unique_reals_arr_np, first_occurrence_idx, real_counts = np.unique(
        table.column("REAL").to_numpy(), return_index=True, return_counts=True
    )

    all_dates_arr_np = table.column("DATE").to_numpy()

    # Will receive row indices into the full input table for the two values we should
    # interpolate/blend between.
    # To keep things simple we always add two indices for each realization even if
    # we know that no interpolation will be needed (e.g. exact matches)
    row_indices = []

    # Will receive the blending weights for doing interpolation
    interpolate_t_arr = np.zeros(len(unique_reals_arr_np))

    # Array with mask for selecting values when doing backfill. A value of 1 will select
    # v1, while a value of 0 will yield a 0 value
    backfill_mask_arr = np.ones(len(unique_reals_arr_np))

    for i, _real in enumerate(unique_reals_arr_np):
        # Starting row of this realization and number of rows belonging to realization
        start_row_idx = first_occurrence_idx[i]
        row_count = real_counts[i]

        # Get slice of the dates for just this realization
        dates_arr_np = all_dates_arr_np[start_row_idx : start_row_idx + row_count]
        assert len(dates_arr_np) > 0

        # OUTSIDE RANGE (query date is before our first date)
        if np_datetime < dates_arr_np[0]:
            row_indices.append(start_row_idx)
            row_indices.append(start_row_idx)
            # Extrapolate or just fill with 0 for rates
            # interpolate_t_arr[i] = 0
            backfill_mask_arr[i] = 0

        # OUTSIDE RANGE (query date is beyond our last date)
        elif np_datetime > dates_arr_np[-1]:
            row_indices.append(start_row_idx + row_count - 1)
            row_indices.append(start_row_idx + row_count - 1)
            # Extrapolate or just fill with 0 for rates. For interpolation, t should
            # really 1, but since the rows are duplicated it does not matter
            # interpolate_t_arr[i] = 0
            backfill_mask_arr[i] = 0

        # EXACT MATCH on the LAST DATE
        elif np_datetime == dates_arr_np[-1]:
            row_indices.append(start_row_idx + row_count - 1)
            row_indices.append(start_row_idx + row_count - 1)
            # interpolate_t_arr[i] = 0
            # backfill_mask_arr[i] = 1

        else:
            # Search for query date amongst the realization's dates.
            # last_insertion_index is the last legal insertion index of the queried value
            last_insertion_index: int = np.searchsorted(
                dates_arr_np, np_datetime, side="right"
            ).item()

            assert 0 < last_insertion_index < len(dates_arr_np)
            assert dates_arr_np[last_insertion_index - 1] <= np_datetime
            assert dates_arr_np[last_insertion_index] > np_datetime

            if dates_arr_np[last_insertion_index - 1] == np_datetime:
                # Exact match
                row_indices.append(start_row_idx + last_insertion_index - 1)
                row_indices.append(start_row_idx + last_insertion_index - 1)
                # interpolate_t_arr[i] = 0
                # backfill_mask_arr[i] = 1
            else:
                row_indices.append(start_row_idx + last_insertion_index - 1)
                row_indices.append(start_row_idx + last_insertion_index)
                interpolate_t_arr[i] = _compute_interpolation_weight(
                    np_datetime,
                    dates_arr_np[last_insertion_index - 1],
                    dates_arr_np[last_insertion_index],
                )
                # backfill_mask_arr[i] = 1

    column_arrays = []

    for colname in table.schema.names:
        if colname == "REAL":
            column_arrays.append(unique_reals_arr_np)
        elif colname == "DATE":
            column_arrays.append(np.full(len(unique_reals_arr_np), np_datetime))
        else:
            records_np = table.column(colname).take(row_indices).to_numpy()
            if is_rate_from_field_meta(table.field(colname)):
                v1_arr = records_np[1::2]
                interpolated_vec_values = v1_arr * backfill_mask_arr
            else:
                v0_arr = records_np[0::2]
                v1_arr = records_np[1::2]
                delta_arr = v1_arr - v0_arr
                interpolated_vec_values = v0_arr + (delta_arr * interpolate_t_arr)

            column_arrays.append(pa.array(interpolated_vec_values))

    ret_table = pa.table(column_arrays, schema=table.schema)

    return ret_table

예제 #4

파일 보기

파일: _resampling.py 프로젝트: rnyb/webviz-subsurface

def resample_segmented_multi_real_table(table: pa.Table, freq: Frequency) -> pa.Table:
    """Resample table containing multiple realizations.
    The table must contain both a REAL and a DATE column.
    The table must be segmented on REAL (so that all rows from a single
    realization are contiguous) and within each REAL segment, it must be
    sorted on DATE.
    The segmentation is needed since interpolations must be done per realization
    and we utilize slicing on rows for speed.
    """
    # pylint: disable=too-many-locals

    real_arr_np = table.column("REAL").to_numpy()
    unique_reals, first_occurrence_idx, real_counts = np.unique(
        real_arr_np, return_index=True, return_counts=True
    )

    output_columns_dict: Dict[str, pa.ChunkedArray] = {}

    real_interpolation_info_dict: Dict[int, RealInterpolationInfo] = {}

    for colname in table.schema.names:
        if colname in ["DATE", "REAL"]:
            continue

        is_rate = is_rate_from_field_meta(table.field(colname))
        raw_whole_numpy_arr = table.column(colname).to_numpy()

        vec_arr_list = []
        for i, real in enumerate(unique_reals):
            start_row_idx = first_occurrence_idx[i]
            row_count = real_counts[i]

            rii = real_interpolation_info_dict.get(real)
            if not rii:
                rii = _extract_real_interpolation_info(
                    table, start_row_idx, row_count, freq
                )
                real_interpolation_info_dict[real] = rii

            raw_numpy_arr = raw_whole_numpy_arr[
                start_row_idx : start_row_idx + row_count
            ]

            if is_rate:
                inter = interpolate_backfill(
                    rii.sample_dates_np_as_uint,
                    rii.raw_dates_np_as_uint,
                    raw_numpy_arr,
                    0,
                    0,
                )
            else:
                inter = np.interp(
                    rii.sample_dates_np_as_uint,
                    rii.raw_dates_np_as_uint,
                    raw_numpy_arr,
                )

            arr_length = len(rii.sample_dates_np_as_uint)
            assert arr_length == len(inter)

            vec_arr_list.append(inter)

        output_columns_dict[colname] = pa.chunked_array(vec_arr_list)

    date_arr_list = []
    real_arr_list = []
    for real in unique_reals:
        rii = real_interpolation_info_dict[real]
        arr_length = len(rii.sample_dates_np)
        date_arr_list.append(rii.sample_dates_np)
        real_arr_list.append(np.full(arr_length, real))

    output_columns_dict["DATE"] = pa.chunked_array(date_arr_list)
    output_columns_dict["REAL"] = pa.chunked_array(real_arr_list)

    ret_table = pa.table(output_columns_dict, schema=table.schema)

    return ret_table