예제 #1
0
def resample_single_real_table(table: pa.Table, freq: Frequency) -> pa.Table:
    """Resample table that contains only a single realization.
    The table must contain a DATE column and it must be sorted on DATE
    """

    # Notes:
    # Getting meta data using json.loads() takes quite a bit of time!!
    # We should provide this info in another way.

    schema = table.schema

    raw_dates_np = table.column("DATE").to_numpy()
    raw_dates_np_as_uint = raw_dates_np.astype(np.uint64)

    min_raw_date = np.min(raw_dates_np)
    max_raw_date = np.max(raw_dates_np)

    sample_dates_np = generate_normalized_sample_dates(
        min_raw_date, max_raw_date, freq=freq
    )
    sample_dates_np_as_uint = sample_dates_np.astype(np.uint64)

    column_arrays = []

    for colname in schema.names:
        if colname == "DATE":
            column_arrays.append(sample_dates_np)
        elif colname == "REAL":
            column_arrays.append(
                np.full(len(sample_dates_np), table.column("REAL")[0].as_py())
            )
        else:
            raw_numpy_arr = table.column(colname).to_numpy()
            if is_rate_from_field_meta(table.field(colname)):
                i = interpolate_backfill(
                    sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr, 0, 0
                )
            else:
                i = np.interp(
                    sample_dates_np_as_uint, raw_dates_np_as_uint, raw_numpy_arr
                )

            column_arrays.append(i)

    ret_table = pa.table(column_arrays, schema=schema)

    return ret_table
예제 #2
0
def read_columns(table: pa.Table, full: bool = True) -> List[Column]:
    """Read Column definitions and validate Workbench assumptions.

    Raise ValidateError if:

    * table has metadata
    * table has more than one record batch
    * columns have invalid metadata (e.g., a "format" on a "text" column, or
      a timestamp with unit!=ns or a timezone)
    * column values disagree with metadata (e.g., date32 "2021-04-12" with
      `ColumnType.Date("month")`)

    Be sure the Arrow file backing the table was validated with
    `validate_arrow_file()` first. Otherwise, you'll get undefined behavior.

    If `full=False`, skip costly checks. Only pass `full=False` when you can
    guarantee the data has been generated by a source you trust. (In particular,
    module output is not trusted and it must use the default `full=True`.)
    """
    if table.schema.metadata is not None:
        raise TableSchemaHasMetadata()

    seen_column_names: Dict[str, int] = {}
    ret = []

    for position, column in enumerate(table.itercolumns()):
        field = table.field(position)
        if column.num_chunks > 1:
            raise TableHasTooManyRecordBatches(column.num_chunks)

        if field.name in seen_column_names:
            raise DuplicateColumnName(
                field.name, seen_column_names[field.name], position
            )
        else:
            seen_column_names[field.name] = position

        ret.append(Column(field.name, _read_column_type(column, field, full=full)))

    return ret
예제 #3
0
def sample_segmented_multi_real_table_at_date(
    table: pa.Table, np_datetime: np.datetime64
) -> pa.Table:
    """Sample table containing multiple realizations at the specified date.
    The table must contain both a REAL and a DATE column.
    The table must be segmented on REAL (so that all rows from a single
    realization are contiguous) and within each REAL segment, it must be
    sorted on DATE.
    """
    # pylint: disable=too-many-locals

    unique_reals_arr_np, first_occurrence_idx, real_counts = np.unique(
        table.column("REAL").to_numpy(), return_index=True, return_counts=True
    )

    all_dates_arr_np = table.column("DATE").to_numpy()

    # Will receive row indices into the full input table for the two values we should
    # interpolate/blend between.
    # To keep things simple we always add two indices for each realization even if
    # we know that no interpolation will be needed (e.g. exact matches)
    row_indices = []

    # Will receive the blending weights for doing interpolation
    interpolate_t_arr = np.zeros(len(unique_reals_arr_np))

    # Array with mask for selecting values when doing backfill. A value of 1 will select
    # v1, while a value of 0 will yield a 0 value
    backfill_mask_arr = np.ones(len(unique_reals_arr_np))

    for i, _real in enumerate(unique_reals_arr_np):
        # Starting row of this realization and number of rows belonging to realization
        start_row_idx = first_occurrence_idx[i]
        row_count = real_counts[i]

        # Get slice of the dates for just this realization
        dates_arr_np = all_dates_arr_np[start_row_idx : start_row_idx + row_count]
        assert len(dates_arr_np) > 0

        # OUTSIDE RANGE (query date is before our first date)
        if np_datetime < dates_arr_np[0]:
            row_indices.append(start_row_idx)
            row_indices.append(start_row_idx)
            # Extrapolate or just fill with 0 for rates
            # interpolate_t_arr[i] = 0
            backfill_mask_arr[i] = 0

        # OUTSIDE RANGE (query date is beyond our last date)
        elif np_datetime > dates_arr_np[-1]:
            row_indices.append(start_row_idx + row_count - 1)
            row_indices.append(start_row_idx + row_count - 1)
            # Extrapolate or just fill with 0 for rates. For interpolation, t should
            # really 1, but since the rows are duplicated it does not matter
            # interpolate_t_arr[i] = 0
            backfill_mask_arr[i] = 0

        # EXACT MATCH on the LAST DATE
        elif np_datetime == dates_arr_np[-1]:
            row_indices.append(start_row_idx + row_count - 1)
            row_indices.append(start_row_idx + row_count - 1)
            # interpolate_t_arr[i] = 0
            # backfill_mask_arr[i] = 1

        else:
            # Search for query date amongst the realization's dates.
            # last_insertion_index is the last legal insertion index of the queried value
            last_insertion_index: int = np.searchsorted(
                dates_arr_np, np_datetime, side="right"
            ).item()

            assert 0 < last_insertion_index < len(dates_arr_np)
            assert dates_arr_np[last_insertion_index - 1] <= np_datetime
            assert dates_arr_np[last_insertion_index] > np_datetime

            if dates_arr_np[last_insertion_index - 1] == np_datetime:
                # Exact match
                row_indices.append(start_row_idx + last_insertion_index - 1)
                row_indices.append(start_row_idx + last_insertion_index - 1)
                # interpolate_t_arr[i] = 0
                # backfill_mask_arr[i] = 1
            else:
                row_indices.append(start_row_idx + last_insertion_index - 1)
                row_indices.append(start_row_idx + last_insertion_index)
                interpolate_t_arr[i] = _compute_interpolation_weight(
                    np_datetime,
                    dates_arr_np[last_insertion_index - 1],
                    dates_arr_np[last_insertion_index],
                )
                # backfill_mask_arr[i] = 1

    column_arrays = []

    for colname in table.schema.names:
        if colname == "REAL":
            column_arrays.append(unique_reals_arr_np)
        elif colname == "DATE":
            column_arrays.append(np.full(len(unique_reals_arr_np), np_datetime))
        else:
            records_np = table.column(colname).take(row_indices).to_numpy()
            if is_rate_from_field_meta(table.field(colname)):
                v1_arr = records_np[1::2]
                interpolated_vec_values = v1_arr * backfill_mask_arr
            else:
                v0_arr = records_np[0::2]
                v1_arr = records_np[1::2]
                delta_arr = v1_arr - v0_arr
                interpolated_vec_values = v0_arr + (delta_arr * interpolate_t_arr)

            column_arrays.append(pa.array(interpolated_vec_values))

    ret_table = pa.table(column_arrays, schema=table.schema)

    return ret_table
예제 #4
0
def resample_segmented_multi_real_table(table: pa.Table, freq: Frequency) -> pa.Table:
    """Resample table containing multiple realizations.
    The table must contain both a REAL and a DATE column.
    The table must be segmented on REAL (so that all rows from a single
    realization are contiguous) and within each REAL segment, it must be
    sorted on DATE.
    The segmentation is needed since interpolations must be done per realization
    and we utilize slicing on rows for speed.
    """
    # pylint: disable=too-many-locals

    real_arr_np = table.column("REAL").to_numpy()
    unique_reals, first_occurrence_idx, real_counts = np.unique(
        real_arr_np, return_index=True, return_counts=True
    )

    output_columns_dict: Dict[str, pa.ChunkedArray] = {}

    real_interpolation_info_dict: Dict[int, RealInterpolationInfo] = {}

    for colname in table.schema.names:
        if colname in ["DATE", "REAL"]:
            continue

        is_rate = is_rate_from_field_meta(table.field(colname))
        raw_whole_numpy_arr = table.column(colname).to_numpy()

        vec_arr_list = []
        for i, real in enumerate(unique_reals):
            start_row_idx = first_occurrence_idx[i]
            row_count = real_counts[i]

            rii = real_interpolation_info_dict.get(real)
            if not rii:
                rii = _extract_real_interpolation_info(
                    table, start_row_idx, row_count, freq
                )
                real_interpolation_info_dict[real] = rii

            raw_numpy_arr = raw_whole_numpy_arr[
                start_row_idx : start_row_idx + row_count
            ]

            if is_rate:
                inter = interpolate_backfill(
                    rii.sample_dates_np_as_uint,
                    rii.raw_dates_np_as_uint,
                    raw_numpy_arr,
                    0,
                    0,
                )
            else:
                inter = np.interp(
                    rii.sample_dates_np_as_uint,
                    rii.raw_dates_np_as_uint,
                    raw_numpy_arr,
                )

            arr_length = len(rii.sample_dates_np_as_uint)
            assert arr_length == len(inter)

            vec_arr_list.append(inter)

        output_columns_dict[colname] = pa.chunked_array(vec_arr_list)

    date_arr_list = []
    real_arr_list = []
    for real in unique_reals:
        rii = real_interpolation_info_dict[real]
        arr_length = len(rii.sample_dates_np)
        date_arr_list.append(rii.sample_dates_np)
        real_arr_list.append(np.full(arr_length, real))

    output_columns_dict["DATE"] = pa.chunked_array(date_arr_list)
    output_columns_dict["REAL"] = pa.chunked_array(real_arr_list)

    ret_table = pa.table(output_columns_dict, schema=table.schema)

    return ret_table