Exemplo n.º 1
0
def group_by_pk_hash_bucket(table: pa.Table, num_buckets: int,
                            primary_keys: List[str]) -> np.ndarray:

    # generate the primary key digest column
    all_pk_column_fields = []
    for pk_name in primary_keys:
        # casting a primary key column to numpy also ensures no nulls exist
        column_fields = table[pk_name].to_numpy()
        all_pk_column_fields.append(column_fields)
    hash_column_generator = hash_pk_bytes_generator(all_pk_column_fields)
    table = sc.append_pk_hash_column(table, hash_column_generator)

    # drop primary key columns to free up memory
    table = table.drop(primary_keys)

    # group hash bucket record indices
    hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
    record_index = 0
    for digest in sc.pk_hash_column_np(table):
        hash_bucket = pk_digest_to_hash_bucket_index(digest, num_buckets)
        if hash_bucket_to_indices[hash_bucket] is None:
            hash_bucket_to_indices[hash_bucket] = []
        hash_bucket_to_indices[hash_bucket].append(record_index)
        record_index += 1

    # generate the ordered record number column
    hash_bucket_to_table = np.empty([num_buckets], dtype="object")
    for hash_bucket in range(len(hash_bucket_to_indices)):
        indices = hash_bucket_to_indices[hash_bucket]
        if indices:
            hash_bucket_to_table[hash_bucket] = sc.append_record_idx_col(
                table.take(indices),
                indices,
            )
    return hash_bucket_to_table
Exemplo n.º 2
0
def drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
    # TODO: drop all primary key occurrences for DELETE delta types
    value_to_last_row_idx = {}
    row_idx = 0
    for chunk in sc.pk_hash_column(table).iterchunks():
        for val in chunk.to_numpy(zero_copy_only=False):
            value_to_last_row_idx[val] = row_idx
            row_idx += 1
    return table.take(list(value_to_last_row_idx.values()))
def fill_gaps(
        arrow_df: pa.Table,
        gaps: List[Tuple[float, float]],
        sample_interval_micros: float,
        copy: bool = False) -> Tuple[pa.Table, List[Tuple[float, float]]]:
    """
    fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the
    calculated sample interval

    :param arrow_df: pyarrow table with data.  first column is "timestamps"
    :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps
    :param sample_interval_micros: known sample interval of the data points
    :param copy: if True, copy the data points, otherwise interpolate from edges, default False
    :return: table without gaps and the list of gaps
    """
    # extract the necessary information to compute gap size and gap timestamps
    data_time_stamps = arrow_df["timestamps"].to_numpy()
    if len(data_time_stamps) > 1:
        data_duration = data_time_stamps[-1] - data_time_stamps[0]
        expected_samples = (
            np.floor(data_duration / sample_interval_micros) +
            (1 if data_duration % sample_interval_micros >=
             sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1
        if expected_samples > len(data_time_stamps):
            if copy:
                pcm = DataPointCreationMode["COPY"]
            else:
                pcm = DataPointCreationMode["NAN"]
            # make it safe to alter the gap values
            my_gaps = check_gap_list(gaps, data_time_stamps[0],
                                     data_time_stamps[-1])
            for gap in my_gaps:
                # if timestamps are around gaps, we have to update the values
                before_start = np.argwhere(
                    [t <= gap[0] for t in data_time_stamps])
                after_end = np.argwhere(
                    [t >= gap[1] for t in data_time_stamps])
                if len(before_start) > 0:
                    before_start = before_start[-1][0]
                    # sim = gap[0] - data_time_stamps[before_start]
                    # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm)
                    gap = (data_time_stamps[before_start], gap[1])
                else:
                    before_start = None
                if len(after_end) > 0:
                    after_end = after_end[0][0]
                    # sim = gap[1] - data_time_stamps[after_end]
                    gap = (gap[0], data_time_stamps[after_end])
                else:
                    after_end = None
                num_new_points = int(
                    (gap[1] - gap[0]) / sample_interval_micros) - 1
                if before_start is not None:
                    arrow_df = add_data_points_to_df(arrow_df, before_start,
                                                     sample_interval_micros,
                                                     num_new_points, pcm)
                elif after_end is not None:
                    arrow_df = add_data_points_to_df(arrow_df, after_end,
                                                     -sample_interval_micros,
                                                     num_new_points, pcm)
        indic = pc.sort_indices(arrow_df,
                                sort_keys=[("timestamps", "ascending")])
        return arrow_df.take(indic), gaps
    return arrow_df, gaps
def _sort_table_on_real_then_date(table: pa.Table) -> pa.Table:
    indices = pc.sort_indices(
        table, sort_keys=[("REAL", "ascending"), ("DATE", "ascending")]
    )
    sorted_table = table.take(indices)
    return sorted_table
Exemplo n.º 5
0
def make_sorted_groups(sorting_table: pa.Table,
                       input_table: pa.Table) -> SortedGroups:
    if not sorting_table.num_columns:
        # Exactly one output group, even for empty-table input
        return SortedGroups(
            sorted_groups=pa.table({
                "A": [None]
            }).select([]),  # 1-row, 0-col table
            sorted_input_table=
            input_table,  # everything is one group (maybe 0-row)
            group_splits=np.array([], np.int64()),
        )

    # pyarrow 3.0.0 can't sort dictionary columns.
    # TODO make sort-dictionary work; nix this conversion
    sorting_table_without_dictionary = pa.table(
        [
            column.cast(pa.utf8())
            if pa.types.is_dictionary(column.type) else column
            for column in sorting_table.columns
        ],
        schema=pa.schema([
            pa.field(field.name, pa.utf8())
            if pa.types.is_dictionary(field.type) else field for field in [
                sorting_table.schema.field(i)
                for i in range(len(sorting_table.schema.names))
            ]
        ]),
    )
    indices = pa.compute.sort_indices(
        sorting_table_without_dictionary,
        sort_keys=[(c, "ascending")
                   for c in sorting_table_without_dictionary.column_names],
    )

    sorted_groups_with_dups_and_nulls = sorting_table.take(indices)
    # Behavior we ought to DEPRECATE: to mimic Pandas, we drop all groups that
    # contain NULL. This is mathematically sound for Pandas' "NA" (because if
    # all these unknown things are the same thing, doesn't that mean we know
    # something about them? -- reducto ad absurdum, QED). But Workbench's NULL
    # is a bit closer to SQL NULL, which means "whatever you say, pal".
    #
    # This null-dropping is for backwards compat. TODO make it optional ... and
    # eventually nix the option and always output NULL groups.
    nonnull_indices = indices.filter(
        find_nonnull_table_mask(sorted_groups_with_dups_and_nulls))

    if input_table.num_columns:
        sorted_input_table = input_table.take(nonnull_indices)
    else:
        # Don't .take() on a zero-column Arrow table: its .num_rows would change
        #
        # All rows are identical, so .slice() gives the table we want
        sorted_input_table = input_table.slice(0, len(nonnull_indices))

    sorted_groups_with_dups = sorting_table.take(nonnull_indices)

    # "is_dup": find each row in sorted_groups_with_dups that is _equal_ to
    # the row before it. (The first value compares the first and second row.)
    #
    # We start assuming all are equal; then we search for inequality
    if len(sorted_groups_with_dups):
        is_dup = pa.array(np.ones(len(sorted_groups_with_dups) - 1),
                          pa.bool_())
        for column in sorted_groups_with_dups.itercolumns():
            chunk = column.chunks[0]
            if pa.types.is_dictionary(chunk.type):
                chunk = chunk.indices
            first = chunk.slice(0, len(column) - 1)
            second = chunk.slice(1)
            # TODO when we support NULL groups:
            # both_null = pa.compute.and_(first.is_null(), second.is_null())
            # both_equal_if_not_null = pa.compute.equal(first, second)
            # both_equal = pa.compute.fill_null(both_equal_if_not_null, False)
            # value_is_dup = pa.compute.or_(both_null, both_equal)
            # ... and for now, it's simply:
            value_is_dup = pa.compute.equal(first, second)
            is_dup = pa.compute.and_(is_dup, value_is_dup)

        group_splits = np.where(~(is_dup.to_numpy(
            zero_copy_only=False)))[0] + 1

        sorted_groups = reencode_dictionaries(
            sorted_groups_with_dups.take(np.insert(group_splits, 0, 0)))
    else:
        sorted_groups = sorted_groups_with_dups
        group_splits = np.array([], np.int64())

    return SortedGroups(
        sorted_groups=sorted_groups,
        sorted_input_table=sorted_input_table,
        group_splits=group_splits,
    )