示例#1
0
    def add_table(self, table: Table, col: str) -> Table:
        """Registers a table for replaying and returns the associated replay table.

        Args:
            table (Table): the table to be replayed
            col (str): column in the table containing timestamps

        Returns:
            a replay Table

        Raises:
            DHError
        """
        try:
            replay_table = Table(j_table=self._j_replayer.replay(table.j_table, col))
            return replay_table
        except Exception as e:
            raise DHError(e, "failed to add a historical table.") from e
示例#2
0
def query_update_performance(eval_number: int) -> Table:
    """  Takes in a query evaluation number and returns a view for that query's update performance data.

    You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance
    data tables obtained from calling query_performance_log() or query_operation_performance_log()

    Args:
        eval_number (int): the evaluation number

    Returns:
        a Table of query update performance data

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JPerformanceQueries.queryUpdatePerformance(eval_number))
    except Exception as e:
        raise DHError(e, "failed to obtain the query update performance data.") from e
示例#3
0
def merge_sorted(tables: List[Table], order_by: str) -> Table:
    """Combines two or more tables into one sorted, aggregate table. This essentially stacks the tables one on top
    of the other and sorts the result. Null tables are ignored. mergeSorted is more efficient than using merge
    followed by sort.

    Args:
        tables (List[Table]): the source tables
        order_by (str): the name of the key column

    Returns:
         a Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JTableTools.mergeSorted(order_by, *[t.j_table for t in tables]))
    except Exception as e:
        raise DHError(e, "merge sorted operation failed.") from e
示例#4
0
def query_update_performance_map(eval_number: int) -> Dict[str, Table]:
    """ Creates multiple tables with performance data for a given query identified by an evaluation number. The tables
     are returned in a map with the following String keys: 'QueryUpdatePerformance', 'UpdateWorst', 'WorstInterval',
     'UpdateMostRecent', 'UpdateAggregate', 'UpdateSummaryStats'.

    Args:
        eval_number (int): the evaluation number

    Returns:
        a dict

    Raises:
        DHError
    """

    try:
        d = j_map_to_dict(_JPerformanceQueries.queryUpdatePerformanceMap(eval_number))
        for k in d.keys():
            d[k] = Table(j_table=d[k])
        return d
    except Exception as e:
        raise DHError(e, "failed to obtain the query update perf map.") from e
示例#5
0
def consume(
    kafka_config: Dict,
    cdc_spec: CDCSpec,
    partitions: List[int] = None,
    stream_table: bool = False,
    cols_to_drop: List[str] = None,
) -> Table:
    """ Consume from a Change Data Capture (CDC) Kafka stream (as, eg, produced by Debezium), tracking the underlying
    database table to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated kafka consumer and also the resulting table. Passed
            to the org.apache.kafka.clients.consumer.KafkaConsumer constructor; pass any KafkaConsumer specific desired
            configuration here. Note this should include the relevant property for a schema server URL where the key
            and/or value Avro necessary schemas are stored.
        cdc_spec (CDCSpec): a CDCSpec obtained from calling either the cdc_long_spec or the cdc_short_spec function
        partitions (List[int]: a list of integer partition numbers, default is None indicating all partitions
        stream_table (bool):  if true, produce a streaming table of changed rows keeping the CDC 'op' column
            indicating the type of column change; if false, return a Deephaven ticking table that tracks the underlying
            database table through the CDC Stream.
        cols_to_drop (list[str]): a list of column names to omit from the resulting DHC table. Note that only columns
            not included in the primary key for the table can be dropped at this stage; you can chain a drop column
            operation after this call if you need to do this.

    Returns:
        a Deephaven live table that will update based on the CDC messages consumed for the given topic

    Raises:
        DHError
    """
    try:
        partitions = j_partitions(partitions)
        kafka_config = j_properties(kafka_config)
        return Table(j_table=_JCdcTools.consumeToTable(
            kafka_config, cdc_spec.j_object, partitions, stream_table,
            cols_to_drop))
    except Exception as e:
        raise DHError(e, "failed to consume a CDC stream.") from e
示例#6
0
def query_operation_performance(eval_number: int) -> Table:
    """ Takes in a query evaluation number and returns a view for that query's individual operation's performance data.

    You can obtain query evaluation numbers, which uniquely identify a query and its subqueries, via the performance
    data tables obtained from calling query_performance_log() or query_operation_performance_log()

    The query operation performance log contains data on how long each individual operation of a query (where(),
    update(), naturalJoin(), etc., as well as internal functions) takes to execute, and the change in resource
    consumption while each was executing.

    Args:
        eval_number (int): the evaluation number

    Returns:
        a table of query operation performance data

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JPerformanceQueries.queryOperationPerformance(eval_number))
    except Exception as e:
        raise DHError(e, "failed to obtain the query operation performance data.") from e
示例#7
0
def time_window(table: Table, ts_col: str, window: int, bool_col: str) -> Table:
    """Creates a new table by applying a time window to the source table and adding a new Boolean column.

    The value of the new Boolean column is set to false when the timestamp column value is older than the window from
    now or true otherwise. If the timestamp column value is null, the Boolean column value will be null as well. The
    result table ticks whenever the source table ticks, or modifies a row when it passes out of the window.

    Args:
        table (Table): the source table
        ts_col (str): the timestamp column name
        window (int): the size of the window in nanoseconds
        bool_col (str): the name of the new Boolean column.

    Returns:
        a new Table

    Raises:
        DHError
    """
    try:
        return Table(j_table=_JWindowCheck.addTimeWindow(table.j_table, ts_col, window, bool_col))
    except Exception as e:
        raise DHError(e, "failed to create a time window table.") from e
示例#8
0
def consume(
        kafka_config: Dict,
        topic: str,
        partitions: List[int] = None,
        offsets: Dict[int, int] = None,
        key_spec: KeyValueSpec = None,
        value_spec: KeyValueSpec = None,
        table_type: TableType = TableType.stream(),
) -> Table:
    """Consume from Kafka to a Deephaven table.

    Args:
        kafka_config (Dict): configuration for the associated Kafka consumer and also the resulting table.
            Once the table-specific properties are stripped, the remaining one is used to call the constructor of
            org.apache.kafka.clients.consumer.KafkaConsumer; pass any KafkaConsumer specific desired configuration here
        topic (str): the Kafka topic name
        partitions (List[int]) : a list of integer partition numbers, default is None which means all partitions
        offsets (Dict[int, int]) : a mapping between partition numbers and offset numbers, and can be one of the
            predefined ALL_PARTITIONS_SEEK_TO_BEGINNING, ALL_PARTITIONS_SEEK_TO_END or ALL_PARTITIONS_DONT_SEEK.
            The default is None which works the same as  ALL_PARTITIONS_DONT_SEEK. The offset numbers may be one
            of the predefined SEEK_TO_BEGINNING, SEEK_TO_END, or DONT_SEEK.
        key_spec (KeyValueSpec): specifies how to map the Key field in Kafka messages to Deephaven column(s).
            It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this
            module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which
            works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values
            for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting
            column name and type
        value_spec (KeyValueSpec): specifies how to map the Value field in Kafka messages to Deephaven column(s).
            It can be the result of calling one of the functions: simple_spec(),avro_spec() or json_spec() in this
            module, or the predefined KeyValueSpec.IGNORE or KeyValueSpec.FROM_PROPERTIES. The default is None which
            works the same as KeyValueSpec.FROM_PROPERTIES, in which case, the kafka_config param should include values
            for dictionary keys 'deephaven.key.column.name' and 'deephaven.key.column.type', for the single resulting
            column name and type
        table_type (TableType): a TableType enum, default is TableType.stream()

    Returns:
        a Deephaven live table that will update based on Kafka messages consumed for the given topic

    Raises:
        DHError
    """

    try:
        partitions = j_partitions(partitions)

        if offsets is None or offsets == ALL_PARTITIONS_DONT_SEEK:
            offsets = _ALL_PARTITIONS_DONT_SEEK
        elif offsets == ALL_PARTITIONS_SEEK_TO_BEGINNING:
            offsets = _ALL_PARTITIONS_SEEK_TO_BEGINNING
        elif offsets == ALL_PARTITIONS_SEEK_TO_END:
            offsets = _ALL_PARTITIONS_SEEK_TO_END
        else:
            partitions_array = jpy.array("int", list(offsets.keys()))
            offsets_array = jpy.array("long", list(offsets.values()))
            offsets = _JKafkaTools.partitionToOffsetFromParallelArrays(
                partitions_array, offsets_array)

        key_spec = KeyValueSpec.FROM_PROPERTIES if key_spec is None else key_spec
        value_spec = KeyValueSpec.FROM_PROPERTIES if value_spec is None else value_spec

        if key_spec is KeyValueSpec.IGNORE and value_spec is KeyValueSpec.IGNORE:
            raise ValueError(
                "at least one argument for 'key' or 'value' must be different from KeyValueSpec.IGNORE"
            )

        kafka_config = j_properties(kafka_config)
        return Table(j_table=_JKafkaTools.consumeToTable(
            kafka_config,
            topic,
            partitions,
            offsets,
            key_spec.j_object,
            value_spec.j_object,
            table_type.j_object,
        ))
    except Exception as e:
        raise DHError(e, "failed to consume a Kafka stream.") from e
示例#9
0
def read(
    path: str,
    header: Dict[str, dht.DType] = None,
    headless: bool = False,
    skip_rows: int = 0,
    num_rows: int = MAX_LONG,
    ignore_empty_lines: bool = False,
    allow_missing_columns: bool = False,
    ignore_excess_columns: bool = False,
    delimiter: str = ",",
    quote: str = '"',
    ignore_surrounding_spaces: bool = True,
    trim: bool = False,
) -> Table:
    """Read the CSV data specified by the path parameter as a table.

    Args:
        path (str): a file path or a URL string
        header (Dict[str, DType]): a dict to define the table columns with key being the name, value being the data type
        skip_rows (long): number of data rows to skip before processing data. This is useful when you want to parse data in chunks. Defaults to 0
        num_rows (long): max number of rows to process. This is useful when you want to parse data in chunks. Defaults to {@link Long#MAX_VALUE} 
        allow_missing_columns (bool): whether the library should allow missing columns in the input. If this flag is set, then rows that are too short (that have fewer columns than the header row) will be interpreted as if the missing columns contained the empty string. Defaults to false.
        ignore_excess_columns (bool): whether the library should allow excess columns in the input. If this flag is set, then rows that are too long (that have more columns than the header row) will have those excess columns dropped. Defaults to false.
        headless (bool): indicates if the CSV data is headless, default is False
        delimiter (str): the delimiter used by the CSV, default is the comma
        quote (str): the quote character for the CSV, default is double quote
        ignore_surrounding_spaces (bool): Indicates whether surrounding white space should be ignored for unquoted text fields, default is True
        trim (bool): indicates whether to trim white space inside a quoted string, default is False

    Returns:
        a table

    Raises:
        DHError
    """
    try:
        csv_specs_builder = _JCsvTools.builder()

        if header:
            csv_specs_builder.headers(_JArrays.asList(list(header.keys())))
            parser_map = {
                dht.bool_: _JParsers.BOOLEAN,
                dht.byte: _JParsers.BYTE,
                dht.char: _JParsers.CHAR,
                dht.short: _JParsers.SHORT,
                dht.int_: _JParsers.INT,
                dht.long: _JParsers.LONG,
                dht.float_: _JParsers.FLOAT_FAST,
                dht.double: _JParsers.DOUBLE,
                dht.string: _JParsers.STRING,
                dht.DateTime: _JParsers.DATETIME,
            }
            for column_name, column_type in header.items():
                csv_specs_builder.putParserForName(column_name,
                                                   parser_map[column_type])

        csv_specs = (csv_specs_builder.hasHeaderRow(not headless).skipRows(
            skip_rows).numRows(num_rows).ignoreEmptyLines(
                ignore_empty_lines).allowMissingColumns(allow_missing_columns).
                     ignoreExcessColumns(ignore_excess_columns).delimiter(
                         ord(delimiter)).quote(
                             ord(quote)).ignoreSurroundingSpaces(
                                 ignore_surrounding_spaces).trim(trim).build())

        j_table = _JCsvTools.readCsv(path, csv_specs)

        return Table(j_table=j_table)
    except Exception as e:
        raise DHError(e, "read csv failed") from e