class Update(DML): """Measure the time it takes for an UPDATE statement to return to client""" INIT = Td( """ > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); > CREATE TABLE t1 (f1 BIGINT); > INSERT INTO t1 SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; """ ) BENCHMARK = Td( """ > /* A */ SELECT 1 1 > /* B */ UPDATE t1 SET f1 = f1 + 10000000 """ )
class FastPathFilterNoIndex(FastPath): """Measure the time it takes for the fast path to filter our all rows from a materialized view and return""" INIT = Td( """ > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); > CREATE MATERIALIZED VIEW v1 (f1, f2) AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) + (a7.f1 * 1000000) AS f1, 1 AS f2 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6, ten AS a7; > SELECT COUNT(*) = 10000000 FROM v1; true """ ) BENCHMARK = Td( """ > /* A */ SELECT 1; 1 > /* B */ SELECT * FROM v1 WHERE f2 < 0; """ )
class InsertAndSelect(DML): """Measure the time it takes for an INSERT statement to return AND for a follow-up SELECT to return data, that is, for the dataflow to be completely caught up. """ INIT = Td( """ > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); """ ) BENCHMARK = Td( """ > DROP TABLE IF EXISTS t1; > /* A */ CREATE TABLE t1 (f1 INTEGER); > INSERT INTO t1 SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; > /* B */ SELECT 1 FROM t1 WHERE f1 = 1; 1 """ )
class CountDistinct(Dataflow): INIT = Td( """ > CREATE VIEW ten (f1) AS (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) AS f1, a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) + (a7.f1 * 1000000) /* + (a8.f1 * 10000000) */ AS unique FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6, ten AS a7; > SELECT COUNT(*) = 10000000 FROM v1; true """ ) BENCHMARK = Td( """ > /* A */ SELECT 1 1 > /* B */ SELECT COUNT(DISTINCT f1) AS f1 FROM v1; 100 """ )
class GroupBy(Dataflow): INIT = Td( """ > CREATE VIEW ten (f1) AS (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f1, a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f2 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; > SELECT COUNT(*) = 1000000 FROM v1 true """ ) BENCHMARK = Td( """ > /* A */ SELECT 1 1 > /* B */ SELECT COUNT(*), MIN(f1_min), MAX(f1_max) FROM (SELECT f2, MIN(f1) AS f1_min, MAX(f1) AS f1_max FROM v1 GROUP BY f2); 1000000 0 999999 """ )
class MinMax(Dataflow): INIT = Td( """ > CREATE VIEW ten (f1) AS (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f1 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; > SELECT COUNT(*) = 1000000 FROM v1; true """ ) BENCHMARK = Td( """ > /* A */ SELECT 1 1 > /* B */ SELECT MIN(f1), MAX(f1) AS f1 FROM v1; 0 999999 """ )
class DifferentialJoin(Dataflow): INIT = Td( """ > CREATE VIEW ten (f1) AS (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f1, a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f2 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; """ ) BENCHMARK = Td( """ > /* A */ SELECT 1; 1 > /* B */ SELECT COUNT(*) FROM v1 AS a1 JOIN v1 AS a2 USING (f1); 1000000 """ )
class CrossJoin(Dataflow): INIT = Td( """ > CREATE VIEW ten (f1) AS (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)); """ ) BENCHMARK = Td( """ > DROP VIEW IF EXISTS v1; > /* A */ CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6 > /* B */ SELECT COUNT(*) = 1000000 AS f1 FROM v1; true """ )
class FinishOrderByLimit(Finish): """Benchmark ORDER BY + LIMIT without the benefit of an index""" INIT = Td( """ > CREATE VIEW ten (f1) AS (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f1, a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f2 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; > SELECT COUNT(*) = 1000000 FROM v1; true """ ) BENCHMARK = Td( """ > /* A */ SELECT 1 1 > /* B */ SELECT f2 FROM v1 ORDER BY 1 DESC LIMIT 1; 999999 """ )
class KafkaUpsertUnique(KafkaScenario): SHARED = Td( """ $ set keyschema={"type": "record", "name": "Key", "fields": [ {"name": "f1", "type": "long"} ] } $ set schema={"type" : "record", "name" : "test", "fields": [ {"name": "f2", "type": "long"} ] } $ kafka-create-topic topic=upsert-unique partitions=16 $ kafka-ingest format=avro topic=upsert-unique key-format=avro key-schema=${keyschema} schema=${schema} publish=true repeat=1000000 {"f1": ${kafka-ingest.iteration}} {"f2": ${kafka-ingest.iteration}} """ ) BENCHMARK = Td( """ > DROP SOURCE IF EXISTS s1; > /* A */ CREATE MATERIALIZED SOURCE s1 FROM KAFKA BROKER '${testdrive.kafka-addr}' TOPIC 'testdrive-upsert-unique-${testdrive.seed}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY '${testdrive.schema-registry-url}' ENVELOPE UPSERT; > /* B */ SELECT COUNT(*) FROM s1; 1000000 """ )
class FastPathOrderByLimit(FastPath): """Benchmark the case SELECT * FROM materialized_view ORDER BY <key> LIMIT <i>""" INIT = Td( """ > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f1 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; > SELECT COUNT(*) = 1000000 FROM v1; true """ ) BENCHMARK = Td( """ > /* A */ SELECT 1; 1 > /* B */ SELECT f1 FROM v1 ORDER BY f1 DESC LIMIT 1000; """ + "\n".join([str(x) for x in range(999000, 1000000)]) )
class Insert(DML): """Measure the time it takes for an INSERT statement to return.""" INIT = Td( """ > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); """ ) BENCHMARK = Td( """ > DROP TABLE IF EXISTS t1; > /* A */ CREATE TABLE t1 (f1 INTEGER); > /* B */ INSERT INTO t1 SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; """ )
class OrderBy(Dataflow): """Benchmark ORDER BY as executed by the dataflow layer, in contrast with an ORDER BY executed using a Finish step in the coordinator""" INIT = Td( """ > CREATE TABLE ten (f1 INTEGER); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) AS f1 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; # Just to spice things up a bit, we perform individual # inserts here so that the rows are assigned separate timestamps > INSERT INTO ten VALUES (0); > INSERT INTO ten VALUES (1); > INSERT INTO ten VALUES (2); > INSERT INTO ten VALUES (3); > INSERT INTO ten VALUES (4); > INSERT INTO ten VALUES (5); > INSERT INTO ten VALUES (6); > INSERT INTO ten VALUES (7); > INSERT INTO ten VALUES (8); > INSERT INTO ten VALUES (9); > SELECT COUNT(*) = 1000000 FROM v1; true """ ) BENCHMARK = Td( """ > DROP VIEW IF EXISTS v2 /* A */ # explicit LIMIT is needed for the ORDER BY to not be optimized away > CREATE MATERIALIZED VIEW v2 AS SELECT * FROM v1 ORDER BY f1 LIMIT 999999999999 > SELECT COUNT(*) FROM v2 /* B */ 1000000 """ )
class FastPathFilterIndex(FastPath): """Measure the time it takes for the fast path to filter our all rows from a materialized view using an index and return""" INIT = Td( """ > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) AS f1 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6; > SELECT COUNT(*) = 1000000 FROM v1; true """ ) # Since an individual query of this particular type being benchmarked takes 1ms to execute, the results are susceptible # to a lot of random noise. As we can not make the query any slower by using e.g. a large dataset, # we run the query 100 times in a row and measure the total execution time. BENCHMARK = Td( """ > BEGIN > /* A */ SELECT 1; 1 """ + "\n".join( [ """ > SELECT * FROM v1 WHERE f1 = 1; 1 1 1 1 1 1 1 1 1 1 """ for i in range(0, 100) ] ) + """ > /* B */ SELECT 1; 1 """ )
class KafkaRecovery(KafkaScenario): SHARED = Td( """ $ set keyschema={ "type": "record", "name": "Key", "fields": [ {"name": "f1", "type": "long"} ] } $ set schema={ "type" : "record", "name" : "test", "fields" : [ {"name":"f2", "type":"long"} ] } $ kafka-create-topic topic=kafka-recovery partitions=8 $ kafka-ingest format=avro topic=kafka-recovery key-format=avro key-schema=${keyschema} schema=${schema} publish=true repeat=10000000 {"f1": ${kafka-ingest.iteration}} {"f2": ${kafka-ingest.iteration}} """ ) INIT = Td( """ > CREATE MATERIALIZED SOURCE s1 FROM KAFKA BROKER '${testdrive.kafka-addr}' TOPIC 'testdrive-kafka-recovery-${testdrive.seed}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY '${testdrive.schema-registry-url}' ENVELOPE UPSERT; # Make sure we are fully caught up before continuing > SELECT COUNT(*) = 10000000 FROM s1; true """ ) BEFORE = Lambda(lambda e: e.RestartMz()) BENCHMARK = Td( """ > /* A */ SELECT 1; 1 > /* B */ SELECT COUNT(*) = 10000000 FROM s1; true """ )
class Retraction(Dataflow): """Benchmark the time it takes to process a very large retraction""" BENCHMARK = Td( """ > DROP VIEW IF EXISTS v1; > DROP TABLE IF EXISTS ten; > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); > CREATE MATERIALIZED VIEW v1 AS SELECT a1.f1 + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6 > SELECT COUNT(*) = 1000000 AS f1 FROM v1; true > /* A */ SELECT 1; 1 > DELETE FROM ten; > /* B */ SELECT COUNT(*) FROM v1; 0 """ )
def benchmark(self) -> MeasurementSource: return Td(""" > DROP SINK IF EXISTS sink1; > DROP SOURCE IF EXISTS sink1_check CASCADE; /* A */ > CREATE SINK sink1 FROM source1 INTO KAFKA BROKER '${testdrive.kafka-addr}' TOPIC 'testdrive-sink-output-${testdrive.seed}' KEY (f1) WITH (reuse_topic=true) FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY '${testdrive.schema-registry-url}' # Wait until all the records have been emited from the sink, as observed by the sink1_check source > CREATE SOURCE sink1_check FROM KAFKA BROKER '${testdrive.kafka-addr}' TOPIC 'testdrive-sink-output-${testdrive.seed}' KEY FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY '${testdrive.schema-registry-url}' VALUE FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY '${testdrive.schema-registry-url}' ENVELOPE UPSERT; > CREATE MATERIALIZED VIEW sink1_check_v AS SELECT COUNT(*) FROM sink1_check; > SELECT * FROM sink1_check_v /* B */ """ + str(self.n()))
def benchmark(self) -> MeasurementSource: return Td(f""" > SELECT COUNT(*) = 0 FROM mz_kafka_source_statistics WHERE CAST(statistics->'topics'->'testdrive-kafka-raw-${{testdrive.seed}}'->'partitions'->'0'->'msgs' AS INT) > 0 true > DROP CONNECTION IF EXISTS s1_kafka_conn CASCADE > DROP CONNECTION IF EXISTS s1_csr_conn CASCADE > CREATE CONNECTION s1_kafka_conn FOR KAFKA BROKER '${{testdrive.kafka-addr}}' > CREATE CONNECTION IF NOT EXISTS s1_csr_conn FOR CONFLUENT SCHEMA REGISTRY URL '${{testdrive.schema-registry-url}}'; > CREATE SOURCE s1 FROM KAFKA CONNECTION s1_kafka_conn TOPIC 'testdrive-kafka-raw-${{testdrive.seed}}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION s1_csr_conn ENVELOPE NONE /* A */ > SELECT SUM(CAST(statistics->'topics'->'testdrive-kafka-raw-${{testdrive.seed}}'->'partitions'->'0'->'msgs' AS INT)) = {self.n()} /* B */ FROM mz_kafka_source_statistics; true """)
def benchmark(self) -> BenchmarkingSequence: columns_select = ", ".join( [f"a{i+1}.f1 AS f{i+1}" for i in range(0, floor(self.scale()))]) columns_using = ", ".join( [f"f{i+1}" for i in range(0, floor(self.scale()))]) inserts = "\n".join( [f"> INSERT INTO ten VALUES ({i+1})" for i in range(0, 10)]) return [ Td(f""" > DROP MATERIALIZED VIEW IF EXISTS v2 CASCADE; > DROP MATERIALIZED VIEW IF EXISTS v1 CASCADE; > DROP TABLE IF EXISTS ten; > CREATE TABLE ten (f1 INTEGER); > CREATE MATERIALIZED VIEW v1 AS SELECT {columns_select} FROM {self.join()} > SELECT 1; /* A */ 1 > CREATE MATERIALIZED VIEW v2 AS SELECT COUNT(a1.f1) AS c1, COUNT(a2.f1) AS c2 FROM v1 AS a1 FULL OUTER JOIN v1 AS a2 USING ({columns_using}); {inserts} > SELECT * FROM v2; /* B */ {self.n()} {self.n()} """) ]
def benchmark(self) -> MeasurementSource: return Td(""" > DROP CONNECTION IF EXISTS s1_kafka_conn CASCADE /* A */ > CREATE CONNECTION s1_kafka_conn FOR KAFKA BROKER '${testdrive.kafka-addr}' > CREATE SINK sink1 FROM source1 INTO KAFKA CONNECTION s1_kafka_conn TOPIC 'testdrive-sink-output-${testdrive.seed}' KEY (f1) FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn # Wait until all the records have been emited from the sink, as observed by the sink1_check source > CREATE SOURCE sink1_check FROM KAFKA CONNECTION s1_kafka_conn TOPIC 'testdrive-sink-output-${testdrive.seed}' KEY FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn VALUE FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn ENVELOPE UPSERT; > CREATE MATERIALIZED VIEW sink1_check_v AS SELECT COUNT(*) FROM sink1_check; > SELECT * FROM sink1_check_v /* B */ """ + str(self.n()))
def benchmark(self) -> MeasurementSource: return Td(""" > SELECT 1; /* A */ 1 > SELECT f1 FROM v1 ORDER BY f1 DESC LIMIT 1000 /* B */ """ + "\n".join([str(x) for x in range(self.n() - 1000, self.n())]))
def benchmark(self) -> MeasurementSource: return Td(f""" > SELECT 1 /* A */ 1 > UPDATE t1 SET f1 = f1 + {self.n()} /* B */ """)
def benchmark(self) -> BenchmarkingSequence: return [ Lambda(lambda e: e.RestartMz()), Td(f""" > SELECT COUNT(*) /* {self.n()} */ FROM s1; /* B */ {self.n()} """), ]
def benchmark(self) -> BenchmarkingSequence: return [ Lambda(lambda e: e.RestartMz()), Td(""" > SELECT * FROM s1_is_complete /* B */ true """), ]
def benchmark(self) -> BenchmarkingSequence: return [ Lambda(lambda e: e.RestartMz()), Td(f""" > SELECT 1; /* B */ 1 """), ]
def benchmark(self) -> MeasurementSource: return Td(f""" > DROP TABLE IF EXISTS t1; > CREATE TABLE t1 (f1 INTEGER) /* A */ > INSERT INTO t1 SELECT {self.unique_values()} FROM {self.join()} /* B */ """)
def benchmark(self) -> MeasurementSource: return Td(f""" > SELECT 1 /* A */ 1 > SELECT COUNT(DISTINCT f1) AS f1 FROM v1 /* B */ {self.n()} """)
def benchmark(self) -> MeasurementSource: return Td(f""" > SELECT 1 /* A */ 1 > SELECT MIN(f1), MAX(f1) AS f1 FROM v1 /* B */ 0 {self.n()-1} """)
def benchmark(self) -> MeasurementSource: return Td(f""" > SELECT 1 /* A */ 1 > SELECT COUNT(*), MIN(f1_min), MAX(f1_max) FROM (SELECT f2, MIN(f1) AS f1_min, MAX(f1) AS f1_max FROM v1 GROUP BY f2) /* B */ {self.n()} 0 {self.n()-1} """)
def benchmark(self) -> MeasurementSource: return Td(f""" > SELECT 1 /* A */ 1 > SELECT f2 FROM v1 ORDER BY 1 DESC LIMIT 1 /* B */ {self.n()-1} """)