def test_bool(self, duckdb_cursor): conn = duckdb.connect() res = conn.execute("select ?, ?, ?", (True, 42, [1, 2, 3])).fetchall() assert res[0][0] == True assert res[0][1] == 42 assert res[0][2] == [1, 2, 3]
import duckdb con = duckdb.connect('robust04db') #stores db info in filename c = con.cursor() c.execute("CREATE TABLE dict (termid INTEGER,term VARCHAR(100), df INTEGER)") c.execute("CREATE TABLE terms (termid INTEGER,docid INTEGER, count INTEGER)") c.execute( "CREATE TABLE docs (name VARCHAR(50), docid INTEGER, length INTEGER)") c.close() con.close()
def test_values(self, duckdb_cursor): conn = duckdb.connect() conn.execute("create table t (a integer)") duckdb.values([1], conn).insert_into("t") assert conn.execute("select count(*) from t").fetchall()[0] == (1, )
def to_target(self, target: str): """ Emit Pandas DataFrame to target. A target is identified by a connection string. Examples: - duckdb://dwd.duckdb?table=weather - influxdb://localhost/?database=dwd&table=weather - crate://localhost/?database=dwd&table=weather Dispatch data to different data sinks. Currently, SQLite, DuckDB, InfluxDB and CrateDB are implemented. However, through the SQLAlchemy layer, it should actually work with any supported SQL database. - https://docs.sqlalchemy.org/en/13/dialects/ :param target: Target connection string. :return: self """ log.info(f"Exporting records to {target}\n{self.df.count()}") connspec = ConnectionString(target) protocol = connspec.url.scheme database = connspec.get_database() tablename = connspec.get_table() if target.startswith("file://"): filepath = connspec.get_path() if target.endswith(".xlsx"): log.info(f"Writing to spreadsheet file '{filepath}'") # Convert all datetime columns to ISO format. df = convert_datetimes(self.df) df.to_excel(filepath, index=False) elif target.endswith(".feather"): # https://arrow.apache.org/docs/python/feather.html log.info(f"Writing to Feather file '{filepath}'") import pyarrow.feather as feather feather.write_feather(self.df, filepath, compression="lz4") elif target.endswith(".parquet"): """ # Acquire data and store to Parquet file. alias fetch="wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent" fetch --target="file://observations.parquet" # Check Parquet file. parquet-tools schema observations.parquet parquet-tools head observations.parquet # References - https://arrow.apache.org/docs/python/parquet.html """ log.info(f"Writing to Parquet file '{filepath}'") import pyarrow as pa import pyarrow.parquet as pq table = pa.Table.from_pandas(self.df) pq.write_table(table, filepath) elif target.endswith(".zarr"): """ # Acquire data and store to Zarr group. alias fetch="wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent" fetch --target="file://observations.zarr" # References - https://xarray.pydata.org/en/stable/generated/xarray.Dataset.from_dataframe.html - https://xarray.pydata.org/en/stable/generated/xarray.Dataset.to_zarr.html """ log.info(f"Writing to Zarr group '{filepath}'") import xarray df = self.df # Problem: `ValueError: Cannot setitem on a Categorical with a new category, set the categories first`. # Solution: Let's convert all categorical columns back to their designated type representations. # https://stackoverflow.com/questions/32011359/convert-categorical-data-in-pandas-dataframe/32011969#32011969 if "quality" in df: df.quality = df.quality.astype("Int64") categorical_columns = df.select_dtypes(["category"]).columns df[categorical_columns] = df[categorical_columns].astype("str") # Problem: `TypeError: float() argument must be a string or a number, not 'NAType'`. # Solution: Fill gaps in the data. df = df.fillna(-999) # Convert pandas DataFrame to xarray Dataset. dataset = xarray.Dataset.from_dataframe(df) log.info(f"Converted to xarray Dataset. Size={dataset.sizes}") # Export to Zarr format. # TODO: Add "group" parameter. # Group path. (a.k.a. `path` in zarr terminology.) # TODO: Also use attributes: `store.set_attribute()` store = dataset.to_zarr( filepath, mode="w", group=None, encoding={"date": { "dtype": "datetime64" }}, ) # Reporting. dimensions = store.get_dimensions() variables = list(store.get_variables().keys()) log.info( f"Wrote Zarr file with dimensions={dimensions} and variables={variables}" ) log.info(f"Zarr Dataset Group info:\n{store.ds.info}") else: raise KeyError("Unknown export file type") return if target.startswith("duckdb://"): """ ==================== DuckDB database sink ==================== Install Python driver:: pip install duckdb Acquire data:: wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="duckdb:///dwd.duckdb?table=weather" Example queries:: python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.table("weather"))' # noqa python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.execute("SELECT * FROM weather").df())' # noqa """ log.info( f"Writing to DuckDB. database={database}, table={tablename}") import duckdb connection = duckdb.connect(database=database, read_only=False) connection.register("origin", self.df) connection.execute(f"DROP TABLE IF EXISTS {tablename};") connection.execute( f"CREATE TABLE {tablename} AS SELECT * FROM origin;" ) # noqa:S608 weather_table = connection.table(tablename) print(weather_table) # noqa: T001 print("Cardinalities:") # noqa: T001 print(weather_table.to_df().count()) # noqa: T001 connection.close() log.info("Writing to DuckDB finished") elif protocol.startswith("influxdb"): """ ========================== InfluxDB 1.x database sink ========================== Install Python driver:: pip install influxdb Run database:: docker run -it --rm --publish=8086:8086 influxdb:1.8 Acquire data:: alias fetch="wetterdienst values --provider=dwd --network=observation --parameter=kl --resolution=daily --period=recent --station=1048,4411" fetch --target="influxdb://localhost/?database=dwd&table=weather" Example queries:: http 'localhost:8086/query?db=dwd&q=SELECT * FROM weather;' http 'localhost:8086/query?db=dwd&q=SELECT COUNT(*) FROM weather;' ========================== InfluxDB 2.x database sink ========================== Install Python driver:: pip install influxdb_client Run database:: docker run -it --rm --publish=8086:8086 influxdb:2.0 influx setup --name=default --username=root --password=12345678 --org=acme --bucket=dwd --retention=0 --force Acquire data:: INFLUXDB_ORGANIZATION=acme INFLUXDB_TOKEN=t5PJry6TyepGsG7IY_n0K4VHp5uPvt9iap60qNHIXL4E6mW9dLmowGdNz0BDi6aK_bAbtD76Z7ddfho6luL2LA== alias fetch="wetterdienst values --provider=dwd --network=observation --parameter=kl --resolution=daily --period=recent --station=1048,4411" fetch --target="influxdb2://${INFLUXDB_ORGANIZATION}:${INFLUXDB_TOKEN}@localhost/?database=dwd&table=weather" Example queries:: influx query 'from(bucket:"dwd") |> range(start:-2d) |> limit(n: 10)' """ if protocol in [ "influxdb", "influxdbs", "influxdb1", "influxdb1s" ]: version = 1 elif protocol in ["influxdb2", "influxdb2s"]: version = 2 else: raise KeyError( f"Unknown protocol variant '{protocol}' for InfluxDB") log.info( f"Writing to InfluxDB version {version}. database={database}, table={tablename}" ) # 1. Mungle the data frame. # Use the "date" column as appropriate timestamp index. df = self.df.set_index(pd.DatetimeIndex(self.df["date"])) df = df.drop(["date"], axis=1) # Compute designated tag fields from some candidates. tag_columns = [] tag_candidates = [ Columns.STATION_ID.value, Columns.QUALITY.value, Columns.QUALITY_PREFIX.value, Columns.DATASET.value, Columns.PARAMETER.value, ] for tag_candidate in tag_candidates: tag_candidate = tag_candidate.lower() for column in df.columns: if column.startswith(tag_candidate): tag_columns.append(column) # Setup the connection. if version == 1: from influxdb import InfluxDBClient client = InfluxDBClient( host=connspec.url.hostname, port=connspec.url.port or 8086, username=connspec.url.username, password=connspec.url.password, database=database, ssl=protocol.endswith("s"), ) client.create_database(database) elif version == 2: from influxdb_client import InfluxDBClient, Point from influxdb_client.client.write_api import SYNCHRONOUS ssl = protocol.endswith("s") url = f"http{ssl and 's' or ''}://{connspec.url.hostname}:{connspec.url.port or 8086}" client = InfluxDBClient(url=url, org=connspec.url.username, token=connspec.url.password) write_api = client.write_api(write_options=SYNCHRONOUS) points = [] for items in chunker(df, chunksize=50000): for date, record in items.iterrows(): time = date.isoformat() tags = { tag: record.pop(tag) for tag in tag_columns if tag in record } fields = record.dropna().to_dict() if not fields: continue if version == 1: point = { "measurement": tablename, "time": time, "tags": tags, "fields": fields, } elif version == 2: point = Point(tablename).time(date.isoformat()) for tag, value in tags.items(): point = point.tag(tag, value) for field, value in fields.items(): point = point.field(field, value) points.append(point) # Write to InfluxDB. if version == 1: client.write_points( points=points, batch_size=50000, ) elif version == 2: write_api.write(bucket=database, record=points) write_api.close() log.info("Writing to InfluxDB finished") elif target.startswith("crate://"): """ ===================== CrateDB database sink ===================== Install Python driver:: pip install crate[sqlalchemy] crash Run database:: docker run -it --rm --publish=4200:4200 --env CRATE_HEAP_SIZE=2048M crate/crate:nightly Acquire data:: wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="crate://crate@localhost/dwd?table=weather" Example queries:: psql postgres://crate@localhost --command 'SELECT * FROM dwd.weather;' crash -c 'select * from dwd.weather;' crash -c 'select count(*) from dwd.weather;' crash -c "select *, date_format('%Y-%m-%dT%H:%i:%s.%fZ', date) as datetime from dwd.weather order by datetime limit 10;" # noqa """ log.info(f"Writing to CrateDB. target={target}, table={tablename}") # CrateDB's SQLAlchemy driver doesn't accept `database` or `table` query parameters. cratedb_url = connspec.url._replace(path="", query=None) cratedb_target = urlunparse(cratedb_url) # Convert timezone-aware datetime fields to naive ones. # FIXME: Omit this as soon as the CrateDB driver is capable of supporting timezone-qualified timestamps. self.df.date = self.df.date.dt.tz_localize(None) self.df.to_sql( name=tablename, con=cratedb_target, schema=database, if_exists="replace", index=False, chunksize=5000, ) log.info("Writing to CrateDB finished") else: """ ================================ Generic SQLAlchemy database sink ================================ Install Python driver:: pip install sqlalchemy Examples:: # Prepare alias fetch='wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent' # Acquire data. fetch --target="sqlite:///dwd.sqlite?table=weather" # Query data. sqlite3 dwd.sqlite "SELECT * FROM weather;" """ # Honour SQLite's SQLITE_MAX_VARIABLE_NUMBER, which defaults to 999 # for SQLite versions prior to 3.32.0 (2020-05-22), # see https://www.sqlite.org/limits.html#max_variable_number. chunksize = 5000 if target.startswith("sqlite://"): import sqlite3 if sqlite3.sqlite_version_info < (3, 32, 0): chunksize = int(999 / len(self.df.columns)) log.info("Writing to SQL database") self.df.to_sql( name=tablename, con=target, if_exists="replace", index=False, method="multi", chunksize=chunksize, ) log.info("Writing to SQL database finished")
import duckdb con = duckdb.connect('robust04db') c = con.cursor() c.execute() print(c.fetchall()) c.close() con.close() "WITH qterms AS (SELECT termid, docid, count FROM terms WHERE termid IN (10575, 1285, 191)), subscores AS (SELECT docs.docid, len, term_tf.termid, tf, count, (log((528155-count+0.5)/(count+0.5))*((tf*(1.2+1)/ (tf+1.2*(1-0.75+0.75*(len/188.33)))))) AS subscore FROM (SELECT termid, docid, count AS tf FROM qterms) AS term_tf JOIN (SELECT docid FROM qterms GROUP BY docid HAVING COUNT(distinct termid) = 3) AS cdocs ON term_tf.docid = cdocs.docid JOIN docs ON term_tf.docid=docs.docid JOIN dict ON term_tf.termid=dict.termid) SELECT scores.docid, score FROM (SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid) AS scores JOIN docs ON scores.docid=docs.docid ORDER BY score DESC;"
def test_union_operator(self, duckdb_cursor): conn = duckdb.connect() rel = get_relation(conn) print(rel.union(rel).execute().fetchall()) assert rel.union(rel).execute().fetchall() == [(1, 'one'), (2, 'two'), (3, 'three'), (4, 'four'), (1, 'one'), (2, 'two'), (3, 'three'), (4, 'four')]
def test_create_view_operator(self,duckdb_cursor): conn = duckdb.connect() test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4], "j":["one", "two", "three", "four"]}) rel = conn.from_df(test_df) rel.create_view("test_df") assert conn.query("select * from test_df").execute().fetchall() == [(1, 'one'), (2, 'two'), (3, 'three'),(4, 'four')]
def test_conn_prepared_statement_error(self, duckdb_cursor): conn = duckdb.connect() conn.execute("create table integers (a integer, b integer)") with pytest.raises(Exception): conn.execute("select * from integers where a =? and b=?", [1])
def test_register_error(self, duckdb_cursor): con = duckdb.connect() py_obj = "this is a string" with pytest.raises(Exception): con.register(py_obj, "v")
import duckdb import numpy as np con = duckdb.connect('robust04db_new') c = con.cursor() print("here1") c.execute("DROP INDEX IF EXISTS dict_index") print("here2") print(c.fetchall()) c.close() con.close() ''' c.execute("SELECT avg(length) FROM docs"); avgdl = c.fetchall()[0][0] k1 = 1.2 b = 0.75 query_term = 'car' c.execute("SELECT COUNT(*) FROM docs"); N = c.fetchall()[0][0] c.execute("SELECT df FROM dict WHERE term='car'"); df = c.fetchall()[0][0] idf = np.log2((N-df+0.5)/(df+0.5)) c.execute("SELECT ("+str(idf)+"*(table2.count*"+ str(k1+1) +")/(table2.count+("+str(k1)+"*("+str(1-b) +" +("+str(b)+"*(table2.length/"+ str(avgdl) +")))))) AS score, table2.docid FROM (SELECT table1.count, table1.docid, (SELECT length FROM docs WHERE table1.docid = docs.docid) AS length FROM (SELECT docid, count FROM terms WHERE terms.termid = (SELECT termid FROM dict WHERE term = '"+ query_term +"')) AS table1) AS table2 ORDER BY score DESC LIMIT 5") print(c.fetchall())
def test_check_same_thread_false(self, duckdb_cursor): con = duckdb.connect(check_same_thread=False) x = threading.Thread(target=connect_duck, args=(con, )) x.start()
##################################################### # VCOC3 Lifestyle Research Team # TEORIA DE PORTFOLIO (Markowitz) # Objetivo: correlacao de ativos # INPUT: datas de inicio, fim e tickers de ativos (portfolio) # OUTPUT: Correlacao do portfolio e matriz ##################################################### import pandas as pd import numpy as np from pandas_datareader import data as wb from datetime import datetime import duckdb con = duckdb.connect('') dt_ini=datetime(2013, 1, 1) dt_fim=datetime(2020, 1, 21) # RESUMO: reduzir ,'ITUB3.SA' #tickers = ['XPML11.SA','RBRF11.SA','BBSE3.SA','TRPL4.SA','PETR4.SA','VVAR3.SA','BPAC3.SA','PSSA3.SA','ENBR3.SA','HYPE3.SA','WEGE3.SA','CPTS11B.SA','TUPY3.SA','HGLG11.SA','ITUB3.SA','MYPK3.SA','LREN3.SA','HGTX3.SA'] # atual #tickers = ['ENBR3.SA','ABEV3.SA','PSSA3.SA','VVAR3.SA','TRPL4.SA','BPAC3.SA','TUPY3.SA','WEGE3.SA','HGTX3.SA','MYPK3.SA','LREN3.SA'] # Diego #tickers = ['MYPK3.SA','CAML3.SA','SLCE3.SA','MOVI3.SA','COGN3.SA','TUPY3.SA','ITSA4.SA','PRIO3.SA','BBAS3.SA','LCAM3.SA','PTBL3.SA','MDIA3.SA','TAEE11.SA','HGTX3.SA','VVAR3.SA'] # atual tirando HGTX3 e MYPK3 tickers = ['ENBR3.SA','BBSE3.SA','PSSA3.SA','MRFG3.SA','TRPL4.SA','BPAC3.SA','XPML11.SA','RBRF11.SA','CPTS11B.SA','TUPY3.SA','LREN3.SA','WEGE3.SA','HGLG11.SA','ITUB3.SA','HYPE3.SA']#,'PETR4.SA','HGTX3.SA','MYPK3.SA'] # Menor correlacao, tira: BBSE3.SA 'ITUB3.SA','HYPE3.SA','HGTX3.SA','MYPK3.SA', 'LREN3.SA','PETR4.SA'
import duckdb import json import sys from .utils import flush_rows SCHEMA = "documents(document_id, canon_url, date_publish, language, title, country)" if __name__ == '__main__': conn = duckdb.connect(sys.argv[1]) conn.begin() counter = 0 rows = [] for line in sys.stdin: counter += 1 doc = json.loads(line.strip()) canon_url = doc['canon_url'] date_publish = doc['date_publish'] language = doc['language'] title = doc['title'] country = doc['country'] rows.append( (counter, canon_url, date_publish, language, title, country)) if counter % 50000 == 0: # Commit changes every now and then flush_rows(SCHEMA, conn, rows) conn.begin() print(counter)
def get_queries(): #Get the preprocessed queries queries = [] with open("topics.data", 'rb') as filehandle: queries = pickle.load(filehandle) return queries def get_content(raw_docs): keys = [] values = [] contents = {} with open(raw_docs) as json_file: contents = json_file.read() json_data = json.loads(contents) contents = {item['id']:item for item in json_data} return contents def BM25(input_query,c): query_words= "" for word in input_query.split(): query_words = query_words + "'" + word.lower() + "'," query_words = query_words[:-1] c.execute("SELECT termid FROM dict WHERE term IN (" + query_words + ")") id_list = c.fetchall() query_ids= "" for ids in id_list: query_ids = query_ids + str(ids[0]) + "," query_ids = query_ids[:-1] BM = """ WITH qterms AS (SELECT termid, docid, count as df FROM terms WHERE termid IN ("""+ query_ids +""")), subscores AS (SELECT docs.docid, length, term_tf.termid, tf, df, (log((528155.000000-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/ (term_tf.tf+1.2*(1-0.75+0.75*(length/188.33)))))) AS subscore FROM (SELECT termid, docid, df AS tf FROM qterms) AS term_tf JOIN (SELECT docid FROM qterms GROUP BY docid ) AS cdocs ON term_tf.docid = cdocs.docid JOIN docs ON term_tf.docid=docs.docid JOIN dict ON term_tf.termid=dict.termid) SELECT scores.docid, ROUND(score,6) FROM (SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid) AS scores JOIN docs ON scores.docid=docs.docid ORDER BY ROUND(score,6) DESC, scores.docid ASC LIMIT 100; """ c.execute(BM) docids =str(list(c.fetchnumpy()['docid']))[1:-1] c.execute("SELECT name FROM docs WHERE docid IN (" + docids + ")") results = c.fetchnumpy()['name'] return results if __name__ == '__main__': raw_docs = sys.argv[1] db_name = sys.argv[2] option = int(sys.argv[3]) queries = get_queries() times = [] results = open('results.txt','w') #DuckDB with Bert if option == 1: contents = get_content(raw_docs) bert_model = bm.BertModel() con = duckdb.connect(db_name) c = con.cursor() for item in queries: query_no, query= item['number'],item["title"] results.write("Query: " + str(query_no) + "\n") start_time = time.time() candidate_docs= BM25(query,c) choices = [] for i in range(len(candidate_docs)): content = contents[candidate_docs[i] + '.000000']['contents'] content = preprocess(content) choices.append(Choice(i,content.encode('utf-8'))) ranked = bert_model.rank(query.encode('utf-8'),choices) end_time = time.time() for i in range(10): results.write(str(i+1) + ") " + candidate_docs[ranked[i]]+"\n") times.append(end_time - start_time) else: con = None #DuckDB if option == 2: con = duckdb.connect(db_name) #MonetDB else: con = pymonetdb.connect(username='******',password='******',hostname='localhost', database=db_name) c = con.cursor() for item in queries: query_no, query= item['number'],item["title"] start_time = time.time() bm25_results = BM25(query,c) end_time = time.time() for i in range(10): results.write(str(i+1) + ") " + bm25_results[i]+"\n") times.append(end_time - start_time) print("Max: " + str(max(times))) print("Min: " + str(min(times))) print("Average: " + str(sum(times)/len(times))) print("Standard Deviation: " + str(statistics.stdev(times))) print("Total: " + str(sum(times))) print("Number of queries: " + str(len(times))) results.close() c.close() con.close()
def test_aggregate_operator(self, duckdb_cursor): conn = duckdb.connect() rel = get_relation(conn) assert rel.aggregate("sum(i)").execute().fetchall() == [(10,)] assert rel.aggregate("j, sum(i)").execute().fetchall() == [('one', 1), ('two', 2), ('three', 3), ('four', 4)]
def create_connection(self): connection = duckdb.connect(self.config['dbFilename']) return connection
def test_distinct_operator(self, duckdb_cursor): conn = duckdb.connect() rel = get_relation(conn) assert rel.distinct().execute().fetchall() == [(1, 'one'), (2, 'two'), (3, 'three'),(4, 'four')]
def export(self, target: str): """ Emit Pandas DataFrame to target. A target is identified by a connection string. Examples: - duckdb://dwd.duckdb?table=weather - influxdb://localhost/?database=dwd&table=weather - crate://localhost/?database=dwd&table=weather Dispatch data to different data sinks. Currently, SQLite, DuckDB, InfluxDB and CrateDB are implemented. However, through the SQLAlchemy layer, it should actually work with any supported SQL database. - https://docs.sqlalchemy.org/en/13/dialects/ :param target: Target connection string. :return: self """ database, tablename = ConnectionString(target).get() if target.startswith("duckdb://"): """ ==================== DuckDB database sink ==================== Install Python driver:: pip install duckdb Acquire data:: wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="duckdb:///dwd.duckdb?table=weather" Example queries:: python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.table("weather"))' # noqa python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.execute("SELECT * FROM weather").df())' # noqa """ log.info(f"Writing to DuckDB {database, tablename}") import duckdb connection = duckdb.connect(database=database, read_only=False) connection.register("origin", self.df) connection.execute(f"DROP TABLE IF EXISTS {tablename};") connection.execute( f"CREATE TABLE {tablename} AS SELECT * FROM origin;" # noqa:S608 ) weather_table = connection.table(tablename) print(weather_table) print("Cardinalities:") print(weather_table.to_df().count()) connection.close() log.info("Writing to DuckDB finished") elif target.startswith("influxdb://"): """ ====================== InfluxDB database sink ====================== Install Python driver:: pip install influxdb Run database:: docker run --publish "8086:8086" influxdb/influxdb:1.8.2 Acquire data:: wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="influxdb://localhost/?database=dwd&table=weather" Example queries:: http 'localhost:8086/query?db=dwd&q=SELECT * FROM weather;' http 'localhost:8086/query?db=dwd&q=SELECT COUNT(*) FROM weather;' """ log.info(f"Writing to InfluxDB {database, tablename}") from influxdb.dataframe_client import DataFrameClient # Setup the connection. c = DataFrameClient(database=database) c.create_database(database) # Mungle the data frame. df = self.df.set_index(pd.DatetimeIndex(self.df["date"])) df = df.drop(["date"], axis=1) df = df.dropna() # Write to InfluxDB. c.write_points( dataframe=df, measurement=tablename, tag_columns=["station_id", "parameter", "element"], ) log.info("Writing to InfluxDB finished") elif target.startswith("crate://"): """ ===================== CrateDB database sink ===================== Install Python driver:: pip install crate[sqlalchemy] crash Run database:: docker run --publish "4200:4200" --env CRATE_HEAP_SIZE=512M crate/crate:4.2.4 Acquire data:: wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="crate://localhost/?database=dwd&table=weather" Example queries:: crash -c 'select * from weather;' crash -c 'select count(*) from weather;' crash -c "select *, date_format('%Y-%m-%dT%H:%i:%s.%fZ', date) as datetime from weather order by datetime limit 10;" # noqa """ log.info("Writing to CrateDB") self.df.to_sql( name=tablename, con=target, if_exists="replace", index=False, method="multi", chunksize=5000, ) log.info("Writing to CrateDB finished") else: """ ======================== SQLAlchemy database sink ======================== Install Python driver:: pip install sqlalchemy Examples:: # Prepare alias fetch='wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent' # Acquire data. fetch --target="sqlite:///dwd.sqlite?table=weather" # Query data. sqlite3 dwd.sqlite "SELECT * FROM weather;" """ log.info("Writing to SQL database") self.df.to_sql( name=tablename, con=target, if_exists="replace", index=False, method="multi", chunksize=5000, ) log.info("Writing to SQL database finished")
def test_except_operator(self,duckdb_cursor): conn = duckdb.connect() test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4], "j":["one", "two", "three", "four"]}) rel = conn.from_df(test_df) rel2 = conn.from_df(test_df) assert rel.except_(rel2).execute().fetchall() == []
def test_execute_fail(self,duckdb_cursor): conn = duckdb.connect() conn.execute("CREATE TABLE test (i INTEGER)") rel = conn.table("test") with pytest.raises(Exception): rel.execute("select j from test")
import os import pytest import tempfile try: import pyarrow as pa import pyarrow.parquet as pq import pyarrow.dataset as ds import numpy as np import pandas as pd import re can_run = True except: can_run = False ## DuckDB connection used in this test duckdb_conn = duckdb.connect() def numeric_operators(data_type, tbl_name): duckdb_conn.execute("CREATE TABLE " + tbl_name + " (a " + data_type + ", b " + data_type + ", c " + data_type + ")") duckdb_conn.execute( "INSERT INTO " + tbl_name + " VALUES (1,1,1),(10,10,10),(100,10,100),(NULL,NULL,NULL)") duck_tbl = duckdb_conn.table(tbl_name) arrow_table = duck_tbl.arrow() print(arrow_table) duckdb_conn.register("testarrow", arrow_table) # Try == assert duckdb_conn.execute(
def test_filter_operator(self, duckdb_cursor): conn = duckdb.connect() rel = get_relation(conn) assert rel.filter('i > 1').execute().fetchall() == [(2, 'two'), (3, 'three'), (4, 'four')]
parser.add_argument("vcffile", type=argparse.FileType(mode='rb')) parser.add_argument("dataset_name", help="name of dataset for this vcf", default="dataset1", type=str) parser.add_argument("--sqlite", help="sqlite filename", type=str) parser.add_argument("--duckdb", help="duckdb filename", type=str) parser.add_argument("--parquet", help="parquet file prefix", type=str) parser.add_argument("--csv", help="csv file prefix", type=str) args = parser.parse_args() print("Reading VCF and generating dataframes...") tables = tables(args.vcffile, args.dataset_name) if args.sqlite: print("Writing sqlite") with sqlite3.connect(args.sqlite) as con: pd_to_sql(tables, con) if args.duckdb: print("Writing duckdb") con = duckdb.connect(args.duckdb, read_only=False) pd_to_sql(tables, con) if args.csv: print("Writing csv") pd_to_csv(tables, args.csv) if args.parquet: print("Writing parquet") pd_to_parquet(tables, args.parquet)
def test_projection_operator(self, duckdb_cursor): conn = duckdb.connect() rel = get_relation(conn) assert rel.project('i').execute().fetchall() == [(1,), (2,), (3,), (4,)]
def test_map(self, duckdb_cursor): testrel = duckdb.values([1, 2]) conn = duckdb.connect() conn.execute('CREATE TABLE t (a integer)') empty_rel = conn.table('t') newdf1 = testrel.map(lambda df: df['col0'].add(42).to_frame()) newdf2 = testrel.map(lambda df: df['col0'].astype('string').to_frame()) newdf3 = testrel.map(lambda df: df) # column count differs from bind def evil1(df): if len(df) == 0: return df['col0'].to_frame() else: return df # column type differs from bind def evil2(df): if len(df) == 0: df['col0'] = df['col0'].astype('string') return df # column name differs from bind def evil3(df): if len(df) == 0: df = df.rename(columns={"col0": "col42"}) return df # does not return a df def evil4(df): return 42 # straight up throws exception def evil5(df): this_makes_no_sense() def return_dataframe(df): return pd.DataFrame({'A': [1]}) def return_big_dataframe(df): return pd.DataFrame({'A': [1] * 5000}) def return_none(df): return None def return_empty_df(df): return pd.DataFrame() with pytest.raises(RuntimeError): print(testrel.map(evil1).df()) with pytest.raises(RuntimeError): print(testrel.map(evil2).df()) with pytest.raises(RuntimeError): print(testrel.map(evil3).df()) with pytest.raises(AttributeError): print(testrel.map(evil4).df()) with pytest.raises(RuntimeError): print(testrel.map(evil5).df()) # not a function with pytest.raises(TypeError): print(testrel.map(42).df()) # nothing passed to map with pytest.raises(TypeError): print(testrel.map().df()) testrel.map(return_dataframe).df().equals(pd.DataFrame({'A': [1]})) with pytest.raises(Exception): testrel.map(return_big_dataframe).df() empty_rel.map(return_dataframe).df().equals(pd.DataFrame({'A': []})) with pytest.raises(Exception): testrel.map(return_none).df() with pytest.raises(Exception): testrel.map(return_empty_df).df()
def test_projection_operator(self, duckdb_cursor): conn = duckdb.connect() rel = get_relation(conn) assert rel.order('j').execute().fetchall() == [(4, 'four'), (1, 'one'), (3, 'three'), (2, 'two')]
def test_length(self, duckdb_cursor): con = duckdb.connect() rel = initialize(con) assert len(rel) == 3
def test_limit_operator(self, duckdb_cursor): conn = duckdb.connect() rel = get_relation(conn) assert rel.limit(2).execute().fetchall() == [(1, 'one'), (2, 'two')]
import duckdb con = duckdb.connect('robust04db_indexed') c = con.cursor() c.execute("COPY dict FROM 'dict.csv' DELIMITER '|'") c.close() con.close()
def run(dbFilename,dataFolder): print("connecting to duckdb") conn = duckdb.connect(dbFilename) cursor = conn.cursor() ''' COPY INTO supplier from '${data_folder}/sf_${scale_factor}/supplier.tbl' USING DELIMITERS '|', '|\n'; ''' print("loading customer") cursor.execute("DROP TABLE IF EXISTS customer"); cursor.execute(""" CREATE TABLE if not exists customer ( c_custkey INT,--numeric identifier c_name STRING, -- varchar(25), --variable text, size 25 'customer'||custkey c_address STRING, -- varchar(25), --variable text, size 25 (city below) c_city STRING, -- varchar(10), --fixed text, size 10 (10/nation: nation_prefix||(0-9) c_nation STRING, -- varchar(15), --fixed text(15) (25 values, longest united kingdom) c_region STRING, -- varchar(12), --fixed text, size 12 (5 values: longest middle east) c_phone STRING, -- varchar(15), --fixed text, size 15 (many values, format: 43-617-354-1222) c_mktsegment STRING, -- varchar(10) --fixed text, size 10 (longest is automobile) PRIMARY KEY (c_custkey) ); """) data_file = str(os.path.join(dataFolder,'customer.tbl')) cursor.execute("copy customer from '"+data_file+"' (delimiter '|')") print("loading date_") cursor.execute("DROP TABLE IF EXISTS date_"); cursor.execute(""" CREATE TABLE if not exists date_ ( d_datekey INT, -- identifier, unique id -- e.g. 19980327 (what we use) d_date STRING, -- varchar(18), --fixed text, size 18, longest: december 22, 1998 d_dayofweek STRING, -- varchar(8), --fixed text, size 8, sunday, monday, ..., saturday) d_month STRING, -- varchar(9), --fixed text, size 9: january, ..., december d_year INT, -- unique value 1992-1998 d_yearmonthnum INT, -- numeric (yyyymm) -- e.g. 199803 d_yearmonth STRING, -- varchar(7), --fixed text, size 7: mar1998 for example d_daynuminweek INT, -- numeric 1-7 d_daynuminmonth INT, -- numeric 1-31 d_daynuminyear INT, -- numeric 1-366 d_monthnuminyear INT, -- numeric 1-12 d_weeknuminyear INT, -- numeric 1-53 d_sellingseason STRING, -- varchar(12), --text, size 12 (christmas, summer,...) d_lastdayinweekfl INT, -- 1 bit d_lastdayinmonthfl INT, -- 1 bit d_holidayfl INT, -- 1 bit d_weekdayfl INT, -- 1 bit PRIMARY KEY (d_datekey) ); """) data_file = str(os.path.join(dataFolder,'date.tbl')) cursor.execute("copy date_ from '"+data_file+"' (delimiter '|') ") print("loading part") cursor.execute("DROP TABLE IF EXISTS part"); cursor.execute(""" CREATE TABLE if not exists part ( p_partkey INT, -- identifier p_name STRING, -- varchar(22), --variable text, size 22 (not unique per part but never was) p_mfgr STRING, -- varchar(6), --fixed text, size 6 (mfgr#1-5, card = 5) p_category STRING, -- varchar(7), --fixed text, size 7 ('mfgr#'||1-5||1-5: card = 25) p_brand1 STRING, -- varchar(9), --fixed text, size 9 (category||1-40: card = 1000) p_color STRING, -- varchar(11), --variable text, size 11 (card = 94) p_type STRING, -- varchar(25), --variable text, size 25 (card = 150) p_size INT, -- numeric 1-50 (card = 50) p_container STRING, -- varchar(15) --fixed text(10) (card = 40) PRIMARY KEY (p_partkey) ); """) data_file = str(os.path.join(dataFolder,'part.tbl')) cursor.execute("copy part from '"+data_file+"' (delimiter '|') ") print("loading supplier") cursor.execute("DROP TABLE IF EXISTS supplier"); cursor.execute(""" CREATE TABLE if not exists supplier ( s_suppkey INT, -- identifier s_name STRING, -- varchar(25), --fixed text, size 25: 'supplier'||suppkey s_address STRING, -- varchar(25), --variable text, size 25 (city below) s_city STRING, -- varchar(10), --fixed text, size 10 (10/nation: nation_prefix||(0-9)) s_nation STRING, -- varchar(15), --fixed text(15) (25 values, longest united kingdom) s_region STRING, -- varchar(12), --fixed text, size 12 (5 values: longest middle east) s_phone STRING, -- varchar(15) --fixed text, size 15 (many values, format: 43-617-354-1222) PRIMARY KEY (s_suppkey) ); """) data_file = str(os.path.join(dataFolder,'supplier.tbl')) cursor.execute("copy supplier from '"+data_file+"' (delimiter '|') ") print("loading lineorder") cursor.execute("DROP TABLE IF EXISTS lineorder"); cursor.execute(""" CREATE TABLE if not exists lineorder ( lo_orderkey INT, -- numeric (int up to sf 300) first 8 of each 32 keys used lo_linenumber INT, -- numeric 1-7 lo_custkey INT, -- numeric identifier foreign key reference to c_custkey lo_partkey INT, -- identifier foreign key reference to p_partkey lo_suppkey INT, -- numeric identifier foreign key reference to s_suppkey lo_orderdate INT, -- identifier foreign key reference to d_datekey lo_orderpriority STRING, -- varchar(15), --fixed text, size 15 (5 priorities: 1-urgent, etc.) lo_shippriority STRING, -- varchar(1), --fixed text, size 1 lo_quantity INT, -- numeric 1-50 (for part) lo_extendedprice INT, -- numeric, max about 55,450 (for part) lo_ordtotalprice INT, -- numeric, max about 388,000 (for order) lo_discount INT, -- numeric 0-10 (for part) -- (represents percent) lo_revenue INT, -- numeric (for part: (extendedprice*(100-discount))/100) lo_supplycost INT, -- numeric (for part, cost from supplier, max = ?) lo_tax INT, -- numeric 0-8 (for part) lo_commitdate INT, -- foreign key reference to d_datekey lo_shipmode STRING -- varchar(10) --fixed text, size 10 (modes: reg air, air, etc.) ); """) # duckdb does not currently support compound primary keys, so removed it ''' PRIMARY KEY (lo_orderkey, lo_linenumber) --Compound Primary Key: ORDERKEY, LINENUMBER ''' # duckdb does not currently support foreign keys, so removed them ''' FOREIGN KEY (lo_orderdate) REFERENCES date_ (d_datekey), --identifier foreign key reference to D_DATEKEY FOREIGN KEY (lo_commitdate) REFERENCES date_ (d_datekey), --Foreign Key reference to D_DATEKEY FOREIGN KEY (lo_suppkey) REFERENCES supplier (s_suppkey), --numeric identifier foreign key reference to S_SUPPKEY FOREIGN KEY (lo_custkey) REFERENCES customer (c_custkey) --numeric identifier foreign key reference ''' data_file = str(os.path.join(dataFolder,'lineorder.tbl')) cursor.execute("copy lineorder from '"+data_file+"' (delimiter '|') ")