예제 #1
0
 def test_bool(self, duckdb_cursor):
     conn = duckdb.connect()
     res = conn.execute("select ?, ?, ?", (True, 42, [1, 2, 3])).fetchall()
     assert res[0][0] == True
     assert res[0][1] == 42
     assert res[0][2] == [1, 2, 3]
예제 #2
0
import duckdb

con = duckdb.connect('robust04db')  #stores db info in filename
c = con.cursor()

c.execute("CREATE TABLE dict (termid INTEGER,term VARCHAR(100), df INTEGER)")
c.execute("CREATE TABLE terms (termid INTEGER,docid INTEGER, count INTEGER)")
c.execute(
    "CREATE TABLE docs (name VARCHAR(50), docid INTEGER, length INTEGER)")

c.close()
con.close()
예제 #3
0
 def test_values(self, duckdb_cursor):
     conn = duckdb.connect()
     conn.execute("create table t (a integer)")
     duckdb.values([1], conn).insert_into("t")
     assert conn.execute("select count(*) from t").fetchall()[0] == (1, )
예제 #4
0
    def to_target(self, target: str):
        """
        Emit Pandas DataFrame to target. A target
        is identified by a connection string.

        Examples:

        - duckdb://dwd.duckdb?table=weather
        - influxdb://localhost/?database=dwd&table=weather
        - crate://localhost/?database=dwd&table=weather

        Dispatch data to different data sinks. Currently, SQLite, DuckDB,
        InfluxDB and CrateDB are implemented. However, through the SQLAlchemy
        layer, it should actually work with any supported SQL database.

        - https://docs.sqlalchemy.org/en/13/dialects/

        :param target: Target connection string.
        :return: self
        """

        log.info(f"Exporting records to {target}\n{self.df.count()}")

        connspec = ConnectionString(target)
        protocol = connspec.url.scheme
        database = connspec.get_database()
        tablename = connspec.get_table()

        if target.startswith("file://"):
            filepath = connspec.get_path()

            if target.endswith(".xlsx"):
                log.info(f"Writing to spreadsheet file '{filepath}'")

                # Convert all datetime columns to ISO format.
                df = convert_datetimes(self.df)
                df.to_excel(filepath, index=False)

            elif target.endswith(".feather"):
                # https://arrow.apache.org/docs/python/feather.html
                log.info(f"Writing to Feather file '{filepath}'")
                import pyarrow.feather as feather

                feather.write_feather(self.df, filepath, compression="lz4")

            elif target.endswith(".parquet"):
                """
                # Acquire data and store to Parquet file.
                alias fetch="wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent"
                fetch --target="file://observations.parquet"

                # Check Parquet file.
                parquet-tools schema observations.parquet
                parquet-tools head observations.parquet

                # References
                - https://arrow.apache.org/docs/python/parquet.html
                """

                log.info(f"Writing to Parquet file '{filepath}'")
                import pyarrow as pa
                import pyarrow.parquet as pq

                table = pa.Table.from_pandas(self.df)
                pq.write_table(table, filepath)

            elif target.endswith(".zarr"):
                """
                # Acquire data and store to Zarr group.
                alias fetch="wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent"
                fetch --target="file://observations.zarr"

                # References
                - https://xarray.pydata.org/en/stable/generated/xarray.Dataset.from_dataframe.html
                - https://xarray.pydata.org/en/stable/generated/xarray.Dataset.to_zarr.html
                """

                log.info(f"Writing to Zarr group '{filepath}'")
                import xarray

                df = self.df

                # Problem: `ValueError: Cannot setitem on a Categorical with a new category, set the categories first`.
                # Solution: Let's convert all categorical columns back to their designated type representations.
                #           https://stackoverflow.com/questions/32011359/convert-categorical-data-in-pandas-dataframe/32011969#32011969
                if "quality" in df:
                    df.quality = df.quality.astype("Int64")
                categorical_columns = df.select_dtypes(["category"]).columns
                df[categorical_columns] = df[categorical_columns].astype("str")

                # Problem: `TypeError: float() argument must be a string or a number, not 'NAType'`.
                # Solution: Fill gaps in the data.
                df = df.fillna(-999)

                # Convert pandas DataFrame to xarray Dataset.
                dataset = xarray.Dataset.from_dataframe(df)
                log.info(f"Converted to xarray Dataset. Size={dataset.sizes}")

                # Export to Zarr format.
                # TODO: Add "group" parameter.
                #       Group path. (a.k.a. `path` in zarr terminology.)
                # TODO: Also use attributes: `store.set_attribute()`
                store = dataset.to_zarr(
                    filepath,
                    mode="w",
                    group=None,
                    encoding={"date": {
                        "dtype": "datetime64"
                    }},
                )

                # Reporting.
                dimensions = store.get_dimensions()
                variables = list(store.get_variables().keys())

                log.info(
                    f"Wrote Zarr file with dimensions={dimensions} and variables={variables}"
                )
                log.info(f"Zarr Dataset Group info:\n{store.ds.info}")

            else:
                raise KeyError("Unknown export file type")

            return

        if target.startswith("duckdb://"):
            """
            ====================
            DuckDB database sink
            ====================

            Install Python driver::

                pip install duckdb

            Acquire data::

                wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="duckdb:///dwd.duckdb?table=weather"

            Example queries::

                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.table("weather"))'  # noqa
                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.execute("SELECT * FROM weather").df())'  # noqa

            """
            log.info(
                f"Writing to DuckDB. database={database}, table={tablename}")
            import duckdb

            connection = duckdb.connect(database=database, read_only=False)
            connection.register("origin", self.df)
            connection.execute(f"DROP TABLE IF EXISTS {tablename};")
            connection.execute(
                f"CREATE TABLE {tablename} AS SELECT * FROM origin;"
            )  # noqa:S608

            weather_table = connection.table(tablename)
            print(weather_table)  # noqa: T001
            print("Cardinalities:")  # noqa: T001
            print(weather_table.to_df().count())  # noqa: T001
            connection.close()
            log.info("Writing to DuckDB finished")

        elif protocol.startswith("influxdb"):
            """
            ==========================
            InfluxDB 1.x database sink
            ==========================

            Install Python driver::

                pip install influxdb

            Run database::

                docker run -it --rm --publish=8086:8086 influxdb:1.8

            Acquire data::

                alias fetch="wetterdienst values --provider=dwd --network=observation --parameter=kl --resolution=daily --period=recent --station=1048,4411"
                fetch --target="influxdb://localhost/?database=dwd&table=weather"

            Example queries::

                http 'localhost:8086/query?db=dwd&q=SELECT * FROM weather;'
                http 'localhost:8086/query?db=dwd&q=SELECT COUNT(*) FROM weather;'


            ==========================
            InfluxDB 2.x database sink
            ==========================

            Install Python driver::

                pip install influxdb_client

            Run database::

                docker run -it --rm --publish=8086:8086 influxdb:2.0
                influx setup --name=default --username=root --password=12345678 --org=acme --bucket=dwd --retention=0 --force

            Acquire data::

                INFLUXDB_ORGANIZATION=acme
                INFLUXDB_TOKEN=t5PJry6TyepGsG7IY_n0K4VHp5uPvt9iap60qNHIXL4E6mW9dLmowGdNz0BDi6aK_bAbtD76Z7ddfho6luL2LA==

                alias fetch="wetterdienst values --provider=dwd --network=observation --parameter=kl --resolution=daily --period=recent --station=1048,4411"
                fetch --target="influxdb2://${INFLUXDB_ORGANIZATION}:${INFLUXDB_TOKEN}@localhost/?database=dwd&table=weather"

            Example queries::

                influx query 'from(bucket:"dwd") |> range(start:-2d) |> limit(n: 10)'
            """

            if protocol in [
                    "influxdb", "influxdbs", "influxdb1", "influxdb1s"
            ]:
                version = 1
            elif protocol in ["influxdb2", "influxdb2s"]:
                version = 2
            else:
                raise KeyError(
                    f"Unknown protocol variant '{protocol}' for InfluxDB")

            log.info(
                f"Writing to InfluxDB version {version}. database={database}, table={tablename}"
            )

            # 1. Mungle the data frame.
            # Use the "date" column as appropriate timestamp index.
            df = self.df.set_index(pd.DatetimeIndex(self.df["date"]))
            df = df.drop(["date"], axis=1)

            # Compute designated tag fields from some candidates.
            tag_columns = []
            tag_candidates = [
                Columns.STATION_ID.value,
                Columns.QUALITY.value,
                Columns.QUALITY_PREFIX.value,
                Columns.DATASET.value,
                Columns.PARAMETER.value,
            ]
            for tag_candidate in tag_candidates:
                tag_candidate = tag_candidate.lower()
                for column in df.columns:
                    if column.startswith(tag_candidate):
                        tag_columns.append(column)

            # Setup the connection.
            if version == 1:
                from influxdb import InfluxDBClient

                client = InfluxDBClient(
                    host=connspec.url.hostname,
                    port=connspec.url.port or 8086,
                    username=connspec.url.username,
                    password=connspec.url.password,
                    database=database,
                    ssl=protocol.endswith("s"),
                )
                client.create_database(database)
            elif version == 2:
                from influxdb_client import InfluxDBClient, Point
                from influxdb_client.client.write_api import SYNCHRONOUS

                ssl = protocol.endswith("s")
                url = f"http{ssl and 's' or ''}://{connspec.url.hostname}:{connspec.url.port or 8086}"
                client = InfluxDBClient(url=url,
                                        org=connspec.url.username,
                                        token=connspec.url.password)
                write_api = client.write_api(write_options=SYNCHRONOUS)

            points = []
            for items in chunker(df, chunksize=50000):

                for date, record in items.iterrows():
                    time = date.isoformat()
                    tags = {
                        tag: record.pop(tag)
                        for tag in tag_columns if tag in record
                    }

                    fields = record.dropna().to_dict()
                    if not fields:
                        continue

                    if version == 1:
                        point = {
                            "measurement": tablename,
                            "time": time,
                            "tags": tags,
                            "fields": fields,
                        }
                    elif version == 2:
                        point = Point(tablename).time(date.isoformat())
                        for tag, value in tags.items():
                            point = point.tag(tag, value)
                        for field, value in fields.items():
                            point = point.field(field, value)

                    points.append(point)

            # Write to InfluxDB.
            if version == 1:
                client.write_points(
                    points=points,
                    batch_size=50000,
                )
            elif version == 2:
                write_api.write(bucket=database, record=points)
                write_api.close()

            log.info("Writing to InfluxDB finished")

        elif target.startswith("crate://"):
            """
            =====================
            CrateDB database sink
            =====================

            Install Python driver::

                pip install crate[sqlalchemy] crash

            Run database::

                docker run -it --rm --publish=4200:4200 --env CRATE_HEAP_SIZE=2048M crate/crate:nightly

            Acquire data::

                wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="crate://crate@localhost/dwd?table=weather"

            Example queries::

                psql postgres://crate@localhost --command 'SELECT * FROM dwd.weather;'

                crash -c 'select * from dwd.weather;'
                crash -c 'select count(*) from dwd.weather;'
                crash -c "select *, date_format('%Y-%m-%dT%H:%i:%s.%fZ', date) as datetime from dwd.weather order by datetime limit 10;"  # noqa

            """
            log.info(f"Writing to CrateDB. target={target}, table={tablename}")

            # CrateDB's SQLAlchemy driver doesn't accept `database` or `table` query parameters.
            cratedb_url = connspec.url._replace(path="", query=None)
            cratedb_target = urlunparse(cratedb_url)

            # Convert timezone-aware datetime fields to naive ones.
            # FIXME: Omit this as soon as the CrateDB driver is capable of supporting timezone-qualified timestamps.
            self.df.date = self.df.date.dt.tz_localize(None)

            self.df.to_sql(
                name=tablename,
                con=cratedb_target,
                schema=database,
                if_exists="replace",
                index=False,
                chunksize=5000,
            )
            log.info("Writing to CrateDB finished")

        else:
            """
            ================================
            Generic SQLAlchemy database sink
            ================================

            Install Python driver::

                pip install sqlalchemy

            Examples::

                # Prepare
                alias fetch='wetterdienst dwd observations values --station=1048,4411 --parameter=kl --resolution=daily --period=recent'

                # Acquire data.
                fetch --target="sqlite:///dwd.sqlite?table=weather"

                # Query data.
                sqlite3 dwd.sqlite "SELECT * FROM weather;"

            """

            # Honour SQLite's SQLITE_MAX_VARIABLE_NUMBER, which defaults to 999
            # for SQLite versions prior to 3.32.0 (2020-05-22),
            # see https://www.sqlite.org/limits.html#max_variable_number.
            chunksize = 5000
            if target.startswith("sqlite://"):
                import sqlite3

                if sqlite3.sqlite_version_info < (3, 32, 0):
                    chunksize = int(999 / len(self.df.columns))

            log.info("Writing to SQL database")
            self.df.to_sql(
                name=tablename,
                con=target,
                if_exists="replace",
                index=False,
                method="multi",
                chunksize=chunksize,
            )
            log.info("Writing to SQL database finished")
예제 #5
0
import duckdb
con = duckdb.connect('robust04db')
c = con.cursor()

c.execute()
print(c.fetchall())

c.close()
con.close()

"WITH qterms AS (SELECT termid, docid, count FROM terms WHERE termid IN (10575, 1285, 191)),                                          subscores AS (SELECT docs.docid, len, term_tf.termid,                         
  tf, count, (log((528155-count+0.5)/(count+0.5))*((tf*(1.2+1)/                          
  (tf+1.2*(1-0.75+0.75*(len/188.33)))))) AS subscore                            
  FROM (SELECT termid, docid, count AS tf FROM qterms) AS term_tf                  
  JOIN (SELECT docid FROM qterms                                                
    GROUP BY docid HAVING COUNT(distinct termid) = 3)                           
    AS cdocs ON term_tf.docid = cdocs.docid                                     
  JOIN docs ON term_tf.docid=docs.docid                                         
  JOIN dict ON term_tf.termid=dict.termid)                                      
SELECT scores.docid, score FROM (SELECT docid, sum(subscore) AS score           
  FROM subscores GROUP BY docid) AS scores JOIN docs ON                         
  scores.docid=docs.docid ORDER BY score DESC;"
예제 #6
0
 def test_union_operator(self, duckdb_cursor):
     conn = duckdb.connect()
     rel = get_relation(conn)
     print(rel.union(rel).execute().fetchall())
     assert rel.union(rel).execute().fetchall() == [(1, 'one'), (2, 'two'), (3, 'three'), (4, 'four'), (1, 'one'), (2, 'two'), (3, 'three'), (4, 'four')]
예제 #7
0
 def test_create_view_operator(self,duckdb_cursor):
     conn = duckdb.connect()
     test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4], "j":["one", "two", "three", "four"]})
     rel = conn.from_df(test_df)
     rel.create_view("test_df")
     assert conn.query("select * from test_df").execute().fetchall() == [(1, 'one'), (2, 'two'), (3, 'three'),(4, 'four')]
예제 #8
0
 def test_conn_prepared_statement_error(self, duckdb_cursor):
     conn = duckdb.connect()
     conn.execute("create table integers (a integer, b integer)")
     with pytest.raises(Exception):
         conn.execute("select * from integers where a =? and b=?", [1])
예제 #9
0
 def test_register_error(self, duckdb_cursor):
     con = duckdb.connect()
     py_obj = "this is a string"
     with pytest.raises(Exception):
         con.register(py_obj, "v")
예제 #10
0
import duckdb
import numpy as np
con = duckdb.connect('robust04db_new')
c = con.cursor()
print("here1")
c.execute("DROP INDEX IF EXISTS dict_index")
print("here2")
print(c.fetchall())

c.close()
con.close()
'''
c.execute("SELECT avg(length) FROM docs");
avgdl = c.fetchall()[0][0]
k1 = 1.2
b = 0.75
query_term = 'car'


c.execute("SELECT COUNT(*) FROM docs");
N = c.fetchall()[0][0]

c.execute("SELECT df FROM dict WHERE term='car'");
df = c.fetchall()[0][0]

idf = np.log2((N-df+0.5)/(df+0.5))


c.execute("SELECT ("+str(idf)+"*(table2.count*"+ str(k1+1) +")/(table2.count+("+str(k1)+"*("+str(1-b) +" +("+str(b)+"*(table2.length/"+ str(avgdl) +")))))) AS score, table2.docid FROM (SELECT table1.count, table1.docid, (SELECT length FROM docs WHERE table1.docid = docs.docid) AS length FROM (SELECT docid, count FROM terms WHERE terms.termid = (SELECT termid FROM dict WHERE term = '"+ query_term +"')) AS table1) AS table2 ORDER BY score DESC LIMIT 5")
print(c.fetchall())
예제 #11
0
    def test_check_same_thread_false(self, duckdb_cursor):
        con = duckdb.connect(check_same_thread=False)

        x = threading.Thread(target=connect_duck, args=(con, ))
        x.start()
예제 #12
0
#####################################################
# VCOC3 Lifestyle Research Team
# TEORIA DE PORTFOLIO (Markowitz)
# Objetivo: correlacao de ativos
# INPUT: datas de inicio, fim e tickers de ativos (portfolio)
# OUTPUT: Correlacao do portfolio e matriz 
#####################################################

import pandas as pd
import numpy as np
from pandas_datareader import data as wb
from datetime import datetime
import duckdb

con = duckdb.connect('')

dt_ini=datetime(2013, 1, 1) 
dt_fim=datetime(2020, 1, 21) 

# RESUMO: reduzir ,'ITUB3.SA'
#tickers = ['XPML11.SA','RBRF11.SA','BBSE3.SA','TRPL4.SA','PETR4.SA','VVAR3.SA','BPAC3.SA','PSSA3.SA','ENBR3.SA','HYPE3.SA','WEGE3.SA','CPTS11B.SA','TUPY3.SA','HGLG11.SA','ITUB3.SA','MYPK3.SA','LREN3.SA','HGTX3.SA']
# atual
#tickers = ['ENBR3.SA','ABEV3.SA','PSSA3.SA','VVAR3.SA','TRPL4.SA','BPAC3.SA','TUPY3.SA','WEGE3.SA','HGTX3.SA','MYPK3.SA','LREN3.SA']

# Diego
#tickers = ['MYPK3.SA','CAML3.SA','SLCE3.SA','MOVI3.SA','COGN3.SA','TUPY3.SA','ITSA4.SA','PRIO3.SA','BBAS3.SA','LCAM3.SA','PTBL3.SA','MDIA3.SA','TAEE11.SA','HGTX3.SA','VVAR3.SA']


# atual tirando HGTX3 e MYPK3
tickers = ['ENBR3.SA','BBSE3.SA','PSSA3.SA','MRFG3.SA','TRPL4.SA','BPAC3.SA','XPML11.SA','RBRF11.SA','CPTS11B.SA','TUPY3.SA','LREN3.SA','WEGE3.SA','HGLG11.SA','ITUB3.SA','HYPE3.SA']#,'PETR4.SA','HGTX3.SA','MYPK3.SA']
# Menor correlacao, tira: BBSE3.SA 'ITUB3.SA','HYPE3.SA','HGTX3.SA','MYPK3.SA', 'LREN3.SA','PETR4.SA'
import duckdb
import json
import sys
from .utils import flush_rows

SCHEMA = "documents(document_id, canon_url, date_publish, language, title, country)"

if __name__ == '__main__':
    conn = duckdb.connect(sys.argv[1])
    conn.begin()
    counter = 0
    rows = []

    for line in sys.stdin:
        counter += 1

        doc = json.loads(line.strip())

        canon_url = doc['canon_url']
        date_publish = doc['date_publish']
        language = doc['language']
        title = doc['title']
        country = doc['country']

        rows.append(
            (counter, canon_url, date_publish, language, title, country))

        if counter % 50000 == 0:  # Commit changes every now and then
            flush_rows(SCHEMA, conn, rows)
            conn.begin()
            print(counter)
예제 #14
0
def get_queries():
	#Get the preprocessed queries
	queries = []
	with open("topics.data", 'rb') as filehandle:
    	queries = pickle.load(filehandle)
    return queries

def get_content(raw_docs):
	keys = []
	values = []
	contents = {}
	with open(raw_docs) as json_file:
		contents = json_file.read()
		json_data = json.loads(contents)
		contents = {item['id']:item for item in json_data}
    return contents

def BM25(input_query,c):
	query_words= ""
	for word in input_query.split():
			query_words = query_words + "'" + word.lower() + "',"
	query_words = query_words[:-1]
	c.execute("SELECT termid FROM dict WHERE term IN (" + query_words + ")")
	id_list = c.fetchall()
	query_ids= ""
	for ids in id_list:
			query_ids = query_ids + str(ids[0]) + ","
	query_ids = query_ids[:-1]
	BM = """ WITH qterms AS (SELECT termid, docid, count as df FROM terms							 
		WHERE termid IN ("""+ query_ids +""")),										  
		subscores AS (SELECT docs.docid, length, term_tf.termid,						 
		tf, df, (log((528155.000000-df+0.5)/(df+0.5))*((term_tf.tf*(1.2+1)/						  
		(term_tf.tf+1.2*(1-0.75+0.75*(length/188.33)))))) AS subscore							
		FROM (SELECT termid, docid, df AS tf FROM qterms) AS term_tf				  
		JOIN (SELECT docid FROM qterms												
		GROUP BY docid )						   
		AS cdocs ON term_tf.docid = cdocs.docid									 
		JOIN docs ON term_tf.docid=docs.docid										 
		JOIN dict ON term_tf.termid=dict.termid)									  
		SELECT scores.docid, ROUND(score,6) FROM (SELECT docid, sum(subscore) AS score		   
		FROM subscores GROUP BY docid) AS scores JOIN docs ON						 
		scores.docid=docs.docid ORDER BY ROUND(score,6) DESC, scores.docid ASC LIMIT 100; """
	c.execute(BM)
	docids =str(list(c.fetchnumpy()['docid']))[1:-1]
	c.execute("SELECT name FROM docs WHERE docid IN (" + docids + ")")
	results = c.fetchnumpy()['name']
	return results


if __name__ == '__main__':

	raw_docs = sys.argv[1]
	db_name = sys.argv[2]
	option = int(sys.argv[3])
	

	queries = get_queries()
	times = []
	results = open('results.txt','w')

	#DuckDB with Bert
	if option == 1:
		contents = get_content(raw_docs)
		bert_model = bm.BertModel()
		con = duckdb.connect(db_name)
		c = con.cursor()
		for item in queries:
			query_no, query= item['number'],item["title"]
			results.write("Query: " + str(query_no) + "\n")
			start_time = time.time() 
			candidate_docs= BM25(query,c)
			choices = []
			for i in range(len(candidate_docs)):
				content  = contents[candidate_docs[i] + '.000000']['contents']
				content = preprocess(content)
				choices.append(Choice(i,content.encode('utf-8')))
			ranked = bert_model.rank(query.encode('utf-8'),choices)
			end_time = time.time()
			for i in range(10):
				results.write(str(i+1) + ") " + candidate_docs[ranked[i]]+"\n")
			times.append(end_time - start_time)
			
	else:
		con = None
		#DuckDB
		if option == 2:
			con = duckdb.connect(db_name)
		#MonetDB
		else: 
			con = pymonetdb.connect(username='******',password='******',hostname='localhost', database=db_name)
		c = con.cursor()
		for item in queries:
			query_no, query= item['number'],item["title"]
			start_time = time.time() 
			bm25_results = BM25(query,c)
			end_time = time.time()
			for i in range(10):
				results.write(str(i+1) + ") " + bm25_results[i]+"\n")
			times.append(end_time - start_time)

	print("Max: " + str(max(times)))
	print("Min: " + str(min(times)))
	print("Average: " + str(sum(times)/len(times)))
	print("Standard Deviation: " + str(statistics.stdev(times)))
	print("Total: " + str(sum(times)))
	print("Number of queries: " + str(len(times)))

	results.close()
	c.close()
	con.close()
예제 #15
0
 def test_aggregate_operator(self, duckdb_cursor):
     conn = duckdb.connect()
     rel = get_relation(conn)
     assert rel.aggregate("sum(i)").execute().fetchall() == [(10,)]
     assert rel.aggregate("j, sum(i)").execute().fetchall() == [('one', 1), ('two', 2), ('three', 3), ('four', 4)]
예제 #16
0
 def create_connection(self):
     connection = duckdb.connect(self.config['dbFilename'])
     return connection
예제 #17
0
 def test_distinct_operator(self, duckdb_cursor):
     conn = duckdb.connect()
     rel = get_relation(conn)
     assert rel.distinct().execute().fetchall() == [(1, 'one'), (2, 'two'), (3, 'three'),(4, 'four')]
예제 #18
0
    def export(self, target: str):
        """
        Emit Pandas DataFrame to target. A target
        is identified by a connection string.

        Examples:

        - duckdb://dwd.duckdb?table=weather
        - influxdb://localhost/?database=dwd&table=weather
        - crate://localhost/?database=dwd&table=weather

        Dispatch data to different data sinks. Currently, SQLite, DuckDB,
        InfluxDB and CrateDB are implemented. However, through the SQLAlchemy
        layer, it should actually work with any supported SQL database.

        - https://docs.sqlalchemy.org/en/13/dialects/

        :param target: Target connection string.
        :return: self
        """

        database, tablename = ConnectionString(target).get()

        if target.startswith("duckdb://"):
            """
            ====================
            DuckDB database sink
            ====================

            Install Python driver::

                pip install duckdb

            Acquire data::

                wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="duckdb:///dwd.duckdb?table=weather"

            Example queries::

                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.table("weather"))'  # noqa
                python -c 'import duckdb; c = duckdb.connect(database="dwd.duckdb"); print(c.execute("SELECT * FROM weather").df())'  # noqa

            """
            log.info(f"Writing to DuckDB {database, tablename}")
            import duckdb

            connection = duckdb.connect(database=database, read_only=False)
            connection.register("origin", self.df)
            connection.execute(f"DROP TABLE IF EXISTS {tablename};")
            connection.execute(
                f"CREATE TABLE {tablename} AS SELECT * FROM origin;"  # noqa:S608
            )

            weather_table = connection.table(tablename)
            print(weather_table)
            print("Cardinalities:")
            print(weather_table.to_df().count())
            connection.close()
            log.info("Writing to DuckDB finished")

        elif target.startswith("influxdb://"):
            """
            ======================
            InfluxDB database sink
            ======================

            Install Python driver::

                pip install influxdb

            Run database::

                docker run --publish "8086:8086" influxdb/influxdb:1.8.2

            Acquire data::

                wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="influxdb://localhost/?database=dwd&table=weather"

            Example queries::

                http 'localhost:8086/query?db=dwd&q=SELECT * FROM weather;'
                http 'localhost:8086/query?db=dwd&q=SELECT COUNT(*) FROM weather;'
            """
            log.info(f"Writing to InfluxDB {database, tablename}")
            from influxdb.dataframe_client import DataFrameClient

            # Setup the connection.
            c = DataFrameClient(database=database)
            c.create_database(database)

            # Mungle the data frame.
            df = self.df.set_index(pd.DatetimeIndex(self.df["date"]))
            df = df.drop(["date"], axis=1)
            df = df.dropna()

            # Write to InfluxDB.
            c.write_points(
                dataframe=df,
                measurement=tablename,
                tag_columns=["station_id", "parameter", "element"],
            )
            log.info("Writing to InfluxDB finished")

        elif target.startswith("crate://"):
            """
            =====================
            CrateDB database sink
            =====================

            Install Python driver::

                pip install crate[sqlalchemy] crash

            Run database::

                docker run --publish "4200:4200" --env CRATE_HEAP_SIZE=512M crate/crate:4.2.4

            Acquire data::

                wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent --target="crate://localhost/?database=dwd&table=weather"

            Example queries::

                crash -c 'select * from weather;'
                crash -c 'select count(*) from weather;'
                crash -c "select *, date_format('%Y-%m-%dT%H:%i:%s.%fZ', date) as datetime from weather order by datetime limit 10;"  # noqa

            """
            log.info("Writing to CrateDB")
            self.df.to_sql(
                name=tablename,
                con=target,
                if_exists="replace",
                index=False,
                method="multi",
                chunksize=5000,
            )
            log.info("Writing to CrateDB finished")

        else:
            """
            ========================
            SQLAlchemy database sink
            ========================

            Install Python driver::

                pip install sqlalchemy

            Examples::

                # Prepare
                alias fetch='wetterdienst readings --station=1048,4411 --parameter=kl --resolution=daily --period=recent'

                # Acquire data.
                fetch --target="sqlite:///dwd.sqlite?table=weather"

                # Query data.
                sqlite3 dwd.sqlite "SELECT * FROM weather;"

            """
            log.info("Writing to SQL database")
            self.df.to_sql(
                name=tablename,
                con=target,
                if_exists="replace",
                index=False,
                method="multi",
                chunksize=5000,
            )
            log.info("Writing to SQL database finished")
예제 #19
0
 def test_except_operator(self,duckdb_cursor):
     conn = duckdb.connect()
     test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4], "j":["one", "two", "three", "four"]})
     rel = conn.from_df(test_df)
     rel2 = conn.from_df(test_df)
     assert rel.except_(rel2).execute().fetchall() == []
예제 #20
0
 def test_execute_fail(self,duckdb_cursor):
     conn = duckdb.connect()
     conn.execute("CREATE TABLE test (i INTEGER)")
     rel = conn.table("test")
     with pytest.raises(Exception):
         rel.execute("select j from test")
예제 #21
0
import os
import pytest
import tempfile
try:
    import pyarrow as pa
    import pyarrow.parquet as pq
    import pyarrow.dataset as ds
    import numpy as np
    import pandas as pd
    import re
    can_run = True
except:
    can_run = False

## DuckDB connection used in this test
duckdb_conn = duckdb.connect()


def numeric_operators(data_type, tbl_name):
    duckdb_conn.execute("CREATE TABLE " + tbl_name + " (a " + data_type +
                        ", b " + data_type + ", c " + data_type + ")")
    duckdb_conn.execute(
        "INSERT INTO  " + tbl_name +
        " VALUES (1,1,1),(10,10,10),(100,10,100),(NULL,NULL,NULL)")
    duck_tbl = duckdb_conn.table(tbl_name)
    arrow_table = duck_tbl.arrow()
    print(arrow_table)

    duckdb_conn.register("testarrow", arrow_table)
    # Try ==
    assert duckdb_conn.execute(
예제 #22
0
 def test_filter_operator(self, duckdb_cursor):
     conn = duckdb.connect()
     rel = get_relation(conn)
     assert rel.filter('i > 1').execute().fetchall() == [(2, 'two'), (3, 'three'), (4, 'four')]
예제 #23
0
    parser.add_argument("vcffile", type=argparse.FileType(mode='rb'))
    parser.add_argument("dataset_name",
                        help="name of dataset for this vcf",
                        default="dataset1",
                        type=str)
    parser.add_argument("--sqlite", help="sqlite filename", type=str)
    parser.add_argument("--duckdb", help="duckdb filename", type=str)
    parser.add_argument("--parquet", help="parquet file prefix", type=str)
    parser.add_argument("--csv", help="csv file prefix", type=str)
    args = parser.parse_args()

    print("Reading VCF and generating dataframes...")
    tables = tables(args.vcffile, args.dataset_name)

    if args.sqlite:
        print("Writing sqlite")
        with sqlite3.connect(args.sqlite) as con:
            pd_to_sql(tables, con)

    if args.duckdb:
        print("Writing duckdb")
        con = duckdb.connect(args.duckdb, read_only=False)
        pd_to_sql(tables, con)

    if args.csv:
        print("Writing csv")
        pd_to_csv(tables, args.csv)

    if args.parquet:
        print("Writing parquet")
        pd_to_parquet(tables, args.parquet)
예제 #24
0
 def test_projection_operator(self, duckdb_cursor):
     conn = duckdb.connect()
     rel = get_relation(conn)
     assert rel.project('i').execute().fetchall() == [(1,), (2,), (3,), (4,)]
예제 #25
0
    def test_map(self, duckdb_cursor):
        testrel = duckdb.values([1, 2])
        conn = duckdb.connect()
        conn.execute('CREATE TABLE t (a integer)')
        empty_rel = conn.table('t')

        newdf1 = testrel.map(lambda df: df['col0'].add(42).to_frame())
        newdf2 = testrel.map(lambda df: df['col0'].astype('string').to_frame())
        newdf3 = testrel.map(lambda df: df)

        # column count differs from bind
        def evil1(df):
            if len(df) == 0:
                return df['col0'].to_frame()
            else:
                return df

        # column type differs from bind
        def evil2(df):
            if len(df) == 0:
                df['col0'] = df['col0'].astype('string')
            return df

        # column name differs from bind
        def evil3(df):
            if len(df) == 0:
                df = df.rename(columns={"col0": "col42"})
            return df

        # does not return a df
        def evil4(df):
            return 42

        # straight up throws exception
        def evil5(df):
            this_makes_no_sense()

        def return_dataframe(df):
            return pd.DataFrame({'A': [1]})

        def return_big_dataframe(df):
            return pd.DataFrame({'A': [1] * 5000})

        def return_none(df):
            return None

        def return_empty_df(df):
            return pd.DataFrame()

        with pytest.raises(RuntimeError):
            print(testrel.map(evil1).df())

        with pytest.raises(RuntimeError):
            print(testrel.map(evil2).df())

        with pytest.raises(RuntimeError):
            print(testrel.map(evil3).df())

        with pytest.raises(AttributeError):
            print(testrel.map(evil4).df())

        with pytest.raises(RuntimeError):
            print(testrel.map(evil5).df())

        # not a function
        with pytest.raises(TypeError):
            print(testrel.map(42).df())

        # nothing passed to map
        with pytest.raises(TypeError):
            print(testrel.map().df())

        testrel.map(return_dataframe).df().equals(pd.DataFrame({'A': [1]}))

        with pytest.raises(Exception):
            testrel.map(return_big_dataframe).df()

        empty_rel.map(return_dataframe).df().equals(pd.DataFrame({'A': []}))

        with pytest.raises(Exception):
            testrel.map(return_none).df()

        with pytest.raises(Exception):
            testrel.map(return_empty_df).df()
예제 #26
0
 def test_projection_operator(self, duckdb_cursor):
     conn = duckdb.connect()
     rel = get_relation(conn)
     assert rel.order('j').execute().fetchall() == [(4, 'four'), (1, 'one'), (3, 'three'), (2, 'two')]
 def test_length(self, duckdb_cursor):
     con = duckdb.connect()
     rel = initialize(con)
     assert len(rel) == 3
예제 #28
0
 def test_limit_operator(self, duckdb_cursor):
     conn = duckdb.connect()
     rel = get_relation(conn)
     assert rel.limit(2).execute().fetchall() == [(1, 'one'), (2, 'two')]
예제 #29
0
import duckdb

con = duckdb.connect('robust04db_indexed')
c = con.cursor()

c.execute("COPY dict FROM 'dict.csv'  DELIMITER '|'")

c.close()
con.close()
예제 #30
0
def run(dbFilename,dataFolder):
  print("connecting to duckdb")
  conn = duckdb.connect(dbFilename)
  cursor = conn.cursor()

  '''
COPY INTO supplier from '${data_folder}/sf_${scale_factor}/supplier.tbl' USING DELIMITERS '|', '|\n';
  '''
  print("loading customer")
  cursor.execute("DROP TABLE IF EXISTS customer");
  cursor.execute("""
    CREATE TABLE if not exists customer (
      c_custkey    INT,--numeric identifier
      c_name       STRING,     -- varchar(25), --variable text, size 25 'customer'||custkey
      c_address    STRING,     -- varchar(25), --variable text, size 25 (city below)
      c_city       STRING,     -- varchar(10), --fixed text, size 10 (10/nation: nation_prefix||(0-9)
      c_nation     STRING,     -- varchar(15), --fixed text(15) (25 values, longest united kingdom)
      c_region     STRING,     -- varchar(12), --fixed text, size 12 (5 values: longest middle east)
      c_phone      STRING,     -- varchar(15), --fixed text, size 15 (many values, format: 43-617-354-1222)
      c_mktsegment STRING,     -- varchar(10) --fixed text, size 10 (longest is automobile)
      PRIMARY KEY (c_custkey)
    );
        """)
  data_file = str(os.path.join(dataFolder,'customer.tbl'))
  cursor.execute("copy customer from '"+data_file+"' (delimiter '|')")

  print("loading date_")
  cursor.execute("DROP TABLE IF EXISTS date_");
  cursor.execute("""
    CREATE TABLE if not exists date_ (
      d_datekey          INT,     -- identifier, unique id -- e.g. 19980327 (what we use)
      d_date             STRING,  -- varchar(18), --fixed text, size 18, longest: december 22, 1998
      d_dayofweek        STRING,  -- varchar(8), --fixed text, size 8, sunday, monday, ..., saturday)
      d_month            STRING,  -- varchar(9), --fixed text, size 9: january, ..., december
      d_year             INT,     -- unique value 1992-1998
      d_yearmonthnum     INT,     -- numeric (yyyymm) -- e.g. 199803
      d_yearmonth        STRING,  -- varchar(7), --fixed text, size 7: mar1998 for example
      d_daynuminweek     INT,     -- numeric 1-7
      d_daynuminmonth    INT,     -- numeric 1-31
      d_daynuminyear     INT,     -- numeric 1-366
      d_monthnuminyear   INT,     -- numeric 1-12
      d_weeknuminyear    INT,     -- numeric 1-53
      d_sellingseason    STRING,  -- varchar(12), --text, size 12 (christmas, summer,...)
      d_lastdayinweekfl  INT,     -- 1 bit
      d_lastdayinmonthfl INT,     -- 1 bit
      d_holidayfl        INT,     -- 1 bit
      d_weekdayfl        INT,     -- 1 bit
      PRIMARY KEY (d_datekey) 
    );
        """)
  data_file = str(os.path.join(dataFolder,'date.tbl'))
  cursor.execute("copy date_ from '"+data_file+"' (delimiter '|') ")

  print("loading part")
  cursor.execute("DROP TABLE IF EXISTS part");
  cursor.execute("""
    CREATE TABLE if not exists part (
      p_partkey   INT,        -- identifier
      p_name      STRING,     -- varchar(22), --variable text, size 22 (not unique per part but never was)
      p_mfgr      STRING,     -- varchar(6), --fixed text, size 6 (mfgr#1-5, card = 5)
      p_category  STRING,     -- varchar(7), --fixed text, size 7 ('mfgr#'||1-5||1-5: card = 25)
      p_brand1    STRING,     -- varchar(9), --fixed text, size 9 (category||1-40: card = 1000)
      p_color     STRING,     -- varchar(11), --variable text, size 11 (card = 94)
      p_type      STRING,     -- varchar(25), --variable text, size 25 (card = 150)
      p_size      INT,        -- numeric 1-50 (card = 50)
      p_container STRING,     -- varchar(15) --fixed text(10) (card = 40)
      PRIMARY KEY (p_partkey)
    );
        """)
  data_file = str(os.path.join(dataFolder,'part.tbl'))
  cursor.execute("copy part from '"+data_file+"' (delimiter '|') ")
 
  print("loading supplier")
  cursor.execute("DROP TABLE IF EXISTS supplier");
  cursor.execute("""
    CREATE TABLE if not exists supplier (
      s_suppkey INT,     -- identifier
      s_name    STRING,  -- varchar(25), --fixed text, size 25: 'supplier'||suppkey
      s_address STRING,  -- varchar(25), --variable text, size 25 (city below)
      s_city    STRING,  -- varchar(10), --fixed text, size 10 (10/nation: nation_prefix||(0-9))
      s_nation  STRING,  -- varchar(15), --fixed text(15) (25 values, longest united kingdom)
      s_region  STRING,  -- varchar(12), --fixed text, size 12 (5 values: longest middle east)
      s_phone   STRING,  -- varchar(15) --fixed text, size 15 (many values, format: 43-617-354-1222)
      PRIMARY KEY (s_suppkey)
    );
        """)
  data_file = str(os.path.join(dataFolder,'supplier.tbl'))
  cursor.execute("copy supplier from '"+data_file+"' (delimiter '|') ")

  print("loading lineorder")
  cursor.execute("DROP TABLE IF EXISTS lineorder");
  cursor.execute("""
    CREATE TABLE if not exists lineorder (
      lo_orderkey      INT,     -- numeric (int up to sf 300) first 8 of each 32 keys used
      lo_linenumber    INT,     -- numeric 1-7
      lo_custkey       INT,     -- numeric identifier foreign key reference to c_custkey
      lo_partkey       INT,     -- identifier foreign key reference to p_partkey
      lo_suppkey       INT,     -- numeric identifier foreign key reference to s_suppkey
      lo_orderdate     INT,     -- identifier foreign key reference to d_datekey
      lo_orderpriority STRING,  -- varchar(15), --fixed text, size 15 (5 priorities: 1-urgent, etc.)
      lo_shippriority  STRING,  -- varchar(1), --fixed text, size 1
      lo_quantity      INT,     -- numeric 1-50 (for part)
      lo_extendedprice INT,     -- numeric, max about 55,450 (for part)
      lo_ordtotalprice INT,     -- numeric, max about 388,000 (for order)
      lo_discount      INT,     -- numeric 0-10 (for part) -- (represents percent)
      lo_revenue       INT,     -- numeric (for part: (extendedprice*(100-discount))/100)
      lo_supplycost    INT,     -- numeric (for part, cost from supplier, max = ?)
      lo_tax           INT,     -- numeric 0-8 (for part)
      lo_commitdate    INT,     -- foreign key reference to d_datekey
      lo_shipmode      STRING  -- varchar(10) --fixed text, size 10 (modes: reg air, air, etc.)
    );
        """)
  # duckdb does not currently support compound primary keys, so removed it
  '''
      PRIMARY KEY (lo_orderkey, lo_linenumber) --Compound Primary Key: ORDERKEY, LINENUMBER
  '''
  # duckdb does not currently support foreign keys, so removed them
  '''
      FOREIGN KEY (lo_orderdate)  REFERENCES date_    (d_datekey), --identifier foreign key reference to D_DATEKEY
      FOREIGN KEY (lo_commitdate) REFERENCES date_    (d_datekey), --Foreign Key reference to D_DATEKEY
      FOREIGN KEY (lo_suppkey)    REFERENCES supplier (s_suppkey), --numeric identifier foreign key reference to S_SUPPKEY
      FOREIGN KEY (lo_custkey)    REFERENCES customer (c_custkey)  --numeric identifier foreign key reference 
  '''
  data_file = str(os.path.join(dataFolder,'lineorder.tbl'))
  cursor.execute("copy lineorder from '"+data_file+"' (delimiter '|') ")