def compare_results(query): true_answer = duckdb.query(query).fetchall() t = duckdb.query(query).arrow() from_arrow = duckdb.from_arrow_table( duckdb.query(query).arrow()).fetchall() assert true_answer == from_arrow
def test_object_integer(self, duckdb_cursor): df_in = pd.DataFrame({ 'int8': pd.Series([None, 1, -1], dtype="Int8"), 'int16': pd.Series([None, 1, -1], dtype="Int16"), 'int32': pd.Series([None, 1, -1], dtype="Int32"), 'int64': pd.Series([None, 1, -1], dtype="Int64") }) df_expected_res = pd.DataFrame({ 'int8': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), 'int16': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), 'int32': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), 'int64': np.ma.masked_array([0, 1, -1], mask=[True, False, False], dtype='float64'), }) df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df() pd.testing.assert_frame_equal(df_expected_res, df_out)
def compare_results(query, list_values=[]): df_duck = duckdb.query(query).df() counter = 0 duck_values = df_duck['a'] for duck_value in duck_values: assert duck_value == list_values[counter] counter += 1
def test_query(self, duckdb_cursor): conn = duckdb.connect() conn.execute("create table t (a integer)") conn.execute("insert into t values (1)") assert duckdb.query("select count(*) from t", connection=conn).execute().fetchall()[0] == (1, ) assert duckdb.from_query( "select count(*) from t", connection=conn).execute().fetchall()[0] == (1, )
def test_fromquery(self, duckdb_cursor): assert duckdb.from_query('select 42').fetchone()[0] == 42 assert duckdb.query('select 43').fetchone()[0] == 43 # assert duckdb_cursor.from_query('select 44').execute().fetchone()[0] == 44 # assert duckdb_cursor.from_query('select 45').execute().fetchone()[0] == 45 # assert duckdb_cursor.from_query('select 45').execute().fetchone()[0] == 45 # cursor = duckdb.connect().cursor() # TestRelationApi().test_readonly(cursor)
def test_category_simple(self, duckdb_cursor): df_in = pd.DataFrame({ 'float': [1.0, 2.0, 1.0], 'string': pd.Series(["foo", "bar", "foo"], dtype="category"), 'int': pd.Series([1, 2, 1], dtype="category") }) df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df() assert numpy.all(df_out['float'] == numpy.array([1.0, 2.0, 1.0])) assert numpy.all(df_out['string'] == numpy.array(["foo", "bar", "foo"])) assert numpy.all(df_out['int'] == numpy.array([1, 2, 1]))
def test_category_nulls(self, duckdb_cursor): df_in = pd.DataFrame({ 'string': pd.Series(["foo", "bar", None], dtype="category"), 'int': pd.Series([1, 2, None], dtype="category") }) df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df() assert df_out['string'][0] == "foo" assert df_out['string'][1] == "bar" assert numpy.isnan(df_out['string'][2]) assert df_out['int'][0] == 1 assert df_out['int'][1] == 2 assert numpy.isnan(df_out['int'][2])
def test_timestamp_tz(self, duckdb_cursor): df_in = pd.DataFrame({ 'datetime': [pd.Timestamp('20180310T11:17:54Z')], 'string': ['foo'] }) df_expected_res = pd.DataFrame({ 'datetime': [pd.Timestamp('20180310T11:17:54')], 'string': ['foo'] }) print(df_in) print(df_expected_res) df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df() print(df_out) pd.testing.assert_frame_equal(df_expected_res, df_out)
def test_lists_basic(self, duckdb_cursor): if not can_run: return #Test Constant List query = duckdb.query( "SELECT a from (select list_value(3,5,10) as a) as t").arrow( )['a'].to_numpy() assert query[0][0] == 3 assert query[0][1] == 5 assert query[0][2] == 10 # Empty List query = duckdb.query("SELECT a from (select list_value() as a) as t" ).arrow()['a'].to_numpy() assert len(query[0]) == 0 #Test Constant List With Null query = duckdb.query( "SELECT a from (select list_value(3,NULL) as a) as t").arrow( )['a'].to_numpy() assert query[0][0] == 3 assert np.isnan(query[0][1])
def test_duckdb_query(self, duckdb_cursor): # we can use duckdb.query to run both DDL statements and select statements duckdb.query('create view v1 as select 42 i') rel = duckdb.query('select * from v1') assert rel.fetchall()[0][0] == 42; # also multiple statements duckdb.query('create view v2 as select i*2 j from v1; create view v3 as select j * 2 from v2;') rel = duckdb.query('select * from v3') assert rel.fetchall()[0][0] == 168; # we can run multiple select statements, but we get no result res = duckdb.query('select 42; select 84;'); assert res is None
def test_pandas_string(self, duckdb_cursor): strings = numpy.array(['foo', 'bar', 'baz']) # https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html df_in = pd.DataFrame({ 'object': pd.Series(strings, dtype='object'), }) # Only available in pandas 1.0.0 if hasattr(pd, 'StringDtype'): df_in['string'] = pd.Series(strings, dtype=pd.StringDtype()) df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df() assert numpy.all(df_out['object'] == strings) if hasattr(pd, 'StringDtype'): assert numpy.all(df_out['string'] == strings)
def _filter_by_sql(df: pd.DataFrame, sql: str) -> pd.DataFrame: """ Filter Pandas DataFrame using an SQL query. The virtual table name is "data", so queries should look like ``SELECT * FROM data;``. This implementation is based on DuckDB, so please have a look at its SQL documentation. - https://duckdb.org/docs/sql/introduction :param sql: A SQL expression. :return: Filtered DataFrame """ import duckdb return duckdb.query(df, "data", sql).df()
def arrow_to_pandas(query): return duckdb.query(query).arrow().to_pandas()['a'].values.tolist()
def run(): """ Usage: phenodata info phenodata list-species --source=dwd [--format=csv] phenodata list-phases --source=dwd [--format=csv] phenodata list-stations --source=dwd --dataset=immediate [--all] [--filter=berlin] [--sort=Stationsname] [--format=csv] phenodata nearest-station --source=dwd --dataset=immediate --latitude=52.520007 --longitude=13.404954 [--format=csv] phenodata nearest-stations --source=dwd --dataset=immediate [--all] --latitude=52.520007 --longitude=13.404954 [--limit=10] [--format=csv] phenodata list-quality-levels --source=dwd [--format=csv] phenodata list-quality-bytes --source=dwd [--format=csv] phenodata list-filenames --source=dwd --dataset=immediate --partition=recent [--filename=Hasel,Schneegloeckchen] [--year=2017] phenodata list-urls --source=dwd --dataset=immediate --partition=recent [--filename=Hasel,Schneegloeckchen] [--year=2017] phenodata (observations|forecast) --source=dwd --dataset=immediate --partition=recent [--filename=Hasel,Schneegloeckchen] [--station-id=164,717] [--species-id=113,127] [--phase-id=5] [--quality-level=10] [--quality-byte=1,2,3] [--station=berlin,brandenburg] [--species=hazel,snowdrop] [--species-preset=mellifera-de-primary] [--phase=flowering] [--quality=ROUTKLI] [--year=2017] [--forecast-year=2021] [--humanize] [--show-ids] [--language=german] [--long-station] [--sort=Datum] [--sql=sql] [--format=csv] [--verbose] phenodata drop-cache --source=dwd phenodata --version phenodata (-h | --help) Data acquisition options: --source=<source> Data source. Currently "dwd" only. --dataset=<dataset> Data set. Use "immediate" or "annual" for --source=dwd. --partition=<dataset> Partition. Use "recent" or "historical" for --source=dwd. --filename=<file> Filter by file names (comma-separated list) Direct filtering options: --year=<year> Filter by year (comma-separated list) --station-id=<station-id> Filter by station ids (comma-separated list) --species-id=<species-id> Filter by species ids (comma-separated list) --phase-id=<phase-id> Filter by phase ids (comma-separated list) Humanized filtering options: --station=<station> Filter by strings from "stations" data (comma-separated list) --species=<species> Filter by strings from "species" data (comma-separated list) --phase=<phase> Filter by strings from "phases" data (comma-separated list) --species-preset=<preset> Filter by strings from "species" data (comma-separated list) The preset will get loaded from the ``presets.json`` file. Forecasting options: --forecast-year=<year> Use as designated forecast year. Postprocess filtering options: --sql=<sql> Apply given SQL query before output Data output options: --format=<format> Output data in designated format. Choose one of "tabular", "json", "csv" or "string". With "tabular", it is also possible to specify the table format, see https://bitbucket.org/astanin/python-tabulate. e.g. "tabular:presto". [default: tabular:psql] --sort=<sort> Sort by given column names (comma-separated list) --humanize Resolve ID-based columns to real names with "observations" and "forecast" output. --show-ids Show IDs alongside resolved text representation when using ``--humanize``. --language=<language> Use labels in designated language when using ``--humanize`` [default: english]. --long-station Use long station name including "Naturraumgruppe" and "Naturraum". --limit=<limit> Limit output of "nearest-stations" to designated number of entries. [default: 10] --verbose Turn on verbose output """ # Use generic commandline options schema and amend with current program name commandline_schema = run.__doc__ # Read commandline options options = docopt(commandline_schema, version=APP_NAME + ' ' + __version__) # Initialize logging boot_logging(options) # Normalize commandline options options = normalize_options(options, encoding='utf-8') # Expand options preset_name = options['species-preset'] if preset_name: options['species'] = DwdPhenoData.load_preset('options', 'species', preset_name) # Coerce comma-separated list fields options_convert_lists(options, list_items=[ # Acquisition parameters 'filename', # Filter parameters 'year', # ID parameters 'quality-level', 'quality-byte', 'station-id', 'species-id', 'phase-id', # Humanized parameters 'quality', 'station', 'species', 'phase', # Sorting parameters 'sort', ]) # Command line argument debugging #import pprint; print 'options:\n{}'.format(pprint.pformat(options)) if options['info']: print('Name: phenodata-{version}'.format(version=__version__)) print('Description: phenodata is a data acquisition and manipulation toolkit for open access phenology data') print('Data sources: DWD') # TODO: Add cache location and info return # Create data source adapter if options['source'] == 'dwd': cdc_client = DwdCdcClient(ftp=FTPSession()) humanizer = DwdPhenoDataHumanizer(language=options['language'], long_station=options['long-station'], show_ids=options['show-ids']) client = DwdPhenoData(cdc=cdc_client, humanizer=humanizer, dataset=options.get('dataset')) else: message = 'Data source "{}" not implemented'.format(options['source']) logger.error(message) raise DocoptExit(message) # Dispatch command data = None if options['list-species']: data = client.get_species() elif options['list-phases']: data = client.get_phases() elif options['list-stations']: data = client.get_stations(filter=options['filter'], all=options['all']) elif options['list-quality-levels']: data = client.get_quality_levels() elif options['list-quality-bytes']: data = client.get_quality_bytes() elif options['list-filenames']: files = client.scan_files(options['partition'], include=options['filename'], field='name') print('\n'.join(files)) return elif options['list-urls']: files = client.scan_files(options['partition'], include=options['filename'], field='url') print('\n'.join(files)) return elif options['observations']: data = client.get_observations(options, humanize=options['humanize']) elif options['forecast']: data = client.get_forecast(options, forecast_year=options['forecast-year'], humanize=options['humanize']) elif options['nearest-station']: data = client.nearest_station(float(options['latitude']), float(options['longitude']), all=options['all']) elif options['nearest-stations']: data = client.nearest_stations(float(options['latitude']), float(options['longitude']), all=options['all'], limit=int(options['limit'])) elif options['drop-cache']: client.cdc.ftp.ensure_cache_manager() if client.cdc.ftp.cache.drop(): logger.info('Dropping the cache succeeded') else: logger.warning('Dropping the cache failed') return # Query results if data is not None and options["sql"]: import duckdb data = duckdb.query(data, "data", options["sql"]).df() # Format and output results if data is not None: output_format = options['format'] # Whether to show the index column or not showindex = True if options['observations'] or options['forecast']: showindex = False # Sort columns if options['sort']: data.sort_values(options['sort'], inplace=True) output = None if output_format.startswith('tabular'): try: tablefmt = options['format'].split(':')[1] except: tablefmt = 'psql' # TODO: How to make "tabulate" print index column name? output = tabulate(data, headers=data.columns, showindex=showindex, tablefmt=tablefmt) elif output_format == 'csv': output = data.to_csv(encoding='utf-8', index=showindex) elif output_format == 'json': output = data.to_json(orient='table', date_format='iso') elif output_format == 'string': output = data.to_string() else: message = 'Unknown output format "{}"'.format(options['format']) logger.error(message) sys.exit(1) if output is not None: if sys.version_info.major == 2: print(output.encode('utf-8')) else: print(output) else: logger.warning('Empty output')
# convert a relation back to a pandas data frame print(rel.to_df()) # df() is shorthand for to_df() on relations print(rel.df()) # create a table in duckdb from the relation print(rel.create("test_table2")) # insert the relation's data into an existing table conn.execute("CREATE TABLE test_table3 (i INTEGER, j STRING)") print(rel.insert("test_table3")) # create a SQL-accessible view of the relation print(rel.create_view('test_view')) # we can also directly run SQL queries on relation objects without explicitly creating a view # the first parameter gives the rel object a view name so we can refer to it in queries res = rel.query('my_name_for_rel', 'SELECT * FROM my_name_for_rel') print(res) # res is a query result, we can fetch with the methods described above, e.g. print(res.fetchone()) print(res.fetchdf()) # or just use df(), a shorthand for fetchdf() on query results print(res.df()) # this also works directly on data frames res = duckdb.query(test_df, 'my_name_for_test_df', 'SELECT * FROM my_name_for_test_df') print(res.df())
def pandas_replacement(): df = pd.DataFrame({"x": np.random.rand(1_000_000)}) duckdb.query("select sum(x) from df").fetchall()
def test_nested_mix(self, duckdb_cursor): # List of structs W/ Struct that is NULL entirely compare_results("SELECT [{'i':1,'j':2},NULL,{'i':2,'j':NULL}] as a", [[{ 'i': 1, 'j': 2 }, None, { 'i': 2, 'j': None }]]) # Lists of structs with lists compare_results("SELECT [{'i':1,'j':[2,3]},NULL] as a", [[{ 'i': 1, 'j': [2, 3] }, None]]) # Maps embedded in a struct compare_results( "SELECT {'i':mp,'j':mp2} as a FROM (SELECT MAP(LIST_VALUE(1, 2, 3, 4),LIST_VALUE(10, 9, 8, 7)) as mp, MAP(LIST_VALUE(1, 2, 3, 5),LIST_VALUE(10, 9, 8, 7)) as mp2) as t", [{ 'i': { 'key': [1, 2, 3, 4], 'value': [10, 9, 8, 7] }, 'j': { 'key': [1, 2, 3, 5], 'value': [10, 9, 8, 7] } }]) # List of maps compare_results( "SELECT [mp,mp2] as a FROM (SELECT MAP(LIST_VALUE(1, 2, 3, 4),LIST_VALUE(10, 9, 8, 7)) as mp, MAP(LIST_VALUE(1, 2, 3, 5),LIST_VALUE(10, 9, 8, 7)) as mp2) as t", [[{ 'key': [1, 2, 3, 4], 'value': [10, 9, 8, 7] }, { 'key': [1, 2, 3, 5], 'value': [10, 9, 8, 7] }]]) # Map with list as key and/or value compare_results( "SELECT MAP(LIST_VALUE([1,2],[3,4],[5,4]),LIST_VALUE([1,2],[3,4],[5,4])) as a", [{ 'key': [[1, 2], [3, 4], [5, 4]], 'value': [[1, 2], [3, 4], [5, 4]] }]) # Map with struct as key and/or value compare_results( "SELECT MAP(LIST_VALUE({'i':1,'j':2},{'i':3,'j':4}),LIST_VALUE({'i':1,'j':2},{'i':3,'j':4})) as a", [{ 'key': [{ 'i': 1, 'j': 2 }, { 'i': 3, 'j': 4 }], 'value': [{ 'i': 1, 'j': 2 }, { 'i': 3, 'j': 4 }] }]) # Null checks on lists with structs compare_results( "SELECT [{'i':1,'j':[2,3]},NULL,{'i':1,'j':[2,3]}] as a", [[{ 'i': 1, 'j': [2, 3] }, None, { 'i': 1, 'j': [2, 3] }]]) # Struct that is NULL entirely df_duck = duckdb.query( "SELECT col0 as a FROM (VALUES ({'i':1,'j':2}), (NULL), ({'i':1,'j':2}), (NULL))" ).df() duck_values = df_duck['a'] assert duck_values[0] == {'i': 1, 'j': 2} assert np.isnan(duck_values[1]) assert duck_values[2] == {'i': 1, 'j': 2} assert np.isnan(duck_values[3]) # MAP that is NULL entirely df_duck = duckdb.query( "SELECT col0 as a FROM (VALUES (MAP(LIST_VALUE(1,2),LIST_VALUE(3,4))),(NULL), (MAP(LIST_VALUE(1,2),LIST_VALUE(3,4))), (NULL))" ).df() duck_values = df_duck['a'] assert duck_values[0] == {'key': [1, 2], 'value': [3, 4]} assert np.isnan(duck_values[1]) assert duck_values[2] == {'key': [1, 2], 'value': [3, 4]} assert np.isnan(duck_values[3])
def test_2273(self, duckdb_cursor): df_in = pd.DataFrame([[datetime.date(1992, 7, 30)]]) assert duckdb.query("Select * from df_in").fetchall() == [ ('1992-07-30', ) ]
def arrow_replacement(): data = pa.array(np.random.rand(1_000_000), type=pa.float32()) arrow_table = pa.Table.from_arrays([data],['a']) duckdb.query("select sum(a) from arrow_table").fetchall()