예제 #1
0
def compare_results(query):
    true_answer = duckdb.query(query).fetchall()
    t = duckdb.query(query).arrow()
    from_arrow = duckdb.from_arrow_table(
        duckdb.query(query).arrow()).fetchall()

    assert true_answer == from_arrow
예제 #2
0
 def test_object_integer(self, duckdb_cursor):
     df_in = pd.DataFrame({
         'int8': pd.Series([None, 1, -1], dtype="Int8"),
         'int16': pd.Series([None, 1, -1], dtype="Int16"),
         'int32': pd.Series([None, 1, -1], dtype="Int32"),
         'int64': pd.Series([None, 1, -1], dtype="Int64")
     })
     df_expected_res = pd.DataFrame({
         'int8':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
         'int16':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
         'int32':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
         'int64':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
     })
     df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df()
     pd.testing.assert_frame_equal(df_expected_res, df_out)
예제 #3
0
def compare_results(query, list_values=[]):
    df_duck = duckdb.query(query).df()
    counter = 0
    duck_values = df_duck['a']
    for duck_value in duck_values:
        assert duck_value == list_values[counter]
        counter += 1
예제 #4
0
 def test_query(self, duckdb_cursor):
     conn = duckdb.connect()
     conn.execute("create table t (a integer)")
     conn.execute("insert into t values (1)")
     assert duckdb.query("select count(*) from t",
                         connection=conn).execute().fetchall()[0] == (1, )
     assert duckdb.from_query(
         "select count(*) from t",
         connection=conn).execute().fetchall()[0] == (1, )
예제 #5
0
    def test_fromquery(self, duckdb_cursor):
        assert duckdb.from_query('select 42').fetchone()[0] == 42
        assert duckdb.query('select 43').fetchone()[0] == 43

        # assert duckdb_cursor.from_query('select 44').execute().fetchone()[0] == 44
        # assert duckdb_cursor.from_query('select 45').execute().fetchone()[0] == 45
        # assert duckdb_cursor.from_query('select 45').execute().fetchone()[0] == 45


# cursor = duckdb.connect().cursor()
# TestRelationApi().test_readonly(cursor)
예제 #6
0
    def test_category_simple(self, duckdb_cursor):
        df_in = pd.DataFrame({
            'float': [1.0, 2.0, 1.0],
            'string': pd.Series(["foo", "bar", "foo"], dtype="category"),
            'int': pd.Series([1, 2, 1], dtype="category")
        })

        df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df()
        assert numpy.all(df_out['float'] == numpy.array([1.0, 2.0, 1.0]))
        assert numpy.all(df_out['string'] == numpy.array(["foo", "bar", "foo"]))
        assert numpy.all(df_out['int'] == numpy.array([1, 2, 1]))
예제 #7
0
    def test_category_nulls(self, duckdb_cursor):
        df_in = pd.DataFrame({
            'string': pd.Series(["foo", "bar", None], dtype="category"),
            'int': pd.Series([1, 2, None], dtype="category")
        })

        df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df()
        assert df_out['string'][0] == "foo"
        assert df_out['string'][1] == "bar"
        assert numpy.isnan(df_out['string'][2])
        assert df_out['int'][0] == 1
        assert df_out['int'][1] == 2
        assert numpy.isnan(df_out['int'][2])
예제 #8
0
 def test_timestamp_tz(self, duckdb_cursor):
     df_in = pd.DataFrame({
         'datetime': [pd.Timestamp('20180310T11:17:54Z')],
         'string': ['foo']
     })
     df_expected_res = pd.DataFrame({
         'datetime': [pd.Timestamp('20180310T11:17:54')],
         'string': ['foo']
     })
     print(df_in)
     print(df_expected_res)
     df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df()
     print(df_out)
     pd.testing.assert_frame_equal(df_expected_res, df_out)
예제 #9
0
    def test_lists_basic(self, duckdb_cursor):
        if not can_run:
            return

        #Test Constant List
        query = duckdb.query(
            "SELECT a from (select list_value(3,5,10) as a) as t").arrow(
            )['a'].to_numpy()
        assert query[0][0] == 3
        assert query[0][1] == 5
        assert query[0][2] == 10

        # Empty List
        query = duckdb.query("SELECT a from (select list_value() as a) as t"
                             ).arrow()['a'].to_numpy()
        assert len(query[0]) == 0

        #Test Constant List With Null
        query = duckdb.query(
            "SELECT a from (select list_value(3,NULL) as a) as t").arrow(
            )['a'].to_numpy()
        assert query[0][0] == 3
        assert np.isnan(query[0][1])
예제 #10
0
    def test_duckdb_query(self, duckdb_cursor):
        # we can use duckdb.query to run both DDL statements and select statements
        duckdb.query('create view v1 as select 42 i')
        rel = duckdb.query('select * from v1')
        assert rel.fetchall()[0][0] == 42;

        # also multiple statements
        duckdb.query('create view v2 as select i*2 j from v1; create view v3 as select j * 2 from v2;')
        rel = duckdb.query('select * from v3')
        assert rel.fetchall()[0][0] == 168;

        # we can run multiple select statements, but we get no result
        res = duckdb.query('select 42; select 84;');
        assert res is None
예제 #11
0
    def test_pandas_string(self, duckdb_cursor):
        strings = numpy.array(['foo', 'bar', 'baz'])

        # https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html
        df_in = pd.DataFrame({
            'object': pd.Series(strings, dtype='object'),
        })
        # Only available in pandas 1.0.0
        if hasattr(pd, 'StringDtype'):
            df_in['string'] = pd.Series(strings, dtype=pd.StringDtype())

        df_out = duckdb.query(df_in, "data", "SELECT * FROM data").df()

        assert numpy.all(df_out['object'] == strings)
        if hasattr(pd, 'StringDtype'):
            assert numpy.all(df_out['string'] == strings)
예제 #12
0
    def _filter_by_sql(df: pd.DataFrame, sql: str) -> pd.DataFrame:
        """
        Filter Pandas DataFrame using an SQL query.
        The virtual table name is "data", so queries
        should look like ``SELECT * FROM data;``.

        This implementation is based on DuckDB, so please
        have a look at its SQL documentation.

        - https://duckdb.org/docs/sql/introduction

        :param sql: A SQL expression.
        :return: Filtered DataFrame
        """
        import duckdb

        return duckdb.query(df, "data", sql).df()
예제 #13
0
def arrow_to_pandas(query):
    return duckdb.query(query).arrow().to_pandas()['a'].values.tolist()
예제 #14
0
def run():
    """
    Usage:
      phenodata info
      phenodata list-species --source=dwd [--format=csv]
      phenodata list-phases --source=dwd [--format=csv]
      phenodata list-stations --source=dwd --dataset=immediate [--all] [--filter=berlin] [--sort=Stationsname] [--format=csv]
      phenodata nearest-station --source=dwd --dataset=immediate --latitude=52.520007 --longitude=13.404954 [--format=csv]
      phenodata nearest-stations --source=dwd --dataset=immediate [--all] --latitude=52.520007 --longitude=13.404954 [--limit=10] [--format=csv]
      phenodata list-quality-levels --source=dwd [--format=csv]
      phenodata list-quality-bytes --source=dwd [--format=csv]
      phenodata list-filenames --source=dwd --dataset=immediate --partition=recent [--filename=Hasel,Schneegloeckchen] [--year=2017]
      phenodata list-urls --source=dwd --dataset=immediate --partition=recent [--filename=Hasel,Schneegloeckchen] [--year=2017]
      phenodata (observations|forecast) --source=dwd --dataset=immediate --partition=recent [--filename=Hasel,Schneegloeckchen] [--station-id=164,717] [--species-id=113,127] [--phase-id=5] [--quality-level=10] [--quality-byte=1,2,3] [--station=berlin,brandenburg] [--species=hazel,snowdrop] [--species-preset=mellifera-de-primary] [--phase=flowering] [--quality=ROUTKLI] [--year=2017] [--forecast-year=2021] [--humanize] [--show-ids] [--language=german] [--long-station] [--sort=Datum] [--sql=sql] [--format=csv] [--verbose]
      phenodata drop-cache --source=dwd
      phenodata --version
      phenodata (-h | --help)

    Data acquisition options:
      --source=<source>         Data source. Currently "dwd" only.
      --dataset=<dataset>       Data set. Use "immediate" or "annual" for --source=dwd.
      --partition=<dataset>     Partition. Use "recent" or "historical" for --source=dwd.
      --filename=<file>         Filter by file names (comma-separated list)

    Direct filtering options:
      --year=<year>             Filter by year (comma-separated list)
      --station-id=<station-id> Filter by station ids (comma-separated list)
      --species-id=<species-id> Filter by species ids (comma-separated list)
      --phase-id=<phase-id>     Filter by phase ids (comma-separated list)

    Humanized filtering options:
      --station=<station>       Filter by strings from "stations" data (comma-separated list)
      --species=<species>       Filter by strings from "species" data (comma-separated list)
      --phase=<phase>           Filter by strings from "phases" data (comma-separated list)
      --species-preset=<preset> Filter by strings from "species" data (comma-separated list)
                                The preset will get loaded from the ``presets.json`` file.

    Forecasting options:
      --forecast-year=<year>    Use as designated forecast year.

    Postprocess filtering options:
      --sql=<sql>               Apply given SQL query before output

    Data output options:
      --format=<format>         Output data in designated format. Choose one of "tabular", "json", "csv" or "string".
                                With "tabular", it is also possible to specify the table format,
                                see https://bitbucket.org/astanin/python-tabulate. e.g. "tabular:presto".
                                [default: tabular:psql]
      --sort=<sort>             Sort by given column names (comma-separated list)
      --humanize                Resolve ID-based columns to real names with "observations" and "forecast" output.
      --show-ids                Show IDs alongside resolved text representation when using ``--humanize``.
      --language=<language>     Use labels in designated language when using ``--humanize`` [default: english].
      --long-station            Use long station name including "Naturraumgruppe" and "Naturraum".
      --limit=<limit>           Limit output of "nearest-stations" to designated number of entries.
                                [default: 10]
      --verbose                 Turn on verbose output
    """

    # Use generic commandline options schema and amend with current program name
    commandline_schema = run.__doc__

    # Read commandline options
    options = docopt(commandline_schema, version=APP_NAME + ' ' + __version__)

    # Initialize logging
    boot_logging(options)

    # Normalize commandline options
    options = normalize_options(options, encoding='utf-8')

    # Expand options
    preset_name = options['species-preset']
    if preset_name:
        options['species'] = DwdPhenoData.load_preset('options', 'species', preset_name)

    # Coerce comma-separated list fields
    options_convert_lists(options, list_items=[

        # Acquisition parameters
        'filename',

        # Filter parameters
        'year',

        # ID parameters
        'quality-level',
        'quality-byte',
        'station-id',
        'species-id',
        'phase-id',

        # Humanized parameters
        'quality',
        'station',
        'species',
        'phase',

        # Sorting parameters
        'sort',
    ])

    # Command line argument debugging
    #import pprint; print 'options:\n{}'.format(pprint.pformat(options))

    if options['info']:
        print('Name:         phenodata-{version}'.format(version=__version__))
        print('Description:  phenodata is a data acquisition and manipulation toolkit for open access phenology data')
        print('Data sources: DWD')
        # TODO: Add cache location and info
        return

    # Create data source adapter
    if options['source'] == 'dwd':
        cdc_client = DwdCdcClient(ftp=FTPSession())
        humanizer = DwdPhenoDataHumanizer(language=options['language'], long_station=options['long-station'], show_ids=options['show-ids'])
        client = DwdPhenoData(cdc=cdc_client, humanizer=humanizer, dataset=options.get('dataset'))
    else:
        message = 'Data source "{}" not implemented'.format(options['source'])
        logger.error(message)
        raise DocoptExit(message)

    # Dispatch command
    data = None
    if options['list-species']:
        data = client.get_species()
    elif options['list-phases']:
        data = client.get_phases()
    elif options['list-stations']:
        data = client.get_stations(filter=options['filter'], all=options['all'])
    elif options['list-quality-levels']:
        data = client.get_quality_levels()
    elif options['list-quality-bytes']:
        data = client.get_quality_bytes()

    elif options['list-filenames']:
        files = client.scan_files(options['partition'], include=options['filename'], field='name')
        print('\n'.join(files))
        return
    elif options['list-urls']:
        files = client.scan_files(options['partition'], include=options['filename'], field='url')
        print('\n'.join(files))
        return

    elif options['observations']:
        data = client.get_observations(options, humanize=options['humanize'])

    elif options['forecast']:
        data = client.get_forecast(options, forecast_year=options['forecast-year'], humanize=options['humanize'])

    elif options['nearest-station']:
        data = client.nearest_station(float(options['latitude']), float(options['longitude']), all=options['all'])

    elif options['nearest-stations']:
        data = client.nearest_stations(float(options['latitude']), float(options['longitude']), all=options['all'], limit=int(options['limit']))

    elif options['drop-cache']:
        client.cdc.ftp.ensure_cache_manager()
        if client.cdc.ftp.cache.drop():
            logger.info('Dropping the cache succeeded')
        else:
            logger.warning('Dropping the cache failed')
        return

    # Query results
    if data is not None and options["sql"]:
        import duckdb
        data = duckdb.query(data, "data", options["sql"]).df()

    # Format and output results
    if data is not None:

        output_format = options['format']

        # Whether to show the index column or not
        showindex = True
        if options['observations'] or options['forecast']:
            showindex = False

        # Sort columns
        if options['sort']:
            data.sort_values(options['sort'], inplace=True)

        output = None
        if output_format.startswith('tabular'):

            try:
                tablefmt = options['format'].split(':')[1]
            except:
                tablefmt = 'psql'

            # TODO: How to make "tabulate" print index column name?
            output = tabulate(data, headers=data.columns, showindex=showindex, tablefmt=tablefmt)

        elif output_format == 'csv':
            output = data.to_csv(encoding='utf-8', index=showindex)

        elif output_format == 'json':
            output = data.to_json(orient='table', date_format='iso')

        elif output_format == 'string':
            output = data.to_string()

        else:
            message = 'Unknown output format "{}"'.format(options['format'])
            logger.error(message)
            sys.exit(1)

        if output is not None:
            if sys.version_info.major == 2:
                print(output.encode('utf-8'))
            else:
                print(output)
        else:
            logger.warning('Empty output')
예제 #15
0
# convert a relation back to a pandas data frame
print(rel.to_df())

# df() is shorthand for to_df() on relations
print(rel.df())

# create a table in duckdb from the relation
print(rel.create("test_table2"))

# insert the relation's data into an existing table
conn.execute("CREATE TABLE test_table3 (i INTEGER, j STRING)")
print(rel.insert("test_table3"))

# create a SQL-accessible view of the relation
print(rel.create_view('test_view'))

# we can also directly run SQL queries on relation objects without explicitly creating a view
# the first parameter gives the rel object a view name so we can refer to it in queries
res = rel.query('my_name_for_rel', 'SELECT * FROM my_name_for_rel')
print(res)
# res is a query result, we can fetch with the methods described above, e.g.
print(res.fetchone())
print(res.fetchdf())
# or just use df(), a shorthand for fetchdf() on query results
print(res.df())

# this also works directly on data frames
res = duckdb.query(test_df, 'my_name_for_test_df',
                   'SELECT * FROM my_name_for_test_df')
print(res.df())
예제 #16
0
def pandas_replacement():
    df = pd.DataFrame({"x": np.random.rand(1_000_000)})
    duckdb.query("select sum(x) from df").fetchall()
예제 #17
0
    def test_nested_mix(self, duckdb_cursor):
        # List of structs W/ Struct that is NULL entirely
        compare_results("SELECT [{'i':1,'j':2},NULL,{'i':2,'j':NULL}] as a",
                        [[{
                            'i': 1,
                            'j': 2
                        }, None, {
                            'i': 2,
                            'j': None
                        }]])

        # Lists of structs with lists
        compare_results("SELECT [{'i':1,'j':[2,3]},NULL] as a", [[{
            'i': 1,
            'j': [2, 3]
        }, None]])

        # Maps embedded in a struct
        compare_results(
            "SELECT {'i':mp,'j':mp2} as a FROM (SELECT MAP(LIST_VALUE(1, 2, 3, 4),LIST_VALUE(10, 9, 8, 7)) as mp, MAP(LIST_VALUE(1, 2, 3, 5),LIST_VALUE(10, 9, 8, 7)) as mp2) as t",
            [{
                'i': {
                    'key': [1, 2, 3, 4],
                    'value': [10, 9, 8, 7]
                },
                'j': {
                    'key': [1, 2, 3, 5],
                    'value': [10, 9, 8, 7]
                }
            }])

        # List of maps
        compare_results(
            "SELECT [mp,mp2] as a FROM (SELECT MAP(LIST_VALUE(1, 2, 3, 4),LIST_VALUE(10, 9, 8, 7)) as mp, MAP(LIST_VALUE(1, 2, 3, 5),LIST_VALUE(10, 9, 8, 7)) as mp2) as t",
            [[{
                'key': [1, 2, 3, 4],
                'value': [10, 9, 8, 7]
            }, {
                'key': [1, 2, 3, 5],
                'value': [10, 9, 8, 7]
            }]])

        # Map with list as key and/or value
        compare_results(
            "SELECT MAP(LIST_VALUE([1,2],[3,4],[5,4]),LIST_VALUE([1,2],[3,4],[5,4])) as a",
            [{
                'key': [[1, 2], [3, 4], [5, 4]],
                'value': [[1, 2], [3, 4], [5, 4]]
            }])

        # Map with struct as key and/or value
        compare_results(
            "SELECT MAP(LIST_VALUE({'i':1,'j':2},{'i':3,'j':4}),LIST_VALUE({'i':1,'j':2},{'i':3,'j':4})) as a",
            [{
                'key': [{
                    'i': 1,
                    'j': 2
                }, {
                    'i': 3,
                    'j': 4
                }],
                'value': [{
                    'i': 1,
                    'j': 2
                }, {
                    'i': 3,
                    'j': 4
                }]
            }])

        # Null checks on lists with structs
        compare_results(
            "SELECT [{'i':1,'j':[2,3]},NULL,{'i':1,'j':[2,3]}] as a",
            [[{
                'i': 1,
                'j': [2, 3]
            }, None, {
                'i': 1,
                'j': [2, 3]
            }]])

        # Struct that is NULL entirely
        df_duck = duckdb.query(
            "SELECT col0 as a FROM (VALUES ({'i':1,'j':2}), (NULL), ({'i':1,'j':2}), (NULL))"
        ).df()
        duck_values = df_duck['a']
        assert duck_values[0] == {'i': 1, 'j': 2}
        assert np.isnan(duck_values[1])
        assert duck_values[2] == {'i': 1, 'j': 2}
        assert np.isnan(duck_values[3])

        # MAP that is NULL entirely
        df_duck = duckdb.query(
            "SELECT col0 as a FROM (VALUES (MAP(LIST_VALUE(1,2),LIST_VALUE(3,4))),(NULL), (MAP(LIST_VALUE(1,2),LIST_VALUE(3,4))), (NULL))"
        ).df()
        duck_values = df_duck['a']
        assert duck_values[0] == {'key': [1, 2], 'value': [3, 4]}
        assert np.isnan(duck_values[1])
        assert duck_values[2] == {'key': [1, 2], 'value': [3, 4]}
        assert np.isnan(duck_values[3])
예제 #18
0
 def test_2273(self, duckdb_cursor):
     df_in = pd.DataFrame([[datetime.date(1992, 7, 30)]])
     assert duckdb.query("Select * from df_in").fetchall() == [
         ('1992-07-30', )
     ]
예제 #19
0
def arrow_replacement():
    data = pa.array(np.random.rand(1_000_000), type=pa.float32())
    arrow_table = pa.Table.from_arrays([data],['a'])
    duckdb.query("select sum(a) from arrow_table").fetchall()