Пример #1
0
 def test_category_nulls(self, duckdb_cursor):
     df_in = pd.DataFrame(
         {'int': pd.Series([1, 2, None], dtype="category")})
     df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
     print(duckdb.query_df(df_in, "data", "SELECT * FROM data").fetchall())
     assert df_out['int'][0] == 1
     assert df_out['int'][1] == 2
     assert numpy.isnan(df_out['int'][2])
Пример #2
0
    def test_category_simple(self, duckdb_cursor):
        df_in = pd.DataFrame({
            'float': [1.0, 2.0, 1.0],
            'int': pd.Series([1, 2, 1], dtype="category")
        })

        df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
        print(duckdb.query_df(df_in, "data", "SELECT * FROM data").fetchall())
        print(df_out['int'])
        assert numpy.all(df_out['float'] == numpy.array([1.0, 2.0, 1.0]))
        assert numpy.all(df_out['int'] == numpy.array([1, 2, 1]))
Пример #3
0
 def test_query_df(self, duckdb_cursor):
     conn = duckdb.connect()
     conn.execute("create table t (a integer)")
     conn.execute("insert into t values (1),(4)")
     test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4]})
     rel = duckdb.query_df(test_df,'t_2','select * from t inner join t_2 on (a = i)', connection=conn)
     assert rel.fetchall()[0] ==  (1,1) 
Пример #4
0
 def test_object_integer(self, duckdb_cursor):
     df_in = pd.DataFrame({
         'int8': pd.Series([None, 1, -1], dtype="Int8"),
         'int16': pd.Series([None, 1, -1], dtype="Int16"),
         'int32': pd.Series([None, 1, -1], dtype="Int32"),
         'int64': pd.Series([None, 1, -1], dtype="Int64")
     })
     df_expected_res = pd.DataFrame({
         'int8':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
         'int16':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
         'int32':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
         'int64':
         np.ma.masked_array([0, 1, -1],
                            mask=[True, False, False],
                            dtype='float64'),
     })
     df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
     pd.testing.assert_frame_equal(df_expected_res, df_out)
Пример #5
0
def query_df(df, query):
    """ Perform simple query ('select' from one table, without subqueries and joins) on DataFrame.

        Args:
            df (pandas.DataFrame): data
            query (mindsdb_sql.parser.ast.Select | str): select query

        Returns:
            pandas.DataFrame
    """

    query = parse_sql(str(query), dialect='mysql')
    if isinstance(query, Select) is False or isinstance(
            query.from_table, Identifier) is False:
        raise Exception(
            "Only 'SELECT from TABLE' statements supported for internal query")

    query.from_table.parts = ['df_table']
    for identifier in query.targets:
        if isinstance(identifier, Identifier):
            identifier.parts = [identifier.parts[-1]]
    if isinstance(query.order_by, list):
        for orderby in query.order_by:
            if isinstance(orderby, OrderBy) and isinstance(
                    orderby.field, Identifier):
                orderby.field.parts = [orderby.field.parts[-1]]
    _remove_table_name(query.where)

    # FIXME https://github.com/mindsdb/mindsdb_sql/issues/130
    # we need way to dump suery in postgres dialect
    sql_query = str(query).replace('`', '')
    res = duckdb.query_df(df, 'df_table', sql_query)
    result_df = res.df()
    result_df = result_df.where(pd.notnull(result_df), None)
    return result_df
Пример #6
0
    def _filter_by_sql(df: pd.DataFrame, sql: str) -> pd.DataFrame:
        """
        Filter Pandas DataFrame using an SQL query.
        The virtual table name is "data", so queries
        should look like ``SELECT * FROM data;``.

        This implementation is based on DuckDB, so please
        have a look at its SQL documentation.

        - https://duckdb.org/docs/sql/introduction

        :param sql: A SQL expression.
        :return: Filtered DataFrame
        """
        import duckdb

        df = duckdb.query_df(df, "data", sql).df()

        for column in (
                Columns.FROM_DATE.value,
                Columns.TO_DATE.value,
                Columns.DATE.value,
        ):
            try:
                df[column] = df[column].dt.tz_localize(pytz.UTC)
            except KeyError:
                pass

        return df
Пример #7
0
 def test_pandas_encoded_utf8(self, duckdb_cursor):
     data = u'\u00c3'  # Unicode data
     data = [data.encode('utf8')]
     expected_result = data[0]
     df_in = pd.DataFrame({'object': pd.Series(data, dtype='object')})
     result = duckdb.query_df(df_in, "data",
                              "SELECT * FROM data").fetchone()[0]
     assert result == str(expected_result)
Пример #8
0
def round_trip(data, pandas_type):
    df_in = pd.DataFrame({
        'object': pd.Series(data, dtype=pandas_type),
    })

    df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
    print(df_out)
    print(df_in)
    assert df_out.equals(df_in)
Пример #9
0
    def test_category_mix(self, duckdb_cursor):
        df_in = pd.DataFrame({
            'float': [1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 0.0],
            'x':
            pd.Categorical(
                ['foo', 'bla', None, 'zoo', 'foo', 'foo', None, 'bla'],
                ordered=True),
        })

        df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
        assert df_out.equals(df_in)
Пример #10
0
    def test_pandas_float32(self, duckdb_cursor):
        data = numpy.array([0.1, 0.32, 0.78, numpy.nan])
        df_in = pd.DataFrame({
            'object': pd.Series(data, dtype='float32'),
        })

        df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()

        assert df_out['object'][0] == df_in['object'][0]
        assert df_out['object'][1] == df_in['object'][1]
        assert df_out['object'][2] == df_in['object'][2]
        assert numpy.isnan(df_out['object'][3])
Пример #11
0
    def test_pandas_boolean(self, duckdb_cursor):
        data = numpy.array([True, None, pd.NA, numpy.nan, True])
        df_in = pd.DataFrame({
            'object': pd.Series(data, dtype='boolean'),
        })

        df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
        assert df_out['object'][0] == df_in['object'][0]
        assert numpy.isnan(df_out['object'][1])
        assert numpy.isnan(df_out['object'][2])
        assert numpy.isnan(df_out['object'][3])
        assert df_out['object'][4] == df_in['object'][4]
Пример #12
0
    def test_category_nulls(self, duckdb_cursor):
        df_in = pd.DataFrame({
            'string': pd.Series(["foo", "bar", None], dtype="category"),
            'int': pd.Series([1, 2, None], dtype="category")
        })

        df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
        assert df_out['string'][0] == "foo"
        assert df_out['string'][1] == "bar"
        assert numpy.isnan(df_out['string'][2])
        assert df_out['int'][0] == 1
        assert df_out['int'][1] == 2
        assert numpy.isnan(df_out['int'][2])
Пример #13
0
 def test_timestamp_tz(self, duckdb_cursor):
     df_in = pd.DataFrame({
         'datetime': [pd.Timestamp('20180310T11:17:54Z')],
         'string': ['foo']
     })
     df_expected_res = pd.DataFrame({
         'datetime': [pd.Timestamp('20180310T11:17:54')],
         'string': ['foo']
     })
     print(df_in)
     print(df_expected_res)
     df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
     print(df_out)
     pd.testing.assert_frame_equal(df_expected_res, df_out)
Пример #14
0
    def test_pandas_interval(self, duckdb_cursor):
        if pd.__version__ != '1.2.4':
            return

        data = numpy.array([2069211000000000, numpy.datetime64("NaT")])
        df_in = pd.DataFrame({
            'object':
            pd.Series(data, dtype='timedelta64[ns]'),
        })

        df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()

        assert df_out['object'][0] == df_in['object'][0]
        assert pd.isnull(df_out['object'][1])
Пример #15
0
 def test_categorical_fetchall(self, duckdb_cursor):
     df_in = pd.DataFrame({
         'x':
         pd.Categorical(
             ['foo', 'bla', None, 'zoo', 'foo', 'foo', None, 'bla'],
             ordered=True),
     })
     assert duckdb.query_df(df_in, "data",
                            "SELECT * FROM data").fetchall() == [('foo', ),
                                                                 ('bla', ),
                                                                 (None, ),
                                                                 ('zoo', ),
                                                                 ('foo', ),
                                                                 ('foo', ),
                                                                 (None, ),
                                                                 ('bla', )]
Пример #16
0
    def test_pandas_string(self, duckdb_cursor):
        strings = numpy.array(['foo', 'bar', 'baz'])

        # https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html
        df_in = pd.DataFrame({
            'object': pd.Series(strings, dtype='object'),
        })
        # Only available in pandas 1.0.0
        if hasattr(pd, 'StringDtype'):
            df_in['string'] = pd.Series(strings, dtype=pd.StringDtype())

        df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()

        assert numpy.all(df_out['object'] == strings)
        if hasattr(pd, 'StringDtype'):
            assert numpy.all(df_out['string'] == strings)
Пример #17
0
def query_df(df, query):
    """ Perform simple query ('select' from one table, without subqueries and joins) on DataFrame.

        Args:
            df (pandas.DataFrame): data
            query (mindsdb_sql.parser.ast.Select | str): select query

        Returns:
            pandas.DataFrame
    """

    if isinstance(query, str):
        query_ast = parse_sql(query, dialect='mysql')
    else:
        query_ast = query

    if isinstance(query_ast, Select) is False or isinstance(
            query_ast.from_table, Identifier) is False:
        raise Exception(
            "Only 'SELECT from TABLE' statements supported for internal query")

    query_ast.from_table.parts = ['df_table']
    for identifier in query_ast.targets:
        if isinstance(identifier, Identifier):
            identifier.parts = [identifier.parts[-1]]
    if isinstance(query_ast.order_by, list):
        for orderby in query_ast.order_by:
            if isinstance(orderby, OrderBy) and isinstance(
                    orderby.field, Identifier):
                orderby.field.parts = [orderby.field.parts[-1]]
    _remove_table_name(query_ast.where)

    render = SqlalchemyRender('postgres')
    try:
        query_str = render.get_string(query_ast, with_failback=False)
    except Exception as e:
        print(
            f"Exception during query casting to 'postgres' dialect. Query: {str(query)}. Error: {e}"
        )
        query_str = render.get_string(query_ast, with_failback=True)

    res = duckdb.query_df(df, 'df_table', query_str)
    result_df = res.df()
    result_df = result_df.replace({np.nan: None})
    return result_df
Пример #18
0
    def filter_by_sql(self, sql: str) -> StationsResult:
        """

        :param sql:
        :return:
        """
        import duckdb

        df = self.all().df

        df: pd.DataFrame = duckdb.query_df(df, "data", sql).df()

        df.loc[:,
               Columns.FROM_DATE.value] = df.loc[:, Columns.FROM_DATE.
                                                 value].dt.tz_localize(self.tz)
        df.loc[:,
               Columns.TO_DATE.value] = df.loc[:, Columns.TO_DATE.
                                               value].dt.tz_localize(self.tz)

        return StationsResult(stations=self, df=df.reset_index(drop=True))
Пример #19
0
def check_create_table(category):
    conn = duckdb.connect()

    conn.execute("PRAGMA enable_verification")
    df_in = pd.DataFrame({
        'x': pd.Categorical(category, ordered=True),
        'y': pd.Categorical(category, ordered=True)
    })

    df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
    assert df_in.equals(df_out)

    conn.execute("CREATE TABLE t1 AS SELECT * FROM df_in")
    conn.execute("CREATE TABLE t2 AS SELECT * FROM df_in")

    # Check fetchall
    res = conn.execute("SELECT t1.x FROM t1").fetchall()
    check_result_list(res, category)

    # Do a insert to trigger string -> cat
    conn.execute("INSERT INTO t1 VALUES ('2','2')")

    res = conn.execute("SELECT x FROM t1 where x = '1'").fetchall()
    assert res == [('1', )]

    res = conn.execute(
        "SELECT t1.x FROM t1 inner join t2 on (t1.x = t2.x)").fetchall()
    assert res == conn.execute("SELECT x FROM t1").fetchall()

    res = conn.execute(
        "SELECT t1.x FROM t1 inner join t2 on (t1.x = t2.y)").fetchall()
    assert res == conn.execute("SELECT x FROM t1").fetchall()

    assert res == conn.execute("SELECT x FROM t1").fetchall()
    # Triggering the cast with ENUM as a src
    conn.execute("ALTER TABLE t1 ALTER x SET DATA TYPE VARCHAR")
    # We should be able to drop the table without any dependencies
    conn.execute("DROP TABLE t1")
Пример #20
0
def check_category_equal(category):
    df_in = pd.DataFrame({
        'x': pd.Categorical(category, ordered=True),
    })
    df_out = duckdb.query_df(df_in, "data", "SELECT * FROM data").df()
    assert df_in.equals(df_out)