def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: components = list(map(str, sql.getTable().names)) # some queries might also include the database # as we do not have such a concept, we just get rid of it components = components[-2:] tableName = components[-1] if len(components) == 2: if components[0] != context.schema_name: raise AttributeError(f"Schema {components[0]} is not defined.") try: dc = context.tables[tableName] except KeyError: # pragma: no cover raise AttributeError(f"Table {tableName} is not defined.") cols = dc.column_container.columns dtypes = list( map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame({ "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), }) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: components = list(map(str, sql.getTable().names)) tableName = components[-1] if len(components) == 2: if components[0] != context.schema_name: raise AttributeError(f"Schema {components[0]} is not defined.") elif len(components) > 2: raise AttributeError( "Table specification must be in the form [schema.]table") dc = context.tables[tableName] cols = dc.column_container.columns dtypes = list( map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame({ "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), }) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def _add_parameters_from_description(function_description, dask_function): for parameter in function_description.parameters: param_name, param_type = parameter sql_param_type = python_to_sql_type(param_type) dask_function.addParameter(param_name, sql_param_type, False) return dask_function
def _prepare_schema(self): """ Create a schema filled with the dataframes and functions we have currently in our list """ schema = DaskSchema(self.schema_name) if not self.tables: logger.warning("No tables are registered.") for name, dc in self.tables.items(): table = DaskTable(name) df = dc.df logger.debug( f"Adding table '{name}' to schema with columns: {list(df.columns)}" ) for column in df.columns: data_type = df[column].dtype sql_data_type = python_to_sql_type(data_type) table.addColumn(column, sql_data_type) schema.addTable(table) if not self.functions: logger.debug("No custom functions defined.") for function_description in self.function_list: name = function_description.name sql_return_type = python_to_sql_type( function_description.return_type) if function_description.aggregation: logger.debug( f"Adding function '{name}' to schema as aggregation.") dask_function = DaskAggregateFunction(name, sql_return_type) else: logger.debug( f"Adding function '{name}' to schema as scalar function.") dask_function = DaskScalarFunction(name, sql_return_type) dask_function = self._add_parameters_from_description( function_description, dask_function) schema.addFunction(dask_function) return schema
def get_column_description(df): sql_types = [str(python_to_sql_type(t)) for t in df.dtypes] column_names = df.columns return [{ "name": column_name, "type": sql_type.lower(), "typeSignature": { "rawType": sql_type.lower(), "arguments": [] }, } for column_name, sql_type in zip(column_names, sql_types)]
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: schema_name, name = context.fqn(sql.getTable()) dc = context.schema[schema_name].tables[name] cols = dc.column_container.columns dtypes = list( map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame({ "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), }) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert( self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context" ) -> DataContainer: components = list(map(str, sql.getTable().names)) dc = get_table_from_compound_identifier(context, components) cols = dc.column_container.columns dtypes = list(map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes)) df = pd.DataFrame( { "Column": cols, "Type": dtypes, "Extra": [""] * len(cols), "Comment": [""] * len(cols), } ) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert( self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context" ) -> DataContainer: components = list(map(str, sql.getTableName().names)) dc = get_table_from_compound_identifier(context, components) columns = list(map(str, sql.getColumnList())) if not columns: columns = dc.column_container.columns # Define some useful shortcuts mapping = dc.column_container.get_backend_by_frontend_name df = dc.df # Calculate statistics statistics = dd.from_pandas( pd.DataFrame({col: [] for col in columns}), npartitions=1 ) statistics = statistics.append(df[[mapping(col) for col in columns]].describe()) # Add additional information statistics = statistics.append( pd.Series( { col: str(python_to_sql_type(df[mapping(col)].dtype)).lower() for col in columns }, name="data_type", ) ) statistics = statistics.append( pd.Series({col: col for col in columns}, name="col_name",) ) cc = ColumnContainer(statistics.columns) dc = DataContainer(statistics, cc) return dc
def test_python_to_sql(): assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER" assert str(python_to_sql_type(np.dtype(">M8[ns]"))) == "TIMESTAMP"
def test_python_to_sql_to_python(): assert (type( sql_to_python_value(str(python_to_sql_type(np.dtype("int64"))), 54)) == np.int64)
def test_python_to_sql(): assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER" assert str(python_to_sql_type(np.dtype(">M8[ns]"))) == "TIMESTAMP" assert (str(python_to_sql_type(pd.DatetimeTZDtype( unit="ns", tz="UTC"))) == "TIMESTAMP_WITH_LOCAL_TIME_ZONE")
def _prepare_schemas(self): """ Create a list of schemas filled with the dataframes and functions we have currently in our schema list """ schema_list = [] DaskTable = com.dask.sql.schema.DaskTable DaskAggregateFunction = com.dask.sql.schema.DaskAggregateFunction DaskScalarFunction = com.dask.sql.schema.DaskScalarFunction DaskSchema = com.dask.sql.schema.DaskSchema for schema_name, schema in self.schema.items(): java_schema = DaskSchema(schema_name) if not schema.tables: logger.warning("No tables are registered.") for name, dc in schema.tables.items(): row_count = (schema.statistics[name].row_count if name in schema.statistics else None) if row_count is not None: row_count = float(row_count) table = DaskTable(name, row_count) df = dc.df logger.debug( f"Adding table '{name}' to schema with columns: {list(df.columns)}" ) for column in df.columns: data_type = df[column].dtype sql_data_type = python_to_sql_type(data_type) table.addColumn(column, sql_data_type) java_schema.addTable(table) if not schema.functions: logger.debug("No custom functions defined.") for function_description in schema.function_lists: name = function_description.name sql_return_type = python_to_sql_type( function_description.return_type) if function_description.aggregation: logger.debug( f"Adding function '{name}' to schema as aggregation.") dask_function = DaskAggregateFunction( name, sql_return_type) else: logger.debug( f"Adding function '{name}' to schema as scalar function." ) dask_function = DaskScalarFunction(name, sql_return_type) dask_function = self._add_parameters_from_description( function_description, dask_function) java_schema.addFunction(dask_function) schema_list.append(java_schema) return schema_list