def _get_ral(self, sql): """Helper function to turn the sql query into a relational algebra and resulting column names""" # get the schema of what we currently have registered schema = self._prepare_schema() # Now create a relational algebra from that generator = RelationalAlgebraGenerator(schema) default_dialect = generator.getDialect() logger.debug(f"Using dialect: {get_java_class(default_dialect)}") try: sqlNode = generator.getSqlNode(sql) sqlNodeClass = get_java_class(sqlNode) if sqlNodeClass.startswith("com.dask.sql.parser."): rel = sqlNode rel_string = "" else: validatedSqlNode = generator.getValidatedNode(sqlNode) nonOptimizedRelNode = generator.getRelationalAlgebra( validatedSqlNode) rel = generator.getOptimizedRelationalAlgebra( nonOptimizedRelNode) rel_string = str(generator.getRelationalAlgebraString(rel)) except (ValidationException, SqlParseException) as e: logger.debug(f"Original exception raised by Java:\n {e}") # We do not want to re-raise an exception here # as this would print the full java stack trace # if debug is not set. # Instead, we raise a nice exception raise ParsingException(sql, str(e.message())) from None # Internal, temporary results of calcite are sometimes # named EXPR$N (with N a number), which is not very helpful # to the user. We replace these cases therefore with # the actual query string. This logic probably fails in some # edge cases (if the outer SQLNode is not a select node), # but so far I did not find such a case. # So please raise an issue if you have found one! if sqlNodeClass == "org.apache.calcite.sql.SqlOrderBy": sqlNode = sqlNode.query sqlNodeClass = get_java_class(sqlNode) if sqlNodeClass == "org.apache.calcite.sql.SqlSelect": select_names = [ self._to_sql_string(s, default_dialect=default_dialect) for s in sqlNode.getSelectList() ] else: logger.debug( "Not extracting output column names as the SQL is not a SELECT call" ) select_names = None logger.debug(f"Extracted relational algebra:\n {rel_string}") return rel, select_names, rel_string
def convert( cls, rex: "org.apache.calcite.rex.RexNode", dc: DataContainer, context: "dask_sql.Context", ) -> Union[dd.DataFrame, Any]: """ Convert the given rel (java instance) into a python expression (a dask dataframe) using the stored plugins and the dictionary of registered dask tables. """ class_name = get_java_class(rex) try: plugin_instance = cls.get_plugin(class_name) except KeyError: # pragma: no cover raise NotImplementedError( f"No conversion for class {class_name} available (yet).") logger.debug( f"Processing REX {rex} using {plugin_instance.__class__.__name__}..." ) df = plugin_instance.convert(rex, dc, context=context) logger.debug(f"Processed REX {rex} into {LoggableDataFrame(df)}") return df
def check_special_operator(operator: "org.apache.calcite.sql.fun"): """ Check for special operator classes that have an overloaded name with other operator type/kinds. eg: sqlDatetimeSubtractionOperator has the sqltype and kind of the `-` or `minus` operation. """ special_op_to_name = { "org.apache.calcite.sql.fun.SqlDatetimeSubtractionOperator": "datetime_subtraction" } return special_op_to_name.get(get_java_class(operator), None)
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # Get the input of the previous step (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # Collect all (new) columns named_projects = rel.getNamedProjects() column_names = [] new_columns = {} new_mappings = {} for expr, key in named_projects: key = str(key) column_names.append(key) # shortcut: if we have a column already, there is no need to re-assign it again # this is only the case if the expr is a RexInputRef if get_java_class(expr) == RexInputRefPlugin.class_name: index = expr.getIndex() backend_column_name = cc.get_backend_by_frontend_index(index) logger.debug( f"Not re-adding the same column {key} (but just referencing it)" ) new_mappings[key] = backend_column_name else: new_columns[key] = RexConverter.convert(expr, dc, context=context) logger.debug(f"Adding a new column {key} out of {expr}") cc = cc.add(key, key) new_mappings[key] = key # Actually add the new columns if new_columns: df = df.assign(**new_columns) # and the new mappings for key, backend_column_name in new_mappings.items(): cc = cc.add(key, backend_column_name) # Make sure the order is correct cc = cc.limit_to(column_names) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def _get_ral(self, sql): """Helper function to turn the sql query into a relational algebra and resulting column names""" # get the schema of what we currently have registered schema = self._prepare_schema() # Now create a relational algebra from that generator = RelationalAlgebraGenerator(schema) sqlNode = generator.getSqlNode(sql) sqlNodeClass = get_java_class(sqlNode) if sqlNodeClass.startswith("com.dask.sql.parser."): return sqlNode, [] validatedSqlNode = generator.getValidatedNode(sqlNode) nonOptimizedRelNode = generator.getRelationalAlgebra(validatedSqlNode) rel = generator.getOptimizedRelationalAlgebra(nonOptimizedRelNode) default_dialect = generator.getDialect() logger.debug(f"Using dialect: {get_java_class(default_dialect)}") # Internal, temporary results of calcite are sometimes # named EXPR$N (with N a number), which is not very helpful # to the user. We replace these cases therefore with # the actual query string. This logic probably fails in some # edge cases (if the outer SQLNode is not a select node), # but so far I did not find such a case. # So please raise an issue if you have found one! def toSqlString(s): try: return str(s.toSqlString(default_dialect)) except: # pragma: no cover return str(s) if sqlNodeClass == "org.apache.calcite.sql.SqlSelect": select_names = [toSqlString(s) for s in sqlNode.getSelectList()] else: logger.debug( "Not extracting output column names as the SQL is not a SELECT call" ) select_names = None logger.debug( f"Extracted relational algebra:\n {generator.getRelationalAlgebraString(rel)}" ) return rel, select_names
def _get_ral(self, sql): """Helper function to turn the sql query into a relational algebra and resulting column names""" # get the schema of what we currently have registered schemas = self._prepare_schemas() RelationalAlgebraGeneratorBuilder = ( com.dask.sql.application.RelationalAlgebraGeneratorBuilder) # True if the SQL query should be case sensitive and False otherwise case_sensitive = dask_config.get("sql.identifier.case_sensitive", default=True) generator_builder = RelationalAlgebraGeneratorBuilder( self.schema_name, case_sensitive, java.util.ArrayList()) for schema in schemas: generator_builder = generator_builder.addSchema(schema) generator = generator_builder.build() default_dialect = generator.getDialect() logger.debug(f"Using dialect: {get_java_class(default_dialect)}") ValidationException = org.apache.calcite.tools.ValidationException SqlParseException = org.apache.calcite.sql.parser.SqlParseException CalciteContextException = org.apache.calcite.runtime.CalciteContextException try: sqlNode = generator.getSqlNode(sql) sqlNodeClass = get_java_class(sqlNode) select_names = None rel = sqlNode rel_string = "" if not sqlNodeClass.startswith("com.dask.sql.parser."): nonOptimizedRelNode = generator.getRelationalAlgebra(sqlNode) # Optimization might remove some alias projects. Make sure to keep them here. select_names = [ str(name) for name in nonOptimizedRelNode.getRowType().getFieldNames() ] rel = generator.getOptimizedRelationalAlgebra( nonOptimizedRelNode) rel_string = str(generator.getRelationalAlgebraString(rel)) except (ValidationException, SqlParseException, CalciteContextException) as e: logger.debug(f"Original exception raised by Java:\n {e}") # We do not want to re-raise an exception here # as this would print the full java stack trace # if debug is not set. # Instead, we raise a nice exception raise ParsingException(sql, str(e.message())) from None # Internal, temporary results of calcite are sometimes # named EXPR$N (with N a number), which is not very helpful # to the user. We replace these cases therefore with # the actual query string. This logic probably fails in some # edge cases (if the outer SQLNode is not a select node), # but so far I did not find such a case. # So please raise an issue if you have found one! if sqlNodeClass == "org.apache.calcite.sql.SqlOrderBy": sqlNode = sqlNode.query sqlNodeClass = get_java_class(sqlNode) if sqlNodeClass == "org.apache.calcite.sql.SqlSelect": select_names = [ self._to_sql_string(s, default_dialect=default_dialect) if current_name.startswith("EXPR$") else current_name for s, current_name in zip(sqlNode.getSelectList(), select_names) ] else: logger.debug( "Not extracting output column names as the SQL is not a SELECT call" ) logger.debug(f"Extracted relational algebra:\n {rel_string}") return rel, select_names, rel_string