def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container sort_collation = rel.getCollation().getFieldCollations() sort_columns = [ cc.get_backend_by_frontend_index(int(x.getFieldIndex())) for x in sort_collation ] sort_ascending = [ str(x.getDirection()) == "ASCENDING" for x in sort_collation ] offset = rel.offset if offset: offset = RexConverter.convert(offset, df, context=context) end = rel.fetch if end: end = RexConverter.convert(end, df, context=context) if offset: end += offset if sort_columns: df = self._apply_sort(df, sort_columns, sort_ascending) if offset is not None or end is not None: df = self._apply_offset(df, offset, end) cc = self.fix_column_to_row_type(cc, rel.getRowType()) # No column type has changed, so no need to cast again return DataContainer(df, cc)
def convert( self, rex: "org.apache.calcite.rex.RexNode", dc: DataContainer, context: "dask_sql.Context", ) -> SeriesOrScalar: # Prepare the operands by turning the RexNodes into python expressions operands = [ RexConverter.convert(o, dc, context=context) for o in rex.getOperands() ] # Now use the operator name in the mapping operator_name = str(rex.getOperator().getName()) operator_name = operator_name.lower() try: operation = self.OPERATION_MAPPING[operator_name] except KeyError: try: operation = context.functions[operator_name] except KeyError: raise NotImplementedError( f"{operator_name} not (yet) implemented") logger.debug( f"Executing {operator_name} on {[str(LoggableDataFrame(df)) for df in operands]}" ) if hasattr(operation, "needs_dc") and operation.needs_dc: return operation(*operands, dc=dc) else: return operation(*operands)
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # There should not be any input. This is the first step. self.assert_inputs(rel, 0) rex_expression_rows = list(rel.getTuples()) rows = [] for rex_expression_row in rex_expression_rows: # We convert each of the cells in the row # using a RexConverter. # As we do not have any information on the # column headers, we just name them with # their index. rows.append({ str(i): RexConverter.convert(rex_cell, None, context=context) for i, rex_cell in enumerate(rex_expression_row) }) # TODO: we explicitely reference pandas and dask here -> might we worth making this more general # We assume here that when using the values plan, the resulting dataframe will be quite small if rows: df = pd.DataFrame(rows) else: field_names = [str(x) for x in rel.getRowType().getFieldNames()] df = pd.DataFrame(columns=field_names) df = dd.from_pandas(df, npartitions=1) cc = ColumnContainer(df.columns) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container offset = rel.getOffset() if offset: offset = RexConverter.convert(offset, df, context=context) end = rel.getFetch() if end: end = RexConverter.convert(end, df, context=context) if offset: end += offset df = self._apply_limit(df, offset, end) cc = self.fix_column_to_row_type(cc, rel.getRowType()) # No column type has changed, so no need to cast again return DataContainer(df, cc)
def convert( self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context" ) -> DataContainer: # Get the input of the previous step (dc,) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # Collect all (new) columns named_projects = rel.getNamedProjects() column_names = [] new_columns = {} new_mappings = {} for expr, key in named_projects: key = str(key) column_names.append(key) # shortcut: if we have a column already, there is no need to re-assign it again # this is only the case if the expr is a RexInputRef if isinstance(expr, org.apache.calcite.rex.RexInputRef): index = expr.getIndex() backend_column_name = cc.get_backend_by_frontend_index(index) logger.debug( f"Not re-adding the same column {key} (but just referencing it)" ) new_mappings[key] = backend_column_name else: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( expr, dc, context=context ) logger.debug(f"Adding a new column {key} out of {expr}") new_mappings[key] = random_name # Actually add the new columns if new_columns: df = df.assign(**new_columns) # and the new mappings for key, backend_column_name in new_mappings.items(): cc = cc.add(key, backend_column_name) # Make sure the order is correct cc = cc.limit_to(column_names) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def __init__(self): """ Create a new context. """ # Storage for the registered tables self.tables = {} # Storage for the registered functions self.functions: Dict[str, Callable] = {} self.function_list: List[FunctionDescription] = [] # Storage for the registered aggregations self.aggregations = {} # Storage for the trained models self.models = {} # Name of the root schema (not changable so far) self.schema_name = "schema" # Register any default plugins, if nothing was registered before. RelConverter.add_plugin_class(logical.LogicalAggregatePlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalFilterPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalJoinPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalProjectPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalSortPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalTableScanPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalUnionPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalValuesPlugin, replace=False) RelConverter.add_plugin_class(logical.SamplePlugin, replace=False) RelConverter.add_plugin_class(custom.AnalyzeTablePlugin, replace=False) RelConverter.add_plugin_class(custom.CreateModelPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateTableAsPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateTablePlugin, replace=False) RelConverter.add_plugin_class(custom.PredictModelPlugin, replace=False) RelConverter.add_plugin_class(custom.DropModelPlugin, replace=False) RelConverter.add_plugin_class(custom.DropTablePlugin, replace=False) RelConverter.add_plugin_class(custom.ShowColumnsPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowSchemasPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowTablesPlugin, replace=False) RexConverter.add_plugin_class(core.RexCallPlugin, replace=False) RexConverter.add_plugin_class(core.RexInputRefPlugin, replace=False) RexConverter.add_plugin_class(core.RexLiteralPlugin, replace=False) InputUtil.add_plugin_class(input_utils.DaskInputPlugin, replace=False) InputUtil.add_plugin_class(input_utils.PandasInputPlugin, replace=False) InputUtil.add_plugin_class(input_utils.HiveInputPlugin, replace=False) InputUtil.add_plugin_class(input_utils.IntakeCatalogInputPlugin, replace=False) # needs to be the last entry, as it only checks for string InputUtil.add_plugin_class(input_utils.LocationInputPlugin, replace=False)
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # Every logic is handled in the RexConverter # we just need to apply it here condition = rel.getCondition() df_condition = RexConverter.convert(condition, dc, context=context) df = filter_or_scalar(df, df_condition) cc = self.fix_column_to_row_type(cc, rel.getRowType()) # No column type has changed, so no need to convert again return DataContainer(df, cc)
def convert( self, rex: "org.apache.calcite.rex.RexNode", dc: DataContainer, context: "dask_sql.Context", ) -> SeriesOrScalar: # Prepare the operands by turning the RexNodes into python expressions operands = [ RexConverter.convert(o, dc, context=context) for o in rex.getOperands() ] # Now use the operator name in the mapping schema_name, operator_name = context.fqn(rex.getOperator().getNameAsId()) if special_op := check_special_operator(rex.getOperator()): operator_name = special_op
def __init__(self): """ Create a new context. """ # Storage for the registered tables self.tables = {} # Storage for the registered functions self.functions: Dict[str, Callable] = {} self.function_list: List[FunctionDescription] = [] # Storage for the registered aggregations self.aggregations = {} # Name of the root schema (not changable so far) self.schema_name = "schema" # Register any default plugins, if nothing was registered before. RelConverter.add_plugin_class(logical.LogicalAggregatePlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalFilterPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalJoinPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalProjectPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalSortPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalTableScanPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalUnionPlugin, replace=False) RelConverter.add_plugin_class(logical.LogicalValuesPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateAsPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateTablePlugin, replace=False) RelConverter.add_plugin_class(custom.ShowColumnsPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowSchemasPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowTablesPlugin, replace=False) RexConverter.add_plugin_class(core.RexCallPlugin, replace=False) RexConverter.add_plugin_class(core.RexInputRefPlugin, replace=False) RexConverter.add_plugin_class(core.RexLiteralPlugin, replace=False)
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # Joining is a bit more complicated, so lets do it in steps: # 1. We now have two inputs (from left and right), so we fetch them both dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context) cc_lhs = dc_lhs.column_container cc_rhs = dc_rhs.column_container # 2. dask's merge will do some smart things with columns, which have the same name # on lhs an rhs (which also includes reordering). # However, that will confuse our column numbering in SQL. # So we make our life easier by converting the column names into unique names # We will convert back in the end cc_lhs_renamed = cc_lhs.make_unique("lhs") cc_rhs_renamed = cc_rhs.make_unique("rhs") dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed) dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed) df_lhs_renamed = dc_lhs_renamed.assign() df_rhs_renamed = dc_rhs_renamed.assign() join_type = rel.getJoinType() join_type = self.JOIN_TYPE_MAPPING[str(join_type)] # 3. The join condition can have two forms, that we can understand # (a) a = b # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b) # The first case is very simple and we do not need any additional filter # In the second case we do a merge on all the a = b, # and then apply a filter using the other expressions. # In all other cases, we need to do a full table cross join and filter afterwards. # As this is probably non-sense for large tables, but there is no other # known solution so far. join_condition = rel.getCondition() lhs_on, rhs_on, filter_condition = self._split_join_condition( join_condition) logger.debug( f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.") # lhs_on and rhs_on are the indices of the columns to merge on. # The given column indices are for the full, merged table which consists # of lhs and rhs put side-by-side (in this order) # We therefore need to normalize the rhs indices relative to the rhs table. rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on] # 4. dask can only merge on the same column names. # We therefore create new columns on purpose, which have a distinct name. assert len(lhs_on) == len(rhs_on) if lhs_on: # 5. Now we can finally merge on these columns # The resulting dataframe will contain all (renamed) columns from the lhs and rhs # plus the added columns df = self._join_on_columns( df_lhs_renamed, df_rhs_renamed, lhs_on, rhs_on, join_type, ) else: # 5. We are in the complex join case # where we have no column to merge on # This means we have no other chance than to merge # everything with everything... # TODO: we should implement a shortcut # for filter conditions that are always false def merge_single_partitions(lhs_partition, rhs_partition): # Do a cross join with the two partitions # TODO: it would be nice to apply the filter already here # problem: this would mean we need to ship the rex to the # workers (as this is executed on the workers), # which is definitely not possible (java dependency, JVM start...) lhs_partition = lhs_partition.assign(common=1) rhs_partition = rhs_partition.assign(common=1) return lhs_partition.merge(rhs_partition, on="common").drop(columns="common") # Iterate nested over all partitions from lhs and rhs and merge them name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed) dsk = {(name, i * df_rhs_renamed.npartitions + j): ( merge_single_partitions, (df_lhs_renamed._name, i), (df_rhs_renamed._name, j), ) for i in range(df_lhs_renamed.npartitions) for j in range(df_rhs_renamed.npartitions)} graph = HighLevelGraph.from_collections( name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed]) meta = dd.dispatch.concat( [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty], axis=1) # TODO: Do we know the divisions in any way here? divisions = [None] * (len(dsk) + 1) df = dd.DataFrame(graph, name, meta=meta, divisions=divisions) warnings.warn( "Need to do a cross-join, which is typically very resource heavy", ResourceWarning, ) # 6. So the next step is to make sure # we have the correct column order (and to remove the temporary join columns) correct_column_order = list(df_lhs_renamed.columns) + list( df_rhs_renamed.columns) cc = ColumnContainer(df.columns).limit_to(correct_column_order) # and to rename them like the rel specifies row_type = rel.getRowType() field_specifications = [str(f) for f in row_type.getFieldNames()] cc = cc.rename({ from_col: to_col for from_col, to_col in zip(cc.columns, field_specifications) }) cc = self.fix_column_to_row_type(cc, row_type) dc = DataContainer(df, cc) # 7. Last but not least we apply any filters by and-chaining together the filters if filter_condition: # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate filter_condition = reduce( operator.and_, [ RexConverter.convert(rex, dc, context=context) for rex in filter_condition ], ) logger.debug(f"Additionally applying filter {filter_condition}") df = filter_or_scalar(df, filter_condition) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # Joining is a bit more complicated, so lets do it in steps: # 1. We now have two inputs (from left and right), so we fetch them both dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context) cc_lhs = dc_lhs.column_container cc_rhs = dc_rhs.column_container # 2. dask's merge will do some smart things with columns, which have the same name # on lhs an rhs (which also includes reordering). # However, that will confuse our column numbering in SQL. # So we make our life easier by converting the column names into unique names # We will convert back in the end cc_lhs_renamed = cc_lhs.make_unique("lhs") cc_rhs_renamed = cc_rhs.make_unique("rhs") dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed) dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed) df_lhs_renamed = dc_lhs_renamed.assign() df_rhs_renamed = dc_rhs_renamed.assign() join_type = rel.getJoinType() join_type = self.JOIN_TYPE_MAPPING[str(join_type)] # 3. The join condition can have two forms, that we can understand # (a) a = b # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b) # The first case is very simple and we do not need any additional filter # In the second case we do a merge on all the a = b, # and then apply a filter using the other expressions. # In all other cases, we need to do a full table cross join and filter afterwards. # As this is probably non-sense for large tables, but there is no other # known solution so far. join_condition = rel.getCondition() lhs_on, rhs_on, filter_condition = self._split_join_condition( join_condition) logger.debug( f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.") # lhs_on and rhs_on are the indices of the columns to merge on. # The given column indices are for the full, merged table which consists # of lhs and rhs put side-by-side (in this order) # We therefore need to normalize the rhs indices relative to the rhs table. rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on] # 4. dask can only merge on the same column names. # We therefore create new columns on purpose, which have a distinct name. assert len(lhs_on) == len(rhs_on) if lhs_on: lhs_columns_to_add = { f"common_{i}": df_lhs_renamed.iloc[:, index] for i, index in enumerate(lhs_on) } rhs_columns_to_add = { f"common_{i}": df_rhs_renamed.iloc[:, index] for i, index in enumerate(rhs_on) } else: # We are in the complex join case # where we have no column to merge on # This means we have no other chance than to merge # everything with everything... # We add a 1-column to merge on lhs_columns_to_add = {"common": 1} rhs_columns_to_add = {"common": 1} warnings.warn( "Need to do a cross-join, which is typically very resource heavy", ResourceWarning, ) df_lhs_with_tmp = df_lhs_renamed.assign(**lhs_columns_to_add) df_rhs_with_tmp = df_rhs_renamed.assign(**rhs_columns_to_add) added_columns = list(lhs_columns_to_add.keys()) # 5. Now we can finally merge on these columns # The resulting dataframe will contain all (renamed) columns from the lhs and rhs # plus the added columns df = dd.merge(df_lhs_with_tmp, df_rhs_with_tmp, on=added_columns, how=join_type) # 6. So the next step is to make sure # we have the correct column order (and to remove the temporary join columns) correct_column_order = list(df_lhs_renamed.columns) + list( df_rhs_renamed.columns) cc = ColumnContainer(df.columns).limit_to(correct_column_order) # and to rename them like the rel specifies row_type = rel.getRowType() field_specifications = [str(f) for f in row_type.getFieldNames()] cc = cc.rename({ from_col: to_col for from_col, to_col in zip(cc.columns, field_specifications) }) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) # 7. Last but not least we apply any filters by and-chaining together the filters if filter_condition: # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate filter_condition = reduce( operator.and_, [ RexConverter.convert(rex, dc, context=context) for rex in filter_condition ], ) logger.debug(f"Additionally applying filter {filter_condition}") df = filter_or_scalar(df, filter_condition) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def __init__(self): """ Create a new context. """ # Name of the root schema self.schema_name = self.DEFAULT_SCHEMA_NAME # All schema information self.schema = {self.schema_name: SchemaContainer(self.schema_name)} # A started SQL server (useful for jupyter notebooks) self.sql_server = None # Register any default plugins, if nothing was registered before. RelConverter.add_plugin_class(logical.DaskAggregatePlugin, replace=False) RelConverter.add_plugin_class(logical.DaskFilterPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskJoinPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskLimitPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskProjectPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskSortPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskTableScanPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskUnionPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskValuesPlugin, replace=False) RelConverter.add_plugin_class(logical.DaskWindowPlugin, replace=False) RelConverter.add_plugin_class(logical.SamplePlugin, replace=False) RelConverter.add_plugin_class(custom.AnalyzeTablePlugin, replace=False) RelConverter.add_plugin_class(custom.CreateExperimentPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateModelPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateSchemaPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateTableAsPlugin, replace=False) RelConverter.add_plugin_class(custom.CreateTablePlugin, replace=False) RelConverter.add_plugin_class(custom.DropModelPlugin, replace=False) RelConverter.add_plugin_class(custom.DropSchemaPlugin, replace=False) RelConverter.add_plugin_class(custom.DropTablePlugin, replace=False) RelConverter.add_plugin_class(custom.ExportModelPlugin, replace=False) RelConverter.add_plugin_class(custom.PredictModelPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowColumnsPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowModelParamsPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowModelsPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowSchemasPlugin, replace=False) RelConverter.add_plugin_class(custom.ShowTablesPlugin, replace=False) RelConverter.add_plugin_class(custom.SwitchSchemaPlugin, replace=False) RelConverter.add_plugin_class(custom.AlterSchemaPlugin, replace=False) RelConverter.add_plugin_class(custom.AlterTablePlugin, replace=False) RelConverter.add_plugin_class(custom.DistributeByPlugin, replace=False) RexConverter.add_plugin_class(core.RexCallPlugin, replace=False) RexConverter.add_plugin_class(core.RexInputRefPlugin, replace=False) RexConverter.add_plugin_class(core.RexLiteralPlugin, replace=False) InputUtil.add_plugin_class(input_utils.DaskInputPlugin, replace=False) InputUtil.add_plugin_class(input_utils.PandasLikeInputPlugin, replace=False) InputUtil.add_plugin_class(input_utils.HiveInputPlugin, replace=False) InputUtil.add_plugin_class(input_utils.IntakeCatalogInputPlugin, replace=False) InputUtil.add_plugin_class(input_utils.SqlalchemyHiveInputPlugin, replace=False) # needs to be the last entry, as it only checks for string InputUtil.add_plugin_class(input_utils.LocationInputPlugin, replace=False)