def test_join_tables_plan_default_namespace(self): query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')], from_table=Join(left=Identifier('tab1'), right=Identifier('tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN ) ) expected_plan = QueryPlan(integrations=['int'], default_namespace='int', steps = [ FetchDataframeStep(integration='int', query=Select( targets=[Star()], from_table=Identifier('tab1')), ), FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab2')), ), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('tab1'), right=Identifier('tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN )), ProjectStep(dataframe=Result(2), columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]), ], ) plan = plan_query(query, integrations=['int'], default_namespace='int') assert plan.steps == expected_plan.steps
def test_join_predictor_plan_default_namespace_predictor(self): query = Select(targets=[Identifier('tab1.column1'), Identifier('pred.predicted')], from_table=Join(left=Identifier('int.tab1'), right=Identifier('pred'), join_type=JoinType.INNER_JOIN, implicit=True) ) expected_plan = QueryPlan( default_namespace='mindsdb', steps=[ FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab1')), ), ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('result_0', alias=Identifier('tab1')), right=Identifier('result_1', alias=Identifier('pred')), join_type=JoinType.INNER_JOIN)), ProjectStep(dataframe=Result(2), columns=[Identifier('tab1.column1'), Identifier('pred.predicted')]), ], ) plan = plan_query(query, integrations=['int'], predictor_namespace='mindsdb', default_namespace='mindsdb', predictor_metadata={'pred': {}}) for i in range(len(plan.steps)): assert plan.steps[i] == expected_plan.steps[i]
def test_join_tables_disambiguate_identifiers_in_condition(self): query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')], from_table=Join(left=Identifier('int.tab1'), right=Identifier('int.tab2'), condition=BinaryOperation(op='=', args=[Identifier('int.tab1.column1'), # integration name included Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN ) ) plan = plan_query(query, integrations=['int']) expected_plan = QueryPlan(integrations=['int'], steps=[ FetchDataframeStep(integration='int', query=Select( targets=[Star()], from_table=Identifier('tab1')), ), FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab2')), ), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('tab1'), right=Identifier('tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), # integration name gets stripped out Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN )), ProjectStep(dataframe=Result(2), columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]), ], ) assert plan.steps == expected_plan.steps
def test_join_predictor_plan_order_by(self): query = Select(targets=[Identifier('tab.column1'), Identifier('pred.predicted')], from_table=Join(left=Identifier('int.tab'), right=Identifier('mindsdb.pred'), join_type=JoinType.INNER_JOIN, implicit=True), where=BinaryOperation('=', args=[Identifier('tab.product_id'), Constant('x')]), limit=Constant(10), offset=Constant(15), order_by=[OrderBy(field=Identifier('tab.column1'))] ) expected_plan = QueryPlan( steps=[ FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab'), where=BinaryOperation('=', args=[Identifier('tab.product_id'), Constant('x')]), limit=Constant(10), offset=Constant(15), order_by=[OrderBy(field=Identifier('tab.column1'))], ), ), ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('result_0', alias=Identifier('tab')), right=Identifier('result_1', alias=Identifier('pred')), join_type=JoinType.INNER_JOIN)), ProjectStep(dataframe=Result(2), columns=[Identifier('tab.column1'), Identifier('pred.predicted')]), ], ) plan = plan_query(query, integrations=['int'], predictor_namespace='mindsdb', predictor_metadata={'pred': {}}) assert plan.steps == expected_plan.steps
def test_plan_union_queries(self): query1 = Select(targets=[Identifier('column1'), Constant(None, alias=Identifier('predicted'))], from_table=Identifier('int.tab'), where=BinaryOperation('and', args=[ BinaryOperation('=', args=[Identifier('column1'), Identifier('column2')]), BinaryOperation('>', args=[Identifier('column3'), Constant(0)]), ])) query2 = Select( targets=[Identifier('tab1.column1'), Identifier('pred.predicted', alias=Identifier('predicted'))], from_table=Join(left=Identifier('int.tab1'), right=Identifier('mindsdb.pred'), join_type=JoinType.INNER_JOIN, implicit=True) ) query = Union(left=query1, right=query2, unique=False) expected_plan = QueryPlan( steps=[ # Query 1 FetchDataframeStep(integration='int', query=Select(targets=[Identifier('tab.column1', alias=Identifier('column1')), Constant(None, alias=Identifier('predicted'))], from_table=Identifier('tab'), where=BinaryOperation('and', args=[ BinaryOperation('=', args=[Identifier('tab.column1'), Identifier('tab.column2')]), BinaryOperation('>', args=[Identifier('tab.column3'), Constant(0)]), ]) )), # Query 2 FetchDataframeStep(integration='int', query=Select( targets=[Star()], from_table=Identifier('tab1')), ), ApplyPredictorStep(namespace='mindsdb', dataframe=Result(1), predictor=Identifier('pred')), JoinStep(left=Result(1), right=Result(2), query=Join(left=Identifier('result_1', alias=Identifier('tab1')), right=Identifier('result_2', alias=Identifier('pred')), join_type=JoinType.INNER_JOIN)), ProjectStep(dataframe=Result(3), columns=[Identifier('tab1.column1'), Identifier('pred.predicted', alias=Identifier('predicted'))]), # Union UnionStep(left=Result(0), right=Result(4), unique=False), ], ) plan = plan_query(query, integrations=['int'], predictor_namespace='mindsdb', predictor_metadata={'pred': {}}) for i in range(len(plan.steps)): assert plan.steps[i] == expected_plan.steps[i]
def test_join_tables_plan_groupby(self): query = Select(targets=[ Identifier('tab1.column1'), Identifier('tab2.column1'), Function('sum', args=[Identifier('tab2.column2')], alias=Identifier('total'))], from_table=Join(left=Identifier('int.tab1'), right=Identifier('int.tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN ), group_by=[Identifier('tab1.column1'), Identifier('tab2.column1')], having=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Constant(0)]) ) plan = plan_query(query, integrations=['int']) expected_plan = QueryPlan(integrations=['int'], steps = [ FetchDataframeStep(integration='int', query=Select( targets=[Star()], from_table=Identifier('tab1')), ), FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab2')), ), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('tab1'), right=Identifier('tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN )), GroupByStep(dataframe=Result(2), targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Function('sum', args=[Identifier('tab2.column2')])], columns=[Identifier('tab1.column1'), Identifier('tab2.column1')]), FilterStep(dataframe=Result(3), query=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Constant(0)])), ProjectStep(dataframe=Result(4), columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Function(op='sum', args=[Identifier('tab2.column2')], alias=Identifier('total'))]), ], ) assert plan.steps == expected_plan.steps
def plan_join_two_tables(self, join): select_left_step = self.plan_integration_select( Select(targets=[Star()], from_table=join.left)) select_right_step = self.plan_integration_select( Select(targets=[Star()], from_table=join.right)) left_integration_name, left_table = self.get_integration_path_from_identifier_or_error( join.left) right_integration_name, right_table = self.get_integration_path_from_identifier_or_error( join.right) left_table_path = left_table.to_string(alias=False) right_table_path = right_table.to_string(alias=False) new_condition_args = [] for arg in join.condition.args: if isinstance(arg, Identifier): if left_table_path in arg.parts: new_condition_args.append( disambiguate_integration_column_identifier( arg, left_integration_name, left_table)) elif right_table_path in arg.parts: new_condition_args.append( disambiguate_integration_column_identifier( arg, right_integration_name, right_table)) else: raise PlanningException( f'Wrong table or no source table in join condition for column: {str(arg)}' ) else: new_condition_args.append(arg) new_join = copy.deepcopy(join) new_join.condition.args = new_condition_args new_join.left = Identifier(left_table_path, alias=left_table.alias) new_join.right = Identifier(right_table_path, alias=right_table.alias) # FIXME: INFORMATION_SCHEMA with condition # clear join condition for INFORMATION_SCHEMA if right_integration_name == 'INFORMATION_SCHEMA': new_join.condition = None return self.plan.add_step( JoinStep(left=select_left_step.result, right=select_right_step.result, query=new_join))
def test_join_tables_plan_order_by(self): query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')], from_table=Join(left=Identifier('int.tab1'), right=Identifier('int.tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN ), limit=Constant(10), offset=Constant(15), order_by=[OrderBy(field=Identifier('tab1.column1'))], ) plan = plan_query(query, integrations=['int']) expected_plan = QueryPlan(integrations=['int'], steps = [ FetchDataframeStep(integration='int', query=Select( targets=[Star()], from_table=Identifier('tab1')), ), FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab2')), ), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('tab1'), right=Identifier('tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN )), OrderByStep(dataframe=Result(2), order_by=[OrderBy(field=Identifier('tab1.column1'))]), LimitOffsetStep(dataframe=Result(3), limit=10, offset=15), ProjectStep(dataframe=Result(4), columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]), ], ) assert plan.steps == expected_plan.steps
def test_join_tables_where_plan(self): query = Select(targets=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')], from_table=Join(left=Identifier('int.tab1'), right=Identifier('int.tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN ), where=BinaryOperation('and', args=[ BinaryOperation('and', args=[ BinaryOperation('=', args=[Identifier('tab1.column1'), Constant(1)]), BinaryOperation('=', args=[Identifier('tab2.column1'), Constant(0)]), ] ), BinaryOperation('=', args=[Identifier('tab1.column3'), Identifier('tab2.column3')]), ] ) ) plan = plan_query(query, integrations=['int']) expected_plan = QueryPlan(integrations=['int'], steps=[ FetchDataframeStep(integration='int', query=Select( targets=[Star()], from_table=Identifier('tab1'), ), ), FetchDataframeStep(integration='int', query=Select(targets=[Star()], from_table=Identifier('tab2'), ), ), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('tab1'), right=Identifier('tab2'), condition=BinaryOperation(op='=', args=[Identifier('tab1.column1'), Identifier('tab2.column1')]), join_type=JoinType.INNER_JOIN )), FilterStep(dataframe=Result(2), query=BinaryOperation('and', args=[ BinaryOperation('and', args=[ BinaryOperation('=', args=[ Identifier( 'tab1.column1'), Constant( 1)]), BinaryOperation('=', args=[ Identifier( 'tab2.column1'), Constant( 0)]), ] ), BinaryOperation('=', args=[Identifier( 'tab1.column3'), Identifier( 'tab2.column3')]), ] )), ProjectStep(dataframe=Result(3), columns=[Identifier('tab1.column1'), Identifier('tab2.column1'), Identifier('tab2.column2')]), ], ) assert plan.steps == expected_plan.steps
def plan_join(self, query, integration=None): join = query.from_table join_left = join.left join_right = join.right if isinstance(join_left, Select): # dbt query. # TODO support complex query. Only one table is supported at the moment. if not isinstance(join_left.from_table, Identifier): raise PlanningException( f'Statement not supported: {query.to_string()}') # move properties to upper query query = join_left if query.from_table.alias is not None: table_alias = [query.from_table.alias.parts[0]] else: table_alias = query.from_table.parts def add_aliases(node, is_table, **kwargs): if not is_table and isinstance(node, Identifier): if len(node.parts) == 1: # add table alias to field node.parts = table_alias + node.parts query_traversal(query.where, add_aliases) if isinstance(query.from_table, Identifier): # DBT workaround: allow use tables without integration. # if table.part[0] not in integration - take integration name from create table command if (integration is not None and query.from_table.parts[0] not in self.integrations): # add integration name to table query.from_table.parts.insert(0, integration) join_left = join_left.from_table aliased_fields = self.get_aliased_fields(query.targets) recursively_check_join_identifiers_for_ambiguity(query.where) recursively_check_join_identifiers_for_ambiguity( query.group_by, aliased_fields=aliased_fields) recursively_check_join_identifiers_for_ambiguity(query.having) recursively_check_join_identifiers_for_ambiguity( query.order_by, aliased_fields=aliased_fields) if isinstance(join_left, Identifier) and isinstance( join_right, Identifier): if self.is_predictor(join_left) and self.is_predictor(join_right): raise PlanningException( f'Can\'t join two predictors {str(join_left.parts[0])} and {str(join_left.parts[1])}' ) predictor_namespace = None predictor = None table = None predictor_is_left = False if self.is_predictor(join_left): predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier( join_left, self.default_namespace) predictor_is_left = True else: table = join_left if self.is_predictor(join_right): predictor_namespace, predictor = get_predictor_namespace_and_name_from_identifier( join_right, self.default_namespace) else: table = join_right last_step = None if predictor: # One argument is a table, another is a predictor # Apply mindsdb model to result of last dataframe fetch # Then join results of applying mindsdb with table predictor_name = self.predictor_names[predictor.to_string( alias=False).lower()] if self.predictor_metadata[predictor_name].get('timeseries'): predictor_steps = self.plan_timeseries_predictor( query, table, predictor_namespace, predictor) else: predictor_steps = self.plan_predictor( query, table, predictor_namespace, predictor) # add join # Update reference _, table = self.get_integration_path_from_identifier_or_error( table) table_alias = table.alias or Identifier( table.to_string(alias=False).replace('.', '_')) left = Identifier( predictor_steps['predictor'].result.ref_name, alias=predictor.alias or Identifier(predictor.to_string(alias=False))) right = Identifier(predictor_steps['data'].result.ref_name, alias=table_alias) if not predictor_is_left: # swap join left, right = right, left new_join = Join(left=left, right=right, join_type=join.join_type) left = predictor_steps['predictor'].result right = predictor_steps['data'].result if not predictor_is_left: # swap join left, right = right, left last_step = self.plan.add_step( JoinStep(left=left, right=right, query=new_join)) # limit from timeseries if predictor_steps.get('saved_limit'): last_step = self.plan.add_step( LimitOffsetStep(dataframe=last_step.result, limit=predictor_steps['saved_limit'])) else: # Both arguments are tables, join results of 2 dataframe fetches join_step = self.plan_join_two_tables(join) last_step = join_step if query.where: # FIXME: INFORMATION_SCHEMA with Where right_integration_name, _ = self.get_integration_path_from_identifier_or_error( join.right) if right_integration_name == 'INFORMATION_SCHEMA': ... else: last_step = self.plan.add_step( FilterStep(dataframe=last_step.result, query=query.where)) if query.group_by: group_by_targets = [] for t in query.targets: target_copy = copy.deepcopy(t) target_copy.alias = None group_by_targets.append(target_copy) last_step = self.plan.add_step( GroupByStep(dataframe=last_step.result, columns=query.group_by, targets=group_by_targets)) if query.having: last_step = self.plan.add_step( FilterStep(dataframe=last_step.result, query=query.having)) if query.order_by: last_step = self.plan.add_step( OrderByStep(dataframe=last_step.result, order_by=query.order_by)) if query.limit is not None or query.offset is not None: limit = query.limit.value if query.limit is not None else None offset = query.offset.value if query.offset is not None else None last_step = self.plan.add_step( LimitOffsetStep(dataframe=last_step.result, limit=limit, offset=offset)) else: raise PlanningException( f'Join of unsupported objects, currently only tables and predictors can be joined.' ) return self.plan_project(query, last_step.result)
def test_nested_select(self): # for tableau sql = f''' SELECT time FROM ( select * from int.covid join mindsdb.pred limit 10 ) `Custom SQL Query` limit 1 ''' query = parse_sql(sql, dialect='mindsdb') expected_plan = QueryPlan( default_namespace='mindsdb', steps=[ FetchDataframeStep(integration='int', query=parse_sql('select * from covid limit 10')), ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('result_0', alias=Identifier('covid')), right=Identifier('result_1', alias=Identifier('pred')), join_type=JoinType.JOIN)), ProjectStep(dataframe=Result(2), columns=[Star()]), ProjectStep(dataframe=Result(3), columns=[Identifier('time')], ignore_doubles=True), LimitOffsetStep(dataframe=Result(4), limit=1) ], ) plan = plan_query( query, integrations=['int'], predictor_namespace='mindsdb', default_namespace='mindsdb', predictor_metadata={'pred': {}} ) for i in range(len(plan.steps)): assert plan.steps[i] == expected_plan.steps[i] sql = f''' SELECT `time` FROM ( select * from int.covid join mindsdb.pred ) `Custom SQL Query` GROUP BY 1 ''' query = parse_sql(sql, dialect='mindsdb') expected_plan = QueryPlan( default_namespace='mindsdb', steps=[ FetchDataframeStep(integration='int', query=parse_sql('select * from covid')), ApplyPredictorStep(namespace='mindsdb', dataframe=Result(0), predictor=Identifier('pred')), JoinStep(left=Result(0), right=Result(1), query=Join(left=Identifier('result_0', alias=Identifier('covid')), right=Identifier('result_1', alias=Identifier('pred')), join_type=JoinType.JOIN)), ProjectStep(dataframe=Result(2), columns=[Star()]), GroupByStep(dataframe=Result(3), columns=[Constant(1)], targets=[Identifier('time')]) ], ) plan = plan_query( query, integrations=['int'], predictor_namespace='mindsdb', default_namespace='mindsdb', predictor_metadata={'pred': {}} ) for i in range(len(plan.steps)): assert plan.steps[i] == expected_plan.steps[i]