def get_predictors(self, mindsdb_sql_query): predictors_df = self._select_predictors() mindsdb_sql_query.from_table.parts = ['predictors'] # +++ FIXME https://github.com/mindsdb/dfsql/issues/37 https://github.com/mindsdb/mindsdb_sql/issues/53 if ' 1 = 0' in str(mindsdb_sql_query): q = str(mindsdb_sql_query) q = q[:q.lower().find('where')] + ' limit 0' result_df = dfsql.sql_query(q, ds_kwargs={'case_sensitive': False}, reduce_output=False, predictors=predictors_df) elif 'AND (1 = 1)' in str(mindsdb_sql_query): q = str(mindsdb_sql_query) q = q.replace('AND (1 = 1)', ' ') result_df = dfsql.sql_query(q, ds_kwargs={'case_sensitive': False}, reduce_output=False, predictors=predictors_df) else: # --- try: result_df = dfsql.sql_query( str(mindsdb_sql_query), ds_kwargs={'case_sensitive': False}, reduce_output=False, predictors=predictors_df) except Exception: # FIXME https://github.com/mindsdb/dfsql/issues/38 result_df = predictors_df # FIXME https://github.com/mindsdb/dfsql/issues/38 result_df = result_df.where(pd.notnull(result_df), '') return result_df.to_dict(orient='records'), list(result_df.columns)
def select(self, query): query_tables = get_all_tables(query) if len(query_tables) != 1: raise Exception( f'Only one table can be used in query to information_schema: {query}' ) table = query_tables[0].upper() if table == 'TABLES': dataframe = self._get_tables() elif table == 'COLUMNS': dataframe = self._get_columns() elif table == 'SCHEMATA': dataframe = self._get_schemata() elif table == 'EVENTS': dataframe = self._get_events() elif table == 'ROUTINES': dataframe = self._get_routines() elif table == 'TRIGGERS': dataframe = self._get_triggers() elif table == 'PLUGINS': dataframe = self._get_plugins() else: raise Exception('Information schema: Not implemented.') table_name = query.from_table.parts[-1] # region FIXME https://github.com/mindsdb/dfsql/issues/37 https://github.com/mindsdb/mindsdb_sql/issues/53 if ' 1 = 0' in str(query): q = str(query) q = q[:q.lower().find('where')] + ' limit 0' data = dfsql.sql_query(q, ds_kwargs={'case_sensitive': False}, reduce_output=False, **{table_name: dataframe}) # endregion else: # --- try: if table == 'TABLES': query = 'select * from TABLES' table_name = 'TABLES' data = dfsql.sql_query(str(query), ds_kwargs={'case_sensitive': False}, reduce_output=False, **{table_name: dataframe}) except Exception as e: print(f'Exception! {e}') return [], [] return data.to_dict(orient='records'), data.columns.to_list()
def get_datasources(self, mindsdb_sql_query): datasources_df = self._select_datasources() mindsdb_sql_query.from_table.parts = ['datasources'] result_df = dfsql.sql_query(str(mindsdb_sql_query), ds_kwargs={'case_sensitive': False}, reduce_output=False, datasources=datasources_df) return result_df.to_dict(orient='records'), list(result_df.columns)
def _parse_query(self, sql): # +++ FIXME https://github.com/mindsdb/mindsdb_sql/issues/53 is_crutch = False if 'where 1 = 0' in sql.lower(): sql = sql[:sql.lower().find('where 1 = 0')] + ' limit 0' is_crutch = True elif 'where 1=0' in sql.lower(): sql = sql[:sql.lower().find('where 1=0')] + ' limit 0' is_crutch = True # --- mindsdb_sql_struct = parse_sql(sql, dialect='mindsdb') # is it query with only constants? if (mindsdb_sql_struct.from_table is None and mindsdb_sql_struct.where is None and set( isinstance(x, Constant) for x in mindsdb_sql_struct.targets) == set([True])): table_name = (None, None, None) self.fetched_data = [{table_name: {}}] self.columns_list = [] for column in mindsdb_sql_struct.targets: alias = '.'.join( column.alias.parts ) if column.alias is not None else column.value self.fetched_data[0][table_name][alias] = column.value self.columns_list.append(table_name + (alias, alias)) return # is it query to 'predictors'? if (isinstance(mindsdb_sql_struct.from_table, Identifier) and mindsdb_sql_struct.from_table.parts[-1].lower() == 'predictors' and (self.database == 'mindsdb' or mindsdb_sql_struct.from_table.parts[0].lower() == 'mindsdb')): dn = self.datahub.get(self.mindsdb_database_name) data, columns = dn.get_predictors(mindsdb_sql_struct) table_name = ('mindsdb', 'predictors', 'predictors') self.fetched_data = [{table_name: row} for row in data] self.columns_list = [(table_name + (column_name, column_name)) for column_name in columns] return # is it query to 'commands'? if (isinstance(mindsdb_sql_struct.from_table, Identifier) and mindsdb_sql_struct.from_table.parts[-1].lower() == 'commands' and (self.database == 'mindsdb' or mindsdb_sql_struct.from_table.parts[0].lower() == 'mindsdb')): self.fetched_data = [] self.columns_list = [('mindsdb', 'commands', 'commands', 'command', 'command')] return integrations_names = self.datahub.get_integrations_names() integrations_names.append('INFORMATION_SCHEMA') integrations_names.append('information_schema') integrations_names.append('datasource') all_tables = get_all_tables(mindsdb_sql_struct) predictor_metadata = {} predictors = db.session.query( db.Predictor).filter_by(company_id=self.session.company_id) for model_name in set(all_tables): for p in predictors: if p.name == model_name: if isinstance(p.data, dict) and 'error' not in p.data: ts_settings = p.learn_args.get('timeseries_settings', {}) if ts_settings.get('is_timeseries') is True: window = ts_settings.get('window') order_by = ts_settings.get('order_by')[0] group_by = ts_settings.get('group_by')[0] predictor_metadata[model_name] = { 'timeseries': True, 'window': window, 'order_by_column': order_by, 'group_by_column': group_by } else: predictor_metadata[model_name] = { 'timeseries': False } self.model_types.update(p.data.get('dtypes', {})) # FIXME https://github.com/mindsdb/mindsdb_sql/issues/53 if is_crutch is True: sql = sql[:sql.lower().find(' limit 0' )] + " where when_data = '{}' limit 0" mindsdb_sql_struct = parse_sql(sql, dialect='mindsdb') plan = plan_query(mindsdb_sql_struct, integrations=integrations_names, predictor_namespace=self.mindsdb_database_name, predictor_metadata=predictor_metadata, default_namespace=self.database) steps_data = [] for step in plan.steps: data = [] if isinstance(step, FetchDataframeStep): data = self._fetch_dataframe_step(step) elif isinstance(step, UnionStep): left_data = steps_data[step.left.step_num] right_data = steps_data[step.right.step_num] data = left_data + right_data elif isinstance(step, MapReduceStep): if step.reduce != 'union': raise Exception( f'Unknown MapReduceStep type: {step.reduce}') step_data = steps_data[step.values.step_num] values = [] for row in step_data: for row_data in row.values(): for v in row_data.values(): values.append(v) data = [] substep = step.step if isinstance(substep, FetchDataframeStep): query = substep.query markQueryVar(query.where) for value in values: replaceQueryVar(query.where, value) data.extend(self._fetch_dataframe_step(substep)) elif isinstance(substep, MultipleSteps): data = self._multiple_steps_reduce(substep, values) else: raise Exception(f'Unknown step type: {step.step}') elif isinstance(step, ApplyPredictorRowStep): predictor = '.'.join(step.predictor.parts) dn = self.datahub.get(self.mindsdb_database_name) where_data = step.row_dict # +++ external datasource if 'external_datasource' in where_data: external_datasource_sql = where_data['external_datasource'] if 'select ' not in external_datasource_sql.lower(): external_datasource_sql = f'select * from {external_datasource_sql}' temp_session = copy.copy(self.session) temp_session.database = 'datasource' query = SQLQuery(external_datasource_sql, session=temp_session) result = query.fetch(self.datahub, view='dict') if result['success'] is False: raise Exception( f"Something wrong with getting data from {where_data['external_datasource']}" ) for row in result['result']: row.update(where_data) where_data = result['result'] # --- data = dn.select( table=predictor, columns=None, where_data=where_data, integration_name=self.session.integration, integration_type=self.session.integration_type, # where={} ) data = [{ get_preditor_alias(step, self.database): x } for x in data] elif isinstance(step, ApplyPredictorStep): dn = self.datahub.get(self.mindsdb_database_name) predictor = '.'.join(step.predictor.parts) where_data = [] for row in steps_data[step.dataframe.step_num]: new_row = {} for table_name in row: keys_intersection = set(new_row) & set(row[table_name]) if len(keys_intersection) > 0: raise Exception( f'The predictor got two identical keys from different datasources: {keys_intersection}' ) new_row.update(row[table_name]) where_data.append(new_row) is_timeseries = predictor_metadata[predictor]['timeseries'] _mdb_make_predictions = None if is_timeseries: if 'LATEST' in self.raw: _mdb_make_predictions = False else: _mdb_make_predictions = True for row in where_data: if '__mdb_make_predictions' not in row: row['__mdb_make_predictions'] = _mdb_make_predictions for row in where_data: for key in row: if isinstance(row[key], datetime.date): row[key] = str(row[key]) data = dn.select( table=predictor, columns=None, where_data=where_data, integration_name=self.session.integration, integration_type=self.session.integration_type, is_timeseries=_mdb_make_predictions) data = [{ get_preditor_alias(step, self.database): x } for x in data] elif isinstance(step, JoinStep): left_data = steps_data[step.left.step_num] right_data = steps_data[step.right.step_num] if len(left_data) == 0 and len(right_data) == 0: return [] if len(left_data) == 0: return right_data if len(right_data) == 0: return left_data left_keys = left_data[0].keys() right_keys = right_data[0].keys() if len(left_keys) != 1 or len(right_keys) != 1: raise Exception( 'At this moment supported only one JOIN supported') if step.query.condition is not None: raise Exception( 'At this moment supported only JOIN without conditions' ) left_key = list(left_keys)[0] right_key = list(right_keys)[0] data = [] if step.query.join_type.upper() == 'LEFT JOIN': for left_row in left_data: left_row_data = left_row[left_key] for right_row in right_data: right_row_data = right_row[right_key] if left_row_data[ '__mindsdb_row_id'] == right_row_data[ '__mindsdb_row_id']: data.append({ left_key: left_row_data, right_key: right_row_data }) break else: data.append({ left_key: left_row_data, right_key: {} }) elif step.query.join_type.upper() == 'JOIN': right_used_ids = [] for left_row in left_data: left_row_data = left_row[left_key] for right_row in right_data: right_row_data = right_row[right_key] if left_row_data[ '__mindsdb_row_id'] == right_row_data[ '__mindsdb_row_id']: data.append({ left_key: left_row_data, right_key: right_row_data }) right_used_ids.append( right_row_data['__mindsdb_row_id']) break for right_row in right_data: right_row_data = right_row[right_key] if right_row_data[ '__mindsdb_row_id'] not in right_used_ids: data.append({ left_key: {}, right_key: right_row_data }) else: raise Exception( f'Unknown JOIN type: {step.query.join_type}') elif isinstance(step, FilterStep): raise Exception('FilterStep not implemented') elif isinstance(step, ProjectStep): step_data = steps_data[step.dataframe.step_num] row = step_data[0] # TODO if rowcount = 0 tables_columns = {} for table_name in row: tables_columns[table_name] = list(row[table_name].keys()) columns_list = [] for column_full_name in step.columns: table_name = None if isinstance(column_full_name, Star) is False: column_name_parts = column_full_name.parts column_alias = None if column_full_name.alias is None else '.'.join( column_full_name.alias.parts) if len(column_name_parts) > 2: raise Exception( f'Column name must contain no more than 2 parts. Got name: {".".join(column_full_name)}' ) elif len(column_name_parts) == 1: column_name = column_name_parts[0] appropriate_table = None for table_name, table_columns in tables_columns.items( ): if column_name in table_columns: if appropriate_table is not None: raise Exception( 'Fount multiple appropriate tables for column {column_name}' ) else: appropriate_table = table_name if appropriate_table is None: # it is probably constaint column_name = column_name.strip("'") name_or_alias = column_alias or column_name column_alias = name_or_alias # appropriate_table = '' for row in step_data: for table in row: row[table][name_or_alias] = column_name appropriate_table = list( step_data[0].keys())[0] # raise Exception(f'Can not find approproate table for column {column_name}') columns_list.append(appropriate_table + (column_alias, column_alias)) else: columns_list.append(appropriate_table + (column_name, column_alias)) elif len(column_name_parts) == 2: table_name_or_alias = column_name_parts[0] column_name = column_name_parts[1] appropriate_table = None for table_name, table_columns in tables_columns.items( ): checkig_table_name_or_alias = table_name[ 2] or table_name[1] if table_name_or_alias == checkig_table_name_or_alias: if column_name not in table_columns: raise Exception( f'Can not find column "{column_name}" in table "{table_name}"' ) appropriate_table = table_name break if appropriate_table is None: raise Exception( f'Can not find approproate table for column {column_name}' ) columns_list.append(appropriate_table + (column_name, column_alias)) else: raise Exception('Undefined column name') else: for table_name, table_columns in tables_columns.items( ): for column_name in table_columns: columns_list.append(table_name + (column_name, None)) self.columns_list = columns_list data = step_data steps_data.append(data) if self.outer_query is not None: data = [] # +++ result = [] for row in steps_data[-1]: data_row = {} for column_record in self.columns_list: table_name = column_record[:3] column_name = column_record[3] data_row[ column_record[4] or column_record[3]] = row[table_name][column_name] result.append(data_row) # --- data = self._make_list_result_view(result) df = pd.DataFrame(data) result = dfsql.sql_query(self.outer_query, ds_kwargs={'case_sensitive': False}, reduce_output=False, dataframe=df) try: self.columns_list = [('', '', '', x, x) for x in result.columns] except Exception: self.columns_list = [('', '', '', result.name, result.name)] # +++ make list result view new_result = [] for row in result.to_dict(orient='records'): data_row = [] for column_record in self.columns_list: column_name = column_record[4] or column_record[3] data_row.append(row.get(column_name)) new_result.append(data_row) result = new_result # --- self.fetched_data = result else: self.fetched_data = steps_data[-1] if hasattr(self, 'columns_list') is False: self.columns_list = [] for row in self.fetched_data: for table_key in row: for column_name in row[table_key]: if (table_key + (column_name, column_name) ) not in self.columns_list: self.columns_list.append( (table_key + (column_name, column_name))) self.columns_list = [ x for x in self.columns_list if x[3] != '__mindsdb_row_id' ]