class IntegrationDataNode(DataNode): type = 'integration' def __init__(self, config, integration_name): self.config = config self.integration_name = integration_name self.default_store = DataStore() def getType(self): return self.type def getTables(self): return [] def hasTable(self, tableName): return True def getTableColumns(self, tableName): return [] def select(self, table=None, columns=None, where=None, where_data=None, order_by=None, group_by=None, came_from=None): if isinstance(where, dict): where = [where] if isinstance(where, list): for el in where: if isinstance(el, dict): for key in el: if isinstance( el[key], list) and len(el[key]) > 0 and isinstance( el[key][0], str) and '.' in el[key][0]: el[key][0] = el[key][0][el[key][0].find('.') + 1:] where = {'and': where} query = format({"from": table, 'select': columns, "where": where}) ds, ds_name = self.default_store.save_datasource( f'temp_ds_{int(time.time()*100)}', self.integration_name, {'query': query}) dso = self.default_store.get_datasource_obj(ds_name) data = dso.df.T.to_dict().values() self.default_store.delete_datasource(ds_name) return data
class MindsDBDataNode(DataNode): type = 'mindsdb' def __init__(self, config): self.config = Config() self.mindsdb_native = NativeInterface() self.custom_models = CustomModels() self.ai_table = AITable_store() self.default_store = DataStore() def getTables(self): models = self.mindsdb_native.get_models() models = [x['name'] for x in models if x['status'] == 'complete'] models += ['predictors', 'commands'] models += [x['name'] for x in self.custom_models.get_models()] ai_tables = self.ai_table.get_ai_tables() models += [x['name'] for x in ai_tables] return models def hasTable(self, table): return table in self.getTables() def _get_ai_table_columns(self, table_name): aitable_record = self.ai_table.get_ai_table(table_name) columns = ([x['name'] for x in aitable_record.query_fields] + [x['name'] for x in aitable_record.predictor_columns]) return columns def _get_model_columns(self, table_name): model = self.mindsdb_native.get_model_data(name=table_name) columns = [] columns += model['columns'] columns += [f'{x}_original' for x in model['predict']] for col in model['predict']: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': columns += [f"{col}_min", f"{col}_max"] columns += [f"{col}_confidence"] columns += [f"{col}_explain"] return columns def getTableColumns(self, table): try: columns = self.custom_models.get_model_data(table)['columns'] columns += [ 'external_datasource', 'select_data_query', 'when_data' ] return columns except Exception: pass if table == 'predictors': return [ 'name', 'status', 'accuracy', 'predict', 'select_data_query', 'external_datasource', 'training_options' ] if table == 'commands': return ['command'] columns = [] ai_tables = self.ai_table.get_ai_table(table) if ai_tables is not None: columns = self._get_ai_table_columns(table) elif table in [x['name'] for x in self.mindsdb_native.get_models()]: columns = self._get_model_columns(table) columns += [ 'when_data', 'select_data_query', 'external_datasource' ] return columns def _select_predictors(self): models = self.mindsdb_native.get_models() # TODO add custom models return [ { 'name': x['name'], 'status': x['status'], 'accuracy': str(x['accuracy']) if x['accuracy'] is not None else None, 'predict': ', '.join(x['predict']), 'select_data_query': '', 'external_datasource': '', # TODO 'training_options': '' # TODO ? } for x in models ] def delete_predictor(self, name): self.mindsdb_native.delete_model(name) def _select_from_ai_table(self, table, columns, where): aitable_record = self.ai_table.get_ai_table(table) integration = aitable_record.integration_name query = aitable_record.integration_query predictor_name = aitable_record.predictor_name ds, ds_name = self.default_store.save_datasource( 'temp_ds', integration, {'query': query}) dso = self.default_store.get_datasource_obj(ds_name) res = self.mindsdb_native.predict(name=predictor_name, when_data=dso) self.default_store.delete_datasource(ds_name) keys_map = {} for f in aitable_record.predictor_columns: keys_map[f['value']] = f['name'] for f in aitable_record.query_fields: keys_map[f['name']] = f['name'] keys = list(keys_map.keys()) data = [] for i, el in enumerate(res): data.append({keys_map[key]: el[key] for key in keys}) return data def select(self, table, columns=None, where=None, where_data=None, order_by=None, group_by=None, came_from=None): ''' NOTE WHERE statements can be just $eq joined with 'and' ''' if table == 'predictors': return self._select_predictors() if table == 'commands': return [] if self.ai_table.get_ai_table(table): return self._select_from_ai_table(table, columns, where) original_when_data = None if 'when_data' in where: if len(where) > 1: raise ValueError( "Should not be used any other keys in 'where', if 'when_data' used" ) try: original_when_data = where['when_data']['$eq'] where_data = json.loads(where['when_data']['$eq']) if isinstance(where_data, list) is False: where_data = [where_data] except Exception: raise ValueError( f'''Error while parse 'when_data'="{where_data}"''') external_datasource = None if 'external_datasource' in where: external_datasource = where['external_datasource']['$eq'] del where['external_datasource'] select_data_query = None if came_from is not None and 'select_data_query' in where: select_data_query = where['select_data_query']['$eq'] del where['select_data_query'] dbtype = self.config['integrations'][came_from]['type'] if dbtype == 'clickhouse': ch = Clickhouse(self.config, came_from) res = ch._query( select_data_query.strip(' ;\n') + ' FORMAT JSON') data = res.json()['data'] elif dbtype == 'mariadb': maria = Mariadb(self.config, came_from) data = maria._query(select_data_query) elif dbtype == 'mysql': mysql = MySQL(self.config, came_from) data = mysql._query(select_data_query) elif dbtype == 'postgres': mysql = PostgreSQL(self.config, came_from) data = mysql._query(select_data_query) elif dbtype == 'mssql': mssql = MSSQL(self.config, came_from) data = mssql._query(select_data_query, fetch=True) else: raise Exception(f'Unknown database type: {dbtype}') if where_data is None: where_data = data else: where_data += data new_where = {} if where_data is not None: where_data = pandas.DataFrame(where_data) else: for key, value in where.items(): if isinstance(value, dict) is False or len( value.keys()) != 1 or list(value.keys())[0] != '$eq': # TODO value should be just string or number raise Exception() new_where[key] = value['$eq'] if len(new_where) == 0: return [] where_data = [new_where] try: model = self.custom_models.get_model_data(name=table) except Exception: model = self.mindsdb_native.get_model_data(name=table) predicted_columns = model['predict'] original_target_values = {} for col in predicted_columns: if where_data is not None: if col in where_data: original_target_values[col + '_original'] = list( where_data[col]) else: original_target_values[col + '_original'] = [None ] * len(where_data) else: original_target_values[col + '_original'] = [None] if table in [x['name'] for x in self.custom_models.get_models()]: res = self.custom_models.predict(name=table, when_data=where_data) data = [] fields = model['columns'] for i, ele in enumerate(res): row = {} row['select_data_query'] = select_data_query row['external_datasource'] = external_datasource row['when_data'] = original_when_data for key in ele: row[key] = ele[key]['predicted_value'] # FIXME prefer get int from mindsdb_native in this case if model['data_analysis_v2'][key]['typing'][ 'data_subtype'] == 'Int': row[key] = int(row[key]) for k in fields: if k not in ele: if isinstance(where_data, list): if k in where_data[i]: row[k] = where_data[i][k] else: row[k] = None elif k in where_data.columns: row[k] = where_data[k].iloc[i] else: row[k] = None for k in original_target_values: row[k] = original_target_values[k][i] data.append(row) field_types = { f: model['data_analysis_v2'][f]['typing']['data_subtype'] for f in fields if 'typing' in model['data_analysis_v2'][f] } for row in data: cast_row_types(row, field_types) return data else: res = self.mindsdb_native.predict(name=table, when_data=where_data) keys = [x for x in list(res._data.keys()) if x in columns] min_max_keys = [] for col in predicted_columns: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': min_max_keys.append(col) data = [] explains = [] for i, el in enumerate(res): data.append({key: el[key] for key in keys}) explains.append(el.explain()) field_types = { f: model['data_analysis_v2'][f]['typing']['data_subtype'] for f in model['columns'] if 'typing' in model['data_analysis_v2'][f] } for i, row in enumerate(data): cast_row_types(row, field_types) row['select_data_query'] = select_data_query row['external_datasource'] = external_datasource row['when_data'] = original_when_data for k in original_target_values: row[k] = original_target_values[k][i] explanation = explains[i] for key in predicted_columns: row[key + '_confidence'] = explanation[key]['confidence'] row[key + '_explain'] = json.dumps(explanation[key], cls=NumpyJSONEncoder, ensure_ascii=False) for key in min_max_keys: row[key + '_min'] = min( explanation[key]['confidence_interval']) row[key + '_max'] = max( explanation[key]['confidence_interval']) return data
class IntegrationDataNode(DataNode): type = 'integration' def __init__(self, config, integration_name): self.config = config self.integration_name = integration_name self.default_store = DataStore() def getType(self): return self.type def getTables(self): return [] def hasTable(self, tableName): return True def getTableColumns(self, tableName): return [] def select(self, table=None, columns=None, where=None, where_data=None, order_by=None, group_by=None, came_from=None): has_where = isinstance(where, (dict, list)) and len(where) > 0 if isinstance(where, dict): where = [where] if isinstance(where, list): for el in where: if isinstance(el, dict): for key in el: if isinstance( el[key], list) and len(el[key]) > 0 and isinstance( el[key][0], str) and '.' in el[key][0]: el[key][0] = el[key][0][el[key][0].find('.') + 1:] where = {'and': where} format_data = {'from': table, 'select': columns} if has_where: format_data['where'] = where query = format(format_data) ds, ds_name = self.default_store.save_datasource( f'temp_ds_{int(time.time()*100)}', self.integration_name, {'query': query}) dso = self.default_store.get_datasource_obj(ds_name) data = dso.df.to_dict(orient='records') for column_name in dso.df.columns: if pd.core.dtypes.common.is_datetime_or_timedelta_dtype( dso.df[column_name]): pass_data = dso.df[column_name].dt.to_pydatetime() for i, rec in enumerate(data): rec[column_name] = pass_data[i].timestamp() self.default_store.delete_datasource(ds_name) return data
class StreamController: def __init__(self, name, predictor, stream_in, stream_out, anomaly_stream=None, learning_stream=None, learning_threshold=100): self.name = name self.predictor = predictor self.stream_in = stream_in self.stream_out = stream_out self.anomaly_stream = anomaly_stream self.learning_stream = learning_stream self.learning_threshold = learning_threshold self.learning_data = [] self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.stop_event = Event() self.model_interface = ModelInterfaceWrapper(ModelInterface()) self.data_store = DataStore() self.config = Config() p = db.session.query(db.Predictor).filter_by( company_id=self.company_id, name=self.predictor).first() if p is None: raise Exception(f'Predictor {predictor} doesn\'t exist') self.target = p.to_predict[0] ts_settings = p.learn_args.get('timeseries_settings', None) if not ts_settings['is_timeseries']: ts_settings = None if ts_settings is None: self.thread = Thread(target=StreamController._make_predictions, args=(self, )) else: self.ts_settings = ts_settings self.thread = Thread(target=StreamController._make_ts_predictions, args=(self, )) self.thread.start() def _is_anomaly(self, res): for k in res: if k.endswith('_anomaly') and res[k] is not None: return True return False def _consider_learning(self): if self.learning_stream is not None: self.learning_data.extend(self.learning_stream.read()) if len(self.learning_data) >= self.learning_threshold: p = db.session.query(db.Predictor).filter_by( company_id=self.company_id, name=self.predictor).first() ds_record = db.session.query( db.Datasource).filter_by(id=p.datasource_id).first() df = pd.DataFrame.from_records(self.learning_data) name = 'name_' + str(time()).replace('.', '_') path = os.path.join(self.config['paths']['datasources'], name) df.to_csv(path) from_data = { 'class': 'FileDS', 'args': [path], 'kwargs': {}, } self.data_store.save_datasource(name=name, source_type='file', source=path, file_path=path, company_id=self.company_id) ds = self.data_store.get_datasource(name, self.company_id) self.model_interface.adjust(p.name, from_data, ds['id'], self.company_id) self.learning_data.clear() def _make_predictions(self): while not self.stop_event.wait(0.5): self._consider_learning() for when_data in self.stream_in.read(): preds = self.model_interface.predict(self.predictor, when_data, 'dict') for res in preds: if self.anomaly_stream is not None and self._is_anomaly( res): self.anomaly_stream.write(res) else: self.stream_out.write(res) def _make_ts_predictions(self): window = self.ts_settings['window'] order_by = self.ts_settings['order_by'] order_by = [order_by] if isinstance(order_by, str) else order_by group_by = self.ts_settings.get('group_by', None) group_by = [group_by] if isinstance(group_by, str) else group_by cache = Cache(self.name) while not self.stop_event.wait(0.5): self._consider_learning() for when_data in self.stream_in.read(): for ob in order_by: if ob not in when_data: raise Exception( f'when_data doesn\'t contain order_by[{ob}]') for gb in group_by: if gb not in when_data: raise Exception( f'when_data doesn\'t contain group_by[{gb}]') gb_value = tuple( when_data[gb] for gb in group_by) if group_by is not None else '' # because cache doesn't work for tuples # (raises Exception: tuple doesn't have "encode" attribute) gb_value = str(gb_value) with cache: if gb_value not in cache: cache[gb_value] = [] # do this because shelve-cache doesn't support # in-place changing records = cache[gb_value] records.append(when_data) cache[gb_value] = records with cache: for gb_value in cache.keys(): if len(cache[gb_value]) >= window: cache[gb_value] = [ *sorted( cache[gb_value], # WARNING: assuming wd[ob] is numeric key=lambda wd: tuple(wd[ob] for ob in order_by)) ] while len(cache[gb_value]) >= window: res_list = self.model_interface.predict( self.predictor, cache[gb_value][:window], 'dict') if self.anomaly_stream is not None and self._is_anomaly( res_list[-1]): self.anomaly_stream.write(res_list[-1]) else: self.stream_out.write(res_list[-1]) cache[gb_value] = cache[gb_value][1:]