class RedisStream(Thread): def __init__(self, host, port, database, stream_in, stream_out, predictor, _type): self.host = host self.port = port self.db = database self.predictor = predictor self.client = self._get_client() self.stream_in_name = stream_in self.stream_out_name = stream_out self.stream_in = self.client.Stream(stream_in) self.stream_out = self.client.Stream(stream_out) self._type = _type self.native_interface = NativeInterface() self.format_flag = 'explain' self.stop_event = Event() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) super().__init__(target=RedisStream.make_prediction, args=(self,)) def _get_client(self): return walrus.Database(host=self.host, port=self.port, db=self.db) def make_prediction(self): predict_record = session.query(DBPredictor).filter_by(company_id=self.company_id, name=self.predictor).first() if predict_record is None: log.error(f"Error creating stream: requested predictor {self.predictor} is not exist") return while not self.stop_event.wait(0.5): # block==0 is a blocking mode predict_info = self.stream_in.read(block=0) for record in predict_info: record_id = record[0] raw_when_data = record[1] when_data = self.decode(raw_when_data) result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data) log.error(f"STREAM: got {result}") for res in result: in_json = json.dumps(res) self.stream_out.add({"prediction": in_json}) self.stream_in.delete(record_id) session.close() def decode(self, redis_data): decoded = {} for k in redis_data: decoded[k.decode('utf8')] = redis_data[k].decode('utf8') return decoded
class KafkaStream(Thread): def __init__(self, connection_info, advanced_info, topic_in, topic_out, predictor, _type, **ts_params): self.connection_info = connection_info self.advanced_info = advanced_info self.predictor = predictor self.stream_in_name = topic_in self.stream_out_name = topic_out self.consumer = kafka.KafkaConsumer( **self.connection_info, **self.advanced_info.get('consumer', {})) self.consumer.subscribe(topics=[self.stream_in_name]) self.producer = kafka.KafkaProducer( **self.connection_info, **self.advanced_info.get('producer', {})) self.admin = kafka.KafkaAdminClient(**self.connection_info) try: self.topic = NewTopic(self.stream_out_name, num_partitions=1, replication_factor=1) self.admin.create_topics([self.topic]) except kafka.errors.TopicAlreadyExistsError: pass self._type = _type self.native_interface = NativeInterface() self.format_flag = 'explain' self.stop_event = Event() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.caches = {} self.ts_params = ts_params if self._type.lower() == StreamTypes.timeseries: self.target = self.ts_params.get('target') self.window = self.ts_params.get('window_size') self.gb = self.ts_params.get('group_by') self.dt = self.ts_params.get('order_by') super().__init__(target=KafkaStream.make_timeseries_predictions, args=(self, )) else: super().__init__(target=KafkaStream.make_prediction, args=(self, )) def predict_ts(self, cache_name): when_list = [x for x in self.caches[cache_name]] for x in when_list: if self.target not in x: x['make_predictions'] = False else: x['make_predictions'] = True result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_list) log.error(f"TIMESERIES STREAM: got {result}") for res in result: in_json = json.dumps(res) to_send = in_json.encode('utf-8') log.error(f"sending {to_send}") self.producer.send(self.stream_out_name, to_send) self.caches[cache_name] = self.caches[cache_name][1:] def make_prediction_from_cache(self, cache_name): cache = self.caches[cache_name] log.error("STREAM: in make_prediction_from_cache") if len(cache) >= self.window: log.error( f"STREAM: make_prediction_from_cache - len(cache) = {len(cache)}" ) self.predict_ts(cache_name) def to_cache(self, record): gb_val = record[self.gb] cache_name = f"cache.{gb_val}" if cache_name not in self.caches: cache = [] self.caches[cache_name] = cache log.error(f"STREAM: cache {cache_name} has been created") self.make_prediction_from_cache(cache_name) self.handle_record(cache_name, record) self.make_prediction_from_cache(cache_name) log.error("STREAM in cache: current iteration has done.") def handle_record(self, cache_name, record): log.error(f"STREAM: handling cache {cache_name} and {record} record.") cache = self.caches[cache_name] cache.append(record) cache = self.sort_cache(cache) self.caches[cache_name] = cache def sort_cache(self, cache): return sorted(cache, key=lambda x: x[self.dt]) def make_timeseries_predictions(self): log.error("STREAM: running 'make_timeseries_predictions'") predict_record = session.query(DBPredictor).filter_by( company_id=self.company_id, name=self.predictor).first() if predict_record is None: log.error( f"Error creating stream: requested predictor {self.predictor} is not exist" ) return while not self.stop_event.wait(0.5): try: msg_str = next(self.consumer) when_data = json.loads(msg_str.value) self.to_cache(when_data) except StopIteration: pass log.error("Stopping stream..") self.producer.close() self.consumer.close() session.close() def make_prediction(self): predict_record = session.query(DBPredictor).filter_by( company_id=self.company_id, name=self.predictor).first() if predict_record is None: log.error( f"Error creating stream: requested predictor {self.predictor} is not exist" ) return while not self.stop_event.wait(0.5): try: msg_str = next(self.consumer) when_data = json.loads(msg_str.value) result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data) log.error(f"STREAM: got {result}") for res in result: in_json = json.dumps({"prediction": res}) to_send = in_json.encode('utf-8') log.error(f"sending {to_send}") self.producer.send(self.stream_out_name, to_send) except StopIteration: pass log.error("Stopping stream..") self.producer.close() self.consumer.close() session.close()
class RedisStream(Thread): def __init__(self, name, connection_info, advanced_info, stream_in, stream_out, predictor, _type): self.stream_name = name self.connection_info = connection_info self.connection_info.update(advanced_info) self.predictor = predictor self.client = self._get_client() self.stream_in_name = stream_in self.stream_out_name = stream_out self.stream_in = self.client.Stream(stream_in) self.stream_out = self.client.Stream(stream_out) self._type = _type self.native_interface = NativeInterface() self.format_flag = 'explain' self.stop_event = Event() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) if self._type == 'timeseries': super().__init__(target=RedisStream.make_timeseries_predictions, args=(self,)) else: super().__init__(target=RedisStream.make_predictions, args=(self,)) def _get_client(self): return walrus.Database(**self.connection_info) def _get_target(self): return "pnew_case" # pass def _get_window_size(self): return 10 # pass def _get_gb(self): return "state" # pass def _get_dt(self): return "time" # pass def predict(self, stream_in, stream_out, timeseries_mode=False): predict_info = stream_in.read(block=0) when_list = [] for record in predict_info: record_id = record[0] raw_when_data = record[1] when_data = self.decode(raw_when_data) if timeseries_mode: # if self.target not in when_data: # when_data['make_predictions'] = False # else: # when_data['make_predictions'] = True when_list.append(when_data) else: result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data) log.error(f"STREAM: got {result}") for res in result: in_json = json.dumps(res) stream_out.add({"prediction": in_json}) stream_in.delete(record_id) if timeseries_mode: result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_list) log.error(f"TIMESERIES STREAM: got {result}") for res in result: in_json = json.dumps(res) stream_out.add({"prediction": in_json}) stream_in.trim(len(stream_in) - 1, approximate=False) def make_prediction_from_cache(self, cache): log.error("STREAM: in make_prediction_from_cache") if len(cache) >= self.window: log.error(f"STREAM: make_prediction_from_cache - len(cache) = {len(cache)}") self.predict(cache, self.stream_out, timeseries_mode=True) def make_timeseries_predictions(self): log.error("STREAM: running 'make_timeseries_predictions'") predict_record = session.query(DBPredictor).filter_by(company_id=self.company_id, name=self.predictor).first() if predict_record is None: log.error(f"Error creating stream: requested predictor {self.predictor} is not exist") return self.target = self._get_target() self.window = self._get_window_size() self.gb = self._get_gb() self.dt = self._get_dt() while not self.stop_event.wait(0.5): # block==0 is a blocking mode predict_info = self.stream_in.read(block=0) for record in predict_info: record_id = record[0] raw_when_data = record[1] when_data = self.decode(raw_when_data) log.error(f"STREAM: next record have read from {self.stream_in.key}: {when_data}") self.to_cache(when_data) self.stream_in.delete(record_id) session.close() def to_cache(self, record): gb_val = record[self.gb] cache = self.client.Stream(f"{self.stream_name}.cache.{gb_val}") log.error(f"STREAM: cache {cache.key} has been created") self.make_prediction_from_cache(cache) self.handle_record(cache, record) self.make_prediction_from_cache(cache) log.error("STREAM in cache: current iteration has done.") def handle_record(self, cache, record): log.error(f"STREAM: handling cache {cache.key} and {record} record.") records = cache.read() # log.error(f"STREAM: current {cache.key} state: {records}") records = [self.decode(x[1]) for x in records] log.error(f"STREAM: read {records} from cache.") records.append(record) records = self.sort_cache(records) log.error(f"STREAM: after updating and sorting - {records}.") cache.trim(0, approximate=False) for rec in records: cache.add(rec) log.error(f"STREAM: finish updating {cache.key}") def sort_cache(self, cache): return sorted(cache, key=lambda x: x[self.dt]) def make_predictions(self): predict_record = session.query(DBPredictor).filter_by(company_id=self.company_id, name=self.predictor).first() if predict_record is None: log.error(f"Error creating stream: requested predictor {self.predictor} is not exist") return while not self.stop_event.wait(0.5): predict_info = self.stream_in.read() for record in predict_info: record_id = record[0] raw_when_data = record[1] when_data = self.decode(raw_when_data) result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data) log.error(f"STREAM: got {result}") for res in result: in_json = json.dumps(res) self.stream_out.add({"prediction": in_json}) self.stream_in.delete(record_id) session.close() log.error("STREAM: stopping...") def decode(self, redis_data): decoded = {} for k in redis_data: decoded[k.decode('utf8')] = redis_data[k].decode('utf8') return decoded
class MindsDBDataNode(DataNode): type = 'mindsdb' def __init__(self, config): self.config = Config() self.mindsdb_native = NativeInterface() self.custom_models = CustomModels() self.ai_table = AITable_store() self.default_store = DataStore() def getTables(self): models = self.mindsdb_native.get_models() models = [x['name'] for x in models if x['status'] == 'complete'] models += ['predictors', 'commands'] models += [x['name'] for x in self.custom_models.get_models()] ai_tables = self.ai_table.get_ai_tables() models += [x['name'] for x in ai_tables] return models def hasTable(self, table): return table in self.getTables() def _get_ai_table_columns(self, table_name): aitable_record = self.ai_table.get_ai_table(table_name) columns = ([x['name'] for x in aitable_record.query_fields] + [x['name'] for x in aitable_record.predictor_columns]) return columns def _get_model_columns(self, table_name): model = self.mindsdb_native.get_model_data(name=table_name) columns = [] columns += model['columns'] columns += [f'{x}_original' for x in model['predict']] for col in model['predict']: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': columns += [f"{col}_min", f"{col}_max"] columns += [f"{col}_confidence"] columns += [f"{col}_explain"] return columns def getTableColumns(self, table): try: columns = self.custom_models.get_model_data(table)['columns'] columns += [ 'external_datasource', 'select_data_query', 'when_data' ] return columns except Exception: pass if table == 'predictors': return [ 'name', 'status', 'accuracy', 'predict', 'select_data_query', 'external_datasource', 'training_options' ] if table == 'commands': return ['command'] columns = [] ai_tables = self.ai_table.get_ai_table(table) if ai_tables is not None: columns = self._get_ai_table_columns(table) elif table in [x['name'] for x in self.mindsdb_native.get_models()]: columns = self._get_model_columns(table) columns += [ 'when_data', 'select_data_query', 'external_datasource' ] return columns def _select_predictors(self): models = self.mindsdb_native.get_models() # TODO add custom models return [ { 'name': x['name'], 'status': x['status'], 'accuracy': str(x['accuracy']) if x['accuracy'] is not None else None, 'predict': ', '.join(x['predict']), 'select_data_query': '', 'external_datasource': '', # TODO 'training_options': '' # TODO ? } for x in models ] def delete_predictor(self, name): self.mindsdb_native.delete_model(name) def _select_from_ai_table(self, table, columns, where): aitable_record = self.ai_table.get_ai_table(table) integration = aitable_record.integration_name query = aitable_record.integration_query predictor_name = aitable_record.predictor_name ds, ds_name = self.default_store.save_datasource( 'temp_ds', integration, {'query': query}) dso = self.default_store.get_datasource_obj(ds_name, raw=True) res = self.mindsdb_native.predict(predictor_name, 'dict', when_data=dso) self.default_store.delete_datasource(ds_name) keys_map = {} for f in aitable_record.predictor_columns: keys_map[f['value']] = f['name'] for f in aitable_record.query_fields: keys_map[f['name']] = f['name'] keys = list(keys_map.keys()) data = [] for i, el in enumerate(res): data.append({keys_map[key]: el[key] for key in keys}) return data def select(self, table, columns=None, where=None, where_data=None, order_by=None, group_by=None, came_from=None): ''' NOTE WHERE statements can be just $eq joined with 'and' ''' if table == 'predictors': return self._select_predictors() if table == 'commands': return [] if self.ai_table.get_ai_table(table): return self._select_from_ai_table(table, columns, where) original_when_data = None if 'when_data' in where: if len(where) > 1: raise ValueError( "Should not be used any other keys in 'where', if 'when_data' used" ) try: original_when_data = where['when_data']['$eq'] where_data = json.loads(where['when_data']['$eq']) if isinstance(where_data, list) is False: where_data = [where_data] except Exception: raise ValueError( f'''Error while parse 'when_data'="{where_data}"''') external_datasource = None if 'external_datasource' in where: external_datasource = where['external_datasource']['$eq'] del where['external_datasource'] select_data_query = None if came_from is not None and 'select_data_query' in where: select_data_query = where['select_data_query']['$eq'] del where['select_data_query'] dbtype = self.config['integrations'][came_from]['type'] if dbtype == 'clickhouse': ch = Clickhouse(self.config, came_from) res = ch._query( select_data_query.strip(' ;\n') + ' FORMAT JSON') data = res.json()['data'] elif dbtype == 'mariadb': maria = Mariadb(self.config, came_from) data = maria._query(select_data_query) elif dbtype == 'mysql': mysql = MySQL(self.config, came_from) data = mysql._query(select_data_query) elif dbtype == 'postgres': mysql = PostgreSQL(self.config, came_from) data = mysql._query(select_data_query) elif dbtype == 'mssql': mssql = MSSQL(self.config, came_from) data = mssql._query(select_data_query, fetch=True) else: raise Exception(f'Unknown database type: {dbtype}') if where_data is None: where_data = data else: where_data += data new_where = {} if where_data is None: for key, value in where.items(): if isinstance(value, dict) is False or len( value.keys()) != 1 or list(value.keys())[0] != '$eq': # TODO value should be just string or number raise Exception() new_where[key] = value['$eq'] if len(new_where) == 0: return [] where_data = [new_where] try: model = self.custom_models.get_model_data(name=table) except Exception: model = self.mindsdb_native.get_model_data(name=table) predicted_columns = model['predict'] original_target_values = {} for col in predicted_columns: if where_data is not None: if col in where_data: original_target_values[col + '_original'] = list( where_data[col]) else: original_target_values[col + '_original'] = [None ] * len(where_data) else: original_target_values[col + '_original'] = [None] if table in [x['name'] for x in self.custom_models.get_models()]: res = self.custom_models.predict(name=table, when_data=where_data) data = [] fields = model['columns'] for i, ele in enumerate(res): row = {} row['select_data_query'] = select_data_query row['external_datasource'] = external_datasource row['when_data'] = original_when_data for key in ele: row[key] = ele[key]['predicted_value'] # FIXME prefer get int from mindsdb_native in this case if model['data_analysis_v2'][key]['typing'][ 'data_subtype'] == 'Int': row[key] = int(row[key]) for k in fields: if k not in ele: if isinstance(where_data, list): if k in where_data[i]: row[k] = where_data[i][k] else: row[k] = None elif k in where_data.columns: row[k] = where_data[k].iloc[i] else: row[k] = None for k in original_target_values: row[k] = original_target_values[k][i] data.append(row) field_types = { f: model['data_analysis_v2'][f]['typing']['data_subtype'] for f in fields if 'typing' in model['data_analysis_v2'][f] } for row in data: cast_row_types(row, field_types) return data else: pred_dicts, explanations = self.mindsdb_native.predict( table, 'dict&explain', when_data=where_data) keys = [x for x in pred_dicts[0] if x in columns] min_max_keys = [] for col in predicted_columns: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': min_max_keys.append(col) data = [] explains = [] for i, el in enumerate(pred_dicts): data.append({key: el[key] for key in keys}) explains.append(explanations[i]) field_types = { f: model['data_analysis_v2'][f]['typing']['data_subtype'] for f in model['columns'] if 'typing' in model['data_analysis_v2'][f] } for i, row in enumerate(data): cast_row_types(row, field_types) row['select_data_query'] = select_data_query row['external_datasource'] = external_datasource row['when_data'] = original_when_data for k in original_target_values: row[k] = original_target_values[k][i] explanation = explains[i] for key in predicted_columns: row[key + '_confidence'] = explanation[key]['confidence'] row[key + '_explain'] = json.dumps(explanation[key], cls=NumpyJSONEncoder, ensure_ascii=False) for key in min_max_keys: row[key + '_min'] = min( explanation[key]['confidence_interval']) row[key + '_max'] = max( explanation[key]['confidence_interval']) return data