예제 #1
0
class RedisStream(Thread):
    def __init__(self, host, port, database, stream_in, stream_out, predictor, _type):
        self.host = host
        self.port = port
        self.db = database
        self.predictor = predictor
        self.client = self._get_client()
        self.stream_in_name = stream_in
        self.stream_out_name = stream_out
        self.stream_in = self.client.Stream(stream_in)
        self.stream_out = self.client.Stream(stream_out)
        self._type = _type
        self.native_interface = NativeInterface()
        self.format_flag = 'explain'

        self.stop_event = Event()
        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        super().__init__(target=RedisStream.make_prediction, args=(self,))

    def _get_client(self):
        return walrus.Database(host=self.host, port=self.port, db=self.db)

    def make_prediction(self):
        predict_record = session.query(DBPredictor).filter_by(company_id=self.company_id, name=self.predictor).first()
        if predict_record is None:
            log.error(f"Error creating stream: requested predictor {self.predictor} is not exist")
            return

        while not self.stop_event.wait(0.5):
            # block==0 is a blocking mode
            predict_info = self.stream_in.read(block=0)
            for record in predict_info:
                record_id = record[0]
                raw_when_data = record[1]
                when_data = self.decode(raw_when_data)

                result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data)
                log.error(f"STREAM: got {result}")
                for res in result:
                    in_json = json.dumps(res)
                    self.stream_out.add({"prediction": in_json})
                self.stream_in.delete(record_id)

        session.close()

    def decode(self, redis_data):
        decoded = {}
        for k in redis_data:
            decoded[k.decode('utf8')] = redis_data[k].decode('utf8')
        return decoded
예제 #2
0
class KafkaStream(Thread):
    def __init__(self, connection_info, advanced_info, topic_in, topic_out,
                 predictor, _type, **ts_params):
        self.connection_info = connection_info
        self.advanced_info = advanced_info
        self.predictor = predictor
        self.stream_in_name = topic_in
        self.stream_out_name = topic_out
        self.consumer = kafka.KafkaConsumer(
            **self.connection_info, **self.advanced_info.get('consumer', {}))
        self.consumer.subscribe(topics=[self.stream_in_name])
        self.producer = kafka.KafkaProducer(
            **self.connection_info, **self.advanced_info.get('producer', {}))
        self.admin = kafka.KafkaAdminClient(**self.connection_info)
        try:
            self.topic = NewTopic(self.stream_out_name,
                                  num_partitions=1,
                                  replication_factor=1)
            self.admin.create_topics([self.topic])
        except kafka.errors.TopicAlreadyExistsError:
            pass
        self._type = _type
        self.native_interface = NativeInterface()
        self.format_flag = 'explain'

        self.stop_event = Event()
        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        self.caches = {}
        self.ts_params = ts_params
        if self._type.lower() == StreamTypes.timeseries:
            self.target = self.ts_params.get('target')
            self.window = self.ts_params.get('window_size')
            self.gb = self.ts_params.get('group_by')
            self.dt = self.ts_params.get('order_by')
            super().__init__(target=KafkaStream.make_timeseries_predictions,
                             args=(self, ))
        else:
            super().__init__(target=KafkaStream.make_prediction, args=(self, ))

    def predict_ts(self, cache_name):
        when_list = [x for x in self.caches[cache_name]]
        for x in when_list:
            if self.target not in x:
                x['make_predictions'] = False
            else:
                x['make_predictions'] = True

        result = self.native_interface.predict(self.predictor,
                                               self.format_flag,
                                               when_data=when_list)
        log.error(f"TIMESERIES STREAM: got {result}")
        for res in result:
            in_json = json.dumps(res)
            to_send = in_json.encode('utf-8')
            log.error(f"sending {to_send}")
            self.producer.send(self.stream_out_name, to_send)
        self.caches[cache_name] = self.caches[cache_name][1:]

    def make_prediction_from_cache(self, cache_name):
        cache = self.caches[cache_name]
        log.error("STREAM: in make_prediction_from_cache")
        if len(cache) >= self.window:
            log.error(
                f"STREAM: make_prediction_from_cache - len(cache) = {len(cache)}"
            )
            self.predict_ts(cache_name)

    def to_cache(self, record):
        gb_val = record[self.gb]
        cache_name = f"cache.{gb_val}"
        if cache_name not in self.caches:
            cache = []
            self.caches[cache_name] = cache

        log.error(f"STREAM: cache {cache_name} has been created")
        self.make_prediction_from_cache(cache_name)
        self.handle_record(cache_name, record)
        self.make_prediction_from_cache(cache_name)
        log.error("STREAM in cache: current iteration has done.")

    def handle_record(self, cache_name, record):
        log.error(f"STREAM: handling cache {cache_name} and {record} record.")
        cache = self.caches[cache_name]
        cache.append(record)
        cache = self.sort_cache(cache)
        self.caches[cache_name] = cache

    def sort_cache(self, cache):
        return sorted(cache, key=lambda x: x[self.dt])

    def make_timeseries_predictions(self):
        log.error("STREAM: running 'make_timeseries_predictions'")
        predict_record = session.query(DBPredictor).filter_by(
            company_id=self.company_id, name=self.predictor).first()
        if predict_record is None:
            log.error(
                f"Error creating stream: requested predictor {self.predictor} is not exist"
            )
            return

        while not self.stop_event.wait(0.5):
            try:
                msg_str = next(self.consumer)
                when_data = json.loads(msg_str.value)
                self.to_cache(when_data)
            except StopIteration:
                pass

        log.error("Stopping stream..")
        self.producer.close()
        self.consumer.close()
        session.close()

    def make_prediction(self):
        predict_record = session.query(DBPredictor).filter_by(
            company_id=self.company_id, name=self.predictor).first()
        if predict_record is None:
            log.error(
                f"Error creating stream: requested predictor {self.predictor} is not exist"
            )
            return
        while not self.stop_event.wait(0.5):
            try:
                msg_str = next(self.consumer)
                when_data = json.loads(msg_str.value)
                result = self.native_interface.predict(self.predictor,
                                                       self.format_flag,
                                                       when_data=when_data)
                log.error(f"STREAM: got {result}")
                for res in result:
                    in_json = json.dumps({"prediction": res})
                    to_send = in_json.encode('utf-8')
                    log.error(f"sending {to_send}")
                    self.producer.send(self.stream_out_name, to_send)
            except StopIteration:
                pass
        log.error("Stopping stream..")
        self.producer.close()
        self.consumer.close()
        session.close()
예제 #3
0
class RedisStream(Thread):
    def __init__(self, name, connection_info, advanced_info, stream_in, stream_out, predictor, _type):
        self.stream_name = name
        self.connection_info = connection_info
        self.connection_info.update(advanced_info)
        self.predictor = predictor
        self.client = self._get_client()
        self.stream_in_name = stream_in
        self.stream_out_name = stream_out
        self.stream_in = self.client.Stream(stream_in)
        self.stream_out = self.client.Stream(stream_out)
        self._type = _type
        self.native_interface = NativeInterface()
        self.format_flag = 'explain'
        self.stop_event = Event()
        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        if self._type == 'timeseries':
            super().__init__(target=RedisStream.make_timeseries_predictions, args=(self,))
        else:
            super().__init__(target=RedisStream.make_predictions, args=(self,))

    def _get_client(self):
        return walrus.Database(**self.connection_info)

    def _get_target(self):
        return "pnew_case"
        # pass

    def _get_window_size(self):
        return 10
        # pass

    def _get_gb(self):
        return "state"
        # pass

    def _get_dt(self):
        return "time"
        # pass

    def predict(self, stream_in, stream_out, timeseries_mode=False):
        predict_info = stream_in.read(block=0)
        when_list = []
        for record in predict_info:
            record_id = record[0]
            raw_when_data = record[1]
            when_data = self.decode(raw_when_data)
            if timeseries_mode:
                # if self.target not in when_data:
                #     when_data['make_predictions'] = False
                # else:
                #     when_data['make_predictions'] = True
                when_list.append(when_data)
            else:
                result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data)
                log.error(f"STREAM: got {result}")
                for res in result:
                    in_json = json.dumps(res)
                    stream_out.add({"prediction": in_json})
                stream_in.delete(record_id)

        if timeseries_mode:
            result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_list)
            log.error(f"TIMESERIES STREAM: got {result}")
            for res in result:
                in_json = json.dumps(res)
                stream_out.add({"prediction": in_json})
            stream_in.trim(len(stream_in) - 1, approximate=False)


    def make_prediction_from_cache(self, cache):
        log.error("STREAM: in make_prediction_from_cache")
        if len(cache) >= self.window:
            log.error(f"STREAM: make_prediction_from_cache - len(cache) = {len(cache)}")
            self.predict(cache, self.stream_out, timeseries_mode=True)

    def make_timeseries_predictions(self):
        log.error("STREAM: running 'make_timeseries_predictions'")
        predict_record = session.query(DBPredictor).filter_by(company_id=self.company_id, name=self.predictor).first()
        if predict_record is None:
            log.error(f"Error creating stream: requested predictor {self.predictor} is not exist")
            return
        self.target = self._get_target()
        self.window = self._get_window_size()
        self.gb = self._get_gb()
        self.dt = self._get_dt()

        while not self.stop_event.wait(0.5):
            # block==0 is a blocking mode
            predict_info = self.stream_in.read(block=0)
            for record in predict_info:
                record_id = record[0]
                raw_when_data = record[1]
                when_data = self.decode(raw_when_data)
                log.error(f"STREAM: next record have read from {self.stream_in.key}: {when_data}")
                self.to_cache(when_data)
                self.stream_in.delete(record_id)
        session.close()

    def to_cache(self, record):
        gb_val = record[self.gb]
        cache = self.client.Stream(f"{self.stream_name}.cache.{gb_val}")
        log.error(f"STREAM: cache {cache.key} has been created")
        self.make_prediction_from_cache(cache)
        self.handle_record(cache, record)
        self.make_prediction_from_cache(cache)
        log.error("STREAM in cache: current iteration has done.")

    def handle_record(self, cache, record):
        log.error(f"STREAM: handling cache {cache.key} and {record} record.")
        records = cache.read()
        # log.error(f"STREAM: current {cache.key} state: {records}")
        records = [self.decode(x[1]) for x in records]
        log.error(f"STREAM: read {records} from cache.")
        records.append(record)
        records = self.sort_cache(records)
        log.error(f"STREAM: after updating and sorting - {records}.")
        cache.trim(0, approximate=False)
        for rec in records:
            cache.add(rec)
        log.error(f"STREAM: finish updating {cache.key}")

    def sort_cache(self, cache):
        return sorted(cache, key=lambda x: x[self.dt])

    def make_predictions(self):
        predict_record = session.query(DBPredictor).filter_by(company_id=self.company_id, name=self.predictor).first()
        if predict_record is None:
            log.error(f"Error creating stream: requested predictor {self.predictor} is not exist")
            return

        while not self.stop_event.wait(0.5):
            predict_info = self.stream_in.read()
            for record in predict_info:
                record_id = record[0]
                raw_when_data = record[1]
                when_data = self.decode(raw_when_data)

                result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data)
                log.error(f"STREAM: got {result}")
                for res in result:
                    in_json = json.dumps(res)
                    self.stream_out.add({"prediction": in_json})
                self.stream_in.delete(record_id)

        session.close()
        log.error("STREAM: stopping...")

    def decode(self, redis_data):
        decoded = {}
        for k in redis_data:
            decoded[k.decode('utf8')] = redis_data[k].decode('utf8')
        return decoded
예제 #4
0
class MindsDBDataNode(DataNode):
    type = 'mindsdb'

    def __init__(self, config):
        self.config = Config()
        self.mindsdb_native = NativeInterface()
        self.custom_models = CustomModels()
        self.ai_table = AITable_store()
        self.default_store = DataStore()

    def getTables(self):
        models = self.mindsdb_native.get_models()
        models = [x['name'] for x in models if x['status'] == 'complete']
        models += ['predictors', 'commands']
        models += [x['name'] for x in self.custom_models.get_models()]

        ai_tables = self.ai_table.get_ai_tables()
        models += [x['name'] for x in ai_tables]
        return models

    def hasTable(self, table):
        return table in self.getTables()

    def _get_ai_table_columns(self, table_name):
        aitable_record = self.ai_table.get_ai_table(table_name)
        columns = ([x['name'] for x in aitable_record.query_fields] +
                   [x['name'] for x in aitable_record.predictor_columns])
        return columns

    def _get_model_columns(self, table_name):
        model = self.mindsdb_native.get_model_data(name=table_name)
        columns = []
        columns += model['columns']
        columns += [f'{x}_original' for x in model['predict']]
        for col in model['predict']:
            if model['data_analysis_v2'][col]['typing'][
                    'data_type'] == 'Numeric':
                columns += [f"{col}_min", f"{col}_max"]
            columns += [f"{col}_confidence"]
            columns += [f"{col}_explain"]
        return columns

    def getTableColumns(self, table):
        try:
            columns = self.custom_models.get_model_data(table)['columns']
            columns += [
                'external_datasource', 'select_data_query', 'when_data'
            ]
            return columns
        except Exception:
            pass

        if table == 'predictors':
            return [
                'name', 'status', 'accuracy', 'predict', 'select_data_query',
                'external_datasource', 'training_options'
            ]
        if table == 'commands':
            return ['command']

        columns = []

        ai_tables = self.ai_table.get_ai_table(table)
        if ai_tables is not None:
            columns = self._get_ai_table_columns(table)
        elif table in [x['name'] for x in self.mindsdb_native.get_models()]:
            columns = self._get_model_columns(table)
            columns += [
                'when_data', 'select_data_query', 'external_datasource'
            ]

        return columns

    def _select_predictors(self):
        models = self.mindsdb_native.get_models()
        # TODO add custom models
        return [
            {
                'name': x['name'],
                'status': x['status'],
                'accuracy':
                str(x['accuracy']) if x['accuracy'] is not None else None,
                'predict': ', '.join(x['predict']),
                'select_data_query': '',
                'external_datasource': '',  # TODO
                'training_options': ''  # TODO ?
            } for x in models
        ]

    def delete_predictor(self, name):
        self.mindsdb_native.delete_model(name)

    def _select_from_ai_table(self, table, columns, where):
        aitable_record = self.ai_table.get_ai_table(table)
        integration = aitable_record.integration_name
        query = aitable_record.integration_query
        predictor_name = aitable_record.predictor_name

        ds, ds_name = self.default_store.save_datasource(
            'temp_ds', integration, {'query': query})
        dso = self.default_store.get_datasource_obj(ds_name, raw=True)
        res = self.mindsdb_native.predict(predictor_name,
                                          'dict',
                                          when_data=dso)
        self.default_store.delete_datasource(ds_name)

        keys_map = {}
        for f in aitable_record.predictor_columns:
            keys_map[f['value']] = f['name']
        for f in aitable_record.query_fields:
            keys_map[f['name']] = f['name']
        keys = list(keys_map.keys())

        data = []
        for i, el in enumerate(res):
            data.append({keys_map[key]: el[key] for key in keys})

        return data

    def select(self,
               table,
               columns=None,
               where=None,
               where_data=None,
               order_by=None,
               group_by=None,
               came_from=None):
        ''' NOTE WHERE statements can be just $eq joined with 'and'
        '''
        if table == 'predictors':
            return self._select_predictors()
        if table == 'commands':
            return []
        if self.ai_table.get_ai_table(table):
            return self._select_from_ai_table(table, columns, where)

        original_when_data = None
        if 'when_data' in where:
            if len(where) > 1:
                raise ValueError(
                    "Should not be used any other keys in 'where', if 'when_data' used"
                )
            try:
                original_when_data = where['when_data']['$eq']
                where_data = json.loads(where['when_data']['$eq'])
                if isinstance(where_data, list) is False:
                    where_data = [where_data]
            except Exception:
                raise ValueError(
                    f'''Error while parse 'when_data'="{where_data}"''')
        external_datasource = None
        if 'external_datasource' in where:
            external_datasource = where['external_datasource']['$eq']
            del where['external_datasource']

        select_data_query = None
        if came_from is not None and 'select_data_query' in where:
            select_data_query = where['select_data_query']['$eq']
            del where['select_data_query']

            dbtype = self.config['integrations'][came_from]['type']
            if dbtype == 'clickhouse':
                ch = Clickhouse(self.config, came_from)
                res = ch._query(
                    select_data_query.strip(' ;\n') + ' FORMAT JSON')
                data = res.json()['data']
            elif dbtype == 'mariadb':
                maria = Mariadb(self.config, came_from)
                data = maria._query(select_data_query)
            elif dbtype == 'mysql':
                mysql = MySQL(self.config, came_from)
                data = mysql._query(select_data_query)
            elif dbtype == 'postgres':
                mysql = PostgreSQL(self.config, came_from)
                data = mysql._query(select_data_query)
            elif dbtype == 'mssql':
                mssql = MSSQL(self.config, came_from)
                data = mssql._query(select_data_query, fetch=True)
            else:
                raise Exception(f'Unknown database type: {dbtype}')

            if where_data is None:
                where_data = data
            else:
                where_data += data

        new_where = {}
        if where_data is None:
            for key, value in where.items():
                if isinstance(value, dict) is False or len(
                        value.keys()) != 1 or list(value.keys())[0] != '$eq':
                    # TODO value should be just string or number
                    raise Exception()
                new_where[key] = value['$eq']

            if len(new_where) == 0:
                return []

            where_data = [new_where]

        try:
            model = self.custom_models.get_model_data(name=table)
        except Exception:
            model = self.mindsdb_native.get_model_data(name=table)

        predicted_columns = model['predict']

        original_target_values = {}
        for col in predicted_columns:
            if where_data is not None:
                if col in where_data:
                    original_target_values[col + '_original'] = list(
                        where_data[col])
                else:
                    original_target_values[col +
                                           '_original'] = [None
                                                           ] * len(where_data)
            else:
                original_target_values[col + '_original'] = [None]

        if table in [x['name'] for x in self.custom_models.get_models()]:
            res = self.custom_models.predict(name=table, when_data=where_data)

            data = []
            fields = model['columns']
            for i, ele in enumerate(res):
                row = {}
                row['select_data_query'] = select_data_query
                row['external_datasource'] = external_datasource
                row['when_data'] = original_when_data

                for key in ele:
                    row[key] = ele[key]['predicted_value']
                    # FIXME prefer get int from mindsdb_native in this case
                    if model['data_analysis_v2'][key]['typing'][
                            'data_subtype'] == 'Int':
                        row[key] = int(row[key])

                for k in fields:
                    if k not in ele:
                        if isinstance(where_data, list):
                            if k in where_data[i]:
                                row[k] = where_data[i][k]
                            else:
                                row[k] = None
                        elif k in where_data.columns:
                            row[k] = where_data[k].iloc[i]
                        else:
                            row[k] = None

                for k in original_target_values:
                    row[k] = original_target_values[k][i]

                data.append(row)

            field_types = {
                f: model['data_analysis_v2'][f]['typing']['data_subtype']
                for f in fields if 'typing' in model['data_analysis_v2'][f]
            }
            for row in data:
                cast_row_types(row, field_types)

            return data
        else:
            pred_dicts, explanations = self.mindsdb_native.predict(
                table, 'dict&explain', when_data=where_data)

            keys = [x for x in pred_dicts[0] if x in columns]
            min_max_keys = []
            for col in predicted_columns:
                if model['data_analysis_v2'][col]['typing'][
                        'data_type'] == 'Numeric':
                    min_max_keys.append(col)

            data = []
            explains = []
            for i, el in enumerate(pred_dicts):
                data.append({key: el[key] for key in keys})
                explains.append(explanations[i])

            field_types = {
                f: model['data_analysis_v2'][f]['typing']['data_subtype']
                for f in model['columns']
                if 'typing' in model['data_analysis_v2'][f]
            }

            for i, row in enumerate(data):
                cast_row_types(row, field_types)

                row['select_data_query'] = select_data_query
                row['external_datasource'] = external_datasource
                row['when_data'] = original_when_data

                for k in original_target_values:
                    row[k] = original_target_values[k][i]

                explanation = explains[i]
                for key in predicted_columns:
                    row[key + '_confidence'] = explanation[key]['confidence']
                    row[key + '_explain'] = json.dumps(explanation[key],
                                                       cls=NumpyJSONEncoder,
                                                       ensure_ascii=False)
                for key in min_max_keys:
                    row[key + '_min'] = min(
                        explanation[key]['confidence_interval'])
                    row[key + '_max'] = max(
                        explanation[key]['confidence_interval'])

            return data