Пример #1
0
 def _get_integration(self, db_alias):
     if self.config['integrations'][db_alias]['publish']:
         db_type = self.config['integrations'][db_alias]['type']
         if db_type == 'clickhouse':
             return Clickhouse(self.config, db_alias)
         elif db_type == 'mariadb':
             return Mariadb(self.config, db_alias)
         elif db_type == 'mysql':
             return MySQL(self.config, db_alias)
         elif db_type == 'postgres':
             return PostgreSQL(self.config, db_alias)
         elif db_type == 'mssql':
             return MSSQL(self.config, db_alias)
         elif db_type == 'mongodb':
             return MongoDB(self.config, db_alias)
         else:
             logger.warning(f'Uknown integration type: {db_type} for database called: {db_alias}')
         return False
     return True
Пример #2
0
    def _get_integrations(self):
        # @TODO Once we have a presistent state sorted out this should be simplified as to not refresh the existing integrations every single time
        integration_arr = []
        for db_alias in self.config['integrations']:
            if self.config['integrations'][db_alias]['enabled']:
                db_type = self.config['integrations'][db_alias]['type']
                if db_type == 'clickhouse':
                    integration_arr.append(Clickhouse(self.config, db_alias))
                elif db_type == 'mariadb':
                    integration_arr.append(Mariadb(self.config, db_alias))
                elif db_type == 'mysql':
                    integration_arr.append(MySQL(self.config, db_alias))
                elif db_type == 'postgres':
                    integration_arr.append(PostgreSQL(self.config, db_alias))
                elif db_type == 'mssql':
                    integration_arr.append(MSSQL(self.config, db_alias))
                elif db_type == 'mongodb':
                    pass
                else:
                    print(
                        f'Uknown integration type: {db_type} for database called: {db_alias}'
                    )

        return integration_arr
Пример #3
0
    def select(self,
               table,
               columns=None,
               where=None,
               where_data=None,
               order_by=None,
               group_by=None,
               integration_name=None,
               integration_type=None):
        ''' NOTE WHERE statements can be just $eq joined with 'and'
        '''
        if table == 'predictors':
            return self._select_predictors()
        if table == 'commands':
            return []
        if table == 'datasources':
            return self._select_datasources()
        if self.ai_table.get_ai_table(table):
            return self._select_from_ai_table(table, columns, where)

        original_when_data = None
        if 'when_data' in where_data:
            if len(where_data) > 1:
                raise ValueError(
                    "Should not be used any other keys in 'where', if 'when_data' used"
                )
            try:
                original_when_data = where_data['when_data']
                where_data = json.loads(where_data['when_data'])
                if isinstance(where_data, list) is False:
                    where_data = [where_data]
            except Exception:
                raise ValueError(
                    f'''Error while parse 'when_data'="{where_data}"''')

        select_data_query = None
        if integration_name is not None and 'select_data_query' in where_data:
            select_data_query = where_data['select_data_query']
            del where_data['select_data_query']

            integration_data = self.datasource_interface.get_db_integration(
                integration_name)
            if integration_type == 'clickhouse':
                ch = Clickhouse(self.config, integration_name,
                                integration_data)
                res = ch._query(
                    select_data_query.strip(' ;\n') + ' FORMAT JSON')
                data = res.json()['data']
            elif integration_type == 'mariadb':
                maria = Mariadb(self.config, integration_name,
                                integration_data)
                data = maria._query(select_data_query)
            elif integration_type == 'mysql':
                mysql = MySQL(self.config, integration_name, integration_data)
                data = mysql._query(select_data_query)
            elif integration_type == 'postgres':
                mysql = PostgreSQL(self.config, integration_name,
                                   integration_data)
                data = mysql._query(select_data_query)
            elif integration_type == 'mssql':
                mssql = MSSQL(self.config, integration_name, integration_data)
                data = mssql._query(select_data_query, fetch=True)
            else:
                raise Exception(f'Unknown database type: {integration_type}')

            where_data = data

        new_where = {}
        if where_data is None:
            for key, value in where_data.items():
                if isinstance(value, dict) is False or len(
                        value.keys()) != 1 or list(value.keys())[0] != '$eq':
                    # TODO value should be just string or number
                    raise Exception()
                new_where[key] = value['$eq']

            if len(new_where) == 0:
                return []

            where_data = [new_where]

        model = self.model_interface.get_model_data(name=table)
        columns = list(model['dtype_dict'].keys())

        predicted_columns = model['predict']
        if not isinstance(predicted_columns, list):
            predicted_columns = [predicted_columns]

        original_target_values = {}
        for col in predicted_columns:
            if where_data is not None:
                if col in where_data:
                    original_target_values[col + '_original'] = list(
                        where_data[col])
                else:
                    original_target_values[col +
                                           '_original'] = [None
                                                           ] * len(where_data)
            else:
                original_target_values[col + '_original'] = [None]

        pred_dicts, explanations = self.model_interface.predict(
            table, where_data, 'dict&explain')

        # transform predictions to more convenient view
        new_pred_dicts = []
        for row in pred_dicts:
            new_row = {}
            for key in row:
                new_row.update(row[key])
                new_row[key] = new_row['predicted_value']
            del new_row['predicted_value']
            new_pred_dicts.append(new_row)
        pred_dicts = new_pred_dicts

        timeseries_settings = model['problem_definition'][
            'timeseries_settings']

        if timeseries_settings['is_timeseries'] is True:
            __mdb_make_predictions = set([
                row.get('__mdb_make_predictions', True) for row in where_data
            ]) == {True}

            predict = model['predict']
            group_by = timeseries_settings['group_by']
            order_by_column = timeseries_settings['order_by'][0]
            nr_predictions = timeseries_settings['nr_predictions']

            groups = set()
            for row in pred_dicts:
                groups.add(tuple([row[x] for x in group_by]))

            # split rows by groups
            rows_by_groups = {}
            for group in groups:
                rows_by_groups[group] = {'rows': [], 'explanations': []}
                for row_index, row in enumerate(pred_dicts):
                    is_wrong_group = False
                    for i, group_by_key in enumerate(group_by):
                        if row[group_by_key] != group[i]:
                            is_wrong_group = True
                            break
                    if not is_wrong_group:
                        rows_by_groups[group]['rows'].append(row)
                        rows_by_groups[group]['explanations'].append(
                            explanations[row_index])

            for group, data in rows_by_groups.items():
                rows = data['rows']
                explanations = data['explanations']

                if len(rows) == 0:
                    break

                for row in rows:
                    predictions = row[predict]
                    if isinstance(predictions, list) is False:
                        predictions = [predictions]

                    date_values = row[order_by_column]
                    if isinstance(date_values, list) is False:
                        date_values = [date_values]

                for i in range(len(rows) - 1):
                    rows[i][predict] = rows[i][predict][0]
                    rows[i][order_by_column] = rows[i][order_by_column][0]
                    for col in ('predicted_value', 'confidence',
                                'confidence_lower_bound',
                                'confidence_upper_bound'):
                        explanations[i][predict][col] = explanations[i][
                            predict][col][0]

                last_row = rows.pop()
                last_explanation = explanations.pop()
                for i in range(nr_predictions):
                    new_row = copy.deepcopy(last_row)
                    if nr_predictions == 1:
                        new_row[predict] = new_row[predict]
                        new_row[order_by_column] = new_row[order_by_column]
                    else:
                        new_row[predict] = new_row[predict][i]
                        new_row[order_by_column] = new_row[order_by_column][i]
                    if '__mindsdb_row_id' in new_row and (
                            i > 0 or __mdb_make_predictions is False):
                        new_row['__mindsdb_row_id'] = None
                    rows.append(new_row)

                    new_explanation = copy.deepcopy(last_explanation)
                    for col in ('predicted_value', 'confidence',
                                'confidence_lower_bound',
                                'confidence_upper_bound'):
                        if nr_predictions == 1:
                            new_explanation[predict][col] = new_explanation[
                                predict][col]
                        else:
                            new_explanation[predict][col] = new_explanation[
                                predict][col][i]
                    if i != 0:
                        new_explanation[predict]['anomaly'] = None
                        new_explanation[predict]['truth'] = None
                    explanations.append(new_explanation)

            pred_dicts = []
            explanations = []
            for group, data in rows_by_groups.items():
                pred_dicts.extend(data['rows'])
                explanations.extend(data['explanations'])

            original_target_values[f'{predict}_original'] = []
            for i in range(len(pred_dicts)):
                original_target_values[f'{predict}_original'].append(
                    explanations[i][predict].get('truth', None))

            if model['dtypes'][order_by_column] == dtype.date:
                for row in pred_dicts:
                    if isinstance(row[order_by_column], (int, float)):
                        row[order_by_column] = str(
                            datetime.fromtimestamp(
                                row[order_by_column]).date())
            elif model['dtypes'][order_by_column] == dtype.datetime:
                for row in pred_dicts:
                    if isinstance(row[order_by_column], (int, float)):
                        row[order_by_column] = str(
                            datetime.fromtimestamp(row[order_by_column]))

        keys = [x for x in pred_dicts[0] if x in columns]
        min_max_keys = []
        for col in predicted_columns:
            if model['dtype_dict'][col] in (dtype.integer, dtype.float):
                min_max_keys.append(col)

        data = []
        explains = []
        keys_to_save = [
            *keys, '__mindsdb_row_id', 'select_data_query', 'when_data'
        ]
        for i, el in enumerate(pred_dicts):
            data.append({key: el.get(key) for key in keys_to_save})
            explains.append(explanations[i])

        for i, row in enumerate(data):
            cast_row_types(row, model['dtype_dict'])

            row['select_data_query'] = select_data_query
            row['when_data'] = original_when_data

            for k in original_target_values:
                try:
                    row[k] = original_target_values[k][i]
                except Exception:
                    row[k] = None

            for column_name in columns:
                if column_name not in row:
                    row[column_name] = None

            explanation = explains[i]
            for key in predicted_columns:
                row[key + '_confidence'] = explanation[key]['confidence']
                row[key + '_explain'] = json.dumps(explanation[key],
                                                   cls=NumpyJSONEncoder,
                                                   ensure_ascii=False)
                if 'anomaly' in explanation[key]:
                    row[key + '_anomaly'] = explanation[key]['anomaly']
            for key in min_max_keys:
                row[key + '_min'] = explanation[key]['confidence_lower_bound']
                row[key + '_max'] = explanation[key]['confidence_upper_bound']

        return data
Пример #4
0
    def select(self,
               table,
               columns=None,
               where=None,
               where_data=None,
               order_by=None,
               group_by=None,
               came_from=None):
        ''' NOTE WHERE statements can be just $eq joined with 'and'
        '''
        if table == 'predictors':
            return self._select_predictors()
        if table == 'commands':
            return []

        original_when_data = None
        if 'when_data' in where:
            if len(where) > 1:
                raise ValueError(
                    "Should not be used any other keys in 'where', if 'when_data' used"
                )
            try:
                original_when_data = where['when_data']['$eq']
                where_data = json.loads(where['when_data']['$eq'])
                if isinstance(where_data, list) is False:
                    where_data = [where_data]
            except Exception:
                raise ValueError(
                    f'''Error while parse 'when_data'="{where_data}"''')

        external_datasource = None
        if 'external_datasource' in where:
            external_datasource = where['external_datasource']['$eq']
            del where['external_datasource']

        select_data_query = None
        if came_from is not None and 'select_data_query' in where:
            select_data_query = where['select_data_query']['$eq']
            del where['select_data_query']

            dbtype = self.config['integrations'][came_from]['type']
            if dbtype == 'clickhouse':
                ch = Clickhouse(self.config, came_from)
                res = ch._query(
                    select_data_query.strip(' ;\n') + ' FORMAT JSON')
                data = res.json()['data']
            elif dbtype == 'mariadb':
                maria = Mariadb(self.config, came_from)
                data = maria._query(select_data_query)
            elif dbtype == 'mysql':
                mysql = MySQL(self.config, came_from)
                data = mysql._query(select_data_query)
            elif dbtype == 'postgres':
                mysql = PostgreSQL(self.config, came_from)
                data = mysql._query(select_data_query)
            elif dbtype == 'mssql':
                mssql = MSSQL(self.config, came_from)
                data = mssql._query(select_data_query, fetch=True)
            else:
                raise Exception(f'Unknown database type: {dbtype}')

            if where_data is None:
                where_data = data
            else:
                where_data += data

        new_where = {}
        if where_data is not None:
            where_data = pandas.DataFrame(where_data)
        else:
            for key, value in where.items():
                if isinstance(value, dict) is False or len(
                        value.keys()) != 1 or list(value.keys())[0] != '$eq':
                    # TODO value should be just string or number
                    raise Exception()
                new_where[key] = value['$eq']

            if len(new_where) == 0:
                return []

            where_data = [new_where]

        try:
            model = self.custom_models.get_model_data(name=table)
        except Exception:
            model = self.mindsdb_native.get_model_data(name=table)

        predicted_columns = model['predict']

        original_target_values = {}
        for col in predicted_columns:
            if where_data is not None:
                if col in where_data:
                    original_target_values[col + '_original'] = list(
                        where_data[col])
                else:
                    original_target_values[col +
                                           '_original'] = [None
                                                           ] * len(where_data)
            else:
                original_target_values[col + '_original'] = [None]

        if table in [x['name'] for x in self.custom_models.get_models()]:
            res = self.custom_models.predict(name=table, when_data=where_data)

            data = []
            fields = model['data_analysis_v2']['columns']
            for i, ele in enumerate(res):
                row = {}
                row['select_data_query'] = select_data_query
                row['external_datasource'] = external_datasource
                row['when_data'] = original_when_data

                for key in ele:
                    row[key] = ele[key]['predicted_value']
                    # FIXME prefer get int from mindsdb_native in this case
                    if model['data_analysis_v2'][key]['typing'][
                            'data_subtype'] == 'Int':
                        row[key] = int(row[key])

                for k in fields:
                    if k not in ele:
                        if isinstance(where_data, list):
                            if k in where_data[i]:
                                row[k] = where_data[i][k]
                            else:
                                row[k] = None
                        elif k in where_data.columns:
                            row[k] = where_data[k].iloc[i]
                        else:
                            row[k] = None

                for k in original_target_values:
                    row[k] = original_target_values[k][i]

                data.append(row)

            field_types = {
                f: model['data_analysis_v2'][f]['typing']['data_subtype']
                for f in fields if 'typing' in model['data_analysis_v2'][f]
            }
            for row in data:
                cast_row_types(row, field_types)

            return data
        else:
            res = self.mindsdb_native.predict(name=table, when_data=where_data)

            keys = [x for x in list(res._data.keys()) if x in columns]
            min_max_keys = []
            for col in predicted_columns:
                if model['data_analysis_v2'][col]['typing'][
                        'data_type'] == 'Numeric':
                    min_max_keys.append(col)

            data = []
            explains = []
            for i, el in enumerate(res):
                data.append({key: el[key] for key in keys})
                explains.append(el.explain())

            field_types = {
                f: model['data_analysis_v2'][f]['typing']['data_subtype']
                for f in model['data_analysis_v2']['columns']
                if 'typing' in model['data_analysis_v2'][f]
            }

            for row in data:
                cast_row_types(row, field_types)

                row['select_data_query'] = select_data_query
                row['external_datasource'] = external_datasource
                row['when_data'] = original_when_data

                for k in original_target_values:
                    row[k] = original_target_values[k][i]

                explanation = explains[i]
                for key in predicted_columns:
                    row[key + '_confidence'] = explanation[key]['confidence']
                    row[key + '_explain'] = json.dumps(explanation[key],
                                                       cls=NumpyJSONEncoder)
                for key in min_max_keys:
                    row[key + '_min'] = min(
                        explanation[key]['confidence_interval'])
                    row[key + '_max'] = max(
                        explanation[key]['confidence_interval'])

            return data
Пример #5
0
    def select(self,
               table,
               columns=None,
               where=None,
               where_data=None,
               order_by=None,
               group_by=None,
               came_from=None):
        ''' NOTE WHERE statements can be just $eq joined with 'and'
        '''
        if table == 'predictors':
            return self._select_predictors()
        if table == 'commands':
            return []

        original_when_data = None
        if 'when_data' in where:
            if len(where) > 1:
                raise ValueError(
                    "Should not be used any other keys in 'where', if 'when_data' used"
                )
            try:
                original_when_data = where['when_data']['$eq']
                where_data = json.loads(where['when_data']['$eq'])
                if isinstance(where_data, list) is False:
                    where_data = [where_data]
            except Exception:
                raise ValueError(
                    f'''Error while parse 'where_data'="{where_data}"''')

        external_datasource = None
        if 'external_datasource' in where:
            external_datasource = where['external_datasource']['$eq']
            del where['external_datasource']

        select_data_query = None
        if came_from is not None and 'select_data_query' in where:
            select_data_query = where['select_data_query']['$eq']
            del where['select_data_query']

            dbtype = self.config['integrations'][came_from]['type']
            if dbtype == 'clickhouse':
                ch = Clickhouse(self.config, came_from)
                res = ch._query(
                    select_data_query.strip(' ;\n') + ' FORMAT JSON')
                data = res.json()['data']
            elif dbtype == 'mariadb':
                maria = Mariadb(self.config, came_from)
                data = maria._query(select_data_query)
            elif dbtype == 'mysql':
                mysql = MySQL(self.config, came_from)
                data = mysql._query(select_data_query)
            elif dbtype == 'postgres':
                mysql = PostgreSQL(self.config, came_from)
                data = mysql._query(select_data_query)
            elif dbtype == 'mssql':
                mssql = MSSQL(self.config, came_from)
                data = mssql._query(select_data_query, fetch=True)
            else:
                raise Exception(f'Unknown database type: {dbtype}')

            if where_data is None:
                where_data = data
            else:
                where_data += data

        new_where = {}
        if where_data is not None:
            where_data = pandas.DataFrame(where_data)
        else:
            for key, value in where.items():
                if isinstance(value, dict) is False or len(
                        value.keys()) != 1 or list(value.keys())[0] != '$eq':
                    # TODO value should be just string or number
                    raise Exception()
                new_where[key] = value['$eq']

            if len(new_where) == 0:
                return []

            where_data = [new_where]

        model = self.mindsdb_native.get_model_data(name=table)
        predicted_columns = model['predict']

        original_target_values = {}
        for col in predicted_columns:
            if where_data is not None:
                if col in where_data:
                    original_target_values[col + '_original'] = list(
                        where_data[col])
                else:
                    original_target_values[col +
                                           '_original'] = [None
                                                           ] * len(where_data)
            else:
                original_target_values[col + '_original'] = [None]

        res = self.mindsdb_native.predict(name=table, when_data=where_data)

        data = []
        keys = [x for x in list(res._data.keys()) if x in columns]
        min_max_keys = []
        for col in predicted_columns:
            if model['data_analysis_v2'][col]['typing'][
                    'data_type'] == 'Numeric':
                min_max_keys.append(col)

        length = len(res._data[predicted_columns[0]])
        for i in range(length):
            row = {}
            explanation = res[i].explain()
            for key in keys:
                row[key] = res._data[key][i]
                # +++ FIXME this fix until issue https://github.com/mindsdb/mindsdb/issues/591 not resolved
                typing = None
                if key in model['data_analysis_v2']:
                    typing = model['data_analysis_v2'][key]['typing'][
                        'data_subtype']

                if typing == 'Timestamp' and row[key] is not None:
                    timestamp = datetime.datetime.utcfromtimestamp(row[key])
                    row[key] = timestamp.strftime('%Y-%m-%d %H:%M:%S')
                elif typing == 'Date':
                    timestamp = datetime.datetime.utcfromtimestamp(row[key])
                    row[key] = timestamp.strftime('%Y-%m-%d')
                # ---
            for key in predicted_columns:
                row[key + '_confidence'] = explanation[key]['confidence']
                row[key + '_explain'] = json.dumps(explanation[key])
            for key in min_max_keys:
                row[key + '_min'] = min(
                    explanation[key]['confidence_interval'])
                row[key + '_max'] = max(
                    explanation[key]['confidence_interval'])
            row['select_data_query'] = select_data_query
            row['external_datasource'] = external_datasource
            row['when_data'] = original_when_data
            for k in original_target_values:
                row[k] = original_target_values[k][i]
            data.append(row)

        return data