def initialize_interfaces(config, app): app.default_store = DataStore(config) app.mindsdb_native = MindsdbNative(config)
class MindsDBDataNode(DataNode): type = 'mindsdb' def __init__(self, config): self.config = config self.mindsdb_native = MindsdbNative(config) def getTables(self): models = self.mindsdb_native.get_models() models = [x['name'] for x in models if x['status'] == 'complete'] models += ['predictors', 'commands'] return models def hasTable(self, table): return table in self.getTables() def getTableColumns(self, table): if table == 'predictors': return [ 'name', 'status', 'accuracy', 'predict', 'select_data_query', 'external_datasource', 'training_options' ] if table == 'commands': return ['command'] model = self.mindsdb_native.get_model_data(name=table) columns = [] columns += [ x['column_name'] for x in model['data_analysis']['input_columns_metadata'] ] columns += [ x['column_name'] for x in model['data_analysis']['target_columns_metadata'] ] columns += [f'{x}_original' for x in model['predict']] for col in model['predict']: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': columns += [f"{col}_min", f"{col}_max"] columns += [f"{col}_confidence"] columns += [f"{col}_explain"] # TODO this should be added just for clickhouse queries columns += ['when_data', 'select_data_query', 'external_datasource'] return columns def _select_predictors(self): models = self.mindsdb_native.get_models() return [ { 'name': x['name'], 'status': x['status'], 'accuracy': str(x['accuracy']) if x['accuracy'] is not None else None, 'predict': ', '.join(x['predict']), 'select_data_query': x['data_source'], 'external_datasource': '', # TODO 'training_options': '' # TODO ? } for x in models ] def delete_predictor(self, name): self.mindsdb_native.delete_model(name) def select(self, table, columns=None, where=None, where_data=None, order_by=None, group_by=None, came_from=None): ''' NOTE WHERE statements can be just $eq joined with 'and' ''' if table == 'predictors': return self._select_predictors() if table == 'commands': return [] original_when_data = None if 'when_data' in where: if len(where) > 1: raise ValueError( "Should not be used any other keys in 'where', if 'when_data' used" ) try: original_when_data = where['when_data']['$eq'] where_data = json.loads(where['when_data']['$eq']) if isinstance(where_data, list) is False: where_data = [where_data] except Exception: raise ValueError( f'''Error while parse 'where_data'="{where_data}"''') external_datasource = None if 'external_datasource' in where: external_datasource = where['external_datasource']['$eq'] del where['external_datasource'] select_data_query = None if came_from is not None and 'select_data_query' in where: select_data_query = where['select_data_query']['$eq'] del where['select_data_query'] dbtype = self.config['integrations'][came_from]['type'] if dbtype == 'clickhouse': ch = Clickhouse(self.config, came_from) res = ch._query( select_data_query.strip(' ;\n') + ' FORMAT JSON') data = res.json()['data'] elif dbtype == 'mariadb': maria = Mariadb(self.config, came_from) data = maria._query(select_data_query) elif dbtype == 'mysql': mysql = MySQL(self.config, came_from) data = mysql._query(select_data_query) elif dbtype == 'postgres': mysql = PostgreSQL(self.config, came_from) data = mysql._query(select_data_query) else: raise Exception(f'Unknown database type: {dbtype}') if where_data is None: where_data = data else: where_data += data new_where = {} if where_data is not None: where_data = pandas.DataFrame(where_data) else: for key, value in where.items(): if isinstance(value, dict) is False or len( value.keys()) != 1 or list(value.keys())[0] != '$eq': # TODO value should be just string or number raise Exception() new_where[key] = value['$eq'] if len(new_where) == 0: return [] where_data = [new_where] model = self.mindsdb_native.get_model_data(name=table) predicted_columns = model['predict'] original_target_values = {} for col in predicted_columns: if where_data is not None: if col in where_data: original_target_values[col + '_original'] = list( where_data[col]) else: original_target_values[col + '_original'] = [None ] * len(where_data) else: original_target_values[col + '_original'] = [None] res = self.mindsdb_native.predict(name=table, when_data=where_data) data = [] keys = [x for x in list(res._data.keys()) if x in columns] min_max_keys = [] for col in predicted_columns: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': min_max_keys.append(col) length = len(res._data[predicted_columns[0]]) for i in range(length): row = {} explanation = res[i].explain() for key in keys: row[key] = res._data[key][i] # +++ FIXME this fix until issue https://github.com/mindsdb/mindsdb/issues/591 not resolved typing = None if key in model['data_analysis_v2']: typing = model['data_analysis_v2'][key]['typing'][ 'data_subtype'] if typing == 'Timestamp' and row[key] is not None: timestamp = datetime.datetime.utcfromtimestamp(row[key]) row[key] = timestamp.strftime('%Y-%m-%d %H:%M:%S') elif typing == 'Date': timestamp = datetime.datetime.utcfromtimestamp(row[key]) row[key] = timestamp.strftime('%Y-%m-%d') # --- for key in predicted_columns: row[key + '_confidence'] = explanation[key]['confidence'] row[key + '_explain'] = json.dumps(explanation[key]) for key in min_max_keys: row[key + '_min'] = min( explanation[key]['confidence_interval']) row[key + '_max'] = max( explanation[key]['confidence_interval']) row['select_data_query'] = select_data_query row['external_datasource'] = external_datasource row['when_data'] = original_when_data for k in original_target_values: row[k] = original_target_values[k][i] data.append(row) return data
def __init__(self, config): self.config = config self.mindsdb_native = MindsdbNative(config)
def initialize_interfaces(config, app): app.default_store = DataStore(config) app.mindsdb_native = MindsdbNative(config) app.custom_models = CustomModels(config) app.config_obj = config
} for api in api_arr] for api in api_arr: api_name = api['name'] if api_name not in config['api']: print(f"Trying run '{api_name}' API, but is no config for this api.") print(f"Please, fill config['api']['{api_name}']") sys.exit(0) start_functions = { 'http': start_http, 'mysql': start_mysql, 'mongodb': start_mongo } mdb = MindsdbNative(config) cst = CustomModels(config) # @TODO Maybe just use `get_model_data` directly here ? Seems like a useless abstraction model_data_arr = [ { 'name': x['name'], 'predict': x['predict'], 'data_analysis': mdb.get_model_data(x['name'])['data_analysis_v2'] } for x in mdb.get_models() ] for m in model_data_arr: if 'columns_to_ignore' in m['data_analysis']: del m['data_analysis']['columns_to_ignore'] if 'train_std_dev' in m['data_analysis']: del m['data_analysis']['train_std_dev']
def __init__(self, config): self.config = config self.dir = config.paths['datasources'] self.mindsdb_native = MindsdbNative(config)
class DataStore(): def __init__(self, config): self.config = config self.dir = config.paths['datasources'] self.mindsdb_native = MindsdbNative(config) def get_analysis(self, ds): if isinstance(ds, str): return self.mindsdb_native.analyse_dataset( self.get_datasource_obj(ds)) else: return self.mindsdb_native.analyse_dataset(ds) def get_datasources(self): datasource_arr = [] for ds_name in os.listdir(self.dir): try: with open(os.path.join(self.dir, ds_name, 'metadata.json'), 'r') as fp: try: datasource = json.load(fp) datasource['created_at'] = parse_dt( datasource['created_at'].split('.')[0]) datasource['updated_at'] = parse_dt( datasource['updated_at'].split('.')[0]) datasource_arr.append(datasource) except Exception as e: print(e) except Exception as e: print(e) return datasource_arr def get_data(self, name, where=None, limit=None, offset=None): if offset is None: offset = 0 ds = self.get_datasource_obj(name) # @TODO Remove and add `offset` to the `filter` method of the datasource if limit is not None: filtered_ds = ds.filter(where=where, limit=limit + offset) else: filtered_ds = ds.filter(where=where) filtered_ds = filtered_ds.iloc[offset:] data = filtered_ds.to_dict(orient='records') return { 'data': data, 'rowcount': len(ds), 'columns_names': filtered_ds.columns } def get_datasource(self, name): for ds in self.get_datasources(): if ds['name'] == name: return ds return None def delete_datasource(self, name): shutil.rmtree(os.path.join(self.dir, name)) def save_datasource(self, name, source_type, source, file_path=None): if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') for i in range(1, 1000): if name in [x['name'] for x in self.get_datasources()]: previous_index = i - 1 name = name.replace(f'__{previous_index}__', '') name = f'{name}__{i}__' else: break ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) try: if source_type == 'file': source = os.path.join(ds_meta_dir, source) shutil.move(file_path, source) ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} elif source_type in self.config['integrations']: integration = self.config['integrations'][source_type] ds_class_map = { 'clickhouse': ClickhouseDS, 'mariadb': MariaDS, 'mysql': MySqlDS, 'postgres': PostgresDS, 'mssql': MSSQLDS, 'mongodb': MongoDS, 'snowflake': SnowflakeDS } try: dsClass = ds_class_map[integration['type']] except KeyError: raise KeyError( f"Unknown DS type: {source_type}, type is {integration['type']}" ) if integration['type'] in ['clickhouse']: picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**picklable['kwargs']) elif integration['type'] in [ 'mssql', 'postgres', 'mariadb', 'mysql' ]: picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } if 'database' in integration: picklable['kwargs']['database'] = integration[ 'database'] if 'database' in source: picklable['kwargs']['database'] = source['database'] ds = dsClass(**picklable['kwargs']) elif integration['type'] == 'snowflake': picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'schema': source['schema'], 'warehouse': source['warehouse'], 'database': source['database'], 'host': integration['host'], 'password': integration['password'], 'user': integration['user'], 'account': integration['account'] } } ds = dsClass(**picklable['kwargs']) elif integration['type'] == 'mongodb': picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'database': source['database'], 'collection': source['collection'], 'query': source['find'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**picklable['kwargs']) else: # This probably only happens for urls ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} df = ds.df if '' in df.columns or len(df.columns) != len(set(df.columns)): shutil.rmtree(ds_meta_dir) raise Exception( 'Each column in datasource must have unique name') # Not sure if needed #summary_analysis = self.get_analysis(ds.filter(limit=200))['data_analysis_v2'] with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'wb') as fp: pickle.dump(picklable, fp) with open(os.path.join(ds_meta_dir, 'metadata.json'), 'w') as fp: meta = { 'name': name, 'source_type': source_type, 'source': source, 'created_at': str(datetime.datetime.now()).split('.')[0], 'updated_at': str(datetime.datetime.now()).split('.')[0], 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] } json.dump(meta, fp, indent=4, sort_keys=True) with open(os.path.join(ds_meta_dir, 'versions.json'), 'wt') as fp: json.dump(self.config.versions, fp, indent=4, sort_keys=True) except Exception: if os.path.isdir(ds_meta_dir): shutil.rmtree(ds_meta_dir) raise return self.get_datasource_obj(name, raw=True), name def get_datasource_obj(self, name, raw=False): ds_meta_dir = os.path.join(self.dir, name) ds = None try: with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'rb') as fp: picklable = pickle.load(fp) if raw: return picklable try: ds = eval(picklable['class'])(*picklable['args'], **picklable['kwargs']) except Exception: ds = picklable return ds except Exception as e: print(f'\n{e}\n') return None
class CustomModels(): def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) self.storage_dir = os.path.join(config['storage_dir'], 'misc') os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = MindsdbNative(self.config) self.dbw = DatabaseWrapper(self.config) def _dir(self, name): return str(os.path.join(self.storage_dir, 'custom_model_' + name)) def _internal_load(self, name): # Caching (2 lines bellow), currently disabled due to multiprocessing cache invalidation issues #if name in self.model_cache: # return self.model_cache[name] # "Proper" model loading (3 lines bellow), currently disabled due to pickling issues #spec = importlib.util.spec_from_file_location(name, self._dir(name) + '/model.py') #module = importlib.util.module_from_spec(spec) #spec.loader.exec_module(module) sys.path.insert(0, self._dir(name)) module = __import__(name) try: model = module.Model.load( os.path.join(self._dir(name), 'model.pickle')) except Exception as e: model = module.Model() model.initialize_column_types() if hasattr(model, 'setup'): model.setup() self.model_cache[name] = model return model def learn(self, name, from_data, to_predict, kwargs={}): model_data = self.get_model_data(name) model_data['status'] = 'training' self.save_model_data(name, model_data) to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source._df model = self._internal_load(name) model.to_predict = to_predict model_data = self.get_model_data(name) model_data['predict'] = model.to_predict self.save_model_data(name, model_data) data_analysis = self.mindsdb_native.analyse_dataset( data_source)['data_analysis_v2'] model_data = self.get_model_data(name) model_data['data_analysis'] = data_analysis self.save_model_data(name, model_data) model.fit(data_frame, to_predict, data_analysis, kwargs) model.save(os.path.join(self._dir(name), 'model.pickle')) self.model_cache[name] = model model_data = self.get_model_data(name) model_data['status'] = 'completed' self.save_model_data(name, model_data) self.dbw.unregister_predictor(name) self.dbw.register_predictors([self.get_model_data(name)], setup=False) def predict(self, name, when_data=None, from_data=None, kwargs={}): if from_data is not None: data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source._df elif when_data is not None: if isinstance(when_data, dict): for k in when_data: when_data[k] = [when_data[k]] data_frame = pd.DataFrame(when_data) else: data_frame = pd.DataFrame(when_data) model = self._internal_load(name) predictions = model.predict(data_frame, kwargs) pred_arr = [] for i in range(len(predictions)): pred_arr.append({}) pred_arr[-1] = {} for col in predictions.columns: pred_arr[-1][col] = {} pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i] return pred_arr def get_model_data(self, name): with open(os.path.join(self._dir(name), 'metadata.json'), 'r') as fp: return json.load(fp) def save_model_data(self, name, data): with open(os.path.join(self._dir(name), 'metadata.json'), 'w') as fp: json.dump(data, fp) def get_models(self, status='any'): models = [] for model_dir in os.listdir(self.storage_dir): if 'custom_model_' in model_dir: name = model_dir.replace('custom_model_', '') try: models.append(self.get_model_data(name)) except: print(f'Model {name} not found !') return models def delete_model(self, name): shutil.rmtree(self._dir(name)) self.dbw.unregister_predictor(name) def rename_model(self, name, new_name): self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) self.dbw.register_predictors([self.get_model_data(new_name)], setup=False) def export_model(self, name): shutil.make_archive(base_name=name, format='zip', root_dir=self._dir(name)) return str(self._dir(name)) + '.zip' def load_model(self, fpath, name, trained_status): shutil.unpack_archive(fpath, self._dir(name), 'zip') shutil.move(os.path.join(self._dir(name), 'model.py'), os.path.join(self._dir(name), f'{name}.py')) model = self._internal_load(name) model.to_predict = model.to_predict if isinstance( model.to_predict, list) else [model.to_predict] self.save_model_data( name, { 'name': name, 'data_analysis': model.column_type_map, 'predict': model.to_predict, 'status': trained_status, 'is_custom': True }) with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp: fp.write('') if trained_status == 'trained': self.dbw.register_predictors([self.get_model_data(name)], setup=False)