def rename_model(self, old_name, new_name, company_id: int): db_p = db.session.query(db.Predictor).filter_by(company_id=company_id, name=old_name).first() db_p.name = new_name db.session.commit() dbw = DatabaseWrapper(company_id) dbw.unregister_predictor(old_name) dbw.register_predictors([self.get_model_data(new_name, company_id)])
class CustomModels(): def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dbw = DatabaseWrapper() self.storage_dir = self.config['paths']['custom_models'] os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = NativeInterface() self.dbw = DatabaseWrapper() def _dir(self, name): return str(os.path.join(self.storage_dir, name)) def _internal_load(self, name): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) sys.path.insert(0, self._dir(name)) module = __import__(name) try: model = module.Model.load( os.path.join(self._dir(name), 'model.pickle')) except Exception as e: model = module.Model() model.initialize_column_types() if hasattr(model, 'setup'): model.setup() self.model_cache[name] = model return model def learn(self, name, from_data, to_predict, datasource_id, kwargs={}): model_data = self.get_model_data(name) model_data['status'] = 'training' self.save_model_data(name, model_data) to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_datasources, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df model = self._internal_load(name) model.to_predict = to_predict model_data = self.get_model_data(name) model_data['predict'] = model.to_predict self.save_model_data(name, model_data) data_analysis = self.mindsdb_native.analyse_dataset( data_source)['data_analysis_v2'] model_data = self.get_model_data(name) model_data['data_analysis_v2'] = data_analysis self.save_model_data(name, model_data) model.fit(data_frame, to_predict, data_analysis, kwargs) model.save(os.path.join(self._dir(name), 'model.pickle')) self.model_cache[name] = model model_data = self.get_model_data(name) model_data['status'] = 'completed' model_data['columns'] = list(data_analysis.keys()) self.save_model_data(name, model_data) self.fs_store.put(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) self.dbw.unregister_predictor(name) self.dbw.register_predictors([self.get_model_data(name)]) def predict(self, name, when_data=None, from_data=None, kwargs=None): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) if kwargs is None: kwargs = {} if from_data is not None: if isinstance(from_data, dict): data_source = getattr(mindsdb_datasources, from_data['class'])( *from_data['args'], **from_data['kwargs']) # assume that particular instance of any DataSource class is provided else: data_source = from_data data_frame = data_source.df elif when_data is not None: if isinstance(when_data, dict): for k in when_data: when_data[k] = [when_data[k]] data_frame = pd.DataFrame(when_data) else: data_frame = pd.DataFrame(when_data) model = self._internal_load(name) predictions = model.predict(data_frame, kwargs) pred_arr = [] for i in range(len(predictions)): pred_arr.append({}) pred_arr[-1] = {} for col in predictions.columns: pred_arr[-1][col] = {} pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i] return pred_arr def get_model_data(self, name): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() return predictor_record.data def save_model_data(self, name, data): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() if predictor_record is None: predictor_record = Predictor(company_id=self.company_id, name=name, is_custom=True, data=data) session.add(predictor_record) else: predictor_record.data = data session.commit() def get_models(self): predictor_names = [ x.name for x in Predictor.query.filter_by(company_id=self.company_id, is_custom=True) ] models = [] for name in predictor_names: models.append(self.get_model_data(name)) return models def delete_model(self, name): Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=True).delete() session.commit() shutil.rmtree(self._dir(name)) self.dbw.unregister_predictor(name) self.fs_store.delete(f'custom_model_{self.company_id}_{name}') def rename_model(self, name, new_name): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() predictor_record.name = new_name session.commit() self.dbw.register_predictors([self.get_model_data(new_name)]) self.fs_store.put(name, f'custom_model_{self.company_id}_{new_name}', self.storage_dir) self.fs_store.delete(f'custom_model_{self.company_id}_{name}') def export_model(self, name): shutil.make_archive(base_name=name, format='zip', root_dir=self._dir(name)) return str(self._dir(name)) + '.zip' def load_model(self, fpath, name, trained_status): shutil.unpack_archive(fpath, self._dir(name), 'zip') shutil.move(os.path.join(self._dir(name), 'model.py'), os.path.join(self._dir(name), f'{name}.py')) model = self._internal_load(name) model.to_predict = model.to_predict if isinstance( model.to_predict, list) else [model.to_predict] self.save_model_data( name, { 'name': name, 'data_analysis_v2': model.column_type_map, 'predict': model.to_predict, 'status': trained_status, 'is_custom': True, 'columns': list(model.column_type_map.keys()) }) with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp: fp.write('') self.fs_store.put(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) if trained_status == 'trained': self.dbw.register_predictors([self.get_model_data(name)])
class NativeInterface(): def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) self.predictor_cache = {} def _invalidate_cached_predictors(self): # @TODO: Cache will become stale if the respective NativeInterface is not invoked yet a bunch of predictors remained cached, no matter where we invoke it. In practice shouldn't be a big issue though for predictor_name in list(self.predictor_cache.keys()): if (datetime.datetime.now() - self.predictor_cache[predictor_name]['created'] ).total_seconds() > 1200: del self.predictor_cache[predictor_name] def _setup_for_creation(self, name): if name in self.predictor_cache: del self.predictor_cache[name] # Here for no particular reason, because we want to run this sometimes but not too often self._invalidate_cached_predictors() predictor_dir = Path(self.config.paths['predictors']).joinpath(name) create_directory(predictor_dir) versions_file_path = predictor_dir.joinpath('versions.json') with open(str(versions_file_path), 'wt') as f: json.dump(self.config.versions, f, indent=4, sort_keys=True) def create(self, name): self._setup_for_creation(name) predictor = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) return predictor def learn(self, name, from_data, to_predict, kwargs={}): join_learn_process = kwargs.get('join_learn_process', False) if 'join_learn_process' in kwargs: del kwargs['join_learn_process'] self._setup_for_creation(name) p = LearnProcess(name, from_data, to_predict, kwargs, self.config.get_all()) p.start() if join_learn_process is True: p.join() if p.exitcode != 0: raise Exception('Learning process failed !') def predict(self, name, when_data=None, kwargs={}): if name not in self.predictor_cache: # Clear the cache entirely if we have less than .12 GB left if psutil.virtual_memory().available < 1.2 * pow(10, 9): self.predictor_cache = {} if F.get_model_data(name)['status'] == 'complete': self.predictor_cache[name] = { 'predictor': mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}), 'created': datetime.datetime.now() } predictions = self.predictor_cache[name]['predictor'].predict( when_data=when_data, **kwargs) return predictions def analyse_dataset(self, ds): return F.analyse_dataset(ds) def get_model_data(self, name, db_fix=True): model = F.get_model_data(name) # Make some corrections for databases not to break when dealing with empty columns if db_fix: data_analysis = model['data_analysis_v2'] for column in data_analysis['columns']: analysis = data_analysis.get(column) if isinstance(analysis, dict) and (len(analysis) == 0 or analysis.get( 'empty', {}).get('is_empty', False)): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } return model def get_models(self): models = [] predictors = [ x for x in Path(self.config.paths['predictors']).iterdir() if x.is_dir() and x.joinpath('light_model_metadata.pickle'). is_file() and x.joinpath('heavy_model_metadata.pickle').is_file() ] for p in predictors: model_name = p.name try: model_data = self.get_model_data(model_name, db_fix=False) if model_data['status'] == 'training' and parse_datetime( model_data['created_at']) < parse_datetime( self.config['mindsdb_last_started_at']): continue reduced_model_data = {} for k in [ 'name', 'version', 'is_active', 'predict', 'status', 'current_phase', 'accuracy', 'data_source' ]: reduced_model_data[k] = model_data.get(k, None) for k in ['train_end_at', 'updated_at', 'created_at']: reduced_model_data[k] = model_data.get(k, None) if reduced_model_data[k] is not None: try: reduced_model_data[k] = parse_datetime( str(reduced_model_data[k]).split('.')[0]) except Exception as e: # @TODO Does this ever happen print( f'Date parsing exception while parsing: {k} in get_models: ', e) reduced_model_data[k] = parse_datetime( str(reduced_model_data[k])) models.append(reduced_model_data) except Exception as e: print( f"Can't list data for model: '{model_name}' when calling `get_models(), error: {e}`" ) return models def delete_model(self, name): F.delete_model(name) self.dbw.unregister_predictor(name) def rename_model(self, name, new_name): self.dbw.unregister_predictor(self.get_model_data(name)) F.rename_model(name, new_name) self.dbw.register_predictors(self.get_model_data(new_name)) def load_model(self, fpath): name = F.import_model(model_archive_path=fpath) self.dbw.register_predictors(self.get_model_data(name), setup=False) def export_model(self, name): F.export_predictor(model_name=name)
class MindsdbNative(): def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) def _setup_for_creation(self, name): predictor_dir = Path(self.config.paths['predictors']).joinpath(name) create_directory(predictor_dir) versions_file_path = predictor_dir.joinpath('versions.json') with open(str(versions_file_path), 'wt') as f: json.dump(self.config.versions, f, indent=4, sort_keys=True) def create(self, name): self._setup_for_creation(name) predictor = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) return predictor def learn(self, name, from_data, to_predict, kwargs={}): join_learn_process = kwargs.get('join_learn_process', False) if 'join_learn_process' in kwargs: del kwargs['join_learn_process'] self._setup_for_creation(name) p = PredictorProcess(name, from_data, to_predict, kwargs, self.config.get_all(), 'learn') p.start() if join_learn_process is True: p.join() if p.exitcode != 0: raise Exception('Learning process failed !') def predict(self, name, when_data=None, kwargs={}): # @TODO Separate into two paths, one for "normal" predictions and one for "real time" predictions. Use the multiprocessing code commented out bellow for normal (once we figure out how to return the prediction object... else use the inline code but with the "real time" predict functionality of mindsdb_native taht will be implemented later) ''' from_data = when if when is not None else when_data p = PredictorProcess(name, from_data, to_predict=None, kwargs=kwargs, config=self.config.get_all(), 'predict') p.start() predictions = p.join() ''' mdb = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) predictions = mdb.predict(when_data=when_data, **kwargs) return predictions def analyse_dataset(self, ds): return F.analyse_dataset(ds) def get_model_data(self, name, native_view=False): model = F.get_model_data(name) if native_view: return model data_analysis = model['data_analysis_v2'] for column in data_analysis['columns']: if len(data_analysis[column]) == 0 or data_analysis[column].get( 'empty', {}).get('is_empty', False): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } return model def get_models(self, status='any'): models = F.get_models() if status != 'any': models = [x for x in models if x['status'] == status] models = [ x for x in models if x['status'] != 'training' or parse_datetime(x['created_at']) > parse_datetime(self.config['mindsdb_last_started_at']) ] for i in range(len(models)): for k in ['train_end_at', 'updated_at', 'created_at']: if k in models[i] and models[i][k] is not None: try: models[i][k] = parse_datetime( str(models[i][k]).split('.')[0]) except Exception: models[i][k] = parse_datetime(str(models[i][k])) return models def delete_model(self, name): F.delete_model(name) self.dbw.unregister_predictor(name) def rename_model(self, name, new_name): self.dbw.unregister_predictor(self.get_model_data(name)) F.rename_model(name, new_name) self.dbw.register_predictors(self.get_model_data(new_name), setup=False) def load_model(self, fpath): F.import_model(model_archive_path=fpath) # @TODO How do we figure out the name here ? # dbw.register_predictors(...) def export_model(self, name): F.export_predictor(model_name=name)
class MindsdbNative(): def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) def learn(self, name, from_data, to_predict, kwargs={}): join_learn_process = kwargs.get('join_learn_process', False) if 'join_learn_process' in kwargs: del kwargs['join_learn_process'] p = PredictorProcess(name, from_data, to_predict, kwargs, self.config.get_all(), 'learn') p.start() if join_learn_process is True: p.join() if p.exitcode != 0: raise Exception('Learning process failed !') def predict(self, name, when_data=None, kwargs={}): # @TODO Separate into two paths, one for "normal" predictions and one for "real time" predictions. Use the multiprocessing code commented out bellow for normal (once we figure out how to return the prediction object... else use the inline code but with the "real time" predict functionality of mindsdb_native taht will be implemented later) ''' from_data = when if when is not None else when_data p = PredictorProcess(name, from_data, to_predict=None, kwargs=kwargs, config=self.config.get_all(), 'predict') p.start() predictions = p.join() ''' mdb = mindsdb_native.Predictor(name=name) predictions = mdb.predict(when_data=when_data, run_confidence_variation_analysis=isinstance( when_data, list) is False or len(when_data) == 1, **kwargs) return predictions def analyse_dataset(self, ds): return F.analyse_dataset(ds) def get_model_data(self, name): return F.get_model_data(name) def get_models(self, status='any'): models = F.get_models() if status != 'any': models = [x for x in models if x['status'] == status] for i in range(len(models)): for k in ['train_end_at', 'updated_at', 'created_at']: if k in models[i] and models[i][k] is not None: try: models[i][k] = parse_datetime( str(models[i][k]).split('.')[0]) except Exception as e: models[i][k] = parse_datetime(str(models[i][k])) return models def delete_model(self, name): F.delete_model(name) self.dbw.unregister_predictor(name) def rename_model(self, name, new_name): self.dbw.unregister_predictor(name) F.rename_model(name, new_name) self.dbw.register_predictors(new_name) def load_model(self, fpath): F.import_model(model_archive_path=fpath) # @TODO How do we figure out the name here ? #dbw.register_predictor(...) def export_model(self, name): F.export_predictor(model_name=name)
class ModelController(): def __init__(self, ray_based): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dbw = DatabaseWrapper() self.predictor_cache = {} self.ray_based = ray_based def _pack(self, obj): if self.ray_based: return obj return xmlrpc.client.Binary(pickle.dumps(obj)) def _invalidate_cached_predictors(self): from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS import mindsdb_native from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor # @TODO: Cache will become stale if the respective NativeInterface is not invoked yet a bunch of predictors remained cached, no matter where we invoke it. In practice shouldn't be a big issue though for predictor_name in list(self.predictor_cache.keys()): if (datetime.datetime.now() - self.predictor_cache[predictor_name]['created']).total_seconds() > 1200: del self.predictor_cache[predictor_name] def _lock_predictor(self, id, mode='write'): from mindsdb.interfaces.storage.db import session, Semaphor while True: semaphor_record = session.query(Semaphor).filter_by(company_id=self.company_id, entity_id=id, entity_type='predictor').first() if semaphor_record is not None: if mode == 'read' and semaphor_record.action == 'read': return True try: semaphor_record = Semaphor(company_id=self.company_id, entity_id=id, entity_type='predictor', action=mode) session.add(semaphor_record) session.commit() return True except Excpetion as e: pass time.sleep(1) def _unlock_predictor(self, id): from mindsdb.interfaces.storage.db import session, Semaphor semaphor_record = session.query(Semaphor).filter_by(company_id=self.company_id, entity_id=id, entity_type='predictor').first() if semaphor_record is not None: session.delete(semaphor_record) session.commit() @contextmanager def _lock_context(self, id, mode='write'): try: self._lock_predictor(mode) yield True finally: self._unlock_predictor(id) def _setup_for_creation(self, name): from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS import mindsdb_native from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor if name in self.predictor_cache: del self.predictor_cache[name] # Here for no particular reason, because we want to run this sometimes but not too often self._invalidate_cached_predictors() predictor_dir = Path(self.config.paths['predictors']).joinpath(name) create_directory(predictor_dir) predictor_record = Predictor(company_id=self.company_id, name=name, is_custom=False) session.add(predictor_record) session.commit() def _try_outdate_db_status(self, predictor_record): from mindsdb_native import __version__ as native_version from mindsdb import __version__ as mindsdb_version from mindsdb.interfaces.storage.db import session if predictor_record.update_status == 'update_failed': return predictor_record if predictor_record.native_version != native_version: predictor_record.update_status = 'available' if predictor_record.mindsdb_version != mindsdb_version: predictor_record.update_status = 'available' session.commit() return predictor_record def _update_db_status(self, predictor_record): from mindsdb_native import __version__ as native_version from mindsdb import __version__ as mindsdb_version from mindsdb.interfaces.storage.db import session predictor_record.native_version = native_version predictor_record.mindsdb_version = mindsdb_version predictor_record.update_status = 'up_to_date' session.commit() return predictor_record def create(self, name): from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS import mindsdb_native from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor self._setup_for_creation(name) predictor = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) return predictor def learn(self, name, from_data, to_predict, datasource_id, kwargs={}): from mindsdb.interfaces.model.learn_process import LearnProcess, run_learn create_process_mark('learn') join_learn_process = kwargs.get('join_learn_process', False) if 'join_learn_process' in kwargs: del kwargs['join_learn_process'] self._setup_for_creation(name) if self.ray_based: run_learn(name, from_data, to_predict, kwargs, datasource_id) else: p = LearnProcess(name, from_data, to_predict, kwargs, datasource_id) p.start() if join_learn_process is True: p.join() if p.exitcode != 0: delete_process_mark('learn') raise Exception('Learning process failed !') delete_process_mark('learn') return 0 def predict(self, name, pred_format, when_data=None, kwargs={}): from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS import mindsdb_native from mindsdb.interfaces.storage.db import session, Predictor create_process_mark('predict') if name not in self.predictor_cache: # Clear the cache entirely if we have less than 1.2 GB left if psutil.virtual_memory().available < 1.2 * pow(10, 9): self.predictor_cache = {} predictor_record = Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=False).first() if predictor_record.data['status'] == 'complete': self.fs_store.get(name, f'predictor_{self.company_id}_{predictor_record.id}', self.config['paths']['predictors']) self.predictor_cache[name] = { 'predictor': mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}), 'created': datetime.datetime.now() } if isinstance(when_data, dict) and 'kwargs' in when_data and 'args' in when_data: data_source = getattr(mindsdb_datasources, when_data['class'])(*when_data['args'], **when_data['kwargs']) else: # @TODO: Replace with Datasource try: data_source = pd.DataFrame(when_data) except Exception: data_source = when_data predictions = self.predictor_cache[name]['predictor'].predict( when_data=when_data, **kwargs ) if pred_format == 'explain' or pred_format == 'new_explain': predictions = [p.explain() for p in predictions] elif pred_format == 'dict': predictions = [p.as_dict() for p in predictions] elif pred_format == 'dict&explain': predictions = [[p.as_dict() for p in predictions], [p.explain() for p in predictions]] else: delete_process_mark('predict') raise Exception(f'Unkown predictions format: {pred_format}') delete_process_mark('predict') return self._pack(predictions) def analyse_dataset(self, ds): from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS from mindsdb_native import F create_process_mark('analyse') ds = eval(ds['class'])(*ds['args'], **ds['kwargs']) analysis = F.analyse_dataset(ds) delete_process_mark('analyse') return self._pack(analysis) def get_model_data(self, name, db_fix=True): from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor predictor_record = Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=False).first() predictor_record = self._try_outdate_db_status(predictor_record) model = predictor_record.data if model is None or model['status'] == 'training': try: self.fs_store.get(name, f'predictor_{self.company_id}_{predictor_record.id}', self.config['paths']['predictors']) new_model_data = mindsdb_native.F.get_model_data(name) except Exception: new_model_data = None if predictor_record.data is None or (new_model_data is not None and len(new_model_data) > len(predictor_record.data)): predictor_record.data = new_model_data model = new_model_data session.commit() # Make some corrections for databases not to break when dealing with empty columns if db_fix: data_analysis = model['data_analysis_v2'] for column in model['columns']: analysis = data_analysis.get(column) if isinstance(analysis, dict) and (len(analysis) == 0 or analysis.get('empty', {}).get('is_empty', False)): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } model['created_at'] = str(parse_datetime(str(predictor_record.created_at).split('.')[0])) model['updated_at'] = str(parse_datetime(str(predictor_record.updated_at).split('.')[0])) model['predict'] = predictor_record.to_predict model['update'] = predictor_record.update_status return self._pack(model) def get_models(self): from mindsdb.interfaces.storage.db import session, Predictor models = [] predictor_records = Predictor.query.filter_by(company_id=self.company_id, is_custom=False) predictor_names = [ x.name for x in predictor_records ] for model_name in predictor_names: try: if self.ray_based: model_data = self.get_model_data(model_name, db_fix=False) else: bin = self.get_model_data(model_name, db_fix=False) model_data = pickle.loads(bin.data) reduced_model_data = {} for k in ['name', 'version', 'is_active', 'predict', 'status', 'current_phase', 'accuracy', 'data_source', 'update']: reduced_model_data[k] = model_data.get(k, None) for k in ['train_end_at', 'updated_at', 'created_at']: reduced_model_data[k] = model_data.get(k, None) if reduced_model_data[k] is not None: try: reduced_model_data[k] = parse_datetime(str(reduced_model_data[k]).split('.')[0]) except Exception as e: # @TODO Does this ever happen log.error(f'Date parsing exception while parsing: {k} in get_models: ', e) reduced_model_data[k] = parse_datetime(str(reduced_model_data[k])) models.append(reduced_model_data) except Exception as e: log.error(f"Can't list data for model: '{model_name}' when calling `get_models(), error: {e}`") return self._pack(models) def delete_model(self, name): from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor predictor_record = Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=False).first() id = predictor_record.id session.delete(predictor_record) session.commit() F.delete_model(name) self.dbw.unregister_predictor(name) self.fs_store.delete(f'predictor_{self.company_id}_{id}') return 0 def update_model(self, name): from mindsdb_native import F from mindsdb_worker.updater.update_model import update_model from mindsdb.interfaces.storage.db import session, Predictor from mindsdb.interfaces.datastore.datastore import DataStore try: predictor_record = Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=False).first() predictor_record.update_status = 'updating' session.commit() update_model(name, self.delete_model, F.delete_model, self.learn, self._lock_context, self.company_id, self.config['paths']['predictors'], predictor_record, self.fs_store, DataStore()) predictor_record = self._update_db_status(predictor_record) except Exception as e: log.error(e) predictor_record.update_status = 'update_failed' session.commit() return str(e)
class CustomModels(): def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) self.storage_dir = os.path.join(config['storage_dir'], 'misc') os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = MindsdbNative(self.config) self.dbw = DatabaseWrapper(self.config) def _dir(self, name): return str(os.path.join(self.storage_dir, 'custom_model_' + name)) def _internal_load(self, name): # Caching (2 lines bellow), currently disabled due to multiprocessing cache invalidation issues #if name in self.model_cache: # return self.model_cache[name] # "Proper" model loading (3 lines bellow), currently disabled due to pickling issues #spec = importlib.util.spec_from_file_location(name, self._dir(name) + '/model.py') #module = importlib.util.module_from_spec(spec) #spec.loader.exec_module(module) sys.path.insert(0, self._dir(name)) module = __import__(name) try: model = module.Model.load( os.path.join(self._dir(name), 'model.pickle')) except Exception as e: model = module.Model() model.initialize_column_types() if hasattr(model, 'setup'): model.setup() self.model_cache[name] = model return model def learn(self, name, from_data, to_predict, kwargs={}): model_data = self.get_model_data(name) model_data['status'] = 'training' self.save_model_data(name, model_data) to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df model = self._internal_load(name) model.to_predict = to_predict model_data = self.get_model_data(name) model_data['predict'] = model.to_predict self.save_model_data(name, model_data) data_analysis = self.mindsdb_native.analyse_dataset( data_source)['data_analysis_v2'] model_data = self.get_model_data(name) model_data['data_analysis'] = data_analysis self.save_model_data(name, model_data) model.fit(data_frame, to_predict, data_analysis, kwargs) model.save(os.path.join(self._dir(name), 'model.pickle')) self.model_cache[name] = model model_data = self.get_model_data(name) model_data['status'] = 'completed' self.save_model_data(name, model_data) self.dbw.unregister_predictor(name) self.dbw.register_predictors([self.get_model_data(name)], setup=False) def predict(self, name, when_data=None, from_data=None, kwargs={}): if from_data is not None: data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df elif when_data is not None: if isinstance(when_data, dict): for k in when_data: when_data[k] = [when_data[k]] data_frame = pd.DataFrame(when_data) else: data_frame = pd.DataFrame(when_data) model = self._internal_load(name) predictions = model.predict(data_frame, kwargs) pred_arr = [] for i in range(len(predictions)): pred_arr.append({}) pred_arr[-1] = {} for col in predictions.columns: pred_arr[-1][col] = {} pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i] return pred_arr def get_model_data(self, name): with open(os.path.join(self._dir(name), 'metadata.json'), 'r') as fp: return json.load(fp) def save_model_data(self, name, data): with open(os.path.join(self._dir(name), 'metadata.json'), 'w') as fp: json.dump(data, fp) def get_models(self, status='any'): models = [] for model_dir in os.listdir(self.storage_dir): if 'custom_model_' in model_dir: name = model_dir.replace('custom_model_', '') try: models.append(self.get_model_data(name)) except: print(f'Model {name} not found !') return models def delete_model(self, name): shutil.rmtree(self._dir(name)) self.dbw.unregister_predictor(name) def rename_model(self, name, new_name): self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) self.dbw.register_predictors([self.get_model_data(new_name)], setup=False) def export_model(self, name): shutil.make_archive(base_name=name, format='zip', root_dir=self._dir(name)) return str(self._dir(name)) + '.zip' def load_model(self, fpath, name, trained_status): shutil.unpack_archive(fpath, self._dir(name), 'zip') shutil.move(os.path.join(self._dir(name), 'model.py'), os.path.join(self._dir(name), f'{name}.py')) model = self._internal_load(name) model.to_predict = model.to_predict if isinstance( model.to_predict, list) else [model.to_predict] self.save_model_data( name, { 'name': name, 'data_analysis': model.column_type_map, 'predict': model.to_predict, 'status': trained_status, 'is_custom': True }) with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp: fp.write('') if trained_status == 'trained': self.dbw.register_predictors([self.get_model_data(name)], setup=False)
class NativeInterface(): def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dbw = DatabaseWrapper() self.predictor_cache = {} def _invalidate_cached_predictors(self): # @TODO: Cache will become stale if the respective NativeInterface is not invoked yet a bunch of predictors remained cached, no matter where we invoke it. In practice shouldn't be a big issue though for predictor_name in list(self.predictor_cache.keys()): if (datetime.datetime.now() - self.predictor_cache[predictor_name]['created'] ).total_seconds() > 1200: del self.predictor_cache[predictor_name] def _setup_for_creation(self, name): if name in self.predictor_cache: del self.predictor_cache[name] # Here for no particular reason, because we want to run this sometimes but not too often self._invalidate_cached_predictors() predictor_dir = Path(self.config.paths['predictors']).joinpath(name) create_directory(predictor_dir) predictor_record = Predictor(company_id=self.company_id, name=name, is_custom=False) session.add(predictor_record) session.commit() def create(self, name): self._setup_for_creation(name) predictor = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) return predictor def learn(self, name, from_data, to_predict, datasource_id, kwargs={}): join_learn_process = kwargs.get('join_learn_process', False) if 'join_learn_process' in kwargs: del kwargs['join_learn_process'] self._setup_for_creation(name) p = LearnProcess(name, from_data, to_predict, kwargs, datasource_id) p.start() if join_learn_process is True: p.join() if p.exitcode != 0: raise Exception('Learning process failed !') def predict(self, name, when_data=None, kwargs={}): try: original_process_title = setproctitle.getproctitle() setproctitle.setproctitle('mindsdb_native_process') except Exception: pass if name not in self.predictor_cache: # Clear the cache entirely if we have less than 1.2 GB left if psutil.virtual_memory().available < 1.2 * pow(10, 9): self.predictor_cache = {} predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=False).first() if predictor_record.data['status'] == 'complete': self.fs_store.get( name, f'predictor_{self.company_id}_{predictor_record.id}', self.config['paths']['predictors']) self.predictor_cache[name] = { 'predictor': mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}), 'created': datetime.datetime.now() } predictions = self.predictor_cache[name]['predictor'].predict( when_data=when_data, **kwargs) try: setproctitle.setproctitle(original_process_title) except Exception: pass return predictions # @TODO Move somewhere else to avoid circular import issues in the future def analyse_dataset(self, ds): return F.analyse_dataset(ds) def get_model_data(self, name, db_fix=True): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=False).first() model = predictor_record.data if model is None or model['status'] == 'training': try: self.fs_store.get( name, f'predictor_{self.company_id}_{predictor_record.id}', self.config['paths']['predictors']) new_model_data = mindsdb_native.F.get_model_data(name) except Exception: new_model_data = None if predictor_record.data is None or ( new_model_data is not None and len(new_model_data) > len(predictor_record.data)): predictor_record.data = new_model_data model = new_model_data session.commit() # Make some corrections for databases not to break when dealing with empty columns if db_fix: data_analysis = model['data_analysis_v2'] for column in model['columns']: analysis = data_analysis.get(column) if isinstance(analysis, dict) and (len(analysis) == 0 or analysis.get( 'empty', {}).get('is_empty', False)): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } return model def get_models(self): models = [] predictor_names = [ x.name for x in Predictor.query.filter_by(company_id=self.company_id, is_custom=False) ] for model_name in predictor_names: try: model_data = self.get_model_data(model_name, db_fix=False) if model_data['status'] == 'training' and parse_datetime( model_data['created_at']) < parse_datetime( self.config['mindsdb_last_started_at']): continue reduced_model_data = {} for k in [ 'name', 'version', 'is_active', 'predict', 'status', 'current_phase', 'accuracy', 'data_source' ]: reduced_model_data[k] = model_data.get(k, None) for k in ['train_end_at', 'updated_at', 'created_at']: reduced_model_data[k] = model_data.get(k, None) if reduced_model_data[k] is not None: try: reduced_model_data[k] = parse_datetime( str(reduced_model_data[k]).split('.')[0]) except Exception as e: # @TODO Does this ever happen print( f'Date parsing exception while parsing: {k} in get_models: ', e) reduced_model_data[k] = parse_datetime( str(reduced_model_data[k])) models.append(reduced_model_data) except Exception as e: print( f"Can't list data for model: '{model_name}' when calling `get_models(), error: {e}`" ) return models def delete_model(self, name): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=False).first() id = predictor_record.id session.delete(predictor_record) session.commit() F.delete_model(name) self.dbw.unregister_predictor(name) self.fs_store.delete(f'predictor_{self.company_id}_{id}')