def start(config, initial=False): if not initial: print('\n\nWarning, this process should not have been started... nothing is "wrong" but it needlessly ate away a tiny bit of precious compute !\n\n') config = Config(config) run_server(config)
def start(verbose, no_studio): config = Config() initialize_log(config, 'http', wrap_print=True) # start static initialization in a separate thread init_static_thread = None if not no_studio: init_static_thread = threading.Thread(target=initialize_static) init_static_thread.start() app, api = initialize_flask(config, init_static_thread, no_studio) Compress(app) initialize_interfaces(app) static_root = config['paths']['static'] if os.path.isabs(static_root) is False: static_root = os.path.join(os.getcwd(), static_root) static_root = Path(static_root) @app.route('/', defaults={'path': ''}, methods=['GET']) @app.route('/<path:path>', methods=['GET']) def root_index(path): if path.startswith('api/'): return {'message': 'wrong query'}, 400 if static_root.joinpath(path).is_file(): return send_from_directory(static_root, path) else: return send_from_directory(static_root, 'index.html') api.add_namespace(predictor_ns) api.add_namespace(datasource_ns) api.add_namespace(utils_ns) api.add_namespace(conf_ns) api.add_namespace(stream_ns) @api.errorhandler(Exception) def handle_exception(e): get_log('http').error(f'http exception: {e}') # pass through HTTP errors if isinstance(e, HTTPException): return {'message': str(e)}, e.code, e.get_response().headers name = getattr(type(e), '__name__') or 'Unknown error' return {'message': f'{name}: {str(e)}'}, 500 @app.teardown_appcontext def remove_session(*args, **kwargs): session.close() @app.before_request def before_request(): company_id = request.headers.get('company-id') if company_id is not None: try: company_id = int(company_id) except Exception as e: get_log('http').error( f'Cloud not parse company id: {company_id} | exception: {e}' ) company_id = None request.company_id = company_id request.default_store = WithKWArgsWrapper( current_app.original_data_store, company_id=company_id) request.model_interface = WithKWArgsWrapper( current_app.original_model_interface, company_id=company_id) request.datasource_interface = WithKWArgsWrapper( current_app.original_datasource_interface, company_id=company_id) port = config['api']['http']['port'] host = config['api']['http']['host'] server = os.environ.get('MINDSDB_DEFAULT_SERVER', 'waitress') # waiting static initialization if not no_studio: init_static_thread.join() if server.lower() == 'waitress': if host in ('', '0.0.0.0'): serve(app, port=port, host='*', max_request_body_size=1073741824 * 10, inbuf_overflow=1073741824 * 10) else: serve(app, port=port, host=host, max_request_body_size=1073741824 * 10, inbuf_overflow=1073741824 * 10) elif server.lower() == 'flask': # that will 'disable access' log in console log = logging.getLogger('werkzeug') log.setLevel(logging.WARNING) app.run(debug=False, port=port, host=host) elif server.lower() == 'gunicorn': try: from mindsdb.api.http.gunicorn_wrapper import StandaloneApplication except ImportError: print( "Gunicorn server is not available by default. If you wish to use it, please install 'gunicorn'" ) return options = { 'bind': f'{host}:{port}', 'workers': min(max(mp.cpu_count(), 2), 3), 'timeout': 600, 'reuse_port': True, 'threads': 4 } StandaloneApplication(app, options).run()
def add_db_integration(self, name, data, company_id=None): if 'database_name' not in data: data['database_name'] = name if 'publish' not in data: data['publish'] = True bundle_path = data.get('secure_connect_bundle') if data.get('type') in ( 'cassandra', 'scylla') and self._is_not_empty_str(bundle_path): if os.path.isfile(bundle_path) is False: raise Exception(f'Can not get access to file: {bundle_path}') integrations_dir = Config()['paths']['integrations'] p = Path(bundle_path) data['secure_connect_bundle'] = p.name integration_record = Integration(name=name, data=data, company_id=company_id) session.add(integration_record) session.commit() integration_id = integration_record.id folder_name = f'integration_files_{company_id}_{integration_id}' integration_dir = os.path.join(integrations_dir, folder_name) create_directory(integration_dir) shutil.copyfile(bundle_path, os.path.join(integration_dir, p.name)) FsStore().put(folder_name, integration_dir, integrations_dir) elif data.get('type') in ('mysql', 'mariadb'): ssl = data.get('ssl') files = {} temp_dir = None if ssl is True: for key in ['ssl_ca', 'ssl_cert', 'ssl_key']: if key not in data: continue if os.path.isfile(data[key]) is False: if self._is_not_empty_str(data[key]) is False: raise Exception( "'ssl_ca', 'ssl_cert' and 'ssl_key' must be paths or inline certs" ) if temp_dir is None: temp_dir = tempfile.mkdtemp( prefix='integration_files_') cert_file_name = data.get(f'{key}_name', f'{key}.pem') cert_file_path = os.path.join(temp_dir, cert_file_name) with open(cert_file_path, 'wt') as f: f.write(data[key]) data[key] = cert_file_path files[key] = data[key] p = Path(data[key]) data[key] = p.name integration_record = Integration(name=name, data=data, company_id=company_id) session.add(integration_record) session.commit() integration_id = integration_record.id if len(files) > 0: integrations_dir = Config()['paths']['integrations'] folder_name = f'integration_files_{company_id}_{integration_id}' integration_dir = os.path.join(integrations_dir, folder_name) create_directory(integration_dir) for file_path in files.values(): p = Path(file_path) shutil.copyfile(file_path, os.path.join(integration_dir, p.name)) FsStore().put(folder_name, integration_dir, integrations_dir) else: integration_record = Integration(name=name, data=data, company_id=company_id) session.add(integration_record) session.commit()
def initialize_interfaces(app): app.original_data_store = DataStore() app.original_model_interface = ModelInterface() config = Config() app.config_obj = config
def __init__(self, model_interface, data_store, integration_controller): self.config = Config() self.model_interface = model_interface self.data_store = data_store self.integration_controller = integration_controller
else: root_storage_dir = get_or_create_data_dir() os.environ['MINDSDB_STORAGE_DIR'] = root_storage_dir if os.path.isdir(root_storage_dir) is False: os.makedirs(root_storage_dir) if 'storage_db' in user_config: os.environ['MINDSDB_DB_CON'] = user_config['storage_db'] elif os.environ.get('MINDSDB_DB_CON', '') == '': os.environ['MINDSDB_DB_CON'] = 'sqlite:///' + os.path.join( os.environ['MINDSDB_STORAGE_DIR'], 'mindsdb.sqlite3.db') + '?check_same_thread=False&timeout=30' from mindsdb.utilities.config import Config mindsdb_config = Config() create_dirs_recursive(mindsdb_config['paths']) os.environ['DEFAULT_LOG_LEVEL'] = os.environ.get('DEFAULT_LOG_LEVEL', 'ERROR') os.environ['LIGHTWOOD_LOG_LEVEL'] = os.environ.get('LIGHTWOOD_LOG_LEVEL', 'ERROR') os.environ['MINDSDB_STORAGE_PATH'] = mindsdb_config['paths']['predictors'] if telemetry_file_exists(mindsdb_config['storage_dir']): os.environ['CHECK_FOR_UPDATES'] = '0' print('\n x telemetry disabled! \n') elif os.getenv('CHECK_FOR_UPDATES', '1').lower() in [ '0', 'false', 'False' ] or mindsdb_config.get('cloud', False): disable_telemetry(mindsdb_config['storage_dir'])
from mindsdb.utilities.wizards import cli_config from mindsdb.utilities.config import Config from mindsdb.utilities.functions import args_parse config_dir, storage_dir = get_or_create_dir_struct() config_path = os.path.join(config_dir, 'config.json') if not os.path.exists(config_path): _ = cli_config(None, None, storage_dir, config_dir, use_default=True) args = args_parse() if args.config is not None: config_path = args.config try: config = Config(config_path) except Exception as e: print(str(e)) sys.exit(1) paths = config.paths create_directory(paths['datasources']) create_directory(paths['predictors']) create_directory(paths['static']) create_directory(paths['tmp']) os.environ['MINDSDB_STORAGE_PATH'] = paths['predictors'] from mindsdb_native import * # Figure out how to add this as a module import lightwood
def __init__(self, company_id): self.config = Config() self.company_id = company_id self.integration_controller = WithKWArgsWrapper( IntegrationController(), company_id=company_id )
class ModelController(): config: Config fs_store: FsStore predictor_cache: Dict[str, Dict[str, Union[Any]]] ray_based: bool def __init__(self, ray_based: bool) -> None: self.config = Config() self.fs_store = FsStore() self.predictor_cache = {} self.ray_based = ray_based def _invalidate_cached_predictors(self) -> None: # @TODO: Cache will become stale if the respective ModelInterface is not invoked yet a bunch of predictors remained cached, no matter where we invoke it. In practice shouldn't be a big issue though for predictor_name in list(self.predictor_cache.keys()): if (datetime.datetime.now() - self.predictor_cache[predictor_name]['created']).total_seconds() > 1200: del self.predictor_cache[predictor_name] def _lock_predictor(self, id: int, mode: str) -> None: from mindsdb.interfaces.storage.db import session, Semaphor while True: semaphor_record = session.query(Semaphor).filter_by(entity_id=id, entity_type='predictor').first() if semaphor_record is not None: if mode == 'read' and semaphor_record.action == 'read': return True try: semaphor_record = Semaphor(entity_id=id, entity_type='predictor', action=mode) session.add(semaphor_record) session.commit() return True except Exception: pass time.sleep(1) def _unlock_predictor(self, id: int) -> None: from mindsdb.interfaces.storage.db import session, Semaphor semaphor_record = session.query(Semaphor).filter_by(entity_id=id, entity_type='predictor').first() if semaphor_record is not None: session.delete(semaphor_record) session.commit() @contextmanager def _lock_context(self, id, mode: str): try: self._lock_predictor(id, mode) yield True finally: self._unlock_predictor(id) def _get_from_data_df(self, from_data: dict) -> DataFrame: if from_data['class'] == 'QueryDS': ds = QueryDS(*from_data['args'], **from_data['kwargs']) else: ds_cls = getattr(mindsdb_datasources, from_data['class']) ds = ds_cls(*from_data['args'], **from_data['kwargs']) return ds.df def _unpack_old_args( self, from_data: dict, kwargs: dict, to_predict: Optional[Union[str, list]] = None ) -> Tuple[pd.DataFrame, ProblemDefinition, bool]: problem_definition = kwargs or {} if isinstance(to_predict, str): problem_definition['target'] = to_predict elif isinstance(to_predict, list) and len(to_predict) == 1: problem_definition['target'] = to_predict[0] elif problem_definition.get('target') is None: raise Exception( f"Predict target must be 'str' or 'list' with 1 element. Got: {to_predict}" ) while '.' in str(list(kwargs.keys())): for k in list(kwargs.keys()): if '.' in k: nks = k.split('.') obj = kwargs for nk in nks[:-1]: if nk not in obj: obj[nk] = {} obj = obj[nk] obj[nks[-1]] = kwargs[k] del kwargs[k] join_learn_process = kwargs.get('join_learn_process', False) if 'join_learn_process' in kwargs: del kwargs['join_learn_process'] # Adapt kwargs to problem definition if 'timeseries_settings' in kwargs: problem_definition['timeseries_settings'] = kwargs['timeseries_settings'] if 'stop_training_in_x_seconds' in kwargs: problem_definition['time_aim'] = kwargs['stop_training_in_x_seconds'] if kwargs.get('ignore_columns') is not None: problem_definition['ignore_features'] = kwargs['ignore_columns'] json_ai_override = {} json_ai_keys = list(lightwood.JsonAI.__dict__['__annotations__'].keys()) for k in kwargs: if k in json_ai_keys: json_ai_override[k] = kwargs[k] if ( problem_definition.get('ignore_features') is not None and isinstance(problem_definition['ignore_features'], list) is False ): problem_definition['ignore_features'] = [problem_definition['ignore_features']] if from_data is not None: df = self._get_from_data_df(from_data) else: df = None return df, problem_definition, join_learn_process, json_ai_override @mark_process(name='learn') def learn(self, name: str, from_data: dict, to_predict: str, dataset_id: int, kwargs: dict, company_id: int, delete_ds_on_fail: Optional[bool] = False) -> None: predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=name).first() if predictor_record is not None: raise Exception('Predictor name must be unique.') df, problem_definition, join_learn_process, json_ai_override = self._unpack_old_args(from_data, kwargs, to_predict) if 'url' in problem_definition: train_url = problem_definition['url'].get('train', None) predict_url = problem_definition['url'].get('predict', None) com_format = problem_definition['format'] predictor_record = db.Predictor( company_id=company_id, name=name, dataset_id=dataset_id, mindsdb_version=mindsdb_version, lightwood_version=lightwood_version, to_predict=problem_definition['target'], learn_args=ProblemDefinition.from_dict(problem_definition).to_dict(), data={'name': name, 'train_url': train_url, 'predict_url': predict_url, 'format': com_format, 'status': 'complete' if train_url is None else 'training'}, is_custom=True, # @TODO: For testing purposes, remove afterwards! dtype_dict=json_ai_override['dtype_dict'], ) db.session.add(predictor_record) db.session.commit() if train_url is not None: p = LearnRemoteProcess(df, predictor_record.id) p.start() if join_learn_process: p.join() if not IS_PY36: p.close() db.session.refresh(predictor_record) return problem_definition = ProblemDefinition.from_dict(problem_definition) predictor_record = db.Predictor( company_id=company_id, name=name, dataset_id=dataset_id, mindsdb_version=mindsdb_version, lightwood_version=lightwood_version, to_predict=problem_definition.target, learn_args=problem_definition.to_dict(), data={'name': name}, ) db.session.add(predictor_record) db.session.commit() predictor_id = predictor_record.id p = LearnProcess(df, problem_definition, predictor_id, delete_ds_on_fail, json_ai_override) p.start() if join_learn_process: p.join() if not IS_PY36: p.close() db.session.refresh(predictor_record) @mark_process(name='predict') def predict(self, name: str, when_data: Union[dict, list, pd.DataFrame], pred_format: str, company_id: int): original_name = name name = f'{company_id}@@@@@{name}' predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=original_name).first() assert predictor_record is not None predictor_data = self.get_model_data(name, company_id) if isinstance(when_data, dict) and 'kwargs' in when_data and 'args' in when_data: ds_cls = getattr(mindsdb_datasources, when_data['class']) df = ds_cls(*when_data['args'], **when_data['kwargs']).df else: if isinstance(when_data, dict): when_data = [when_data] df = pd.DataFrame(when_data) if predictor_record.is_custom: if predictor_data['format'] == 'mlflow': resp = requests.post(predictor_data['predict_url'], data=df.to_json(orient='records'), headers={'content-type': 'application/json; format=pandas-records'}) answer: List[object] = resp.json() predictions = pd.DataFrame({ 'prediction': answer }) elif predictor_data['format'] == 'ray_server': serialized_df = json.dumps(df.to_dict()) resp = requests.post(predictor_data['predict_url'], json={'df': serialized_df}) predictions = pd.DataFrame(resp.json()) else: fs_name = f'predictor_{company_id}_{predictor_record.id}' if ( name in self.predictor_cache and self.predictor_cache[name]['updated_at'] != predictor_record.updated_at ): del self.predictor_cache[name] if name not in self.predictor_cache: # Clear the cache entirely if we have less than 1.2 GB left if psutil.virtual_memory().available < 1.2 * pow(10, 9): self.predictor_cache = {} if predictor_data['status'] == 'complete': self.fs_store.get(fs_name, fs_name, self.config['paths']['predictors']) self.predictor_cache[name] = { 'predictor': lightwood.predictor_from_state( os.path.join(self.config['paths']['predictors'], fs_name), predictor_record.code ), 'updated_at': predictor_record.updated_at, 'created': datetime.datetime.now(), 'code': predictor_record.code, 'pickle': str(os.path.join(self.config['paths']['predictors'], fs_name)) } else: raise Exception( f'Trying to predict using predictor {original_name} with status: {predictor_data["status"]}. Error is: {predictor_data.get("error", "unknown")}' ) predictions = self.predictor_cache[name]['predictor'].predict(df) # Bellow is useful for debugging caching and storage issues # del self.predictor_cache[name] predictions = predictions.to_dict(orient='records') target = predictor_record.to_predict[0] if pred_format in ('explain', 'dict', 'dict&explain'): explain_arr = [] dict_arr = [] for i, row in enumerate(predictions): obj = { target: { 'predicted_value': row['prediction'], 'confidence': row.get('confidence', None), 'anomaly': row.get('anomaly', None), 'truth': row.get('truth', None) } } if 'lower' in row: obj[target]['confidence_lower_bound'] = row.get('lower', None) obj[target]['confidence_upper_bound'] = row.get('upper', None) explain_arr.append(obj) td = {'predicted_value': row['prediction']} for col in df.columns: if col in row: td[col] = row[col] elif f'order_{col}' in row: td[col] = row[f'order_{col}'] elif f'group_{col}' in row: td[col] = row[f'group_{col}'] else: orginal_index = row.get('original_index') if orginal_index is None: log.warning('original_index is None') orginal_index = i td[col] = df.iloc[orginal_index][col] dict_arr.append({target: td}) if pred_format == 'explain': return explain_arr elif pred_format == 'dict': return dict_arr elif pred_format == 'dict&explain': return dict_arr, explain_arr # New format -- Try switching to this in 2-3 months for speed, for now above is ok else: return predictions @mark_process(name='analyse') def analyse_dataset(self, ds: dict, company_id: int) -> lightwood.DataAnalysis: ds_cls = getattr(mindsdb_datasources, ds['class']) df = ds_cls(*ds['args'], **ds['kwargs']).df analysis = lightwood.analyze_dataset(df) return analysis.to_dict() # type: ignore def get_model_data(self, name, company_id: int): if '@@@@@' in name: sn = name.split('@@@@@') assert len(sn) < 3 # security name = sn[1] original_name = name name = f'{company_id}@@@@@{name}' predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=original_name).first() assert predictor_record is not None linked_dataset = db.session.query(db.Dataset).get(predictor_record.dataset_id) data = deepcopy(predictor_record.data) data['dtype_dict'] = predictor_record.dtype_dict data['created_at'] = str(parse_datetime(str(predictor_record.created_at).split('.')[0])) data['updated_at'] = str(parse_datetime(str(predictor_record.updated_at).split('.')[0])) data['predict'] = predictor_record.to_predict[0] data['update'] = predictor_record.update_status data['mindsdb_version'] = predictor_record.mindsdb_version data['name'] = predictor_record.name data['code'] = predictor_record.code data['json_ai'] = predictor_record.json_ai data['data_source_name'] = linked_dataset.name if linked_dataset else None data['problem_definition'] = predictor_record.learn_args # assume older models are complete, only temporary if 'status' in predictor_record.data: data['status'] = predictor_record.data['status'] elif 'error' in predictor_record.data: data['status'] = 'error' elif predictor_record.update_status == 'available': data['status'] = 'complete' elif predictor_record.json_ai is None and predictor_record.code is None: data['status'] = 'generating' elif predictor_record.data is None: data['status'] = 'editable' elif 'training_log' in predictor_record.data: data['status'] = 'training' elif 'error' not in predictor_record.data: data['status'] = 'complete' else: data['status'] = 'error' if data.get('accuracies', None) is not None: if len(data['accuracies']) > 0: data['accuracy'] = float(np.mean(list(data['accuracies'].values()))) return data def get_model_description(self, name: str, company_id: int): """ Similar to `get_model_data` but meant to be seen directly by the user, rather than parsed by something like the Studio predictor view. Uses `get_model_data` to compose this, but in the future we might want to make this independent if we deprected `get_model_data` :returns: Dictionary of the analysis (meant to be foramtted by the APIs and displayed as json/yml/whatever) """ # noqa model_description = {} model_data = self.get_model_data(name, company_id) model_description['accuracies'] = model_data['accuracies'] model_description['column_importances'] = model_data['column_importances'] model_description['outputs'] = [model_data['predict']] model_description['inputs'] = [col for col in model_data['dtype_dict'] if col not in model_description['outputs']] model_description['datasource'] = model_data['data_source_name'] model_description['model'] = ' --> '.join(str(k) for k in model_data['json_ai']) return model_description def get_models(self, company_id: int): models = [] for db_p in db.session.query(db.Predictor).filter_by(company_id=company_id): model_data = self.get_model_data(db_p.name, company_id=company_id) reduced_model_data = {} for k in ['name', 'version', 'is_active', 'predict', 'status', 'current_phase', 'accuracy', 'data_source', 'update', 'data_source_name', 'mindsdb_version', 'error']: reduced_model_data[k] = model_data.get(k, None) for k in ['train_end_at', 'updated_at', 'created_at']: reduced_model_data[k] = model_data.get(k, None) if reduced_model_data[k] is not None: try: reduced_model_data[k] = parse_datetime(str(reduced_model_data[k]).split('.')[0]) except Exception as e: # @TODO Does this ever happen log.error(f'Date parsing exception while parsing: {k} in get_models: ', e) reduced_model_data[k] = parse_datetime(str(reduced_model_data[k])) models.append(reduced_model_data) return models def delete_model(self, name, company_id: int): original_name = name name = f'{company_id}@@@@@{name}' db_p = db.session.query(db.Predictor).filter_by(company_id=company_id, name=original_name).first() if db_p is None: raise Exception(f"Predictor '{name}' does not exist") db.session.delete(db_p) if db_p.dataset_id is not None: try: dataset_record = db.Datasource.query.get(db_p.dataset_id) if ( isinstance(dataset_record.data, str) and json.loads(dataset_record.data).get('source_type') != 'file' ): DataStore().delete_datasource(dataset_record.name, company_id) except Exception: pass db.session.commit() DatabaseWrapper(company_id).unregister_predictor(name) # delete from s3 self.fs_store.delete(f'predictor_{company_id}_{db_p.id}') return 0 def rename_model(self, old_name, new_name, company_id: int): db_p = db.session.query(db.Predictor).filter_by(company_id=company_id, name=old_name).first() db_p.name = new_name db.session.commit() dbw = DatabaseWrapper(company_id) dbw.unregister_predictor(old_name) dbw.register_predictors([self.get_model_data(new_name, company_id)]) @mark_process(name='learn') def update_model(self, name: str, company_id: int): # TODO: Add version check here once we're done debugging predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=name).first() assert predictor_record is not None predictor_record.update_status = 'updating' db.session.commit() p = UpdateProcess(name, company_id) p.start() return 'Updated in progress' @mark_process(name='learn') def generate_predictor(self, name: str, from_data: dict, dataset_id, problem_definition_dict: dict, join_learn_process: bool, company_id: int): predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=name).first() if predictor_record is not None: raise Exception('Predictor name must be unique.') df, problem_definition, _, _ = self._unpack_old_args(from_data, problem_definition_dict) problem_definition = ProblemDefinition.from_dict(problem_definition) predictor_record = db.Predictor( company_id=company_id, name=name, dataset_id=dataset_id, mindsdb_version=mindsdb_version, lightwood_version=lightwood_version, to_predict=problem_definition.target, learn_args=problem_definition.to_dict(), data={'name': name} ) db.session.add(predictor_record) db.session.commit() predictor_id = predictor_record.id p = GenerateProcess(df, problem_definition, predictor_id) p.start() if join_learn_process: p.join() if not IS_PY36: p.close() db.session.refresh(predictor_record) def edit_json_ai(self, name: str, json_ai: dict, company_id=None): predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=name).first() assert predictor_record is not None json_ai = lightwood.JsonAI.from_dict(json_ai) predictor_record.code = lightwood.code_from_json_ai(json_ai) predictor_record.json_ai = json_ai.to_dict() db.session.commit() def code_from_json_ai(self, json_ai: dict, company_id=None): json_ai = lightwood.JsonAI.from_dict(json_ai) code = lightwood.code_from_json_ai(json_ai) return code def edit_code(self, name: str, code: str, company_id=None): """Edit an existing predictor's code""" if self.config.get('cloud', False): raise Exception('Code editing prohibited on cloud') predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=name).first() assert predictor_record is not None lightwood.predictor_from_code(code) predictor_record.code = code predictor_record.json_ai = None db.session.commit() @mark_process(name='learn') def fit_predictor(self, name: str, from_data: dict, join_learn_process: bool, company_id: int) -> None: predictor_record = db.session.query(db.Predictor).filter_by(company_id=company_id, name=name).first() assert predictor_record is not None df = self._get_from_data_df(from_data) p = FitProcess(predictor_record.id, df) p.start() if join_learn_process: p.join() if not IS_PY36: p.close()
def __init__(self): self.config = Config() self.fs_store = FsStore() self.dir = self.config['paths']['datasources'] self.model_interface = ModelInterface()
sys.stdout.flush() process.terminate() process.join() sys.stdout.flush() if ray_based: os.system('ray stop --force') except KeyboardInterrupt: sys.exit(0) except psutil.NoSuchProcess: pass if __name__ == '__main__': mp.freeze_support() args = args_parse() config = Config() if args.verbose is True: # Figure this one out later pass os.environ['DEFAULT_LOG_LEVEL'] = config['log']['level']['console'] os.environ['LIGHTWOOD_LOG_LEVEL'] = config['log']['level']['console'] # Switch to this once the native interface has it's own thread :/ ctx = mp.get_context('spawn') from mindsdb.__about__ import __version__ as mindsdb_version print(f'Version {mindsdb_version}') print(f'Configuration file:\n {config.config_path}')
def __init__(self, config): self.config = Config() self.mindsdb_native = NativeInterface() self.custom_models = CustomModels() self.ai_table = AITable_store() self.default_store = DataStore()
def start(verbose, no_studio): config = Config() if verbose: config.set(['log', 'level', 'console'], 'DEBUG') initialize_log(config, 'http', wrap_print=True) # start static initialization in a separate thread init_static_thread = None if not no_studio: init_static_thread = threading.Thread(target=initialize_static, args=(config, )) init_static_thread.start() app, api = initialize_flask(config, init_static_thread, no_studio) initialize_interfaces(app) static_root = Path(config.paths['static']) @app.route('/', defaults={'path': ''}, methods=['GET']) @app.route('/<path:path>', methods=['GET']) def root_index(path): if path.startswith('api/'): return {'message': 'wrong query'}, 400 if static_root.joinpath(path).is_file(): return send_from_directory(config.paths['static'], path) else: return send_from_directory(config.paths['static'], 'index.html') api.add_namespace(predictor_ns) api.add_namespace(datasource_ns) api.add_namespace(utils_ns) api.add_namespace(conf_ns) @api.errorhandler(Exception) def handle_exception(e): get_log('http').error(f'http exception: {e}') # pass through HTTP errors if isinstance(e, HTTPException): return {'message': str(e)}, e.code, e.get_response().headers name = getattr(type(e), '__name__') or 'Unknown error' return {'message': f'{name}: {str(e)}'}, 500 @app.teardown_appcontext def remove_session(*args, **kwargs): session.close() port = config['api']['http']['port'] host = config['api']['http']['host'] server = os.environ.get('MINDSDB_DEFAULT_SERVER', 'waitress') # waiting static initialization if not no_studio: init_static_thread.join() if server.lower() == 'waitress': if host in ('', '0.0.0.0'): serve(app, port=port, host='*') else: serve(app, port=port, host=host) elif server.lower() == 'flask': # that will 'disable access' log in console log = logging.getLogger('werkzeug') log.setLevel(logging.WARNING) app.run(debug=False, port=port, host=host) elif server.lower() == 'gunicorn': try: from mindsdb.api.http.gunicorn_wrapper import StandaloneApplication except ImportError: print( "Gunicorn server is not available by default. If you wish to use it, please install 'gunicorn'" ) return options = { 'bind': f'{host}:{port}', 'workers': min(max(multiprocessing.cpu_count(), 2), 3) } StandaloneApplication(app, options).run()
class ModelController(): config: Config fs_store: FsStore predictor_cache: Dict[str, Dict[str, Union[Any]]] ray_based: bool def __init__(self, ray_based: bool) -> None: self.config = Config() self.fs_store = FsStore() self.predictor_cache = {} self.ray_based = ray_based def _invalidate_cached_predictors(self) -> None: # @TODO: Cache will become stale if the respective ModelInterface is not invoked yet a bunch of predictors remained cached, no matter where we invoke it. In practice shouldn't be a big issue though for predictor_name in list(self.predictor_cache.keys()): if (datetime.datetime.now() - self.predictor_cache[predictor_name]['created'] ).total_seconds() > 1200: del self.predictor_cache[predictor_name] def _lock_predictor(self, id: int, mode: str) -> None: from mindsdb.interfaces.storage.db import session, Semaphor while True: semaphor_record = session.query(Semaphor).filter_by( entity_id=id, entity_type='predictor').first() if semaphor_record is not None: if mode == 'read' and semaphor_record.action == 'read': return True try: semaphor_record = Semaphor(entity_id=id, entity_type='predictor', action=mode) session.add(semaphor_record) session.commit() return True except Exception: pass time.sleep(1) def _unlock_predictor(self, id: int) -> None: from mindsdb.interfaces.storage.db import session, Semaphor semaphor_record = session.query(Semaphor).filter_by( entity_id=id, entity_type='predictor').first() if semaphor_record is not None: session.delete(semaphor_record) session.commit() @contextmanager def _lock_context(self, id, mode: str): try: self._lock_predictor(id, mode) yield True finally: self._unlock_predictor(id) def _unpack_old_args( self, from_data: dict, kwargs: dict, to_predict: Optional[Union[str, list]] = None ) -> Tuple[pd.DataFrame, ProblemDefinition, bool]: if to_predict is not None: problem_definition = { 'target': to_predict if isinstance(to_predict, str) else to_predict[0] } else: problem_definition = kwargs join_learn_process = kwargs.get('join_learn_process', False) if 'join_learn_process' in kwargs: del kwargs['join_learn_process'] # Adapt kwargs to problem definition if 'timeseries_settings' in kwargs: problem_definition['timeseries_settings'] = kwargs[ 'timeseries_settings'] if 'stop_training_in_x_seconds' in kwargs: problem_definition['time_aim'] = kwargs[ 'stop_training_in_x_seconds'] ds_cls = getattr(mindsdb_datasources, from_data['class']) ds = ds_cls(*from_data['args'], **from_data['kwargs']) df = ds.df return df, problem_definition, join_learn_process @mark_process(name='learn') def learn(self, name: str, from_data: dict, to_predict: str, datasource_id: int, kwargs: dict, company_id: int) -> None: df, problem_definition, join_learn_process = self._unpack_old_args( from_data, kwargs, to_predict) p = LearnProcess(df, ProblemDefinition.from_dict(problem_definition), name, company_id, datasource_id) p.start() if join_learn_process: p.join() if not IS_PY36: p.close() @mark_process(name='predict') def predict(self, name: str, when_data: Union[dict, list, pd.DataFrame], pred_format: str, company_id: int): original_name = name name = f'{company_id}@@@@@{name}' predictor_record = db.session.query(db.Predictor).filter_by( company_id=company_id, name=original_name).first() assert predictor_record is not None predictor_data = self.get_model_data(name, company_id) fs_name = f'predictor_{company_id}_{predictor_record.id}' if name not in self.predictor_cache: # Clear the cache entirely if we have less than 1.2 GB left if psutil.virtual_memory().available < 1.2 * pow(10, 9): self.predictor_cache = {} if predictor_data['status'] == 'complete': self.fs_store.get(fs_name, fs_name, self.config['paths']['predictors']) self.predictor_cache[name] = { 'predictor': lightwood.predictor_from_state( os.path.join(self.config['paths']['predictors'], fs_name), predictor_record.code), 'created': datetime.datetime.now(), 'code': predictor_record.code, 'pickle': str( os.path.join(self.config['paths']['predictors'], fs_name)) } else: raise Exception( f'Trying to predict using predictor {original_name} with status: {predictor_data["status"]}' ) if isinstance(when_data, dict) and 'kwargs' in when_data and 'args' in when_data: ds_cls = getattr(mindsdb_datasources, when_data['class']) df = ds_cls(*when_data['args'], **when_data['kwargs']).df else: if isinstance(when_data, dict): when_data = [when_data] df = pd.DataFrame(when_data) predictions = self.predictor_cache[name]['predictor'].predict(df) predictions = predictions.to_dict(orient='records') # Bellow is useful for debugging caching and storage issues # del self.predictor_cache[name] target = predictor_record.to_predict[0] if pred_format in ('explain', 'dict', 'dict&explain'): explain_arr = [] dict_arr = [] for i, row in enumerate(predictions): explain_arr.append({ target: { 'predicted_value': row['prediction'], 'confidence': row.get('confidence', None), 'confidence_lower_bound': row.get('lower', None), 'confidence_upper_bound': row.get('upper', None), 'anomaly': row.get('anomaly', None), 'truth': row.get('truth', None) } }) td = {'predicted_value': row['prediction']} for col in df.columns: if col in row: td[col] = row[col] elif f'order_{col}' in row: td[col] = row[f'order_{col}'] elif f'group_{col}' in row: td[col] = row[f'group_{col}'] else: td[col] = df.iloc[i][col] dict_arr.append({target: td}) if pred_format == 'explain': return explain_arr elif pred_format == 'dict': return dict_arr elif pred_format == 'dict&explain': return dict_arr, explain_arr # New format -- Try switching to this in 2-3 months for speed, for now above is ok else: return predictions @mark_process(name='analyse') def analyse_dataset(self, ds: dict, company_id: int) -> lightwood.DataAnalysis: ds_cls = getattr(mindsdb_datasources, ds['class']) df = ds_cls(*ds['args'], **ds['kwargs']).df analysis = lightwood.analyze_dataset(df) return analysis.to_dict() # type: ignore def get_model_data(self, name, company_id: int): if '@@@@@' in name: sn = name.split('@@@@@') assert len(sn) < 3 # security name = sn[1] original_name = name name = f'{company_id}@@@@@{name}' predictor_record = db.session.query(db.Predictor).filter_by( company_id=company_id, name=original_name).first() assert predictor_record is not None linked_db_ds = db.session.query(db.Datasource).filter_by( company_id=company_id, id=predictor_record.datasource_id).first() # check update availability if version.parse(predictor_record.mindsdb_version) < version.parse( mindsdb_version): predictor_record.update_status = 'available' db.session.commit() data = deepcopy(predictor_record.data) data['dtype_dict'] = predictor_record.dtype_dict data['created_at'] = str( parse_datetime(str(predictor_record.created_at).split('.')[0])) data['updated_at'] = str( parse_datetime(str(predictor_record.updated_at).split('.')[0])) data['predict'] = predictor_record.to_predict[0] data['update'] = predictor_record.update_status data['name'] = predictor_record.name data['code'] = predictor_record.code data['json_ai'] = predictor_record.json_ai data['data_source_name'] = linked_db_ds.name if linked_db_ds else None data['problem_definition'] = predictor_record.learn_args # assume older models are complete, only temporary if predictor_record.update_status == 'available': data['status'] = 'complete' elif predictor_record.json_ai is None and predictor_record.code is None: data['status'] = 'generating' elif predictor_record.data is None: data['status'] = 'editable' elif 'training_log' in predictor_record.data: data['status'] = 'training' elif 'error' not in predictor_record.data: data['status'] = 'complete' else: data['status'] = 'error' if data.get('accuracies', None) is not None: if len(data['accuracies']) > 0: data['accuracy'] = float( np.mean(list(data['accuracies'].values()))) return data def get_models(self, company_id: int): models = [] for db_p in db.session.query( db.Predictor).filter_by(company_id=company_id): model_data = self.get_model_data(db_p.name, company_id=company_id) reduced_model_data = {} for k in [ 'name', 'version', 'is_active', 'predict', 'status', 'current_phase', 'accuracy', 'data_source', 'update', 'data_source_name' ]: reduced_model_data[k] = model_data.get(k, None) for k in ['train_end_at', 'updated_at', 'created_at']: reduced_model_data[k] = model_data.get(k, None) if reduced_model_data[k] is not None: try: reduced_model_data[k] = parse_datetime( str(reduced_model_data[k]).split('.')[0]) except Exception as e: # @TODO Does this ever happen log.error( f'Date parsing exception while parsing: {k} in get_models: ', e) reduced_model_data[k] = parse_datetime( str(reduced_model_data[k])) models.append(reduced_model_data) return models def delete_model(self, name, company_id: int): original_name = name name = f'{company_id}@@@@@{name}' db_p = db.session.query(db.Predictor).filter_by( company_id=company_id, name=original_name).first() db.session.delete(db_p) db.session.commit() DatabaseWrapper(company_id).unregister_predictor(name) # delete from s3 self.fs_store.delete(f'predictor_{company_id}_{db_p.id}') return 0 def update_model(self, name: str, company_id: int): # TODO: Add version check here once we're done debugging p = UpdateProcess(name, company_id) p.start() return 'Updated in progress' @mark_process(name='learn') def generate_predictor(self, name: str, from_data: dict, datasource_id, problem_definition_dict: dict, join_learn_process: bool, company_id: int): df, problem_definition, _ = self._unpack_old_args( from_data, problem_definition_dict) p = GenerateProcess(df, ProblemDefinition.from_dict(problem_definition), name, company_id, datasource_id) p.start() if join_learn_process: p.join() if not IS_PY36: p.close() def edit_json_ai(self, name: str, json_ai: dict, company_id=None): predictor_record = db.session.query(db.Predictor).filter_by( company_id=company_id, name=name).first() assert predictor_record is not None json_ai = lightwood.JsonAI.from_dict(json_ai) predictor_record.code = lightwood.code_from_json_ai(json_ai) predictor_record.json_ai = json_ai.to_dict() db.session.commit() def code_from_json_ai(self, json_ai: dict, company_id=None): json_ai = lightwood.JsonAI.from_dict(json_ai) code = lightwood.code_from_json_ai(json_ai) return code def edit_code(self, name: str, code: str, company_id=None): """Edit an existing predictor's code""" if self.config.get('cloud', False): raise Exception('Code editing prohibited on cloud') predictor_record = db.session.query(db.Predictor).filter_by( company_id=company_id, name=name).first() assert predictor_record is not None lightwood.predictor_from_code(code) predictor_record.code = code predictor_record.json_ai = None db.session.commit() @mark_process(name='learn') def fit_predictor(self, name: str, from_data: dict, join_learn_process: bool, company_id: int) -> None: predictor_record = db.session.query(db.Predictor).filter_by( company_id=company_id, name=name).first() assert predictor_record is not None df, _, _ = self._unpack_old_args(from_data, {}, None) p = FitProcess(predictor_record.id, df) p.start() if join_learn_process: p.join() if not IS_PY36: p.close()
if os.path.isdir(root_storage_dir) is False: os.makedirs(root_storage_dir) if 'storage_db' in user_config: os.environ['MINDSDB_DB_CON'] = user_config['storage_db'] elif os.environ.get('MINDSDB_DB_CON', '') == '': os.environ['MINDSDB_DB_CON'] = 'sqlite:///' + os.path.join( os.environ['MINDSDB_STORAGE_DIR'], 'mindsdb.sqlite3.db') + '?check_same_thread=False' if 'company_id' in user_config: os.environ['MINDSDB_COMPANY_ID'] = user_config['company_id'] from mindsdb.utilities.config import Config mindsdb_config = Config() create_dirs_recursive(mindsdb_config['paths']) os.environ['DEFAULT_LOG_LEVEL'] = os.environ.get('DEFAULT_LOG_LEVEL', 'ERROR') os.environ['LIGHTWOOD_LOG_LEVEL'] = os.environ.get('LIGHTWOOD_LOG_LEVEL', 'ERROR') os.environ['MINDSDB_STORAGE_PATH'] = mindsdb_config['paths']['predictors'] if telemetry_file_exists(mindsdb_config['storage_dir']): os.environ['CHECK_FOR_UPDATES'] = '0' print('\n x telemetry disabled! \n') elif os.getenv('CHECK_FOR_UPDATES', '1').lower() in ['0', 'false', 'False']: disable_telemetry(mindsdb_config['storage_dir']) print('\n x telemetry disabled \n')
def __init__(self, ray_based: bool) -> None: self.config = Config() self.fs_store = FsStore() self.predictor_cache = {} self.ray_based = ray_based
import os import shelve import json from abc import ABC, abstractmethod import walrus from mindsdb.utilities.config import Config CONFIG = Config() class BaseCache(ABC): def __init__(self): self.config = Config() @abstractmethod def delete(self): pass @abstractmethod def __getitem__(self, key): pass @abstractmethod def __setitem__(self, key, value): pass class LocalCache(BaseCache): def __init__(self, name, *args, **kwargs): super().__init__()
def __init__(self, company_id): self.config = Config() self.company_id = company_id
process = api['process'] childs = get_child_pids(process.pid) for p in childs: os.kill(p, signal.SIGTERM) sys.stdout.flush() process.terminate() process.join() sys.stdout.flush() except KeyboardInterrupt: sys.exit(0) if __name__ == '__main__': mp.freeze_support() args = args_parse() config = Config() if args.verbose is True: config.set(['log', 'level', 'console'], 'DEBUG') os.environ['DEFAULT_LOG_LEVEL'] = config['log']['level']['console'] os.environ['LIGHTWOOD_LOG_LEVEL'] = config['log']['level']['console'] config.set(['mindsdb_last_started_at'], str(datetime.datetime.now())) from lightwood.__about__ import __version__ as lightwood_version from mindsdb_native.__about__ import __version__ as mindsdb_native_version from mindsdb.__about__ import __version__ as mindsdb_version print('Versions:') print(f' - lightwood {lightwood_version}') print(f' - MindsDB_native {mindsdb_native_version}') print(f' - MindsDB {mindsdb_version}')
import os import sys import logging import traceback from mindsdb.interfaces.storage.db import session, Log from mindsdb.utilities.config import Config global_config = Config().get_all() telemtry_enabled = os.getenv('CHECK_FOR_UPDATES', '1').lower() not in ['0', 'false', 'False'] if telemtry_enabled: import sentry_sdk from sentry_sdk import capture_exception, capture_message, add_breadcrumb sentry_sdk.init( "https://[email protected]/5633566", traces_sample_rate=0 #Set to `1` to experiment with performance metrics ) class LoggerWrapper(object): def __init__(self, writer_arr, default_writer_pos): self._writer_arr = writer_arr self.default_writer_pos = default_writer_pos def write(self, message): if len(message.strip(' \n')) == 0: return if 'DEBUG:' in message: self._writer_arr[0](message)
def __init__(self): self.config = Config()
def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dbw = DatabaseWrapper() self.predictor_cache = {}
from mindsdb.utilities.wizards import cli_config from mindsdb.utilities.config import Config from mindsdb.utilities.functions import args_parse config_dir, storage_dir = get_or_create_dir_struct() config_path = os.path.join(config_dir, 'config.json') if not os.path.exists(config_path): _ = cli_config(None, None, storage_dir, config_dir, use_default=True) args = args_parse() if args.config is not None: config_path = args.config try: mindsdb_config = Config(config_path) except Exception as e: print(str(e)) sys.exit(1) paths = mindsdb_config.paths for path in paths.values(): create_directory(path) os.environ['MINDSDB_STORAGE_PATH'] = paths['predictors'] os.environ['DEFAULT_LOG_LEVEL'] = 'ERROR' os.environ['LIGHTWOOD_LOG_LEVEL'] = 'ERROR' from mindsdb_native import * # Figure out how to add this as a module import lightwood
def run(self): ''' running at subprocess due to ValueError: signal only works in main thread this is work for celery worker here? ''' import sys import mindsdb_native from mindsdb.utilities.config import Config name, from_data, to_predict, kwargs, config, trx_type = self._args config = Config(config) mdb = mindsdb_native.Predictor(name=name) if trx_type == 'learn': data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) kwargs['use_gpu'] = config.get('use_gpu', None) mdb.learn(from_data=data_source, to_predict=to_predict, **kwargs) stats = mdb.get_model_data()['data_analysis_v2'] try: assert (config['integrations']['default_clickhouse']['enabled'] == True) from mindsdb.interfaces.clickhouse.clickhouse import Clickhouse clickhouse = Clickhouse(config) clickhouse.register_predictor(name, stats) except: pass try: assert (config['integrations']['default_mariadb']['enabled'] == True) from mindsdb.interfaces.mariadb.mariadb import Mariadb mariadb = Mariadb(config) mariadb.register_predictor(name, stats) except: pass if trx_type == 'predict': if isinstance(from_data, dict): when = from_data when_data = None else: when_data = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) when = None kwargs['use_gpu'] = config.get('use_gpu', None) predictions = mdb.predict(when=when, when_data=when_data, run_confidence_variation_analysis=True, **kwargs) return predictions
import inspect import subprocess import MySQLdb from mindsdb.interfaces.native.mindsdb import MindsdbNative from mindsdb.utilities.config import Config TEST_CONFIG = '/home/maxs/dev/mdb/venv/sources/mindsdb/test_config.json' test_csv = 'tests/home_rentals.csv' test_data_table = 'home_rentals_400' test_predictor_name = 'test_predictor_400' config = Config(TEST_CONFIG) def query_ch(query): if 'CREATE ' not in query.upper() and 'INSERT ' not in query.upper(): query += ' FORMAT JSON' user = config['integrations']['default_clickhouse']['user'] password = config['integrations']['default_clickhouse']['password'] connect_string = 'http://{}:{}'.format( 'localhost', 8123 ) params = {}
def __init__(self, company_id): self.config = Config() self.company_id = company_id self.datasource_interface = WithKWArgsWrapper(DatasourceController(), company_id=company_id)