def get_models(self, company_id: int): models = [] for db_p in db.session.query( db.Predictor).filter_by(company_id=company_id): model_data = self.get_model_data(db_p.name, company_id=company_id) reduced_model_data = {} for k in [ 'name', 'version', 'is_active', 'predict', 'status', 'current_phase', 'accuracy', 'data_source', 'update', 'data_source_name', 'mindsdb_version', 'error' ]: reduced_model_data[k] = model_data.get(k, None) for k in ['train_end_at', 'updated_at', 'created_at']: reduced_model_data[k] = model_data.get(k, None) if reduced_model_data[k] is not None: try: reduced_model_data[k] = parse_datetime( str(reduced_model_data[k]).split('.')[0]) except Exception as e: # @TODO Does this ever happen log.error( f'Date parsing exception while parsing: {k} in get_models: ', e) reduced_model_data[k] = parse_datetime( str(reduced_model_data[k])) models.append(reduced_model_data) return models
def _to_mysql_table(self, dtype_dict, predicted_cols, columns): subtype_map = { dtype.integer: 'int', dtype.float: 'double', dtype.binary: 'bool', dtype.date: 'Date', dtype.datetime: 'Datetime', dtype.binary: 'VARCHAR(500)', dtype.categorical: 'VARCHAR(500)', dtype.tags: 'VARCHAR(500)', dtype.image: 'VARCHAR(500)', dtype.video: 'VARCHAR(500)', dtype.audio: 'VARCHAR(500)', dtype.short_text: 'VARCHAR(500)', dtype.rich_text: 'VARCHAR(500)', dtype.array: 'VARCHAR(500)' } column_declaration = [] for name in columns: try: col_subtype = dtype_dict[name] new_type = subtype_map[col_subtype] column_declaration.append(f' `{name}` {new_type} ') if name in predicted_cols: column_declaration.append( f' `{name}_original` {new_type} ') except Exception as e: log.error( f'Error: can not determine mysql data type for column {name}: {e}' ) return column_declaration
def _to_clickhouse_table(self, stats, predicted_cols, columns): subtype_map = { DATA_SUBTYPES.INT: 'Nullable(Int64)', DATA_SUBTYPES.FLOAT: 'Nullable(Float64)', DATA_SUBTYPES.BINARY: 'Nullable(UInt8)', DATA_SUBTYPES.DATE: 'Nullable(Date)', DATA_SUBTYPES.TIMESTAMP: 'Nullable(Datetime)', DATA_SUBTYPES.SINGLE: 'Nullable(String)', DATA_SUBTYPES.MULTIPLE: 'Nullable(String)', DATA_SUBTYPES.TAGS: 'Nullable(String)', DATA_SUBTYPES.IMAGE: 'Nullable(String)', DATA_SUBTYPES.VIDEO: 'Nullable(String)', DATA_SUBTYPES.AUDIO: 'Nullable(String)', DATA_SUBTYPES.SHORT: 'Nullable(String)', DATA_SUBTYPES.RICH: 'Nullable(String)', DATA_SUBTYPES.ARRAY: 'Nullable(String)' } column_declaration = [] for name in columns: try: col_subtype = stats[name]['typing']['data_subtype'] new_type = subtype_map[col_subtype] column_declaration.append(f' `{name}` {new_type} ') if name in predicted_cols: column_declaration.append(f' `{name}_original` {new_type} ') except Exception as e: log.error(f'Error: can not determine clickhouse data type for column {name}: {e}') return column_declaration
def predict(self, stream_in, stream_out, timeseries_mode=False): predict_info = stream_in.read(block=0) when_list = [] for record in predict_info: record_id = record[0] raw_when_data = record[1] when_data = self.decode(raw_when_data) if timeseries_mode: # if self.target not in when_data: # when_data['make_predictions'] = False # else: # when_data['make_predictions'] = True when_list.append(when_data) else: result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data) log.error(f"STREAM: got {result}") for res in result: in_json = json.dumps(res) stream_out.add({"prediction": in_json}) stream_in.delete(record_id) if timeseries_mode: result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_list) log.error(f"TIMESERIES STREAM: got {result}") for res in result: in_json = json.dumps(res) stream_out.add({"prediction": in_json}) stream_in.trim(len(stream_in) - 1, approximate=False)
def save_datasource(self, name, source_type, source, file_path=None, company_id=None): if source_type == 'file' and (file_path is None): raise Exception('`file_path` argument required when source_type == "file"') datasource_record = session.query(Datasource).filter_by(company_id=company_id, name=name).first() while datasource_record is not None: raise Exception(f'Datasource with name {name} already exists') try: datasource_record = Datasource( company_id=company_id, name=name, datasources_version=mindsdb_datasources.__version__, mindsdb_version=mindsdb_version ) session.add(datasource_record) session.commit() ds_meta_dir = os.path.join(self.dir, f'{company_id}@@@@@{name}') os.mkdir(ds_meta_dir) ds, creation_info = self.create_datasource(source_type, source, file_path, company_id, ds_meta_dir) if hasattr(ds, 'get_columns') and hasattr(ds, 'get_row_count'): try: column_names = ds.get_columns() row_count = ds.get_row_count() except Exception: df = ds.df column_names = list(df.keys()) row_count = len(df) else: df = ds.df column_names = list(df.keys()) row_count = len(df) if '' in column_names or len(column_names) != len(set(column_names)): shutil.rmtree(ds_meta_dir) raise Exception('Each column in datasource must have unique non-empty name') datasource_record.creation_info = json.dumps(creation_info) datasource_record.data = json.dumps({ 'source_type': source_type, 'source': source, 'row_count': row_count, 'columns': [dict(name=x) for x in column_names] }) self.fs_store.put(f'{company_id}@@@@@{name}', f'datasource_{company_id}_{datasource_record.id}', self.dir) session.commit() except Exception as e: log.error(f'Error creating datasource {name}, exception: {e}') try: self.delete_datasource(name, company_id=company_id) except Exception: pass raise e return self.get_datasource_obj(name, raw=True, company_id=company_id)
def select(self, query): if isinstance(query, str): query_str = query else: if self.ds_type in ('postgres', 'snowflake'): dialect = 'postgres' else: dialect = 'mysql' render = SqlalchemyRender(dialect) try: query_str = render.get_string(query, with_failback=False) except Exception as e: log.error(f"Exception during query casting to '{dialect}' dialect. Query: {query}. Error: {e}") query_str = render.get_string(query, with_failback=True) dso, _creation_info = self.data_store.create_datasource(self.integration_name, {'query': query_str}) data = dso.df.to_dict(orient='records') column_names = list(dso.df.columns) for column_name in column_names: if pd.core.dtypes.common.is_datetime_or_timedelta_dtype(dso.df[column_name]): pass_data = dso.df[column_name].dt.to_pydatetime() for i, rec in enumerate(data): rec[column_name] = pass_data[i].timestamp() if len(column_names) == 0: column_names = ['dataframe_is_empty'] return data, column_names
def get_models(self): from mindsdb.interfaces.storage.db import session, Predictor models = [] predictor_records = Predictor.query.filter_by(company_id=self.company_id, is_custom=False) predictor_names = [ x.name for x in predictor_records ] for model_name in predictor_names: try: if self.ray_based: model_data = self.get_model_data(model_name, db_fix=False) else: bin = self.get_model_data(model_name, db_fix=False) model_data = pickle.loads(bin.data) reduced_model_data = {} for k in ['name', 'version', 'is_active', 'predict', 'status', 'current_phase', 'accuracy', 'data_source', 'update']: reduced_model_data[k] = model_data.get(k, None) for k in ['train_end_at', 'updated_at', 'created_at']: reduced_model_data[k] = model_data.get(k, None) if reduced_model_data[k] is not None: try: reduced_model_data[k] = parse_datetime(str(reduced_model_data[k]).split('.')[0]) except Exception as e: # @TODO Does this ever happen log.error(f'Date parsing exception while parsing: {k} in get_models: ', e) reduced_model_data[k] = parse_datetime(str(reduced_model_data[k])) models.append(reduced_model_data) except Exception as e: log.error(f"Can't list data for model: '{model_name}' when calling `get_models(), error: {e}`") return self._pack(models)
def get_datasource_obj(self, name=None, id=None, raw=False, company_id=None): try: if name is not None: datasource_record = session.query(Datasource).filter_by( company_id=company_id, name=name).first() else: datasource_record = session.query(Datasource).filter_by( company_id=company_id, id=id).first() self.fs_store.get( f'{company_id}@@@@@{name}', f'datasource_{company_id}_{datasource_record.id}', self.dir) creation_info = json.loads(datasource_record.creation_info) if raw: return creation_info else: return eval(creation_info['class'])(*creation_info['args'], **creation_info['kwargs']) except Exception as e: log.error(f'Error getting datasource {name}, exception: {e}') return None
def put(self, name): params = request.json.get('params') if not isinstance(params, dict): abort(400, "type of 'params' must be dict") is_test = params.get('test', False) if is_test: del params['test'] db_type = params.get('type') checker_class = CHECKERS.get(db_type, None) if checker_class is None: abort(400, f"Unknown integration type: {db_type}") checker = checker_class(**params) return {'success': checker.check_connection()}, 200 integration = get_integration(name) if integration is not None: abort(400, f"Integration with name '{name}' already exists") try: if 'enabled' in params: params['publish'] = params['enabled'] del params['enabled'] ca.config_obj.add_db_integration(name, params) model_data_arr = get_all_models_meta_data(ca.naitve_interface, ca.custom_models) ca.dbw.setup_integration(name) if is_test is False: ca.dbw.register_predictors(model_data_arr, name) except Exception as e: log.error(str(e)) abort(500, f'Error during config update: {str(e)}') return '', 200
def analyzing_thread(name, default_store): try: from mindsdb.interfaces.storage.db import session analysis = default_store.start_analysis(name) session.close() except Exception as e: log.error(e)
def _to_postgres_table(self, stats, predicted_cols, columns): subtype_map = { DATA_SUBTYPES.INT: ' int8', DATA_SUBTYPES.FLOAT: 'float8', DATA_SUBTYPES.BINARY: 'bool', DATA_SUBTYPES.DATE: 'date', DATA_SUBTYPES.TIMESTAMP: 'timestamp', DATA_SUBTYPES.SINGLE: 'text', DATA_SUBTYPES.MULTIPLE: 'text', DATA_SUBTYPES.TAGS: 'text', DATA_SUBTYPES.IMAGE: 'text', DATA_SUBTYPES.VIDEO: 'text', DATA_SUBTYPES.AUDIO: 'text', DATA_SUBTYPES.SHORT: 'text', DATA_SUBTYPES.RICH: 'text', DATA_SUBTYPES.ARRAY: 'text' } column_declaration = [] for name in columns: try: col_subtype = stats[name]['typing']['data_subtype'] new_type = subtype_map[col_subtype] column_declaration.append(f' "{name}" {new_type} ') if name in predicted_cols: column_declaration.append( f' "{name}_original" {new_type} ') except Exception as e: log.error( f'Error: can not determine postgres data type for column {name}: {e}' ) return column_declaration
def start_analysis(self, name): datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() if datasource_record.analysis is not None: return None semaphor_record = session.query(Semaphor).filter_by( company_id=self.company_id, entity_id=datasource_record.id, entity_type='datasource').first() if semaphor_record is None: semaphor_record = Semaphor(company_id=self.company_id, entity_id=datasource_record.id, entity_type='datasource', action='write') session.add(semaphor_record) session.commit() else: return try: analysis = self.mindsdb_native.analyse_dataset( self.get_datasource_obj(name, raw=True)) datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() datasource_record.analysis = json.dumps(analysis) session.commit() except Exception as e: log.error(e) finally: semaphor_record = session.query(Semaphor).filter_by( company_id=self.company_id, entity_id=datasource_record.id, entity_type='datasource').first() session.delete(semaphor_record) session.commit()
def _to_clickhouse_table(self, dtype_dict, predicted_cols, columns): subtype_map = { dtype.integer: 'Nullable(Int64)', dtype.float: 'Nullable(Float64)', dtype.binary: 'Nullable(UInt8)', dtype.date: 'Nullable(Date)', dtype.datetime: 'Nullable(Datetime)', dtype.binary: 'Nullable(String)', dtype.categorical: 'Nullable(String)', dtype.tags: 'Nullable(String)', dtype.image: 'Nullable(String)', dtype.video: 'Nullable(String)', dtype.audio: 'Nullable(String)', dtype.short_text: 'Nullable(String)', dtype.rich_text: 'Nullable(String)', dtype.quantity: 'Nullable(String)', dtype.num_array: 'Nullable(String)', dtype.cat_array: 'Nullable(String)', dtype.num_tsarray: 'Nullable(String)', dtype.cat_tsarray: 'Nullable(String)', 'default': 'Nullable(String)' } column_declaration = [] for name in columns: try: col_subtype = dtype_dict[name] new_type = subtype_map.get(col_subtype, subtype_map.get('default')) column_declaration.append(f' `{name}` {new_type} ') if name in predicted_cols: column_declaration.append(f' `{name}_original` {new_type} ') except Exception as e: log.error(f'Error: can not determine clickhouse data type for column {name}: {e}') return column_declaration
def _to_postgres_table(self, dtype_dict, predicted_cols, columns): subtype_map = { dtype.integer: ' int8', dtype.float: 'float8', dtype.binary: 'bool', dtype.date: 'date', dtype.datetime: 'timestamp', dtype.binary: 'text', dtype.categorical: 'text', dtype.tags: 'text', dtype.image: 'text', dtype.video: 'text', dtype.audio: 'text', dtype.short_text: 'text', dtype.rich_text: 'text', dtype.array: 'text', dtype.quantity: 'text', dtype.tsarray: 'text', 'default': 'text' } column_declaration = [] for name in columns: try: col_subtype = dtype_dict[name] new_type = subtype_map.get(col_subtype, subtype_map.get('default')) column_declaration.append(f' "{name}" {new_type} ') if name in predicted_cols: column_declaration.append(f' "{name}_original" {new_type} ') except Exception as e: log.error(f'Error: can not determine postgres data type for column {name}: {e}') return column_declaration
def _to_mariadb_table(self, stats, predicted_cols, columns): subtype_map = { DATA_SUBTYPES.INT: 'int', DATA_SUBTYPES.FLOAT: 'double', DATA_SUBTYPES.BINARY: 'bool', DATA_SUBTYPES.DATE: 'Date', DATA_SUBTYPES.TIMESTAMP: 'Datetime', DATA_SUBTYPES.SINGLE: 'VARCHAR(500)', DATA_SUBTYPES.MULTIPLE: 'VARCHAR(500)', DATA_SUBTYPES.TAGS: 'VARCHAR(500)', DATA_SUBTYPES.IMAGE: 'VARCHAR(500)', DATA_SUBTYPES.VIDEO: 'VARCHAR(500)', DATA_SUBTYPES.AUDIO: 'VARCHAR(500)', DATA_SUBTYPES.SHORT: 'VARCHAR(500)', DATA_SUBTYPES.RICH: 'VARCHAR(500)', DATA_SUBTYPES.ARRAY: 'VARCHAR(500)' } column_declaration = [] for name in columns: try: col_subtype = stats[name]['typing']['data_subtype'] new_type = subtype_map[col_subtype] column_declaration.append(f' `{name}` {new_type} ') if name in predicted_cols: column_declaration.append( f' `{name}_original` {new_type} ') except Exception as e: log.error( f'Error: can not determine mariadb data type for column {name}: {e}' ) return column_declaration
def delete(self, name): '''delete datasource''' try: ca.default_store.delete_datasource(name) except Exception as e: log.error(e) abort(400, str(e)) return '', 200
def make_prediction_from_cache(self, cache_name): cache = self.caches[cache_name] log.error("STREAM: in make_prediction_from_cache") if len(cache) >= self.window: log.error( f"STREAM: make_prediction_from_cache - len(cache) = {len(cache)}" ) self.predict_ts(cache_name)
def to_cache(self, record): gb_val = record[self.gb] cache = self.client.Stream(f"{self.stream_name}.cache.{gb_val}") log.error(f"STREAM: cache {cache.key} has been created") self.make_prediction_from_cache(cache) self.handle_record(cache, record) self.make_prediction_from_cache(cache) log.error("STREAM in cache: current iteration has done.")
def run_update(name: str, company_id: int): original_name = name name = f'{company_id}@@@@@{name}' fs_store = FsStore() config = Config() data_store = DataStoreWrapper(DataStore(), company_id) try: predictor_record = Predictor.query.filter_by(company_id=company_id, name=original_name).first() assert predictor_record is not None predictor_record.update_status = 'updating' session.commit() ds = data_store.get_datasource_obj(None, raw=False, id=predictor_record.datasource_id) df = ds.df problem_definition = predictor_record.learn_args problem_definition['target'] = predictor_record.to_predict[0] if 'join_learn_process' in problem_definition: del problem_definition['join_learn_process'] # Adapt kwargs to problem definition if 'timeseries_settings' in problem_definition: problem_definition['timeseries_settings'] = problem_definition['timeseries_settings'] if 'stop_training_in_x_seconds' in problem_definition: problem_definition['time_aim'] = problem_definition['stop_training_in_x_seconds'] json_ai = lightwood.json_ai_from_problem(df, problem_definition) predictor_record.json_ai = json_ai.to_dict() predictor_record.code = lightwood.code_from_json_ai(json_ai) predictor_record.data = {'training_log': 'training'} session.commit() predictor: lightwood.PredictorInterface = lightwood.predictor_from_code(predictor_record.code) predictor.learn(df) fs_name = f'predictor_{predictor_record.company_id}_{predictor_record.id}' pickle_path = os.path.join(config['paths']['predictors'], fs_name) predictor.save(pickle_path) fs_store.put(fs_name, fs_name, config['paths']['predictors']) predictor_record.data = predictor.model_analysis.to_dict() # type: ignore session.commit() predictor_record.lightwood_version = lightwood.__version__ predictor_record.mindsdb_version = mindsdb_version predictor_record.update_status = 'up_to_date' session.commit() except Exception as e: log.error(e) predictor_record.update_status = 'update_failed' # type: ignore session.commit() return str(e)
def delete(self, name): try: session.query(StreamDB).filter_by(company_id=COMPANY_ID, name=name).delete() session.commit() except Exception as e: log.error(e) abort(400, str(e)) return '', 200
def get_datasource(self, name, company_id=None): datasource_arr = self.get_datasources(name, company_id=company_id) if len(datasource_arr) == 1: return datasource_arr[0] # @TODO: Remove when db swithc is more stable, this should never happen, but good santiy check while this is kinda buggy elif len(datasource_arr) > 1: log.error('Two or more datasource with the same name, (', len(datasource_arr), ') | Full list: ', datasource_arr) raise Exception('Two or more datasource with the same name') return None
def delete(self, name): integration = get_db_integration(name, request.company_id) if integration is None: abort(400, f"Nothing to delete. '{name}' not exists.") try: remove_db_integration(name, request.company_id) except Exception as e: log.error(str(e)) abort(500, f'Error during integration delete: {str(e)}') return '', 200
def delete(self, name): integration = request.integration_controller.get(name) if integration is None: abort(400, f"Nothing to delete. '{name}' not exists.") try: request.integration_controller.delete(name) except Exception as e: log.error(str(e)) abort(500, f'Error during integration delete: {str(e)}') return '', 200
def save_datasource(self, name, source_type, source=None, file_path=None, company_id=None): dataset_record = session.query(Dataset).filter_by( company_id=company_id, name=name).first() while dataset_record is not None: raise Exception(f'Dataset with name {name} already exists') if source_type == 'views': source_type = 'view_query' elif source_type == 'files': source_type = 'file' try: dataset_record = Dataset( company_id=company_id, name=name, datasources_version=mindsdb_datasources.__version__, mindsdb_version=mindsdb_version) session.add(dataset_record) session.commit() ds, creation_info = self.create_datasource(source_type, source, file_path, company_id) ds_meta = self._get_ds_meta(ds) column_names = ds_meta['column_names'] row_count = ds_meta['row_count'] dataset_record.ds_class = creation_info['class'] dataset_record.creation_info = json.dumps(creation_info) dataset_record.data = json.dumps({ 'source_type': source_type, 'source': source, 'row_count': row_count, 'columns': [dict(name=x) for x in column_names] }) session.commit() except Exception as e: log.error(f'Error creating dataset {name}, exception: {e}') try: self.delete_datasource(name, company_id=company_id) except Exception: pass raise e return self.get_datasource_obj(name, raw=True, company_id=company_id)
def _decode(self, b_dict): """convert binary key/value into strings""" decoded = {} if not isinstance(b_dict, dict): log.error( f"Integration {self.name}: got unexpected data format from redis control stream {self.control_stream_name}: {b_dict}" ) return {} for k in b_dict: decoded[k.decode('utf8')] = b_dict[k].decode('utf8') return decoded
async def wait_apis_start(): futures = [ wait_api_start(api_name, api_data['process'].pid, api_data['port']) for api_name, api_data in apis.items() if 'port' in api_data ] for i, future in enumerate(asyncio.as_completed(futures)): api_name, port, started = await future if started: print(f"{api_name} API: started on {port}") else: log.error(f"ERROR: {api_name} API cant start on {port}")
def to_cache(self, record): gb_val = record[self.gb] cache_name = f"cache.{gb_val}" if cache_name not in self.caches: cache = [] self.caches[cache_name] = cache log.error(f"STREAM: cache {cache_name} has been created") self.make_prediction_from_cache(cache_name) self.handle_record(cache_name, record) self.make_prediction_from_cache(cache_name) log.error("STREAM in cache: current iteration has done.")
def delete(self, name): '''delete datasource''' try: request.default_store.delete_datasource(name) except Exception as e: log.error(e) return http_error( 400, "Error deleting datasource", f"There was an error while tring to delete datasource with name '{name}'" ) return '', 200
def get(self, name): analysis = request.default_store.get_analysis(name) if analysis is not None: return analysis, 200 ds = request.default_store.get_datasource(name) if ds is None: log.error('No valid datasource given') abort(400, 'No valid datasource given') x = threading.Thread(target=analyzing_thread, args=(name, request.default_store)) x.start() return {'status': 'analyzing'}, 200
def make_prediction(self): predict_record = session.query(DBPredictor).filter_by( company_id=self.company_id, name=self.predictor).first() if predict_record is None: log.error( f"Error creating stream: requested predictor {self.predictor} is not exist" ) return while not self.stop_event.wait(0.5): try: msg_str = next(self.consumer) when_data = json.loads(msg_str.value) result = self.native_interface.predict(self.predictor, self.format_flag, when_data=when_data) log.error(f"STREAM: got {result}") for res in result: in_json = json.dumps({"prediction": res}) to_send = in_json.encode('utf-8') log.error(f"sending {to_send}") self.producer.send(self.stream_out_name, to_send) except StopIteration: pass log.error("Stopping stream..") self.producer.close() self.consumer.close() session.close()