class Predictor: def __init__(self, name, root_folder=CONFIG.MINDSDB_STORAGE_PATH, log_level=CONFIG.DEFAULT_LOG_LEVEL, log_server=CONFIG.MINDSDB_SERVER_URL): """ This controller defines the API to a MindsDB 'mind', a mind is an object that can learn and predict from data :param name: the namespace you want to identify this mind instance with :param root_folder: the folder where you want to store this mind or load from :param log_level: the desired log level :param log_server: the url for a server that can accept log streams """ # initialize variables self.name = name self.root_folder = root_folder self.uuid = str(uuid.uuid1()) # initialize log self.log = MindsdbLogger(log_level=log_level, send_logs=False, log_url=log_server, uuid=self.uuid) # check for updates _thread.start_new_thread(check_for_updates, ()) # set the mindsdb storage folder storage_ok = True # default state # if it does not exist try to create it if not os.path.exists(CONFIG.MINDSDB_STORAGE_PATH): try: self.log.info( '{folder} does not exist, creating it now'.format( folder=CONFIG.MINDSDB_STORAGE_PATH)) path = Path(CONFIG.MINDSDB_STORAGE_PATH) path.mkdir(exist_ok=True, parents=True) except: self.log.info(traceback.format_exc()) storage_ok = False self.log.error( 'MindsDB storage foldler: {folder} does not exist and could not be created' .format(folder=CONFIG.MINDSDB_STORAGE_PATH)) # If storage path is not writable, raise an exception as this can no longer be if not os.access(CONFIG.MINDSDB_STORAGE_PATH, os.W_OK) or storage_ok == False: error_message = '''Cannot write into storage path, please either set the config variable mindsdb.config.set('MINDSDB_STORAGE_PATH',<path>) or give write access to {folder}''' raise ValueError( error_message.format(folder=CONFIG.MINDSDB_STORAGE_PATH)) def get_models(self): models = [] for fn in os.listdir(CONFIG.MINDSDB_STORAGE_PATH): if '_light_model_metadata.pickle' in fn: model_name = fn.replace('_light_model_metadata.pickle', '') try: amd = self.get_model_data(model_name) model = {} for k in [ 'name', 'version', 'is_active', 'data_source', 'predict', 'accuracy', 'status', 'train_end_at', 'updated_at', 'created_at', 'current_phase' ]: if k in amd: model[k] = amd[k] else: model[k] = None print( f'Key {k} not found in the light model metadata !' ) models.append(model) except Exception as e: print(e) print( f"Can't adapt metadata for model: '{model_name}' when calling `get_models()`" ) return models def _adapt_column(self, col_stats, col): icm = {} icm['column_name'] = col icm['data_type'] = col_stats['data_type'] icm['data_subtype'] = col_stats['data_subtype'] icm['data_type_distribution'] = { 'type': "categorical", 'x': [], 'y': [] } for k in col_stats['data_type_dist']: icm['data_type_distribution']['x'].append(k) icm['data_type_distribution']['y'].append( col_stats['data_type_dist'][k]) icm['data_subtype_distribution'] = { 'type': "categorical", 'x': [], 'y': [] } for k in col_stats['data_subtype_dist']: icm['data_subtype_distribution']['x'].append(k) icm['data_subtype_distribution']['y'].append( col_stats['data_subtype_dist'][k]) icm['data_distribution'] = {} icm['data_distribution']['data_histogram'] = { "type": "categorical", 'x': [], 'y': [] } icm['data_distribution']['clusters'] = [{"group": [], "members": []}] for i in range(len(col_stats['histogram']['x'])): icm['data_distribution']['data_histogram']['x'].append( col_stats['histogram']['x'][i]) icm['data_distribution']['data_histogram']['y'].append( col_stats['histogram']['y'][i]) scores = ['consistency_score', 'redundancy_score', 'variability_score'] for score in scores: metrics = [] if score == 'consistency_score': simple_description = "A low value indicates the data is not very consistent, it's either missing a lot of valus or the type (e.g. number, text, category, date) of values varries quite a lot." metrics.append({ "type": "score", "name": "Type Distribution", "score": round(10 * (1 - col_stats['data_type_distribution_score'])), #"description": col_stats['data_type_distribution_score_description'], "description": "A low value indicates that we can't consistently determine a single data type (e.g. number, text, category, date) for most values in this column", "warning": col_stats['data_type_distribution_score_warning'] }) metrics.append({ "type": "score", "score": round(10 * (1 - col_stats['empty_cells_score'])), "name": "Empty Cells", #"description": col_stats['empty_cells_score_description'], "description": "A low value indicates that a lot of the values in this column are empty or null. A value of 10 means no cell is missing data, a value of 0 means no cell has any data.", "warning": col_stats['empty_cells_score_warning'] }) if 'duplicates_score' in col_stats: metrics.append({ "type": "score", "name": "Value Duplication", "score": round(10 * (1 - col_stats['duplicates_score'])), #"description": col_stats['duplicates_score_description'], "description": "A low value indicates that a lot of the values in this columns are duplicates, as in, the same value shows up more than once in the column. This is not necessarily bad and could be normal for certain data types.", "warning": col_stats['duplicates_score_warning'] }) if score == 'variability_score': simple_description = "A low value indicates a high possibility of some noise affecting your data collection process. This could mean that the values for this column are not collected or processed correctly." if 'lof_based_outlier_score' in col_stats and 'z_test_based_outlier_score' in col_stats: metrics.append({ "type": "score", "name": "Z Outlier Score", "score": round(10 * (1 - col_stats['lof_based_outlier_score'])), #"description": col_stats['lof_based_outlier_score_description'], "description": "A low value indicates a large number of outliers in your dataset. This is based on distance from the center of 20 clusters as constructed via KNN.", "warning": col_stats['lof_based_outlier_score_warning'] }) metrics.append({ "type": "score", "name": "Z Outlier Score", "score": round(10 * (1 - col_stats['z_test_based_outlier_score'])), #"description": col_stats['z_test_based_outlier_score_description'], "description": "A low value indicates a large number of data points are more than 3 standard deviations away from the mean value of this column. This means that this column likely has a large amount of outliers", "warning": col_stats['z_test_based_outlier_score_warning'] }) metrics.append({ "type": "score", "name": "Value Distribution", "score": round(10 * (1 - col_stats['value_distribution_score'])), #"description": col_stats['value_distribution_score_description'], "description": "A low value indicates the possibility of a large number of outliers, the clusters in which your data is distributed aren't evenly sized.", "warning": col_stats['value_distribution_score_warning'] }) if score == 'redundancy_score': simple_description = "A low value indicates that the data in this column is highly redundant (useless) for making any sort of prediction. You should make sure that values heavily related to this column are no already expressed in another column (e.g. if this column is a timestamp, make sure you don't have another column representing the exact same time in ISO datetime format)" metrics.append({ "type": "score", "name": "Matthews Correlation Score", "score": round(10 * (1 - col_stats['similarity_score'])), #"description": col_stats['similarity_score_description'], "description": "A low value indicates a large number of values in this column being similar to values on the same row of other columns", "warning": col_stats['similarity_score_warning'] }) icm[score.replace('_score', '')] = { "score": round(10 * (1 - col_stats[score])), "metrics": metrics, #"description": col_stats[f'{score}_description'], "description": simple_description, "warning": col_stats[f'{score}_warning'] } return icm def get_model_data(self, model_name): with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, f'{model_name}_light_model_metadata.pickle'), 'rb') as fp: lmd = pickle.load(fp) # ADAPTOR CODE amd = {} if lmd['current_phase'] == MODEL_STATUS_TRAINED: amd['status'] = 'complete' elif lmd['current_phase'] == MODEL_STATUS_ERROR: amd['status'] = 'error' else: amd['status'] = 'training' # Shared keys for k in [ 'name', 'version', 'is_active', 'data_source', 'predict', 'accuracy', 'current_phase', 'train_end_at', 'updated_at', 'created_at', 'data_preparation', 'validation_set_accuracy' ]: if k == 'predict': amd[k] = lmd['predict_columns'] elif k in lmd: amd[k] = lmd[k] if k == 'validation_set_accuracy': if lmd['validation_set_accuracy'] is not None: amd['accuracy'] = round(lmd['validation_set_accuracy'], 3) else: amd['accuracy'] = None else: amd[k] = None print(f'Key {k} not found in the light model metadata !') amd['data_analysis'] = { 'target_columns_metadata': [], 'input_columns_metadata': [] } amd['model_analysis'] = [] for col in lmd['model_columns_map'].keys(): if col in lmd['malformed_columns']['names']: continue try: icm = self._adapt_column(lmd['column_stats'][col], col) except: icm = {'column_name': col} continue amd['force_vectors'] = {} if col in lmd['predict_columns']: # Histograms for plotting the force vectors if 'all_columns_prediction_distribution' in lmd and lmd[ 'all_columns_prediction_distribution'] is not None: amd['force_vectors'][col] = {} amd['force_vectors'][col][ 'normal_data_distribution'] = lmd[ 'all_columns_prediction_distribution'][col] amd['force_vectors'][col]['normal_data_distribution'][ 'type'] = 'categorical' amd['force_vectors'][col]['missing_data_distribution'] = {} for missing_column in lmd[ 'columnless_prediction_distribution'][col]: amd['force_vectors'][col]['missing_data_distribution'][ missing_column] = lmd[ 'columnless_prediction_distribution'][col][ missing_column] amd['force_vectors'][col]['missing_data_distribution'][ missing_column]['type'] = 'categorical' icm['importance_score'] = None amd['data_analysis']['target_columns_metadata'].append(icm) # Model analysis building for each of the predict columns mao = { 'column_name': col, 'overall_input_importance': { "type": "categorical", "x": [], "y": [] }, "train_accuracy_over_time": { "type": "categorical", "x": [], "y": [] }, "test_accuracy_over_time": { "type": "categorical", "x": [], "y": [] }, "accuracy_histogram": { "x": [], "y": [], 'x_explained': [] } } # This is a check to see if model analysis has run on this data if 'model_accuracy' in lmd and lmd[ 'model_accuracy'] is not None and 'train' in lmd[ 'model_accuracy'] and 'combined' in lmd[ 'model_accuracy']['train'] and lmd[ 'model_accuracy']['train'][ 'combined'] is not None: train_acc = lmd['model_accuracy']['train']['combined'] test_acc = lmd['model_accuracy']['test']['combined'] for i in range(0, len(train_acc)): mao['train_accuracy_over_time']['x'].append(i) mao['train_accuracy_over_time']['y'].append( train_acc[i]) for i in range(0, len(test_acc)): mao['test_accuracy_over_time']['x'].append(i) mao['test_accuracy_over_time']['y'].append([i]) if 'model_accuracy' in lmd and lmd[ 'model_accuracy'] is not None and lmd[ 'column_importances'] is not None: mao['accuracy_histogram']['x'] = [ f'{x}' for x in lmd['accuracy_histogram'][col]['buckets'] ] mao['accuracy_histogram']['y'] = lmd['accuracy_histogram'][ col]['accuracies'] for output_col_bucket in lmd[ 'columns_buckets_importances'][col]: x_explained_member = [] for input_col in lmd['columns_buckets_importances'][ col][output_col_bucket]: stats = lmd['columns_buckets_importances'][col][ output_col_bucket][input_col] adapted_sub_incol = self._adapt_column( stats, input_col) x_explained_member.append(adapted_sub_incol) mao['accuracy_histogram']['x_explained'].append( x_explained_member) ''' mao['accuracy_histogram']['x'] = [] mao['accuracy_histogram']['y'] = [] bucket_importance_keys = list(lmd['columns_buckets_importances'].keys()) for incol in lmd['column_importances']: incol_bucket_importance_keys = list(filter(lambda x: incol in x, bucket_importance_keys)) mao['accuracy_histogram']['x'].append(incol) mao['accuracy_histogram']['y'].append(lmd['column_importances'][incol]) if len(incol_bucket_importance_keys) > 0: sub_group_stats = [] for sub_incol in incol_bucket_importance_keys: adapted_sub_incol = self._adapt_column(lmd['columns_buckets_importances'][sub_incol], sub_incol) sub_incol_parts = sub_incol.split('_bucket_') sub_incol_name = 'Value Bucket "{}" for column: "{}"'.format(sub_incol_parts[1],sub_incol_parts[0]) adapted_sub_incol['column_name'] = sub_incol_name sub_group_stats.append(adapted_sub_incol) else: sub_group_stats = [None] mao['accuracy_histogram']['x_explained'].append(sub_group_stats) ''' for icol in lmd['model_columns_map'].keys(): if icol in lmd['malformed_columns']['names']: continue if icol not in lmd['predict_columns']: try: mao['overall_input_importance']['x'].append( icol) mao['overall_input_importance']['y'].append( lmd['column_importances'][icol]) except: print( f'No column importances found for {icol} !' ) amd['model_analysis'].append(mao) else: if 'column_importances' in lmd and lmd[ 'column_importances'] is not None: icm['importance_score'] = lmd['column_importances'][col] amd['data_analysis']['input_columns_metadata'].append(icm) return amd def export(self, mindsdb_storage_dir='mindsdb_storage'): """ If you want to export this mindsdb's instance storage to a file :param mindsdb_storage_dir: this is the full_path where you want to store a mind to, it will be a zip file :return: bool (True/False) True if mind was exported successfully """ try: shutil.make_archive(base_name=mindsdb_storage_dir, format='zip', root_dir=CONFIG.MINDSDB_STORAGE_PATH) print(f'Exported mindsdb storage to {mindsdb_storage_dir}.zip') return True except: return False def export_model(self, model_name): """ If you want to export a model to a file :param model_name: this is the name of the model you wish to export (defaults to the name of the current Predictor) :return: bool (True/False) True if mind was exported successfully """ if model_name is None: model_name = self.name try: storage_file = model_name + '.zip' with zipfile.ZipFile(storage_file, 'w') as zip_fp: for file_name in [ model_name + '_heavy_model_metadata.pickle', model_name + '_light_model_metadata.pickle' ]: full_path = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, file_name) zip_fp.write(full_path, os.path.basename(full_path)) # If the backend is ludwig, save the ludwig files try: ludwig_model_path = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, model_name + '_ludwig_data') for root, dirs, files in os.walk(ludwig_model_path): for file in files: full_path = os.path.join(root, file) zip_fp.write( full_path, full_path[len(CONFIG.MINDSDB_STORAGE_PATH):]) except: pass print(f'Exported model to {storage_file}') return True except Exception as e: print(e) return False def load(self, mindsdb_storage_dir='mindsdb_storage.zip'): """ If you want to import a mindsdb instance storage from a file :param mindsdb_storage_dir: this is the full_path that contains your mind :return: bool (True/False) True if mind was importerd successfully """ shutil.unpack_archive(mindsdb_storage_dir, extract_dir=CONFIG.MINDSDB_STORAGE_PATH) def load_model(self, model_archive_path=None): """ If you want to load a model to a file :param model_archive_path: this is the path to the archive where your model resides :return: bool (True/False) True if mind was importerd successfully """ shutil.unpack_archive(model_archive_path, extract_dir=CONFIG.MINDSDB_STORAGE_PATH) def delete_model(self, model_name): """ If you want to export a model to a file :param model_name: this is the name of the model you wish to export (defaults to the name of the current Predictor) :return: bool (True/False) True if mind was exported successfully """ if model_name is None: model_name = self.name try: for file_name in [ model_name + '_heavy_model_metadata.pickle', model_name + '_light_model_metadata.pickle' ]: os.remove(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, file_name)) return True except Exception as e: print(e) return False def learn(self, to_predict, from_data=None, test_from_data=None, group_by=None, window_size_samples=None, window_size_seconds=None, window_size=None, order_by=[], sample_margin_of_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, ignore_columns=[], rename_strange_columns=False, stop_training_in_x_seconds=None, stop_training_in_accuracy=None, send_logs=CONFIG.SEND_LOGS, backend='ludwig', rebuild_model=True, use_gpu=True, disable_optional_analysis=False): """ Tells the mind to learn to predict a column or columns from the data in 'from_data' Mandatory arguments: :param to_predict: what column or columns you want to predict :param from_data: the data that you want to learn from, this can be either a file, a pandas data frame, or url to a file Optional arguments: :param test_from_data: If you would like to test this learning from a different data set Optional Time series arguments: :param order_by: this order by defines the time series, it can be a list. By default it sorts each sort by column in ascending manner, if you want to change this pass a touple ('column_name', 'boolean_for_ascending <default=true>') :param group_by: This argument tells the time series that it should learn by grouping rows by a given id :param window_size: The number of samples to learn from in the time series Optional data transformation arguments: :param ignore_columns: it simply removes the columns from the data sources :param rename_strange_columns: this tells mindsDB that if columns have special characters, it should try to rename them, this is a legacy argument, as now mindsdb supports any column name Optional sampling parameters: :param sample_margin_error (DEFAULT 0): Maximum expected difference between the true population parameter, such as the mean, and the sample estimate. Optional debug arguments: :param send_logs: If you want to stream these logs to a server :param stop_training_in_x_seconds: (default None), if set, you want training to finish in a given number of seconds :return: """ # Backwards compatibility of interface if window_size is not None: window_size_samples = window_size # from_ds = getDS(from_data) test_from_ds = test_from_data if test_from_data is None else getDS( test_from_data) transaction_type = TRANSACTION_LEARN sample_confidence_level = 1 - sample_margin_of_error predict_columns_map = {} # lets turn into lists: predict, order_by and group by predict_columns = [to_predict ] if type(to_predict) != type([]) else to_predict group_by = group_by if type(group_by) == type( []) else [group_by] if group_by else [] order_by = order_by if type(order_by) == type( []) else [order_by] if order_by else [] if len(predict_columns) == 0: error = 'You need to specify a column to predict' self.log.error(error) raise ValueError(error) # lets turn order by into tuples if not already # each element ('column_name', 'boolean_for_ascending <default=true>') order_by = [(col_name, True) if type(col_name) != type( ()) else col_name for col_name in order_by] is_time_series = True if len(order_by) > 0 else False if rename_strange_columns is False: for predict_col in predict_columns: predict_col_as_in_df = from_ds.getColNameAsInDF(predict_col) predict_columns_map[predict_col_as_in_df] = predict_col predict_columns = list(predict_columns_map.keys()) else: self.log.warning( 'Note that after version 1.0, the default value for argument rename_strange_columns in MindsDB().learn, will be flipped from True to False, this means that if your data has columns with special characters, MindsDB will not try to rename them by default.' ) heavy_transaction_metadata = {} heavy_transaction_metadata['name'] = self.name heavy_transaction_metadata['from_data'] = from_ds heavy_transaction_metadata['test_from_data'] = test_from_ds heavy_transaction_metadata['bucketing_algorithms'] = {} heavy_transaction_metadata['predictions'] = None light_transaction_metadata = {} light_transaction_metadata['version'] = str(__version__) light_transaction_metadata['name'] = self.name light_transaction_metadata['data_preparation'] = {} light_transaction_metadata['model_backend'] = backend light_transaction_metadata['predict_columns'] = predict_columns light_transaction_metadata[ 'model_columns_map'] = {} if rename_strange_columns else from_ds._col_map light_transaction_metadata['model_group_by'] = group_by light_transaction_metadata['model_order_by'] = order_by light_transaction_metadata['window_size_samples'] = window_size_samples light_transaction_metadata['window_size_seconds'] = window_size_seconds light_transaction_metadata['model_is_time_series'] = is_time_series light_transaction_metadata['data_source'] = from_data light_transaction_metadata['type'] = transaction_type light_transaction_metadata['ignore_columns'] = ignore_columns light_transaction_metadata[ 'sample_margin_of_error'] = sample_margin_of_error light_transaction_metadata[ 'sample_confidence_level'] = sample_confidence_level light_transaction_metadata[ 'stop_training_in_x_seconds'] = stop_training_in_x_seconds light_transaction_metadata[ 'stop_training_in_accuracy'] = stop_training_in_accuracy light_transaction_metadata['rebuild_model'] = rebuild_model light_transaction_metadata['model_accuracy'] = { 'train': {}, 'test': {} } light_transaction_metadata['column_importances'] = None light_transaction_metadata['columns_buckets_importances'] = None light_transaction_metadata['columnless_prediction_distribution'] = None light_transaction_metadata[ 'all_columns_prediction_distribution'] = None light_transaction_metadata['use_gpu'] = use_gpu light_transaction_metadata['malformed_columns'] = { 'names': [], 'indices': [] } light_transaction_metadata[ 'disable_optional_analysis'] = disable_optional_analysis light_transaction_metadata['validation_set_accuracy'] = None if rebuild_model is False: old_lmd = {} for k in light_transaction_metadata: old_lmd[k] = light_transaction_metadata[k] old_hmd = {} for k in heavy_transaction_metadata: old_hmd[k] = heavy_transaction_metadata[k] with open( os.path.join( CONFIG.MINDSDB_STORAGE_PATH, light_transaction_metadata['name'] + '_light_model_metadata.pickle'), 'rb') as fp: light_transaction_metadata = pickle.load(fp) with open( os.path.join( CONFIG.MINDSDB_STORAGE_PATH, heavy_transaction_metadata['name'] + '_heavy_model_metadata.pickle'), 'rb') as fp: heavy_transaction_metadata = pickle.load(fp) for k in [ 'data_preparation', 'rebuild_model', 'data_source', 'type', 'ignore_columns', 'sample_margin_of_error', 'sample_confidence_level', 'stop_training_in_x_seconds', 'stop_training_in_accuracy' ]: if old_lmd[k] is not None: light_transaction_metadata[k] = old_lmd[k] for k in ['from_data', 'test_from_data']: if old_hmd[k] is not None: heavy_transaction_metadata[k] = old_hmd[k] Transaction(session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata, logger=self.log) def predict(self, when={}, when_data=None, update_cached_model=False, use_gpu=True): """ You have a mind trained already and you want to make a prediction :param when: use this if you have certain conditions for a single prediction :param when_data: (optional) use this when you have data in either a file, a pandas data frame, or url to a file that you want to predict from :param update_cached_model: (optional, default:False) when you run predict for the first time, it loads the latest model in memory, you can force it to do this on this run by flipping it to True :return: TransactionOutputData object """ transaction_type = TRANSACTION_PREDICT when_ds = None if when_data is None else getDS(when_data) # lets turn into lists: when when = [when] if type(when) in [type(None), type({})] else when heavy_transaction_metadata = {} if when_ds is None: heavy_transaction_metadata['when_data'] = None else: heavy_transaction_metadata['when_data'] = when_ds heavy_transaction_metadata['model_when_conditions'] = when heavy_transaction_metadata['name'] = self.name light_transaction_metadata = {} light_transaction_metadata['name'] = self.name light_transaction_metadata['type'] = transaction_type light_transaction_metadata['use_gpu'] = use_gpu light_transaction_metadata['data_preparation'] = {} transaction = Transaction( session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata) return transaction.output_data
class Predictor: def __init__(self, name, root_folder=CONFIG.MINDSDB_STORAGE_PATH, log_level=CONFIG.DEFAULT_LOG_LEVEL): """ This controller defines the API to a MindsDB 'mind', a mind is an object that can learn and predict from data :param name: the namespace you want to identify this mind instance with :param root_folder: the folder where you want to store this mind or load from :param log_level: the desired log level """ # initialize variables self.name = name self.root_folder = root_folder self.uuid = str(uuid.uuid1()) # initialize log self.log = MindsdbLogger(log_level=log_level, uuid=self.uuid) if CONFIG.CHECK_FOR_UPDATES: try: check_for_updates() except: self.log.warning('Could not check for updates !') if not CONFIG.SAGEMAKER: # If storage path is not writable, raise an exception as this can no longer be if not os.access(CONFIG.MINDSDB_STORAGE_PATH, os.W_OK): error_message = '''Cannot write into storage path, please either set the config variable mindsdb.config.set('MINDSDB_STORAGE_PATH',<path>) or give write access to {folder}''' self.log.warning( error_message.format(folder=CONFIG.MINDSDB_STORAGE_PATH)) raise ValueError( error_message.format(folder=CONFIG.MINDSDB_STORAGE_PATH)) # If storage path is not writable, raise an exception as this can no longer be if not os.access(CONFIG.MINDSDB_STORAGE_PATH, os.R_OK): error_message = '''Cannot read from storage path, please either set the config variable mindsdb.config.set('MINDSDB_STORAGE_PATH',<path>) or give write access to {folder}''' self.log.warning( error_message.format(folder=CONFIG.MINDSDB_STORAGE_PATH)) raise ValueError( error_message.format(folder=CONFIG.MINDSDB_STORAGE_PATH)) def get_models(self): models = [] for fn in os.listdir(CONFIG.MINDSDB_STORAGE_PATH): if '_light_model_metadata.pickle' in fn: model_name = fn.replace('_light_model_metadata.pickle', '') try: amd = self.get_model_data(model_name) model = {} for k in [ 'name', 'version', 'is_active', 'data_source', 'predict', 'status', 'train_end_at', 'updated_at', 'created_at', 'current_phase', 'accuracy' ]: if k in amd: model[k] = amd[k] else: model[k] = None models.append(model) except Exception as e: print(e) print(traceback.format_exc()) print( f"Can't adapt metadata for model: '{model_name}' when calling `get_models()`" ) return models def _adapt_column(self, col_stats, col): icm = {} icm['column_name'] = col icm['data_type'] = col_stats['data_type'] icm['data_subtype'] = col_stats['data_subtype'] icm['data_type_distribution'] = { 'type': "categorical", 'x': [], 'y': [] } for k in col_stats['data_type_dist']: icm['data_type_distribution']['x'].append(k) icm['data_type_distribution']['y'].append( col_stats['data_type_dist'][k]) icm['data_subtype_distribution'] = { 'type': "categorical", 'x': [], 'y': [] } for k in col_stats['data_subtype_dist']: icm['data_subtype_distribution']['x'].append(k) icm['data_subtype_distribution']['y'].append( col_stats['data_subtype_dist'][k]) icm['data_distribution'] = {} icm['data_distribution']['data_histogram'] = { "type": "categorical", 'x': [], 'y': [] } icm['data_distribution']['clusters'] = [{"group": [], "members": []}] for i in range(len(col_stats['histogram']['x'])): icm['data_distribution']['data_histogram']['x'].append( col_stats['histogram']['x'][i]) icm['data_distribution']['data_histogram']['y'].append( col_stats['histogram']['y'][i]) scores = ['consistency_score', 'redundancy_score', 'variability_score'] for score in scores: metrics = [] if score == 'consistency_score': simple_description = "A low value indicates the data is not very consistent, it's either missing a lot of valus or the type (e.g. number, text, category, date) of values varries quite a lot." metrics.append({ "type": "score", "name": "Type Distribution", "score": col_stats['data_type_distribution_score'], #"description": col_stats['data_type_distribution_score_description'], "description": "A low value indicates that we can't consistently determine a single data type (e.g. number, text, category, date) for most values in this column", "warning": col_stats['data_type_distribution_score_warning'] }) metrics.append({ "type": "score", "score": col_stats['empty_cells_score'], "name": "Empty Cells", #"description": col_stats['empty_cells_score_description'], "description": "A low value indicates that a lot of the values in this column are empty or null. A value of 10 means no cell is missing data, a value of 0 means no cell has any data.", "warning": col_stats['empty_cells_score_warning'] }) if 'duplicates_score' in col_stats: metrics.append({ "type": "score", "name": "Value Duplication", "score": col_stats['duplicates_score'], #"description": col_stats['duplicates_score_description'], "description": "A low value indicates that a lot of the values in this columns are duplicates, as in, the same value shows up more than once in the column. This is not necessarily bad and could be normal for certain data types.", "warning": col_stats['duplicates_score_warning'] }) if score == 'variability_score': simple_description = "A low value indicates a high possibility of some noise affecting your data collection process. This could mean that the values for this column are not collected or processed correctly." if 'lof_based_outlier_score' in col_stats and 'z_test_based_outlier_score' in col_stats: metrics.append({ "type": "score", "name": "Z Outlier Score", "score": col_stats['lof_based_outlier_score'], #"description": col_stats['lof_based_outlier_score_description'], "description": "A low value indicates a large number of outliers in your dataset. This is based on distance from the center of 20 clusters as constructed via KNN.", "warning": col_stats['lof_based_outlier_score_warning'] }) metrics.append({ "type": "score", "name": "Z Outlier Score", "score": col_stats['z_test_based_outlier_score'], #"description": col_stats['z_test_based_outlier_score_description'], "description": "A low value indicates a large number of data points are more than 3 standard deviations away from the mean value of this column. This means that this column likely has a large amount of outliers", "warning": col_stats['z_test_based_outlier_score_warning'] }) metrics.append({ "type": "score", "name": "Value Distribution", "score": col_stats['value_distribution_score'], #"description": col_stats['value_distribution_score_description'], "description": "A low value indicates the possibility of a large number of outliers, the clusters in which your data is distributed aren't evenly sized.", "warning": col_stats['value_distribution_score_warning'] }) if score == 'redundancy_score': # CLF based score to be included here once we find a faster way of computing it... similarity_score_based_most_correlated_column = col_stats[ 'most_similar_column_name'] simple_description = f"A low value indicates that the data in this column is highly redundant (useless) for making any sort of prediction. You should make sure that values heavily related to this column are not already expressed in the \"{similarity_score_based_most_correlated_column}\" column (e.g. if this column is a timestamp, make sure you don't have another column representing the exact same time in ISO datetime format)" metrics.append({ "type": "score", "name": "Matthews Correlation Score", "score": col_stats['similarity_score'], #"description": col_stats['similarity_score_description'], "description": f"A low value indicates a large number of values in this column are similar to values in the \"{similarity_score_based_most_correlated_column}\" column", "warning": col_stats['similarity_score_warning'] }) icm[score.replace('_score', '')] = { "score": col_stats[score], "metrics": metrics, #"description": col_stats[f'{score}_description'], "description": simple_description, "warning": col_stats[f'{score}_warning'] } return icm def get_model_data(self, model_name=None, lmd=None): if model_name is None: model_name = self.name if lmd is None: with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, f'{model_name}_light_model_metadata.pickle'), 'rb') as fp: lmd = pickle.load(fp) # ADAPTOR CODE amd = {} if lmd['current_phase'] == MODEL_STATUS_TRAINED: amd['status'] = 'complete' elif lmd['current_phase'] == MODEL_STATUS_ERROR: amd['status'] = 'error' else: amd['status'] = 'training' # Shared keys for k in [ 'name', 'version', 'is_active', 'data_source', 'predict', 'current_phase', 'train_end_at', 'updated_at', 'created_at', 'data_preparation', 'validation_set_accuracy' ]: if k == 'predict': amd[k] = lmd['predict_columns'] elif k in lmd: amd[k] = lmd[k] if k == 'validation_set_accuracy': if lmd['validation_set_accuracy'] is not None: amd['accuracy'] = round(lmd['validation_set_accuracy'], 3) else: amd['accuracy'] = None else: amd[k] = None amd['data_analysis'] = { 'target_columns_metadata': [], 'input_columns_metadata': [] } amd['model_analysis'] = [] for col in lmd['model_columns_map'].keys(): if col in lmd['columns_to_ignore']: continue try: icm = self._adapt_column(lmd['column_stats'][col], col) except Exception as e: icm = {'column_name': col} #continue amd['force_vectors'] = {} if col in lmd['predict_columns']: # Histograms for plotting the force vectors if 'all_columns_prediction_distribution' in lmd and lmd[ 'all_columns_prediction_distribution'] is not None: amd['force_vectors'][col] = {} amd['force_vectors'][col][ 'normal_data_distribution'] = lmd[ 'all_columns_prediction_distribution'][col] amd['force_vectors'][col]['normal_data_distribution'][ 'type'] = 'categorical' amd['force_vectors'][col]['missing_data_distribution'] = {} for missing_column in lmd[ 'columnless_prediction_distribution'][col]: amd['force_vectors'][col]['missing_data_distribution'][ missing_column] = lmd[ 'columnless_prediction_distribution'][col][ missing_column] amd['force_vectors'][col]['missing_data_distribution'][ missing_column]['type'] = 'categorical' icm['importance_score'] = None amd['data_analysis']['target_columns_metadata'].append(icm) if 'confusion_matrices' in lmd and col in lmd[ 'confusion_matrices']: confusion_matrix = lmd['confusion_matrices'][col] else: confusion_matrix = None # Model analysis building for each of the predict columns mao = { 'column_name': col, 'overall_input_importance': { "type": "categorical", "x": [], "y": [] }, "train_accuracy_over_time": { "type": "categorical", "x": [], "y": [] }, "test_accuracy_over_time": { "type": "categorical", "x": [], "y": [] }, "accuracy_histogram": { "x": [], "y": [], 'x_explained': [] }, "confusion_matrix": confusion_matrix } # This is a check to see if model analysis has run on this data if 'model_accuracy' in lmd and lmd[ 'model_accuracy'] is not None and 'train' in lmd[ 'model_accuracy'] and 'combined' in lmd[ 'model_accuracy']['train'] and lmd[ 'model_accuracy']['train'][ 'combined'] is not None: train_acc = lmd['model_accuracy']['train']['combined'] test_acc = lmd['model_accuracy']['test']['combined'] for i in range(0, len(train_acc)): mao['train_accuracy_over_time']['x'].append(i) mao['train_accuracy_over_time']['y'].append( train_acc[i]) for i in range(0, len(test_acc)): mao['test_accuracy_over_time']['x'].append(i) mao['test_accuracy_over_time']['y'].append([i]) if 'model_accuracy' in lmd and lmd[ 'model_accuracy'] is not None and lmd[ 'column_importances'] is not None: mao['accuracy_histogram']['x'] = [ f'{x}' for x in lmd['accuracy_histogram'][col]['buckets'] ] mao['accuracy_histogram']['y'] = lmd['accuracy_histogram'][ col]['accuracies'] if lmd['columns_buckets_importances'] is not None and col in lmd[ 'columns_buckets_importances']: for output_col_bucket in lmd[ 'columns_buckets_importances'][col]: x_explained_member = [] for input_col in lmd[ 'columns_buckets_importances'][col][ output_col_bucket]: stats = lmd['columns_buckets_importances'][ col][output_col_bucket][input_col] adapted_sub_incol = self._adapt_column( stats, input_col) x_explained_member.append(adapted_sub_incol) mao['accuracy_histogram']['x_explained'].append( x_explained_member) for icol in lmd['model_columns_map'].keys(): if icol in lmd['columns_to_ignore']: continue if icol not in lmd['predict_columns']: try: mao['overall_input_importance']['x'].append( icol) mao['overall_input_importance']['y'].append( round(lmd['column_importances'][icol], 1)) except: print( f'No column importances found for {icol} !' ) amd['model_analysis'].append(mao) else: if 'column_importances' in lmd and lmd[ 'column_importances'] is not None: icm['importance_score'] = lmd['column_importances'][col] amd['data_analysis']['input_columns_metadata'].append(icm) return amd def export(self, mindsdb_storage_dir='mindsdb_storage'): """ If you want to export this mindsdb's instance storage to a file :param mindsdb_storage_dir: this is the full_path where you want to store a mind to, it will be a zip file :return: bool (True/False) True if mind was exported successfully """ try: shutil.make_archive(base_name=mindsdb_storage_dir, format='zip', root_dir=CONFIG.MINDSDB_STORAGE_PATH) print(f'Exported mindsdb storage to {mindsdb_storage_dir}.zip') return True except: return False def export_model(self, model_name=None): """ If you want to export a model to a file :param model_name: this is the name of the model you wish to export (defaults to the name of the current Predictor) :return: bool (True/False) True if mind was exported successfully """ if model_name is None: model_name = self.name try: storage_file = model_name + '.zip' with zipfile.ZipFile(storage_file, 'w') as zip_fp: for file_name in [ model_name + '_heavy_model_metadata.pickle', model_name + '_light_model_metadata.pickle', model_name + '_lightwood_data' ]: full_path = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, file_name) zip_fp.write(full_path, os.path.basename(full_path)) # If the backend is ludwig, save the ludwig files try: ludwig_model_path = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, model_name + '_ludwig_data') for root, dirs, files in os.walk(ludwig_model_path): for file in files: full_path = os.path.join(root, file) zip_fp.write( full_path, full_path[len(CONFIG.MINDSDB_STORAGE_PATH):]) except: pass print(f'Exported model to {storage_file}') return True except Exception as e: print(e) return False def load(self, model_archive_path): """ If you want to import a mindsdb instance storage from a file :param mindsdb_storage_dir: full_path that contains your mindsdb predictor zip file :return: bool (True/False) True if mind was importerd successfully """ previous_models = os.listdir(CONFIG.MINDSDB_STORAGE_PATH) shutil.unpack_archive(model_archive_path, extract_dir=CONFIG.MINDSDB_STORAGE_PATH) new_model_files = set(os.listdir( CONFIG.MINDSDB_STORAGE_PATH)) - set(previous_models) model_names = [] for file in new_model_files: if '_light_model_metadata.pickle' in file: model_name = file.replace('_light_model_metadata.pickle', '') model_names.append(model_name) for model_name in model_names: with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name + '_light_model_metadata.pickle'), 'rb') as fp: lmd = pickle.load(fp) if 'ludwig_data' in lmd and 'ludwig_save_path' in lmd[ 'ludwig_data']: lmd['ludwig_data']['ludwig_save_path'] = str( os.path.join( CONFIG.MINDSDB_STORAGE_PATH, os.path.basename( lmd['ludwig_data']['ludwig_save_path']))) if 'lightwood_data' in lmd and 'save_path' in lmd['lightwood_data']: lmd['lightwood_data']['save_path'] = str( os.path.join( CONFIG.MINDSDB_STORAGE_PATH, os.path.basename(lmd['lightwood_data']['save_path']))) with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name + '_light_model_metadata.pickle'), 'wb') as fp: pickle.dump(lmd, fp, protocol=pickle.HIGHEST_PROTOCOL) def load_model(self, model_archive_path=None): """ If you want to load a model to a file :param model_archive_path: this is the path to the archive where your model resides :return: bool (True/False) True if mind was importerd successfully """ self.load(model_archive_path) def rename_model(self, old_model_name, new_model_name): """ If you want to export a model to a file :param old_model_name: this is the name of the model you wish to rename :param new_model_name: this is the new name of the model :return: bool (True/False) True if mind was exported successfully """ if old_model_name == new_model_name: return True moved_a_backend = False for extension in ['_lightwood_data', '_ludwig_data']: try: shutil.move( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name + extension), os.path.join(CONFIG.MINDSDB_STORAGE_PATH, new_model_name + extension)) moved_a_backend = True except: pass if not moved_a_backend: return False with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name + '_light_model_metadata.pickle'), 'rb') as fp: lmd = pickle.load(fp) with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name + '_heavy_model_metadata.pickle'), 'rb') as fp: hmd = pickle.load(fp) lmd['name'] = new_model_name hmd['name'] = new_model_name renamed_one_backend = False try: lmd['ludwig_data']['ludwig_save_path'] = lmd['ludwig_data'][ 'ludwig_save_path'].replace(old_model_name, new_model_name) renamed_one_backend = True except: pass try: lmd['lightwood_data']['save_path'] = lmd['lightwood_data'][ 'save_path'].replace(old_model_name, new_model_name) renamed_one_backend = True except: pass if not renamed_one_backend: return False with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, new_model_name + '_light_model_metadata.pickle'), 'wb') as fp: pickle.dump(lmd, fp, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, new_model_name + '_heavy_model_metadata.pickle'), 'wb') as fp: pickle.dump(hmd, fp, protocol=pickle.HIGHEST_PROTOCOL) os.remove( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name + '_light_model_metadata.pickle')) os.remove( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name + '_heavy_model_metadata.pickle')) return True def delete_model(self, model_name=None): """ If you want to export a model to a file :param model_name: this is the name of the model you wish to export (defaults to the name of the current Predictor) :return: bool (True/False) True if mind was exported successfully """ with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name + '_light_model_metadata.pickle'), 'rb') as fp: lmd = pickle.load(fp) try: os.remove(lmd['lightwood_data']['save_path']) except: pass try: shutil.rmtree(lmd['ludwig_data']['ludwig_save_path']) except: pass if model_name is None: model_name = self.name try: for file_name in [ model_name + '_heavy_model_metadata.pickle', model_name + '_light_model_metadata.pickle' ]: os.remove(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, file_name)) return True except Exception as e: print(e) return False def analyse_dataset(self, from_data, sample_margin_of_error=0.005): """ Analyse the particular dataset being given """ from_ds = getDS(from_data) transaction_type = TRANSACTION_ANALYSE sample_confidence_level = 1 - sample_margin_of_error heavy_transaction_metadata = {} heavy_transaction_metadata['name'] = self.name heavy_transaction_metadata['from_data'] = from_ds light_transaction_metadata = {} light_transaction_metadata['version'] = str(__version__) light_transaction_metadata['name'] = self.name light_transaction_metadata['model_columns_map'] = from_ds._col_map light_transaction_metadata['type'] = transaction_type light_transaction_metadata[ 'sample_margin_of_error'] = sample_margin_of_error light_transaction_metadata[ 'sample_confidence_level'] = sample_confidence_level light_transaction_metadata['model_is_time_series'] = False light_transaction_metadata['model_group_by'] = [] light_transaction_metadata['model_order_by'] = [] light_transaction_metadata['columns_to_ignore'] = [] light_transaction_metadata['data_preparation'] = {} light_transaction_metadata['predict_columns'] = [] light_transaction_metadata['empty_columns'] = [] light_transaction_metadata['handle_foreign_keys'] = True light_transaction_metadata['force_categorical_encoding'] = [] light_transaction_metadata['handle_text_as_categorical'] = False Transaction(session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata, logger=self.log) return self.get_model_data(model_name=None, lmd=light_transaction_metadata) def learn(self, to_predict, from_data, test_from_data=None, group_by=None, window_size=None, order_by=None, sample_margin_of_error=0.005, ignore_columns=None, stop_training_in_x_seconds=None, stop_training_in_accuracy=None, backend='lightwood', rebuild_model=True, use_gpu=None, disable_optional_analysis=False, equal_accuracy_for_all_output_categories=True, output_categories_importance_dictionary=None, unstable_parameters_dict=None): """ Learn to predict a column or columns from the data in 'from_data' Mandatory arguments: :param to_predict: what column or columns you want to predict :param from_data: the data that you want to learn from, this can be either a file, a pandas data frame, or url or a mindsdb data source Optional arguments: :param test_from_data: If you would like to test this learning from a different data set Optional Time series arguments: :param order_by: this order by defines the time series, it can be a list. By default it sorts each sort by column in ascending manner, if you want to change this pass a touple ('column_name', 'boolean_for_ascending <default=true>') :param group_by: This argument tells the time series that it should learn by grouping rows by a given id :param window_size: The number of samples to learn from in the time series Optional data transformation arguments: :param ignore_columns: mindsdb will ignore this column Optional sampling parameters: :param sample_margin_of_error (DEFAULT 0): Maximum expected difference between the true population parameter, such as the mean, and the sample estimate. Optional debug arguments: :param stop_training_in_x_seconds: (default None), if set, you want training to finish in a given number of seconds :return: """ if order_by is None: order_by = [] if ignore_columns is None: ignore_columns = [] if unstable_parameters_dict is None: unstable_parameters_dict = {} from_ds = getDS(from_data) test_from_ds = test_from_data if test_from_data is None else getDS( test_from_data) transaction_type = TRANSACTION_LEARN sample_confidence_level = 1 - sample_margin_of_error # lets turn into lists: predict, order_by and group by predict_columns = [to_predict ] if type(to_predict) != type([]) else to_predict group_by = group_by if type(group_by) == type( []) else [group_by] if group_by else [] order_by = order_by if type(order_by) == type( []) else [order_by] if order_by else [] if len(predict_columns) == 0: error = 'You need to specify a column to predict' self.log.error(error) raise ValueError(error) # lets turn order by into tuples if not already # each element ('column_name', 'boolean_for_ascending <default=true>') order_by = [(col_name, True) if type(col_name) != type( ()) else col_name for col_name in order_by] is_time_series = True if len(order_by) > 0 else False ''' We don't implement "name" as a concept in mindsdbd data sources, this is only available for files, the server doesn't handle non-file data sources at the moment, so this shouldn't prove an issue, once we want to support datasources such as s3 and databases for the server we need to add name as a concept (or, preferably, before that) ''' data_source_name = from_data if type(from_data) == str else 'Unkown' heavy_transaction_metadata = {} heavy_transaction_metadata['name'] = self.name heavy_transaction_metadata['from_data'] = from_ds heavy_transaction_metadata['test_from_data'] = test_from_ds heavy_transaction_metadata['bucketing_algorithms'] = {} heavy_transaction_metadata['predictions'] = None heavy_transaction_metadata['model_backend'] = backend light_transaction_metadata = {} light_transaction_metadata['version'] = str(__version__) light_transaction_metadata['name'] = self.name light_transaction_metadata['data_preparation'] = {} light_transaction_metadata['predict_columns'] = predict_columns light_transaction_metadata['model_columns_map'] = from_ds._col_map light_transaction_metadata['model_group_by'] = group_by light_transaction_metadata['model_order_by'] = order_by light_transaction_metadata['model_is_time_series'] = is_time_series light_transaction_metadata['data_source'] = data_source_name light_transaction_metadata['type'] = transaction_type light_transaction_metadata['window_size'] = window_size light_transaction_metadata[ 'sample_margin_of_error'] = sample_margin_of_error light_transaction_metadata[ 'sample_confidence_level'] = sample_confidence_level light_transaction_metadata[ 'stop_training_in_x_seconds'] = stop_training_in_x_seconds light_transaction_metadata['rebuild_model'] = rebuild_model light_transaction_metadata['model_accuracy'] = { 'train': {}, 'test': {} } light_transaction_metadata['column_importances'] = None light_transaction_metadata['columns_buckets_importances'] = None light_transaction_metadata['columnless_prediction_distribution'] = None light_transaction_metadata[ 'all_columns_prediction_distribution'] = None light_transaction_metadata['use_gpu'] = use_gpu light_transaction_metadata['columns_to_ignore'] = ignore_columns light_transaction_metadata[ 'disable_optional_analysis'] = disable_optional_analysis light_transaction_metadata['validation_set_accuracy'] = None light_transaction_metadata['lightwood_data'] = {} light_transaction_metadata['ludwig_data'] = {} light_transaction_metadata['weight_map'] = {} light_transaction_metadata['confusion_matrices'] = {} light_transaction_metadata['empty_columns'] = [] light_transaction_metadata[ 'equal_accuracy_for_all_output_categories'] = equal_accuracy_for_all_output_categories light_transaction_metadata['output_categories_importance_dictionary'] = output_categories_importance_dictionary if output_categories_importance_dictionary is not None else {} if 'skip_model_training' in unstable_parameters_dict: light_transaction_metadata[ 'skip_model_training'] = unstable_parameters_dict[ 'skip_model_training'] else: light_transaction_metadata['skip_model_training'] = False if 'skip_stats_generation' in unstable_parameters_dict: light_transaction_metadata[ 'skip_stats_generation'] = unstable_parameters_dict[ 'skip_stats_generation'] else: light_transaction_metadata['skip_stats_generation'] = False if 'optimize_model' in unstable_parameters_dict: light_transaction_metadata[ 'optimize_model'] = unstable_parameters_dict['optimize_model'] else: light_transaction_metadata['optimize_model'] = False if 'force_disable_cache' in unstable_parameters_dict: light_transaction_metadata[ 'force_disable_cache'] = unstable_parameters_dict[ 'force_disable_cache'] else: light_transaction_metadata['force_disable_cache'] = False if 'force_categorical_encoding' in unstable_parameters_dict: light_transaction_metadata[ 'force_categorical_encoding'] = unstable_parameters_dict[ 'force_categorical_encoding'] else: light_transaction_metadata['force_categorical_encoding'] = [] if 'handle_foreign_keys' in unstable_parameters_dict: light_transaction_metadata[ 'handle_foreign_keys'] = unstable_parameters_dict[ 'handle_foreign_keys'] else: light_transaction_metadata['handle_foreign_keys'] = False if 'handle_text_as_categorical' in unstable_parameters_dict: light_transaction_metadata[ 'handle_text_as_categorical'] = unstable_parameters_dict[ 'handle_text_as_categorical'] else: light_transaction_metadata['handle_text_as_categorical'] = False if 'use_selfaware_model' in unstable_parameters_dict: light_transaction_metadata[ 'use_selfaware_model'] = unstable_parameters_dict[ 'use_selfaware_model'] else: light_transaction_metadata['use_selfaware_model'] = True if rebuild_model is False: old_lmd = {} for k in light_transaction_metadata: old_lmd[k] = light_transaction_metadata[k] old_hmd = {} for k in heavy_transaction_metadata: old_hmd[k] = heavy_transaction_metadata[k] with open( os.path.join( CONFIG.MINDSDB_STORAGE_PATH, light_transaction_metadata['name'] + '_light_model_metadata.pickle'), 'rb') as fp: light_transaction_metadata = pickle.load(fp) with open( os.path.join( CONFIG.MINDSDB_STORAGE_PATH, heavy_transaction_metadata['name'] + '_heavy_model_metadata.pickle'), 'rb') as fp: heavy_transaction_metadata = pickle.load(fp) for k in [ 'data_preparation', 'rebuild_model', 'data_source', 'type', 'columns_to_ignore', 'sample_margin_of_error', 'sample_confidence_level', 'stop_training_in_x_seconds' ]: if old_lmd[k] is not None: light_transaction_metadata[k] = old_lmd[k] for k in ['from_data', 'test_from_data']: if old_hmd[k] is not None: heavy_transaction_metadata[k] = old_hmd[k] Transaction(session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata, logger=self.log) def test(self, when_data, accuracy_score_functions, score_using='predicted_value', predict_args=None): """ :param when_data: use this when you have data in either a file, a pandas data frame, or url to a file that you want to predict from :param accuracy_score_functions: a single function or a dictionary for the form `{f'{target_name}': acc_func}` for when we have multiple targets :param score_using: what values from the `explanation` of the target to use in the score function, defaults to the :param predict_args: dictionary of arguments to be passed to `predict`, e.g: `predict_args={'use_gpu': True}` :return: a dictionary for the form `{f'{target_name}_accuracy': accuracy_func_return}`, e.g. {'rental_price_accuracy':0.99} """ if predict_args is None: predict_args = {} predictions = self.predict(when_data=when_data, **predict_args) with open( os.path.join(CONFIG.MINDSDB_STORAGE_PATH, f'{self.name}_light_model_metadata.pickle'), 'rb') as fp: lmd = pickle.load(fp) accuracy_dict = {} for col in lmd['predict_columns']: if type(accuracy_score_functions) == type({}): acc_f = accuracy_score_functions[col] else: acc_f = accuracy_score_functions accuracy_dict[f'{col}_accuracy'] = acc_f( [x[f'__observed_{col}'] for x in predictions], [x.explanation[col][score_using] for x in predictions]) return accuracy_dict def predict(self, when=None, when_data=None, update_cached_model=False, use_gpu=None, unstable_parameters_dict=None, backend=None, run_confidence_variation_analysis=False): """ You have a mind trained already and you want to make a prediction :param when: use this if you have certain conditions for a single prediction :param when_data: use this when you have data in either a file, a pandas data frame, or url to a file that you want to predict from :param update_cached_model: (optional, default:False) when you run predict for the first time, it loads the latest model in memory, you can force it to do this on this run by flipping it to True :param run_confidence_variation_analysis: Run a confidence variation analysis on each of the given input column, currently only works when making single predictions via `when` :return: TransactionOutputData object """ if when is None: when = {} if unstable_parameters_dict is None: unstable_parameters_dict = {} if run_confidence_variation_analysis is True and when_data is not None: error_msg = 'run_confidence_variation_analysis=True is a valid option only when predicting a single data point via `when`' self.log.error(error_msg) raise ValueError(error_msg) transaction_type = TRANSACTION_PREDICT when_ds = None if when_data is None else getDS(when_data) # lets turn into lists: when when = [when] if type(when) in [type(None), type({})] else when heavy_transaction_metadata = {} if when_ds is None: heavy_transaction_metadata['when_data'] = None else: heavy_transaction_metadata['when_data'] = when_ds heavy_transaction_metadata['model_when_conditions'] = when heavy_transaction_metadata['name'] = self.name if backend is not None: heavy_transaction_metadata['model_backend'] = backend light_transaction_metadata = {} light_transaction_metadata['name'] = self.name light_transaction_metadata['type'] = transaction_type light_transaction_metadata['use_gpu'] = use_gpu light_transaction_metadata['data_preparation'] = {} light_transaction_metadata[ 'run_confidence_variation_analysis'] = run_confidence_variation_analysis if 'force_disable_cache' in unstable_parameters_dict: light_transaction_metadata[ 'force_disable_cache'] = unstable_parameters_dict[ 'force_disable_cache'] else: light_transaction_metadata['force_disable_cache'] = False transaction = Transaction( session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata) return transaction.output_data
class Predictor: def __init__(self, name, root_folder=CONFIG.MINDSDB_STORAGE_PATH, log_level=CONFIG.DEFAULT_LOG_LEVEL, log_server=CONFIG.MINDSDB_SERVER_URL): """ This controller defines the API to a MindsDB 'mind', a mind is an object that can learn and predict from data :param name: the namespace you want to identify this mind instance with :param root_folder: the folder where you want to store this mind or load from :param log_level: the desired log level :param log_server: the url for a server that can accept log streams """ # initialize variables self.name = name self.root_folder = root_folder self.uuid = str(uuid.uuid1()) # initialize log self.log = MindsdbLogger(log_level=log_level, send_logs=False, log_url=log_server, uuid=self.uuid) # check for updates _thread.start_new_thread(check_for_updates, ()) # set the mindsdb storage folder storage_ok = True # default state # if it does not exist try to create it if not os.path.exists(CONFIG.MINDSDB_STORAGE_PATH): try: self.log.info('{folder} does not exist, creating it now'.format(folder=CONFIG.MINDSDB_STORAGE_PATH)) path = Path(CONFIG.MINDSDB_STORAGE_PATH) path.mkdir(exist_ok=True, parents=True) except: self.log.info(traceback.format_exc()) storage_ok = False self.log.error('MindsDB storage foldler: {folder} does not exist and could not be created'.format( folder=CONFIG.MINDSDB_STORAGE_PATH)) # If storage path is not writable, raise an exception as this can no longer be if not os.access(CONFIG.MINDSDB_STORAGE_PATH, os.W_OK) or storage_ok == False: error_message = '''Cannot write into storage path, please either set the config variable mindsdb.config.set('MINDSDB_STORAGE_PATH',<path>) or give write access to {folder}''' raise ValueError(error_message.format(folder=CONFIG.MINDSDB_STORAGE_PATH)) def export(self, mindsdb_storage_dir='mindsdb_storage'): """ If you want to export this mind to a file :param mindsdb_storage_dir: this is the full_path where you want to store a mind to, it will be a zip file :return: bool (True/False) True if mind was exported successfully """ try: shutil.make_archive(mindsdb_storage_dir, 'zip', CONFIG.MINDSDB_STORAGE_PATH) return True except: return False def get_models(self): models = [] for fn in os.listdir(CONFIG.MINDSDB_STORAGE_PATH): if '_light_model_metadata.pickle' in fn: model_name = fn.replace('_light_model_metadata.pickle','') lmd = self.get_model_data(model_name) model = {} for k in ['name', 'version', 'is_active', 'data_source', 'predict', 'accuracy', 'status', 'train_end_at', 'updated_at', 'created_at']: if k in lmd: model[k] = lmd[k] else: model[k] = None print(f'Key {k} not found in the light model metadata !') models.append(model) return models def _adapt_column(self, col_stats, col): icm = {} icm['column_name'] = col icm['data_type'] = col_stats['data_type'] icm['data_subtype'] = col_stats['data_subtype'] icm['data_type_distribution'] = { 'type': "categorical" ,'x': [] ,'y': [] } for k in col_stats['data_type_dist']: icm['data_type_distribution']['x'].append(k) icm['data_type_distribution']['y'].append(col_stats['data_type_dist'][k]) icm['data_subtype_distribution'] = { 'type': "categorical" ,'x': [] ,'y': [] } for k in col_stats['data_subtype_dist']: icm['data_subtype_distribution']['x'].append(k) icm['data_subtype_distribution']['y'].append(col_stats['data_subtype_dist'][k]) icm['data_distribution'] = {} icm['data_distribution']['data_histogram'] = { "type": "categorical", 'x': [], 'y': [] } icm['data_distribution']['clusters'] = {} for i in range(len(col_stats['histogram']['x'])): icm['data_distribution']['data_histogram']['x'].append(col_stats['histogram']['x'][i]) icm['data_distribution']['data_histogram']['y'].append(col_stats['histogram']['y'][i]) scores = ['consistency_score', 'redundancy_score', 'variability_score'] for score in scores: metrics = [] if score == 'consistency_score': metrics.append({ "type": "score", "score": col_stats['data_type_distribution_score'], "description": col_stats['data_type_distribution_score_description'], "warning": col_stats['data_type_distribution_score_warning'] }) metrics.append({ "type": "score", "score": col_stats['empty_cells_score'], "description": col_stats['empty_cells_score_description'], "warning": col_stats['empty_cells_score_warning'] }) if 'duplicates_score' in col_stats: metrics.append({ "type": "score", "score": col_stats['duplicates_score'], "description": col_stats['duplicates_score_description'], "warning": col_stats['duplicates_score_warning'] }) if score == 'variability_score': if 'lof_based_outlier_score' in col_stats and 'z_test_based_outlier_score' in col_stats: metrics.append({ "type": "score", "score": col_stats['lof_based_outlier_score'], "description": col_stats['lof_based_outlier_score_description'], "warning": col_stats['lof_based_outlier_score_warning'] }) metrics.append({ "type": "score", "score": col_stats['z_test_based_outlier_score'], "description": col_stats['z_test_based_outlier_score_description'], "warning": col_stats['z_test_based_outlier_score_warning'] }) metrics.append({ "type": "score", "score": col_stats['value_distribution_score'], "description": col_stats['value_distribution_score_description'], "warning": col_stats['value_distribution_score_warning'] }) else: metrics.append({ "type": "score", "score": col_stats['value_distribution_score'], "description": col_stats['value_distribution_score_description'], "warning": col_stats['value_distribution_score_warning'] }) if score == 'redundancy_score': metrics.append({ "type": "score", "score": col_stats['similarity_score'], "description": col_stats['similarity_score_description'], "warning": col_stats['similarity_score_warning'] }) icm[score.replace('','_score')] = { 'score': col_stats[score], 'metrics': metrics, "description": col_stats[f'{score}_description'], "warning": col_stats[f'{score}_warning'] } return icm def get_model_data(self, model_name): with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, f'{model_name}_light_model_metadata.pickle'), 'rb') as fp: lmd = pickle.load(fp) # ADAPTOR CODE amd = {} # Shared keys for k in ['name', 'version', 'is_active', 'data_source', 'predict', 'accuracy', 'status', 'train_end_at', 'updated_at', 'created_at','data_preparation']: if k == 'predict': amd[k] = lmd['predict_columns'] elif k in lmd: amd[k] = lmd[k] else: amd[k] = None print(f'Key {k} not found in the light model metadata !') amd['data_analysis'] = { 'target_columns_metadata': [] ,'input_columns_metadata': [] } amd['model_analysis'] = [] for col in lmd['model_columns_map'].keys(): if col in lmd['malformed_columns']['names']: continue try: icm = self._adapt_column(lmd['column_stats'][col],col) except: print(f'Issue processing column: {icol} !') continue amd['force_vectors'] = {} if col in lmd['predict_columns']: # Histograms for plotting the force vectors amd['force_vectors'][col] = {} amd['force_vectors'][col]['normal_data_distribution'] = lmd['all_columns_prediction_distribution'][col] amd['force_vectors'][col]['normal_data_distribution']['type'] = 'categorical' amd['force_vectors'][col]['missing_data_distribution'] = {} for missing_column in lmd['columnless_prediction_distribution'][col]: amd['force_vectors'][col]['missing_data_distribution'][missing_column] = lmd['columnless_prediction_distribution'][col][missing_column] amd['force_vectors'][col]['missing_data_distribution'][missing_column]['type'] = 'categorical' icm['importance_score'] = None amd['data_analysis']['target_columns_metadata'].append(icm) # Model analysis building for each of the predict columns mao = { 'column_name': col ,'overall_input_importance': { "type": "categorical" ,"x": [] ,"y": [] } ,"train_accuracy_over_time": { "type": "categorical", "x": [], "y": [] } ,"test_accuracy_over_time": { "type": "categorical", "x": [], "y": [] } ,"accuracy_histogram": { "x": [] ,"y": [] ,'x_explained': [] } } train_acc = lmd['model_accuracy']['train']['combined'] test_acc = lmd['model_accuracy']['test']['combined'] for i in range(0,len(train_acc)): mao['train_accuracy_over_time']['x'].append(i) mao['train_accuracy_over_time']['y'].append(train_acc[i]) for i in range(0,len(test_acc)): mao['test_accuracy_over_time']['x'].append(i) mao['test_accuracy_over_time']['y'].append([i]) mao['accuracy_histogram']['x'] = [] mao['accuracy_histogram']['y'] = [] bucket_importance_keys = list(lmd['unusual_columns_buckets_importances'].keys()) for incol in lmd['column_importances']: incol_bucket_importance_keys = list(filter(lambda x: incol in x, bucket_importance_keys)) mao['accuracy_histogram']['x'].append(incol) mao['accuracy_histogram']['y'].append(lmd['column_importances'][incol]) if len(incol_bucket_importance_keys) > 0: sub_group_stats = [] for sub_incol in incol_bucket_importance_keys: sub_group_stats.append(self._adapt_column(lmd['unusual_columns_buckets_importances'][sub_incol], sub_incol)) else: sub_group_stats = [None] mao['accuracy_histogram']['x_explained'].append(sub_group_stats) for icol in lmd['model_columns_map'].keys(): if icol in lmd['malformed_columns']['names']: continue if icol not in lmd['predict_columns']: try: mao['overall_input_importance']['x'].append(icol) mao['overall_input_importance']['y'].append(lmd['column_importances'][icol]) except: print(f'No column importances found for {icol} !') amd['model_analysis'].append(mao) else: icm['importance_score'] = lmd['column_importances'][col] amd['data_analysis']['input_columns_metadata'].append(icm) # ADAPTOR CODE return amd def load(self, mindsdb_storage_dir='mindsdb_storage.zip'): """ If you want to import a mind from a file :param mindsdb_storage_dir: this is the full_path that contains your mind :return: bool (True/False) True if mind was importerd successfully """ shutil.unpack_archive(mindsdb_storage_dir, extract_dir=CONFIG.MINDSDB_STORAGE_PATH) def learn(self, to_predict, from_data = None, test_from_data=None, group_by = None, window_size_samples = None, window_size_seconds = None, window_size = None, order_by = [], sample_margin_of_error = CONFIG.DEFAULT_MARGIN_OF_ERROR, ignore_columns = [], rename_strange_columns = False, stop_training_in_x_seconds = None, stop_training_in_accuracy = None, send_logs=CONFIG.SEND_LOGS, backend='ludwig', rebuild_model=True): """ Tells the mind to learn to predict a column or columns from the data in 'from_data' Mandatory arguments: :param to_predict: what column or columns you want to predict :param from_data: the data that you want to learn from, this can be either a file, a pandas data frame, or url to a file Optional arguments: :param test_from_data: If you would like to test this learning from a different data set Optional Time series arguments: :param order_by: this order by defines the time series, it can be a list. By default it sorts each sort by column in ascending manner, if you want to change this pass a touple ('column_name', 'boolean_for_ascending <default=true>') :param group_by: This argument tells the time series that it should learn by grouping rows by a given id :param window_size: The number of samples to learn from in the time series Optional data transformation arguments: :param ignore_columns: it simply removes the columns from the data sources :param rename_strange_columns: this tells mindsDB that if columns have special characters, it should try to rename them, this is a legacy argument, as now mindsdb supports any column name Optional sampling parameters: :param sample_margin_error (DEFAULT 0): Maximum expected difference between the true population parameter, such as the mean, and the sample estimate. Optional debug arguments: :param send_logs: If you want to stream these logs to a server :param stop_training_in_x_seconds: (default None), if set, you want training to finish in a given number of seconds :return: """ # Backwards compatibility of interface if window_size is not None: window_size_samples = window_size # from_ds = getDS(from_data) test_from_ds = test_from_data if test_from_data is None else getDS(test_from_data) breakpoint = CONFIG.DEBUG_BREAK_POINT transaction_type = TRANSACTION_LEARN sample_confidence_level = 1 - sample_margin_of_error predict_columns_map = {} # lets turn into lists: predict, order_by and group by predict_columns = [to_predict] if type(to_predict) != type([]) else to_predict group_by = group_by if type(group_by) == type([]) else [group_by] if group_by else [] order_by = order_by if type(order_by) == type([]) else [order_by] if order_by else [] if len(predict_columns) == 0: error = 'You need to specify a column to predict' self.log.error(error) raise ValueError(error) # lets turn order by into tuples if not already # each element ('column_name', 'boolean_for_ascending <default=true>') order_by = [(col_name, True) if type(col_name) != type(()) else col_name for col_name in order_by] is_time_series = True if len(order_by) > 0 else False if rename_strange_columns is False: for predict_col in predict_columns: predict_col_as_in_df = from_ds.getColNameAsInDF(predict_col) predict_columns_map[predict_col_as_in_df]=predict_col predict_columns = list(predict_columns_map.keys()) else: self.log.warning('Note that after version 1.0, the default value for argument rename_strange_columns in MindsDB().learn, will be flipped from True to False, this means that if your data has columns with special characters, MindsDB will not try to rename them by default.') heavy_transaction_metadata = {} heavy_transaction_metadata['name'] = self.name heavy_transaction_metadata['from_data'] = from_ds heavy_transaction_metadata['test_from_data'] = test_from_ds heavy_transaction_metadata['bucketing_algorithms'] = {} light_transaction_metadata = {} light_transaction_metadata['version'] = str(__version__) light_transaction_metadata['name'] = self.name light_transaction_metadata['data_preparation'] = {} light_transaction_metadata['model_backend'] = backend light_transaction_metadata['predict_columns'] = predict_columns light_transaction_metadata['model_columns_map'] = {} if rename_strange_columns else from_ds._col_map light_transaction_metadata['model_group_by'] = group_by light_transaction_metadata['model_order_by'] = order_by light_transaction_metadata['window_size_samples'] = window_size_samples light_transaction_metadata['window_size_seconds'] = window_size_seconds light_transaction_metadata['model_is_time_series'] = is_time_series light_transaction_metadata['data_source'] = from_data light_transaction_metadata['type'] = transaction_type light_transaction_metadata['ignore_columns'] = ignore_columns light_transaction_metadata['sample_margin_of_error'] = sample_margin_of_error light_transaction_metadata['sample_confidence_level'] = sample_confidence_level light_transaction_metadata['stop_training_in_x_seconds'] = stop_training_in_x_seconds light_transaction_metadata['stop_training_in_accuracy'] = stop_training_in_accuracy light_transaction_metadata['rebuild_model'] = rebuild_model light_transaction_metadata['model_accuracy'] = {'train': {}, 'test': {}} light_transaction_metadata['column_importances'] = None light_transaction_metadata['unusual_columns_buckets_importances'] = None light_transaction_metadata['columnless_prediction_distribution'] = None light_transaction_metadata['all_columns_prediction_distribution'] = None light_transaction_metadata['malformed_columns'] = {'names': [], 'indices': []} if rebuild_model is False: old_lmd = {} for k in light_transaction_metadata: old_lmd[k] = light_transaction_metadata[k] old_hmd = {} for k in heavy_transaction_metadata: old_hmd[k] = heavy_transaction_metadata[k] with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, light_transaction_metadata['name'] + '_light_model_metadata.pickle'), 'rb') as fp: light_transaction_metadata = pickle.load(fp) with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, heavy_transaction_metadata['name'] + '_heavy_model_metadata.pickle'), 'rb') as fp: heavy_transaction_metadata= pickle.load(fp) for k in ['data_preparation', 'rebuild_model', 'data_source', 'type', 'ignore_columns', 'sample_margin_of_error', 'sample_confidence_level', 'stop_training_in_x_seconds', 'stop_training_in_accuracy']: if old_lmd[k] is not None: light_transaction_metadata[k] = old_lmd[k] for k in ['from_data', 'test_from_data']: if old_hmd[k] is not None: heavy_transaction_metadata[k] = old_hmd[k] Transaction(session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata, logger=self.log, breakpoint=breakpoint) def predict(self, when={}, when_data = None, update_cached_model = False): """ You have a mind trained already and you want to make a prediction :param when: use this if you have certain conditions for a single prediction :param when_data: (optional) use this when you have data in either a file, a pandas data frame, or url to a file that you want to predict from :param update_cached_model: (optional, default:False) when you run predict for the first time, it loads the latest model in memory, you can force it to do this on this run by flipping it to True :return: TransactionOutputData object """ transaction_type = TRANSACTION_PREDICT breakpoint = CONFIG.DEBUG_BREAK_POINT when_ds = None if when_data is None else getDS(when_data) # lets turn into lists: when when = [when] if type(when) in [type(None), type({})] else when heavy_transaction_metadata = {} if when_ds is None: heavy_transaction_metadata['when_data'] = None else: heavy_transaction_metadata['when_data'] = when_ds heavy_transaction_metadata['model_when_conditions'] = when heavy_transaction_metadata['name'] = self.name light_transaction_metadata = {} light_transaction_metadata['name'] = self.name light_transaction_metadata['type'] = transaction_type light_transaction_metadata['data_preparation'] = {} transaction = Transaction(session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata, breakpoint=breakpoint) return transaction.output_data