def test_evaluate_array(self): predictions = {'y': [[1], [2], [3], [4]]} col_stats = { 'y': { 'typing': { 'data_type': DATA_TYPES.SEQUENTIAL, 'data_subtype': DATA_SUBTYPES.ARRAY } } } output_columns = ['y'] data_frame = pd.DataFrame({'y': [1, 2, 3, 5]}) accuracy = evaluate_accuracy(predictions, data_frame, col_stats, output_columns) assert round(accuracy, 2) == 0.8 predictions = {'y': [[1, 2, 3, 4], [2, 3, 4, 5]]} data_frame = pd.DataFrame({'y': [[1, 2, 3, 5], [2, 3, 4, 6]]}) accuracy = evaluate_accuracy(predictions, data_frame, col_stats, output_columns) assert round(accuracy, 2) == 0.8
def test_evaluate_weird_data_types(self): for dtype, data_subtype in [(DATA_TYPES.DATE, DATA_SUBTYPES.DATE), (DATA_TYPES.TEXT, DATA_SUBTYPES.SHORT), (DATA_TYPES.SEQUENTIAL, DATA_SUBTYPES.ARRAY), (DATA_TYPES.FILE_PATH, None)]: predictions = {'y': ["1", "2", "3", "4"]} col_stats = { 'y': { 'typing': { 'data_type': dtype, 'data_subtype': data_subtype } } } output_columns = ['y'] data_frame = pd.DataFrame({'y': ["1", "2", "3", "5"]}) accuracy = evaluate_accuracy(predictions, data_frame, col_stats, output_columns) assert round(accuracy, 2) == 0.75
def test_evaluate_regression(self): predictions = { 'y': [1, 2, 3, 4], 'y_confidence_range': [ [0, 2], [0, 2], [1, 3], [4, 4], ] } col_stats = { 'y': { 'typing': { 'data_type': DATA_TYPES.NUMERIC, 'data_subtype': DATA_SUBTYPES.INT } } } output_columns = ['y'] data_frame = pd.DataFrame({'y': [1, 2, 3, 5]}) accuracy = evaluate_accuracy(predictions, data_frame, col_stats, output_columns) assert round(accuracy, 2) == 0.75
def test_evaluate_two_columns(self): predictions = { 'y1': [1, 2, 3, 4], 'y1_confidence_range': [ [0, 2], [0, 2], [1, 3], [4, 4], ], 'y2': [0, 0, 1, 1] } col_stats = { 'y1': { 'typing': { 'data_type': DATA_TYPES.NUMERIC, 'data_subtype': DATA_SUBTYPES.FLOAT } }, 'y2': { 'typing': { 'data_type': DATA_TYPES.CATEGORICAL, 'data_subtype': DATA_SUBTYPES.MULTIPLE } } } output_columns = ['y1', 'y2'] data_frame = pd.DataFrame({'y1': [1, 2, 3, 5], 'y2': [1, 0, 1, 0]}) accuracy = evaluate_accuracy(predictions, data_frame, col_stats, output_columns) assert round(accuracy, 2) == round((0.75 + 0.5) / 2, 2)
def test_evaluate_classification(self): predictions = {'y': [1, 2, 3, 4]} col_stats = { 'y': { 'typing': { 'data_type': DATA_TYPES.CATEGORICAL, 'data_subtype': DATA_SUBTYPES.MULTIPLE } } } output_columns = ['y'] data_frame = pd.DataFrame({'y': [1, 2, 3, 5]}) accuracy = evaluate_accuracy(predictions, data_frame, col_stats, output_columns) assert round(accuracy, 2) == 0.75
def run(self): np.seterr(divide='warn', invalid='warn') """ # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions """ output_columns = self.transaction.lmd['predict_columns'] input_columns = [ col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore'] ] # Make predictions on the validation dataset normally and with various columns missing normal_predictions = self.transaction.model_backend.predict('validate') normal_predictions_test = self.transaction.model_backend.predict( 'test') normal_accuracy = evaluate_accuracy( normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) for col in output_columns: reals = self.transaction.input_data.validation_df[col] preds = normal_predictions[col] fails = False data_type = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_subtype'] if data_type == DATA_TYPES.CATEGORICAL: if data_subtype == DATA_SUBTYPES.TAGS: encoder = self.transaction.model_backend.predictor._mixer.encoders[ col] if accuracy_score( encoder.encode(reals), encoder.encode(preds)) <= self.transaction.lmd[ 'stats_v2'][col]['guess_probability']: fails = True else: if accuracy_score(reals, preds) <= self.transaction.lmd[ 'stats_v2'][col]['guess_probability']: fails = True elif data_type == DATA_TYPES.NUMERIC: if r2_score(reals, preds) < 0: fails = True else: pass if fails: if not self.transaction.lmd['force_predict']: def predict_wrapper(*args, **kwargs): raise Exception('Failed to train model') self.session.predict = predict_wrapper log.error('Failed to train model to predict {}'.format(col)) empty_input_predictions = {} empty_input_accuracy = {} empty_input_predictions_test = {} ignorable_input_columns = [ x for x in input_columns if self.transaction.lmd['stats_v2'][x] ['typing']['data_type'] != DATA_TYPES.FILE_PATH and x not in [y[0] for y in self.transaction.lmd['model_order_by']] ] for col in ignorable_input_columns: empty_input_predictions[ col] = self.transaction.model_backend.predict( 'validate', ignore_columns=[col]) empty_input_predictions_test[ col] = self.transaction.model_backend.predict( 'test', ignore_columns=[col]) empty_input_accuracy[col] = evaluate_accuracy( empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) # Get some information about the importance of each column self.transaction.lmd['column_importances'] = {} for col in ignorable_input_columns: accuracy_increase = (normal_accuracy - empty_input_accuracy[col]) # normalize from 0 to 10 self.transaction.lmd['column_importances'][col] = 10 * max( 0, accuracy_increase) # Run Probabilistic Validator overall_accuracy_arr = [] self.transaction.lmd['accuracy_histogram'] = {} self.transaction.lmd['confusion_matrices'] = {} self.transaction.lmd['accuracy_samples'] = {} self.transaction.hmd['probabilistic_validators'] = {} self.transaction.lmd['train_data_accuracy'] = {} self.transaction.lmd['test_data_accuracy'] = {} self.transaction.lmd['valid_data_accuracy'] = {} for col in output_columns: # Training data accuracy predictions = self.transaction.model_backend.predict( 'predict_on_train_data', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['train_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.train_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Testing data accuracy predictions = self.transaction.model_backend.predict( 'test', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['test_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.test_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Validation data accuracy predictions = self.transaction.model_backend.predict( 'validate', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['valid_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) for col in output_columns: pval = ProbabilisticValidator( col_stats=self.transaction.lmd['stats_v2'][col], col_name=col, input_columns=input_columns) predictions_arr = [normal_predictions_test] + [ x for x in empty_input_predictions_test.values() ] pval.fit(self.transaction.input_data.test_df, predictions_arr, [[ignored_column] for ignored_column in empty_input_predictions_test]) overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats( ) overall_accuracy_arr.append(overall_accuracy) self.transaction.lmd['accuracy_histogram'][ col] = accuracy_histogram self.transaction.lmd['confusion_matrices'][col] = cm self.transaction.lmd['accuracy_samples'][col] = accuracy_samples self.transaction.hmd['probabilistic_validators'][col] = pickle_obj( pval) self.transaction.lmd['validation_set_accuracy'] = sum( overall_accuracy_arr) / len(overall_accuracy_arr)
def run(self): np.seterr(divide='warn', invalid='warn') """ # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions """ output_columns = self.transaction.lmd['predict_columns'] input_columns = [ col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore'] ] # Make predictions on the validation dataset normally and with various columns missing normal_predictions = self.transaction.model_backend.predict('validate') normal_predictions_test = self.transaction.model_backend.predict( 'test') normal_accuracy = evaluate_accuracy( normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) for col in output_columns: if self.transaction.lmd['tss']['is_timeseries']: reals = list(self.transaction.input_data.validation_df[ self.transaction.input_data. validation_df['make_predictions'] == True][col]) else: reals = self.transaction.input_data.validation_df[col] preds = normal_predictions[col] fails = False data_type = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_subtype'] if data_type == DATA_TYPES.CATEGORICAL: if data_subtype == DATA_SUBTYPES.TAGS: encoder = self.transaction.model_backend.predictor._mixer.encoders[ col] if balanced_accuracy_score( encoder.encode(reals).argmax(axis=1), encoder.encode(preds).argmax( axis=1)) <= self.transaction.lmd['stats_v2'][ col]['balanced_guess_probability']: fails = True else: if balanced_accuracy_score( reals, preds) <= self.transaction.lmd['stats_v2'][ col]['balanced_guess_probability']: fails = True elif data_type == DATA_TYPES.NUMERIC: if r2_score(reals, preds) < 0: fails = True else: pass if fails: if not self.transaction.lmd['force_predict']: def predict_wrapper(*args, **kwargs): raise Exception('Failed to train model') self.session.predict = predict_wrapper log.error('Failed to train model to predict {}'.format(col)) empty_input_predictions = {} empty_input_accuracy = {} empty_input_predictions_test = {} ignorable_input_columns = [ x for x in input_columns if self.transaction.lmd['stats_v2'][x] ['typing']['data_type'] != DATA_TYPES.FILE_PATH and ( not self.transaction.lmd['tss']['is_timeseries'] or x not in self.transaction.lmd['tss']['order_by']) ] for col in ignorable_input_columns: empty_input_predictions[ col] = self.transaction.model_backend.predict( 'validate', ignore_columns=[col]) empty_input_predictions_test[ col] = self.transaction.model_backend.predict( 'test', ignore_columns=[col]) empty_input_accuracy[col] = evaluate_accuracy( empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) # Get some information about the importance of each column self.transaction.lmd['column_importances'] = {} for col in ignorable_input_columns: accuracy_increase = (normal_accuracy - empty_input_accuracy[col]) # normalize from 0 to 10 self.transaction.lmd['column_importances'][col] = 10 * max( 0, accuracy_increase) # Run Probabilistic Validator overall_accuracy_arr = [] self.transaction.lmd['accuracy_histogram'] = {} self.transaction.lmd['confusion_matrices'] = {} self.transaction.lmd['accuracy_samples'] = {} self.transaction.hmd['probabilistic_validators'] = {} self.transaction.lmd['train_data_accuracy'] = {} self.transaction.lmd['test_data_accuracy'] = {} self.transaction.lmd['valid_data_accuracy'] = {} for col in output_columns: # Training data accuracy predictions = self.transaction.model_backend.predict( 'predict_on_train_data', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['train_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.train_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Testing data accuracy predictions = self.transaction.model_backend.predict( 'test', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['test_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.test_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Validation data accuracy predictions = self.transaction.model_backend.predict( 'validate', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['valid_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) for col in output_columns: pval = ProbabilisticValidator( col_stats=self.transaction.lmd['stats_v2'][col], col_name=col, input_columns=input_columns) predictions_arr = [normal_predictions_test] + [ x for x in empty_input_predictions_test.values() ] pval.fit(self.transaction.input_data.test_df, predictions_arr, [[ignored_column] for ignored_column in empty_input_predictions_test]) overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats( ) overall_accuracy_arr.append(overall_accuracy) self.transaction.lmd['accuracy_histogram'][ col] = accuracy_histogram self.transaction.lmd['confusion_matrices'][col] = cm self.transaction.lmd['accuracy_samples'][col] = accuracy_samples self.transaction.hmd['probabilistic_validators'][col] = pickle_obj( pval) self.transaction.lmd['validation_set_accuracy'] = sum( overall_accuracy_arr) / len(overall_accuracy_arr) # conformal prediction confidence estimation self.transaction.lmd['stats_v2']['train_std_dev'] = {} self.transaction.hmd['label_encoders'] = {} self.transaction.hmd['icp'] = {'active': False} for target in output_columns: data_type = self.transaction.lmd['stats_v2'][target]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][target]['typing'][ 'data_subtype'] is_classification = data_type == DATA_TYPES.CATEGORICAL fit_params = { 'target': target, 'all_columns': self.transaction.lmd['columns'], 'columns_to_ignore': [] } fit_params['columns_to_ignore'].extend( self.transaction.lmd['columns_to_ignore']) fit_params['columns_to_ignore'].extend( [col for col in output_columns if col != target]) if is_classification: if data_subtype != DATA_SUBTYPES.TAGS: all_targets = [ elt[1][target].values for elt in inspect.getmembers( self.transaction.input_data) if elt[0] in {'test_df', 'train_df', 'validation_df'} ] all_classes = np.unique( np.concatenate([np.unique(arr) for arr in all_targets])) enc = OneHotEncoder(sparse=False, handle_unknown='ignore') enc.fit(all_classes.reshape(-1, 1)) fit_params['one_hot_enc'] = enc self.transaction.hmd['label_encoders'][target] = enc else: fit_params['one_hot_enc'] = None self.transaction.hmd['label_encoders'][target] = None adapter = ConformalClassifierAdapter nc_function = MarginErrFunc( ) # better than IPS as we'd need the complete distribution over all classes nc_class = ClassifierNc icp_class = IcpClassifier else: adapter = ConformalRegressorAdapter nc_function = AbsErrorErrFunc() nc_class = RegressorNc icp_class = IcpRegressor if (data_type == DATA_TYPES.NUMERIC or (is_classification and data_subtype != DATA_SUBTYPES.TAGS) ) and not self.transaction.lmd['tss']['is_timeseries']: model = adapter(self.transaction.model_backend.predictor, fit_params=fit_params) nc = nc_class(model, nc_function) X = deepcopy(self.transaction.input_data.train_df) y = X.pop(target) if is_classification: self.transaction.hmd['icp'][target] = icp_class( nc, smoothing=False) else: self.transaction.hmd['icp'][target] = icp_class(nc) self.transaction.lmd['stats_v2']['train_std_dev'][ target] = self.transaction.input_data.train_df[ target].std() X = clean_df(X, self.transaction.lmd['stats_v2'], output_columns) self.transaction.hmd['icp'][target].fit(X.values, y.values) self.transaction.hmd['icp']['active'] = True # calibrate conformal estimator on test set X = deepcopy(self.transaction.input_data.validation_df) y = X.pop(target).values if is_classification: if isinstance(enc.categories_[0][0], str): cats = enc.categories_[0].tolist() y = np.array([cats.index(i) for i in y]) y = y.astype(int) X = clean_df(X, self.transaction.lmd['stats_v2'], output_columns) self.transaction.hmd['icp'][target].calibrate(X.values, y)
def train(self): if self.transaction.lmd['use_gpu'] is not None: lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd[ 'use_gpu'] secondary_type_dict = {} if self.transaction.lmd['tss']['is_timeseries']: self.transaction.log.debug( 'Reshaping data into timeseries format, this may take a while !' ) train_df, secondary_type_dict = self._create_timeseries_df( self.transaction.input_data.train_df) test_df, _ = self._create_timeseries_df( self.transaction.input_data.test_df) self.transaction.log.debug( 'Done reshaping data into timeseries format !') else: if self.transaction.lmd['sample_settings']['sample_for_training']: sample_margin_of_error = self.transaction.lmd[ 'sample_settings']['sample_margin_of_error'] sample_confidence_level = self.transaction.lmd[ 'sample_settings']['sample_confidence_level'] sample_percentage = self.transaction.lmd['sample_settings'][ 'sample_percentage'] sample_function = self.transaction.hmd['sample_function'] train_df = sample_function( self.transaction.input_data.train_df, sample_margin_of_error, sample_confidence_level, sample_percentage) test_df = sample_function(self.transaction.input_data.test_df, sample_margin_of_error, sample_confidence_level, sample_percentage) sample_size = len(train_df) population_size = len(self.transaction.input_data.train_df) self.transaction.log.warning( f'Training on a sample of {round(sample_size * 100 / population_size, 1)}% your data, results can be unexpected.' ) else: train_df = self.transaction.input_data.train_df test_df = self.transaction.input_data.test_df lightwood_config = self._create_lightwood_config(secondary_type_dict) lightwood_train_ds = lightwood.api.data_source.DataSource( train_df, config=lightwood_config) lightwood_test_ds = lightwood_train_ds.make_child(test_df) self.transaction.lmd['lightwood_data']['save_path'] = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'], 'lightwood_data') Path(CONFIG.MINDSDB_STORAGE_PATH).joinpath( self.transaction.lmd['name']).mkdir(mode=0o777, exist_ok=True, parents=True) logging.getLogger().setLevel(logging.DEBUG) predictors_and_accuracies = [] use_mixers = self.transaction.lmd.get('use_mixers', None) if use_mixers is not None: if isinstance(use_mixers, list): mixer_classes = use_mixers else: mixer_classes = [use_mixers] else: mixer_classes = lightwood.mixers.BaseMixer.__subclasses__() for mixer_class in mixer_classes: lightwood_config['mixer']['kwargs'] = {} lightwood_config['mixer']['class'] = mixer_class if lightwood_config['mixer']['class'] == lightwood.mixers.NnMixer: # Evaluate less often for larger datasets and vice-versa eval_every_x_epochs = int( round(1 * pow(10, 6) * (1 / len(train_df)))) # Within some limits if eval_every_x_epochs > 200: eval_every_x_epochs = 200 if eval_every_x_epochs < 3: eval_every_x_epochs = 3 kwargs = lightwood_config['mixer']['kwargs'] kwargs['callback_on_iter'] = self.callback_on_iter kwargs['eval_every_x_epochs'] = eval_every_x_epochs / len( mixer_classes) kwargs['stop_training_after_seconds'] = self.transaction.lmd[ 'stop_training_in_x_seconds'] self.predictor = lightwood.Predictor(lightwood_config.copy()) self.predictor.learn(from_data=lightwood_train_ds, test_data=lightwood_test_ds) self.transaction.log.info('[{}] Training accuracy of: {}'.format( mixer_class.__name__, self.predictor.train_accuracy)) validation_predictions = self.predict('validate') validation_accuracy = evaluate_accuracy( validation_predictions, self.transaction.input_data.validation_df[ self.transaction.input_data. validation_df['make_predictions'].astype(bool) == True] if self.transaction.lmd['tss']['is_timeseries'] else self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], self.transaction.lmd['predict_columns'], backend=self, use_conf_intervals=False # r2_score will be used for regression ) predictors_and_accuracies.append( (self.predictor, validation_accuracy)) best_predictor, best_accuracy = max(predictors_and_accuracies, key=lambda x: x[1]) # Find predictor with NnMixer for predictor, accuracy in predictors_and_accuracies: if isinstance(predictor._mixer, lightwood.mixers.NnMixer): nn_mixer_predictor, nn_mixer_predictor_accuracy = predictor, accuracy break else: nn_mixer_predictor, nn_mixer_predictor_accuracy = None, None self.predictor = best_predictor # If difference between accuracies of best predictor and NnMixer predictor # is small, then use NnMixer predictor if nn_mixer_predictor is not None: SMALL_ACCURACY_DIFFERENCE = 0.01 if (best_accuracy - nn_mixer_predictor_accuracy) < SMALL_ACCURACY_DIFFERENCE: self.predictor = nn_mixer_predictor self.predictor.save( path_to=self.transaction.lmd['lightwood_data']['save_path'])