def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): # skip depth 20 for dense tests if max_depth == 20 and not storage_type: return # settings classification = True # change this to false to use regression n_categories = 2 random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_categories, random_state=random_state, classification=classification) # identify shape and indices train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) init_kwargs = { 'n_estimators': n_estimators, 'max_depth': max_depth, } if model_class == RandomForestClassifier: init_kwargs['max_features'] = 0.3 init_kwargs['n_jobs'] = -1 else: # model_class == GradientBoostingClassifier init_kwargs['init'] = 'zero' skl_model = model_class(**init_kwargs) skl_model.fit(X_train, y_train) skl_preds = skl_model.predict(X_validation) skl_preds_int = np.around(skl_preds) skl_proba = skl_model.predict_proba(X_validation) skl_acc = accuracy_score(y_validation, skl_preds > 0.5) algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, output_class=True, threshold=0.50, storage_type=storage_type) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int)) fil_proba = np.asarray(fm.predict_proba(X_validation)) fil_proba = np.reshape(fil_proba, np.shape(skl_proba)) fil_acc = accuracy_score(y_validation, fil_preds) assert fil_acc == pytest.approx(skl_acc, abs=1e-5) assert array_equal(fil_preds, skl_preds_int) assert np.allclose(fil_proba, skl_proba, 1e-3)
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, n_classes, storage_type, model_class): # settings classification = True # change this to false to use regression random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_classes, random_state=random_state, classification=classification) # identify shape and indices train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) init_kwargs = { 'n_estimators': n_estimators, 'max_depth': max_depth, } if model_class == RandomForestClassifier: init_kwargs['max_features'] = 0.3 init_kwargs['n_jobs'] = -1 else: # model_class == GradientBoostingClassifier init_kwargs['init'] = 'zero' skl_model = model_class(**init_kwargs, random_state=random_state) skl_model.fit(X_train, y_train) skl_preds = skl_model.predict(X_validation) skl_preds_int = np.around(skl_preds) skl_proba = skl_model.predict_proba(X_validation) skl_acc = accuracy_score(y_validation, skl_preds_int) algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, output_class=True, threshold=0.50, storage_type=storage_type) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int)) fil_acc = accuracy_score(y_validation, fil_preds) # fil_acc is within p99 error bars of skl_acc (diff == 0.017 +- 0.012) # however, some tests have a delta as big as 0.04. # sklearn uses float64 thresholds, while FIL uses float32 # TODO(levsnv): once FIL supports float64 accuracy, revisit thresholds threshold = 1e-5 if n_classes == 2 else 0.1 assert fil_acc == pytest.approx(skl_acc, abs=threshold) if n_classes == 2: assert array_equal(fil_preds, skl_preds_int) fil_proba = np.asarray(fm.predict_proba(X_validation)) fil_proba = np.reshape(fil_proba, np.shape(skl_proba)) assert np.allclose(fil_proba, skl_proba, 1e-3)
def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): # skip depth 20 for dense tests if max_depth == 20 and storage_type == 'DENSE': return # settings n_categories = 1 random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_categories, random_state=random_state, classification=False) # identify shape and indices train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) init_kwargs = { 'n_estimators': n_estimators, 'max_depth': max_depth, } if model_class == RandomForestRegressor: init_kwargs['max_features'] = 0.3 init_kwargs['n_jobs'] = -1 else: # model_class == GradientBoostingRegressor init_kwargs['init'] = 'zero' skl_model = model_class(**init_kwargs) skl_model.fit(X_train, y_train) skl_preds = skl_model.predict(X_validation) skl_mse = mean_squared_error(y_validation, skl_preds) algo = 'NAIVE' if storage_type == 'SPARSE' else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, output_class=False, storage_type=storage_type) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(skl_preds)) fil_mse = mean_squared_error(y_validation, fil_preds) # if fil is better than skl, no need to fail the test assert fil_mse <= skl_mse * (1. + 1e-7) + 1e-4 assert array_equal(fil_preds, skl_preds)