def test_dataset(): df = generate_test_data(1000, text=True) features = ["A", "B", "C", "D"] # Exception: feature group row count is not equal to the features' row count feature_groups = { "interactions": df[["A", "B"]].values[:10] * df[["C", "D"]].values[:10] } with pytest.raises(Exception): assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups) # Exception: Feature group name A is in use by other features feature_groups = {"A": df[["A", "B"]].values * df[["C", "D"]].values} with pytest.raises(Exception): assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups) # Exception: Feature group type is not numpy.ndarray or scipy.csr.csr_matrix feature_groups = {"E": df[["A", "B"]]} with pytest.raises(Exception): assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups)
def test_feature_groups(): df = generate_test_data(1000, text=True) features = ["A", "B", "C", "D"] cv = CountVectorizer(ngram_range=(3, 3), analyzer="char") feature_groups = dict() feature_groups["names"] = cv.fit_transform(df["T"]) feature_groups["interactions"] = df[["A", "B"]].values * df[["C", "D" ]].values dataset = Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups) lgbm = LGBMClassifier(random_state=0, n_jobs=4) lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc') importance_df = lofo.get_importance() assert len(features) + len(feature_groups) == importance_df.shape[ 0], "Missing importance value for some features!" assert importance_df["feature"].values[ 0] == "names", "Most important feature is different than 'names'!"
def test_flofo_importance(): df = generate_test_data(100000) df.loc[df["A"] < df["A"].median(), "A"] = None train_df, val_df = train_test_split(df, test_size=0.2, random_state=0) val_df_checkpoint = val_df.copy() features = ["A", "B", "C", "D"] lgbm = LGBMClassifier(random_state=0, n_jobs=1) lgbm.fit(train_df[features], train_df["binary_target"]) flofo = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc') flofo_parallel = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc', n_jobs=3) importance_df = flofo.get_importance() importance_df_parallel = flofo_parallel.get_importance() is_feature_order_same = importance_df["feature"].values == importance_df_parallel["feature"].values plot_importance(importance_df) assert is_feature_order_same.sum() == len(features), "Parallel FLOFO returned different result!" assert val_df.equals(val_df_checkpoint), "LOFOImportance mutated the dataframe!" assert len(features) == importance_df.shape[0], "Missing importance value for some features!" assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_lofo_importance(): df = generate_test_data(1000) features = ["A", "B", "C", "D"] lgbm = LGBMRegressor(random_state=0, n_jobs=4) lofo = LOFOImportance(lgbm, df, features, 'binary_target', cv=4, scoring='roc_auc') importance_df = lofo.get_importance() assert len(features) == importance_df.shape[0], "Missing importance value for some features!" assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_multithreading(): df = generate_test_data(100000) features = ["A", "B", "C", "D"] lr = LogisticRegression(solver='liblinear') cv = KFold(n_splits=4, shuffle=True, random_state=0) lofo = LOFOImportance(df, features, 'binary_target', model=lr, cv=cv, scoring='roc_auc', n_jobs=3) importance_df = lofo.get_importance() assert len(features) == importance_df.shape[0], "Missing importance value for some features!" assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
def test_lofo_importance(): df = generate_test_data(1000) features = ["A", "B", "C", "D"] dataset = Dataset(df=df, target="binary_target", features=features) lgbm = LGBMClassifier(random_state=0, n_jobs=4) lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc') importance_df = lofo.get_importance() plot_importance(importance_df) assert len(features) == importance_df.shape[ 0], "Missing importance value for some features!" assert importance_df["feature"].values[ 0] == "B", "Most important feature is different than B!"