예제 #1
0
def best_ranker_accuracy(X,
                         y,
                         evaluator,
                         min_features=None,
                         max_features=None):
    max_feat = max_features or len(X.columns)
    min_feat = min_features or 1
    log.debug(
        f"Calculating accuracy of {min_feat} to {max_feat} ranked columns")
    best = None
    for feature_count in range(min_feat, max_feat):
        n_scores, selected_columns = evaluator(X, y, feature_count)
        mean_score = np.mean(n_scores)
        if not best or mean_score > best[0]:
            best = (np.mean(n_scores), selected_columns)

    return best
예제 #2
0
def evaluate_by_rank(X, y, n_features):
    ranker = RFE(DecisionTreeClassifier(), n_features_to_select=n_features)
    ranks = ranker.fit(X, y)
    selected_columns = list(X.columns[ranks.support_])

    pipeline = Pipeline(steps=[('s', ranker), ('m', DecisionTreeClassifier())])

    # evaluate model
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    n_scores = cross_val_score(pipeline,
                               X,
                               y,
                               scoring='accuracy',
                               cv=cv,
                               n_jobs=-1,
                               error_score='raise')
    # report performance
    log.debug(
        f"Accuracy: {np.mean(n_scores):.3f} ({np.std(n_scores):.3f}): COLS {selected_columns}"
    )
    return n_scores, selected_columns
예제 #3
0
def decision_tree(X, y, importance_threshold=0.07):
    classer = DecisionTreeClassifier()
    log.debug(f"churn columns: {X.columns}")
    result = classer.fit(X, y)
    log.debug(f"CLASSER RESULT: {result}")
    important_cols = [
        c for i, c in enumerate(X.columns)
        if result.feature_importances_[i] > importance_threshold
    ]
    log.debug(f"CLASSER IMPORTANT COLUMNS: {important_cols}")
예제 #4
0
def evaluate_by_pca(X, y, n_features):
    pca = PCA(n_components=n_features)
    pca.fit(X)
    log.debug(pca.explained_variance_ratio_)
    log.debug(pca.singular_values_)
    return pca.score(X, y), 0