예제 #1
0
def run_lda():
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    processor = P1()
    datasets = [Diabetes(), Adult()]
    estimators = [
        Config(name='lda',
               estimator=LinearDiscriminantAnalysis(),
               cv=kfold,
               params={})
    ]

    for dataset in datasets:
        for estimator in estimators:
            estimator = processor.get_default_model(dataset=dataset,
                                                    estimator=estimator)
            processor.process_validations(dataset=dataset, estimator=estimator)
            processor.plot_validation()

    for dataset in datasets:
        for estimator in estimators:
            estimator = processor.get_default_model(dataset=dataset,
                                                    estimator=estimator)
            processor.param_selection(dataset=dataset, estimator=estimator)
            processor.print_best_params()

    for dataset in datasets:
        for estimator in estimators:
            processor.process(dataset=dataset, estimator=estimator)
            processor.plot_learning_curves()
def run_feature_importance():
    processor = Processor3()
    processor.latext_start_figure()
    for dataset in [Diabetes(), Adult()]:
        processor = Processor3()
        processor.latext_start_figure()
        X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
        forest = RandomForestClassifier(n_estimators=500, random_state=1)
        forest.fit(X_train, y_train)
        importances = forest.feature_importances_

        indices = np.argsort(importances)[::-1]
        top_10 = []
        top_10_vals = []
        top_10_idx = []
        for f, g in zip(range(X_train.shape[1]), indices[:10]):
            print("%2d) % -*s %f" % (f + 1, 30, dataset.fields[indices[f]],
                                     importances[indices[f]]))
            top_10.append(dataset.fields[indices[f]])
            top_10_idx.append(indices[f])
            top_10_vals.append(importances[indices[f]])

        print(top_10)
        print(top_10_idx)

        plt.title('Feature Importance')
        plt.bar(top_10, top_10_vals, align='center')
        plt.xticks(top_10, rotation=90)
        plt.tight_layout()
        # plt.show()
        filename = '%s_%s' % ('features', dataset.__class__.__name__)
        chart_path = 'report/images/%s.png' % filename
        plt.savefig(chart_path)
        plt.close()
        processor.latex_subgraph(dataset=dataset.__class__.__name__,
                                 fig=filename,
                                 caption=dataset.__class__.__name__,
                                 filename=filename)

    processor.latex_end_figure(caption=f"Feature Importance",
                               fig=f"feature_importance")
예제 #3
0
        print("Directory '%s' can not be created")

    parser = argparse.ArgumentParser(description='Find X Coding Quiz')

    parser.add_argument('-m',
                        '--mode',
                        help='Mode',
                        default='debug',
                        dest='mode')
    args = parser.parse_args()

    processor = Processor3()

    datasets = [
        Diabetes(),
        Adult(),
    ]

    run_feature_importance()
    compute_kmeans_elbow_curves()
    visualize_kmeans_clusters()
    run_dimension_reductions()
    run_nn_opt()
    run_nn_opt_clusters()

    ## TODO: Uncomment to get the other charts

    compute_kmeans_elbow_curves()
    compute_em_elbow_curves()
    run_bics()
    plot_pca_variance()
예제 #4
0
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.random_projection import SparseRandomProjection

from hw1.utils import Adult
from hw1.utils import Diabetes
from hw3.main import Processor3

datasets = [Adult(), Diabetes()]

for dataset in datasets:
    processor = Processor3()
    processor.latext_start_figure()
    X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
    n_clusters = len(dataset.label_encoder.classes_)
    pca = PCA(n_components=0.95)
    pca.fit(X_train)
    n_components = pca.components_.shape[0]
    print(f"n_components: {n_components}")
    dr_models = [
        PCA(n_components=n_components, random_state=0),
        FastICA(n_components=n_components, random_state=0),
        MiniBatchDictionaryLearning(n_components=n_components,
                                    alpha=1,
예제 #5
0
def run_dimension_reductions():
    global mean
    for dataset in [Diabetes(), Adult()]:
        processor = Processor3()
        processor.latext_start_figure()
        X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
        pca = PCA(n_components=0.95)
        pca.fit(X_train)
        n_components = pca.components_.shape[0]
        print(f"n_components: {n_components}")

        whiten = True
        random_state = 0
        dr_models = [
            PCA(n_components=n_components, random_state=0),
            FastICA(n_components=n_components, random_state=0),
            MiniBatchDictionaryLearning(n_components=n_components,
                                        alpha=1,
                                        batch_size=200,
                                        n_iter=10,
                                        random_state=random_state),
            SparseRandomProjection(random_state=0, n_components=n_components)
        ]
        for pca in dr_models:
            X_train = pd.DataFrame(X_train)
            y_train = pd.DataFrame(y_train)

            if isinstance(pca, SparseRandomProjection):
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = np.array(X_train_PCA).dot(
                    pca.components_.todense())
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)
            elif isinstance(pca, MiniBatchDictionaryLearning):
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = np.array(X_train_PCA).dot(
                    pca.components_) + np.array(X_train.mean(axis=0))
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)
            else:
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = pca.inverse_transform(X_train_PCA)
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)

            # plt.show()

            anomalyScoresPCA = anomalyScores(X_train, X_train_PCA_inverse)
            mean = np.mean(anomalyScoresPCA)
            print(mean)
            preds = plotResults(y_train, anomalyScoresPCA, True,
                                pca.__class__.__name__,
                                dataset.__class__.__name__, mean)
        processor.latex_end_figure(
            caption=f"{dataset.__class__.__name__} Precision-Recall Curve",
            fig=f"pr_{dataset.__class__.__name__}")