示例#1
0
def RCA_Experiment(X, title, folder=""):
    n_components_range = list(np.arange(2, X.shape[1], 1))
    correlation_coefficient = defaultdict(dict)

    for i, n in product(range(5), n_components_range):
        rp = RCA(random_state=i, n_components=n)
        rp.fit(X)
        projections = rp.components_
        if sparse.issparse(projections):
            projections = projections.todense()
        p = pinv(projections)
        reconstructed = ((p @ projections) @ (X.T)).T
        correlation_coefficient[n][i] = np.nanmean(np.square(X -
                                                             reconstructed))
    correlation_coefficient = pd.DataFrame(correlation_coefficient).T
    mean_recon = correlation_coefficient.mean(axis=1).tolist()
    std_recon = correlation_coefficient.std(axis=1).tolist()

    plt.plot(n_components_range, mean_recon)
    plt.xlabel('Random Components')
    plt.ylabel('Mean Reconstruction Correlation')
    plt.title(
        'Sparse Random Projection for Mean Reconstruction Correlation: ' +
        title)
    plt.savefig(folder + '/RcaMeanRE.png')
    plt.close()

    plt.plot(n_components_range, std_recon)
    plt.xlabel('Random Components')
    plt.ylabel('STD Reconstruction Correlation')
    plt.title("Sparse Random Projection for STD Reconstruction Correlation: " +
              title)
    plt.savefig(folder + '/RcaStdRE.png')
    plt.close()
示例#2
0
def run_RCA(X, title):
    dims = list(np.arange(2, (X.shape[1] - 1), 3))
    dims.append(X.shape[1])
    tmp = defaultdict(dict)

    for i, dim in product(range(10), dims):
        rp = RCA(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    mean_recon = tmp.mean(axis=1).tolist()
    std_recon = tmp.std(axis=1).tolist()

    fig, ax1 = plt.subplots()
    ax1.plot(dims, mean_recon, 'b-')
    ax1.set_xlabel('Random Components')
    ax1.set_ylabel('Mean Reconstruction Correlation', color='b')
    ax1.tick_params('y', colors='b')
    plt.grid(False)

    ax2 = ax1.twinx()
    ax2.plot(dims, std_recon, 'm-')
    ax2.set_ylabel('STD Reconstruction Correlation', color='m')
    ax2.tick_params('y', colors='m')
    plt.grid(False)

    plt.title("Random Components for 5 Restarts: " + title)
    fig.tight_layout()
    plt.show()
示例#3
0
def run_RCA(X,y,title):
    
    dims = list(np.arange(2,(X.shape[1]-1),3))
    dims.append(X.shape[1])
    tmp = defaultdict(dict)

    for i,dim in product(range(5),dims):
        rp = RCA(random_state=i, n_components=dim)
        tmp[dim][i] = 0
    tmp = pd.DataFrame(tmp).T
    mean_recon = tmp.mean(axis=1).tolist()
    std_recon = tmp.std(axis=1).tolist()


    fig, ax1 = plt.subplots()
    ax1.plot(dims,mean_recon, 'b-')
    ax1.set_xlabel('RCA')
    ax1.set_ylabel('Mean RC', color='g')
    ax1.tick_params('y', colors='g')
    plt.grid(False)

    ax2 = ax1.twinx()
    ax2.plot(dims,std_recon, 'm-')
    ax2.set_ylabel('STD RC', color='r')
    ax2.tick_params('y', colors='r')
    plt.grid(False)

    plt.title(title + "RCA")
    fig.tight_layout()
    plt.show()
示例#4
0
def credit_risk_data():
    data_X = credit_data.drop([
        'credit_amount', 'other_parties', 'purpose', 'own_telephone',
        'foreign_worker'
    ],
                              axis=1)
    data_y = credit_data[['class']]

    features_to_encode = [
        'personal_status', 'checking_status', 'credit_history',
        'savings_status', 'employment', 'property_magnitude',
        'other_payment_plans', 'housing', 'job', 'class'
    ]
    enc = my_encoder()
    enc.fit(data_X, features_to_encode)
    X_train = enc.transform(data_X)
    # X_test = enc.transform(X_test)

    run_PCA(X_train, "Credit Data")
    run_ICA(X_train, "Credit Data")
    run_RCA(X_train, "Credit Data")

    pca_credit = PCA(n_components=3, random_state=5).fit_transform(X_train)
    ica_credit = ICA(n_components=2, random_state=5).fit_transform(X_train)
    rca_credit = RCA(n_components=29, random_state=5).fit_transform(X_train)

    run_kmeans(pca_credit, X_train, "KMEANS")
    run_kmeans(ica_credit, X_train, "KMEANS")
    run_kmeans(rca_credit, X_train, "KMEANS")

    run_EM(pca_credit, X_train, 'PCA Credit Risk Data')
    run_EM(ica_credit, X_train, 'ICA Credit Risk Data')
    run_EM(rca_credit, X_train, 'RCA Credit Risk Data')

    km = KMeans(n_clusters=3, random_state=0)
    y_km = km.fit_predict(X_train)

    score = silhouette_score(X_train, km.labels_, metric='euclidean')
    print('Silhouetter Score: %.3f' % score)

    # kmeans_silhoutte_analysis(X_train)

    elbow_function(X_train)
    run_kmeans(X_train, y_km, "KMEANS")

    em = EM(n_components=2, covariance_type='spherical', random_state=100)
    y_em = em.fit_predict(X_train)
    plot_EM(em, X_train)
    run_EM(X_train, y_em, "EM")
    # evaluate_EM(em, X_train, y_em)

    X_train, X_test, y_train, y_test = train_test_split(data_X,
                                                        data_y,
                                                        test_size=0.2,
                                                        random_state=0)
示例#5
0
def chess_game_data():
    data_X = game_data.drop([
        'id', 'created_at', 'increment_code', 'black_id', 'white_id', 'moves'
    ],
                            axis=1)
    data_y = game_data[['winner']]

    gd = data_X[:1000]

    features_to_encode = [
        'rated', 'victory_status', 'winner', 'opening_eco', 'opening_name'
    ]
    enc = my_encoder()
    enc.fit(gd, features_to_encode)
    X_train = enc.transform(gd)
    # X_test = enc.transform(X_test)

    run_PCA(X_train, "Chess Data")
    run_ICA(X_train, "Chess Data")
    run_RCA(X_train, "Chess Data")

    pca_chess = PCA(random_state=5).fit_transform(X_train)
    # ica_chess = ICA(random_state=5).fit_transform(X_train)
    rca_chess = RCA(n_components=60, random_state=5).fit_transform(X_train)

    run_kmeans(pca_chess, X_train, "KMEANS")
    # run_kmeans(ica_chess, X_train, "KMEANS")
    run_kmeans(rca_chess, X_train, "KMEANS")

    run_EM(pca_chess, X_train, 'PCA Chess Game Data')
    # run_EM(ica_chess, X_train, 'ICA Chess Game Data')
    run_EM(rca_chess, X_train, 'RCA Chess Game Data')

    km = KMeans(n_clusters=3, random_state=0)
    y_km = km.fit_predict(X_train)

    score = silhouette_score(X_train, km.labels_, metric='euclidean')
    print('Silhouetter Score: %.3f' % score)

    # kmeans_silhoutte_analysis(X_train)

    run_kmeans(X_train, y_km, "KMEANS")
    elbow_function(X_train)

    em = EM(n_components=4, covariance_type='spherical', random_state=100)
    y_em = em.fit_predict(X_train)
    plot_EM(em, X_train)
    run_EM(X_train, y_em, "EM")

    X_train, X_test, y_train, y_test = train_test_split(data_X,
                                                        data_y,
                                                        test_size=0.2,
                                                        random_state=0)
示例#6
0
def main():
    df = pd.read_csv("../Dataset/winequality-white.csv", delimiter=";")
    seed = 200
    np.random.seed(seed)

    lowquality = df.loc[df['quality'] <= 6].index
    highquality = df.loc[df['quality'] > 6].index
    df.iloc[lowquality, df.columns.get_loc('quality')] = 0
    df.iloc[highquality, df.columns.get_loc('quality')] = 1

    X = np.array(df.iloc[:, 0:-1])
    wine_Y = np.array(df.iloc[:, -1])

    standardScalerX = StandardScaler()
    wine_x = standardScalerX.fit_transform(X)

    pca_wine = PCA(n_components=7, random_state=seed).fit_transform(wine_x)
    ica_wine = ICA(n_components=9, random_state=seed).fit_transform(wine_x)
    rca_wine = RCA(n_components=8, random_state=seed).fit_transform(wine_x)
    imp_wine, top_columns_wine = run_RFC(wine_x, wine_Y, df)

    rfc_wine = df[top_columns_wine]
    rfc_wine = np.array(rfc_wine.values, dtype='int64')

    X_train, X_test, y_train, y_test = train_test_split(np.array(wine_x),
                                                        np.array(wine_Y),
                                                        test_size=0.30)
    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            activation='relu',
                            learning_rate_init=0.0051,
                            random_state=seed)

    evaluate(learner, X_train, X_test, y_train, y_test, title="FullDataset")

    X_train, X_test, y_train, y_test = train_test_split(np.array(pca_wine),
                                                        np.array(wine_Y),
                                                        test_size=0.30)
    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            activation='relu',
                            learning_rate_init=0.0051,
                            random_state=seed)

    evaluate(learner, X_train, X_test, y_train, y_test, title="PCA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(ica_wine),
                                                        np.array(wine_Y),
                                                        test_size=0.30)
    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            activation='relu',
                            learning_rate_init=0.0051,
                            random_state=seed)

    evaluate(learner, X_train, X_test, y_train, y_test, title="ICA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(rca_wine),
                                                        np.array(wine_Y),
                                                        test_size=0.30)
    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            activation='relu',
                            learning_rate_init=0.0051,
                            random_state=seed)

    evaluate(learner, X_train, X_test, y_train, y_test, title="RP")

    X_train, X_test, y_train, y_test = train_test_split(np.array(rfc_wine),
                                                        np.array(wine_Y),
                                                        test_size=0.30)
    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            activation='relu',
                            learning_rate_init=0.0051,
                            random_state=seed)

    evaluate(learner, X_train, X_test, y_train, y_test, title="RFC")
示例#7
0
from sklearn.datasets import load_digits
data_digits = load_digits()
#X1, Y1 = pd.DataFrame(data_digits["data"]), pd.Series(data_digits["target"])
#Dataset = "digits"

from sklearn.datasets import load_wine
data_wine = load_wine()
X1, Y1 = pd.DataFrame(data_wine["data"],columns=data_wine.feature_names), pd.Series(data_wine["target"])
Dataset = "wine"


Xraw = X1
Xpca = PCA(n_components=5,random_state=5).fit_transform(X1)
Xica = ICA(n_components=5,random_state=5).fit_transform(X1)
Xica /= Xica.std(axis=0)
Xrca = RCA(n_components=5,random_state=5).fit_transform(X1)

# Run RFC
#rfc = RFC(n_estimators=500,min_samples_leaf=round(len(X1)*.01),random_state=5,n_jobs=-1)
#imp = rfc.fit(X1,Y1).feature_importances_
#imp = pd.DataFrame(imp,columns=['Feature Importance'])
#imp.sort_values(by=['Feature Importance'],inplace=True,ascending=False)
#imp['Cum Sum'] = imp['Feature Importance'].cumsum()
#imp = imp[imp['Cum Sum']<=0.35]
#top_cols = imp.index.tolist()
#Xrfc = X1[top_cols]



def MLP_classifier(X, Y, datasource):
    param_range = range(1,201,20)
示例#8
0
def dimensionality_reduction_analysis():
    X_p, Y_p, df_phish = get_phishing_data()
    run_PCA(X_p, Y_p, "Phishing Data")
    run_ICA(X_p, Y_p, "Phishing Data")
    run_RCA(X_p, Y_p, "Phishing Data")
    imp_phish, topcols_phish = run_RFC(X_p, Y_p, df_original=df_phish)
    pca_phish = PCA(n_components=32, random_state=5).fit_transform(X_p)
    ica_phish = ICA(n_components=32, random_state=5).fit_transform(X_p)
    rca_phish = RCA(n_components=32, random_state=5).fit_transform(X_p)
    rfc_phish = df_phish[topcols_phish]
    rfc_phish = np.array(rfc_phish.values, dtype='int64')[:, :32]
    #
    run_kmeans(pca_phish, Y_p, 'PCA Phishing Data')
    run_kmeans(ica_phish, Y_p, 'ICA Phishing Data')
    run_kmeans(rca_phish, Y_p, 'RCA Phishing Data')
    run_kmeans(rfc_phish, Y_p, 'RFC Phishing Data')

    evaluate_kmeans(KMeans(n_clusters=14,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    pca_phish,
                    Y_p,
                    title="PCA")
    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    ica_phish,
                    Y_p,
                    title="ICA")
    evaluate_kmeans(KMeans(n_clusters=10,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rca_phish,
                    Y_p,
                    title="RCA")
    evaluate_kmeans(KMeans(n_clusters=2,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rfc_phish,
                    Y_p,
                    title="RFC")

    run_EM(pca_phish, Y_p, 'PCA Phishing Data')
    run_EM(ica_phish, Y_p, 'ICA Phishing Data')
    run_EM(rca_phish, Y_p, 'RCA Phishing Data')
    run_EM(rfc_phish, Y_p, 'RFC Phishing Data')

    evaluate_EM(EM(n_components=67,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                pca_phish,
                Y_p,
                title="PCA")
    evaluate_EM(EM(n_components=64,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                ica_phish,
                Y_p,
                title="ICA")
    evaluate_EM(EM(n_components=64,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rca_phish,
                Y_p,
                title="RCA")
    evaluate_EM(EM(n_components=32,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rfc_phish,
                Y_p,
                title="RFC")

    X_v, Y_v, df_vocal = get_vocal_data()
    run_PCA(X_v, Y_v, "Phone Me Data")
    run_ICA(X_v, Y_v, "Phone Me Data")
    run_RCA(X_v, Y_v, "Phone Me Data")
    imp_vocal, topcols_vocal = run_RFC(X_v, Y_v, df_original=df_vocal)
    pca_vocal = PCA(n_components=4, random_state=5).fit_transform(X_v)
    ica_vocal = ICA(n_components=4, random_state=5).fit_transform(X_v)
    rca_vocal = RCA(n_components=4, random_state=5).fit_transform(X_v)
    rfc_vocal = df_vocal[topcols_vocal]
    rfc_vocal = np.array(rfc_vocal.values, dtype='int64')[:, :4]

    run_kmeans(pca_vocal, Y_v, 'PCA Phone Me Data')
    run_kmeans(ica_vocal, Y_v, 'ICA Phone Me Data')
    run_kmeans(rca_vocal, Y_v, 'RCA Phone Me Data')
    run_kmeans(rfc_vocal, Y_v, 'RFC Phone Me Data')

    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    pca_vocal,
                    Y_v,
                    title="PCA")
    evaluate_kmeans(KMeans(n_clusters=10,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    ica_vocal,
                    Y_v,
                    title="ICA")
    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rca_vocal,
                    Y_v,
                    title="RCA")
    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rfc_vocal,
                    Y_v,
                    title="RFC")

    run_EM(pca_vocal, Y_v, 'PCA Phone Me Data')
    run_EM(ica_vocal, Y_v, 'ICA Phone Me Data')
    run_EM(rca_vocal, Y_v, 'RCA Phone Me Data')
    run_EM(rfc_vocal, Y_v, 'RFC Phone Me Data')

    evaluate_EM(EM(n_components=58,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                pca_vocal,
                Y_v,
                title="PCA")
    evaluate_EM(EM(n_components=52,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                ica_vocal,
                Y_v,
                title="ICA")
    evaluate_EM(EM(n_components=56,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rca_vocal,
                Y_v,
                title="RCA")
    evaluate_EM(EM(n_components=48,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rfc_vocal,
                Y_v,
                title="RFC")

    # Comparing With NN
    # Original
    print("Original")
    X_train, X_test, y_train, y_test = train_test_split(np.array(X_p),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    full_est = MLPClassifier(hidden_layer_sizes=(50, ),
                             solver='adam',
                             activation='logistic',
                             learning_rate_init=0.01,
                             random_state=100)
    train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve(
        full_est, X_train, y_train, title="Neural Net Phishing: Full")
    final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test)
    # PCA
    print("PCA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(pca_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    pca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve(
        pca_est, X_train, y_train, title="Neural Net Phishing: PCA")
    final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test)
    # ICA
    print("ICA")
    X_train, X_test, y_train, y_test = train_test_split(np.array(ica_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    ica_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve(
        ica_est, X_train, y_train, title="Neural Net Phishing: ICA")
    final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test)
    # Randomised Projection
    print("RCA")
    X_train, X_test, y_train, y_test = train_test_split(np.array(rca_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve(
        rca_est, X_train, y_train, title="Neural Net Phishing: RCA")
    final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test)
    # RFC
    print("RFC")
    X_train, X_test, y_train, y_test = train_test_split(np.array(rfc_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rfc_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve(
        rfc_est, X_train, y_train, title="Neural Net Phishing: RFC")
    final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test)

    compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca,
                     NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc,
                     'Phishing Dataset')
    compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca,
                      NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc,
                      'Phishing Dataset')
    compare_learn_time(train_samp_full, NN_train_score_full,
                       NN_train_score_pca, NN_train_score_ica,
                       NN_train_score_rca, NN_train_score_rfc,
                       'Phishing Dataset')

    print("Training Clustered Label")
    # Training NN on Projected data with cluster labels
    km = KMeans(n_clusters=2, n_init=10, random_state=100, n_jobs=-1).fit(X_p)
    km_labels = km.labels_
    em = EM(n_components=30,
            covariance_type='diag',
            n_init=1,
            warm_start=True,
            random_state=100).fit(X_p)
    em_labels = em.predict(X_p)

    clust_full = addclusters(X_p, km_labels, em_labels)
    clust_pca = addclusters(pca_phish, km_labels, em_labels)
    clust_ica = addclusters(ica_phish, km_labels, em_labels)
    clust_rca = addclusters(rca_phish, km_labels, em_labels)
    clust_rfc = addclusters(rfc_phish, km_labels, em_labels)
    print("Training Clustered - Original")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_full),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    full_est = MLPClassifier(hidden_layer_sizes=(50, ),
                             solver='adam',
                             activation='logistic',
                             learning_rate_init=0.01,
                             random_state=100)
    train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve(
        full_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: Full")
    final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - PCA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_pca),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    pca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve(
        pca_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: PCA")
    final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - ICA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_ica),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    ica_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve(
        ica_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: ICA")
    final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - RCA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rca),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve(
        rca_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: RCA")
    final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - RFC")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rfc),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rfc_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve(
        rfc_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: RFC")
    final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test)

    compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca,
                     NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc,
                     'Phishing Dataset')
    compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca,
                      NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc,
                      'Phishing Dataset')
    compare_learn_time(train_samp_full, NN_train_score_full,
                       NN_train_score_pca, NN_train_score_ica,
                       NN_train_score_rca, NN_train_score_rfc,
                       'Phishing Dataset')
示例#9
0

def pairwiseDistCorr(X1, X2):
    assert X1.shape[0] == X2.shape[0]
    d1 = pairwise_distances(X1)
    d2 = pairwise_distances(X2)
    return np.corrcoef(d1.ravel(), d2.ravel())[0, 1]


# Run RCA
dims = list(np.arange(2, (X1.shape[1] - 1), 3))
dims.append(X1.shape[1])
tmp = defaultdict(dict)

for i, dim in product(range(5), dims):
    rp = RCA(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X1), X1)
tmp = pd.DataFrame(tmp).T
mean_recon = tmp.mean(axis=1).tolist()
std_recon = tmp.std(axis=1).tolist()

# Plot RCA
fig, ax1 = plt.subplots()
ax1.plot(dims, mean_recon, 'b-')
ax1.set_xlabel('Random Components')
ax1.set_ylabel('Mean Reconstruction Correlation', color='b')
ax1.tick_params('y', colors='b')
plt.grid(False)

ax2 = ax1.twinx()
ax2.plot(dims, std_recon, 'm-')