def em(tx, ty, rx, ry, add="", times=5):
    errs = []

    # this is what we will compare to
    checker = EM(n_components=2)
    checker.fit(ry)
    truth = checker.predict(ry)

    # so we do this a bunch of times
    for i in range(2, times):
        clusters = {x: [] for x in range(i)}

        # create a clusterer
        clf = EM(n_components=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set

        # here we make the arguably awful assumption that for a given cluster,
        # all values in tha cluster "should" in a perfect world, belong in one
        # class or the other, meaning that say, cluster "3" should really be
        # all 0s in our truth, or all 1s there
        #
        # So clusters is a dict of lists, where each list contains all items
        # in a single cluster
        for index, val in enumerate(result):
            clusters[val].append(index)

        # then we take each cluster, find the sum of that clusters counterparts
        # in our "truth" and round that to find out if that cluster should be
        # a 1 or a 0
        mapper = {
            x: round(
                sum(truth[v] for v in clusters[x]) /
                float(len(clusters[x]))) if clusters[x] else 0
            for x in range(i)
        }

        # the processed list holds the results of this, so if cluster 3 was
        # found to be of value 1,
        # for each value in clusters[3], processed[value] == 1 would hold
        processed = [mapper[val] for val in result]
        errs.append(sum((processed - truth)**2) / float(len(ry)))
    plot([0, times, min(errs) - .1, max(errs) + .1],
         [range(2, times), errs, "ro"], "Number of Clusters", "Error Rate",
         "Expectation Maximization Error", "EM" + add)

    # dank magic, wrap an array cuz reasons
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    newtx = np.append(tx, td, 1)
    newrx = np.append(rx, rd, 1)
    nn(newtx, ty, newrx, ry, add="onEM" + add)
예제 #2
0
def oneem(tx, ty, rx, ry, add="", times=5):
    scores = []
    clf = EM(n_components=times)
    clf.fit(tx)  #fit it to our data
    scores.append(clf.predict_proba(tx))
    scores.append(clf.predict_proba(rx))
    return scores
예제 #3
0
def part4_mnist():
    mnist = input_data.read_data_sets("data/")

    X = mnist.train.images
    y = mnist.train.labels

    # One cluster for each digit
    k = 10

    # Run EM algorithm on 1000 images from the MNIST dataset.
    expectation_maximization = EM(n_components=k,
                                  max_iter=10,
                                  init_params='kmeans',
                                  covariance_type='diag',
                                  verbose=1,
                                  verbose_interval=1).fit(X)

    means = expectation_maximization.means_
    covs = expectation_maximization.covariances_

    fig, ax = plt.subplots(1, k, figsize=(8, 1))

    for i in range(k):
        ax[i].imshow(means[i].reshape(28, 28), cmap='gray')

    plt.show()

    sample(means, covs, 0)
예제 #4
0
def myem(X, y, nameappendix, krange):
    for n_clusters in krange:
        fig = plt.gcf()
        # fig.set_size_inches(7, 7)
        ax = fig.add_subplot(111)

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = EM(n_components=n_clusters, random_state=10).fit(X)
        labels = clusterer.predict(X)
        print("NMI score: %.6f" % normalized_mutual_info_score(y, labels))

        # 2nd Plot showing the actual clusters formed
#         colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
        colors = plt.get_cmap('Spectral')(labels.astype(float) / n_clusters)
        plt.scatter(X[:, 3], X[:, 5], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k')

        # Labeling the clusters
        centers = clusterer.means_

        # Draw white circles at cluster centers
        plt.scatter(centers[:, 3], centers[:, 5], marker='o', c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax.scatter( c[3], c[5], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

        ax.set_title("Clustering Visualization")
        ax.set_xlabel("1st feature: Pressure X4")
        ax.set_ylabel("2nd feature: Pressure X5")

        plt.suptitle("Analysis for EM Clustering for " + str(n_clusters) + " Clusters", fontsize=14, fontweight='bold')
        plt.savefig('img/em' + nameappendix + '.png')
        plt.show()
예제 #5
0
def em_experiment(X, y, title, folder=""):
    cluster_range = list(np.arange(2, 11, 1))
    sil_scores, accuracy_scores, homo_scores, sse_scores, ami_scores, bic_scores = (
        [] for i in range(6))
    completeness_scores = []

    for k in cluster_range:
        # print(k)
        em = EM(n_components=k).fit(X)
        em_labels = em.predict(X)
        sil_scores.append(sil_score(X, em_labels))
        sse_scores.append(em.score(X))
        # print(sil_score(X,em_labels))
        homo_scores.append(homogeneity_score(y, em_labels))
        completeness_scores.append(completeness_score(y, em_labels))
        ami_scores.append(adjusted_mutual_info_score(y, em_labels))
        bic_scores.append(em.bic(X))

    plt.plot(cluster_range, sil_scores)
    plt.xlabel('No. Components')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Silhouette Score for EM: ' + title)
    plt.savefig(folder + '/EMSIL.png')
    plt.close()

    plt.plot(cluster_range, homo_scores)
    plt.xlabel('No. Components')
    plt.ylabel('Homogeneity Score')
    plt.title('Homogeneity Scores EM: ' + title)
    plt.savefig(folder + '/EMHOMOGENEITY.png')
    plt.close()

    plt.plot(cluster_range, completeness_scores)
    plt.xlabel('No. Components')
    plt.ylabel('Completeness Score')
    plt.title('Completeness Score for EM: ' + title)
    plt.savefig(folder + '/EMCompletness.png')
    plt.close()

    plt.plot(cluster_range, sse_scores)
    plt.xlabel('No. Components')
    plt.ylabel('SSE Score')
    plt.title('SSE Scores EM: ' + title)
    plt.savefig(folder + '/EMSSE.png')
    plt.close()

    plt.plot(cluster_range, ami_scores)
    plt.xlabel('No. Components')
    plt.ylabel('AMI Score')
    plt.title('Adjusted Mutual Information Scores EM: ' + title)
    plt.savefig(folder + '/EMAMI.png')
    plt.close()

    plt.plot(cluster_range, bic_scores)
    plt.xlabel('No. Components')
    plt.ylabel('AMI Score')
    plt.title('BIC Scores EM: ' + title)
    plt.savefig(folder + '/EMBIC.png')
    plt.close()
예제 #6
0
def run_EM(X,y,title):

    kdist = list(np.arange(2,100,5))
    sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = []
    
    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        
        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        y_mode_vote = cluster_predictions(y,labels)
        f1_scores.append(f1_score(y, y_mode_vote))
        homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))
        
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Silhouette')
    plt.title(title + ' Exp Max Silhouette')
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Silhouette')
    plt.title(title + ' Exp Max Silhouette')
    plt.show()
   

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, f1_scores)
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('F1 Score')
    plt.title(title + 'Exp Max F1')
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores,label='BIC')
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Model Complexity Score')
    plt.title(title + 'Exp Max Model Complexity')
    plt.legend(loc="best")
    plt.show()
예제 #7
0
def credit_risk_data():
    data_X = credit_data.drop([
        'credit_amount', 'other_parties', 'purpose', 'own_telephone',
        'foreign_worker'
    ],
                              axis=1)
    data_y = credit_data[['class']]

    features_to_encode = [
        'personal_status', 'checking_status', 'credit_history',
        'savings_status', 'employment', 'property_magnitude',
        'other_payment_plans', 'housing', 'job', 'class'
    ]
    enc = my_encoder()
    enc.fit(data_X, features_to_encode)
    X_train = enc.transform(data_X)
    # X_test = enc.transform(X_test)

    run_PCA(X_train, "Credit Data")
    run_ICA(X_train, "Credit Data")
    run_RCA(X_train, "Credit Data")

    pca_credit = PCA(n_components=3, random_state=5).fit_transform(X_train)
    ica_credit = ICA(n_components=2, random_state=5).fit_transform(X_train)
    rca_credit = RCA(n_components=29, random_state=5).fit_transform(X_train)

    run_kmeans(pca_credit, X_train, "KMEANS")
    run_kmeans(ica_credit, X_train, "KMEANS")
    run_kmeans(rca_credit, X_train, "KMEANS")

    run_EM(pca_credit, X_train, 'PCA Credit Risk Data')
    run_EM(ica_credit, X_train, 'ICA Credit Risk Data')
    run_EM(rca_credit, X_train, 'RCA Credit Risk Data')

    km = KMeans(n_clusters=3, random_state=0)
    y_km = km.fit_predict(X_train)

    score = silhouette_score(X_train, km.labels_, metric='euclidean')
    print('Silhouetter Score: %.3f' % score)

    # kmeans_silhoutte_analysis(X_train)

    elbow_function(X_train)
    run_kmeans(X_train, y_km, "KMEANS")

    em = EM(n_components=2, covariance_type='spherical', random_state=100)
    y_em = em.fit_predict(X_train)
    plot_EM(em, X_train)
    run_EM(X_train, y_em, "EM")
    # evaluate_EM(em, X_train, y_em)

    X_train, X_test, y_train, y_test = train_test_split(data_X,
                                                        data_y,
                                                        test_size=0.2,
                                                        random_state=0)
예제 #8
0
def chess_game_data():
    data_X = game_data.drop([
        'id', 'created_at', 'increment_code', 'black_id', 'white_id', 'moves'
    ],
                            axis=1)
    data_y = game_data[['winner']]

    gd = data_X[:1000]

    features_to_encode = [
        'rated', 'victory_status', 'winner', 'opening_eco', 'opening_name'
    ]
    enc = my_encoder()
    enc.fit(gd, features_to_encode)
    X_train = enc.transform(gd)
    # X_test = enc.transform(X_test)

    run_PCA(X_train, "Chess Data")
    run_ICA(X_train, "Chess Data")
    run_RCA(X_train, "Chess Data")

    pca_chess = PCA(random_state=5).fit_transform(X_train)
    # ica_chess = ICA(random_state=5).fit_transform(X_train)
    rca_chess = RCA(n_components=60, random_state=5).fit_transform(X_train)

    run_kmeans(pca_chess, X_train, "KMEANS")
    # run_kmeans(ica_chess, X_train, "KMEANS")
    run_kmeans(rca_chess, X_train, "KMEANS")

    run_EM(pca_chess, X_train, 'PCA Chess Game Data')
    # run_EM(ica_chess, X_train, 'ICA Chess Game Data')
    run_EM(rca_chess, X_train, 'RCA Chess Game Data')

    km = KMeans(n_clusters=3, random_state=0)
    y_km = km.fit_predict(X_train)

    score = silhouette_score(X_train, km.labels_, metric='euclidean')
    print('Silhouetter Score: %.3f' % score)

    # kmeans_silhoutte_analysis(X_train)

    run_kmeans(X_train, y_km, "KMEANS")
    elbow_function(X_train)

    em = EM(n_components=4, covariance_type='spherical', random_state=100)
    y_em = em.fit_predict(X_train)
    plot_EM(em, X_train)
    run_EM(X_train, y_em, "EM")

    X_train, X_test, y_train, y_test = train_test_split(data_X,
                                                        data_y,
                                                        test_size=0.2,
                                                        random_state=0)
예제 #9
0
def em_analysis():
    X_p, Y_p, _ = get_phishing_data()
    # run_EM(X_p, Y_p, 'Phishing Data')
    em = EM(n_components=30,
            covariance_type='diag',
            n_init=1,
            warm_start=True,
            random_state=100)
    evaluate_EM(em, X_p, Y_p)
    df = pd.DataFrame(em.means_)
    df.to_csv("Phishing EM Component Means.csv")

    X_v, Y_v, _ = get_vocal_data()
    # run_EM(X_v, Y_v, 'Vocal Data')
    em = EM(n_components=52,
            covariance_type='diag',
            n_init=1,
            warm_start=True,
            random_state=100)
    evaluate_EM(em, X_v, Y_v)
    df = pd.DataFrame(em.means_)
    df.to_csv("Vocal EM Component Means.csv")
예제 #10
0
def main():
    df = pd.read_csv("../Dataset/winequality-white.csv", delimiter=";")
    seed = 200
    np.random.seed(seed)

    lowquality = df.loc[df['quality'] <= 6].index
    highquality = df.loc[df['quality'] > 6].index
    df.iloc[lowquality, df.columns.get_loc('quality')] = 0
    df.iloc[highquality, df.columns.get_loc('quality')] = 1

    X = np.array(df.iloc[:, 0:-1])
    wine_Y = np.array(df.iloc[:, -1])

    standardScalerX = StandardScaler()
    wine_x = standardScalerX.fit_transform(X)

    km = KMeans(n_clusters=2, random_state=200).fit(wine_x)
    km_labels = km.labels_

    df['KM'] = km_labels
    df = df.drop(columns='quality')
    knn_X = np.array(df.values, dtype='int64')

    X_train, X_test, y_train, y_test = train_test_split(np.array(knn_X),
                                                        np.array(wine_Y),
                                                        test_size=0.30)
    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            activation='relu',
                            learning_rate_init=0.0051,
                            random_state=seed)

    evaluate(learner, X_train, X_test, y_train, y_test, title="Kmeans.png")

    em = EM(n_components=2, random_state=200).fit(wine_x)
    em_labels = em.predict(wine_x)

    df['EM'] = em_labels
    df = df.drop(columns='KM')
    em_X = np.array(df.values, dtype='int64')

    X_train, X_test, y_train, y_test = train_test_split(np.array(em_X),
                                                        np.array(wine_Y),
                                                        test_size=0.30)
    learner = MLPClassifier(hidden_layer_sizes=(22, ),
                            activation='relu',
                            learning_rate_init=0.0051,
                            random_state=seed)

    evaluate(learner, X_train, X_test, y_train, y_test, title="EM.png")
예제 #11
0
def run_EM(X, y, title):
    kdist = list(np.arange(2, 100, 5))
    sil_scores = []
    train_times = []
    aic_scores = []
    bic_scores = []

    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k, covariance_type='spherical',
                random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)

        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        # y_mode_vote = cluster_predictions(y, labels)
        # f1_scores.append(f1_score(y, y_mode_vote))
        # homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))

    # elbow curve for silhouette score
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Silhouette Analysis for EM: ' + title)
    plt.show()

    # plot model AIC and BIC
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores, label='BIC')
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Model Complexity Score')
    plt.title('EM Model Complexity: ' + title)
    plt.legend(loc="best")
    plt.show()
예제 #12
0
def plotIt():
    iris = sklearn.datasets.load_iris()
    X = iris['data'][:, 0:2]  # reduce dimensions so we can plot what happens.
    k = 3
    means, covs, priors, llh = em_algorithm(X, 3, 100, 0.001)

    fig, ax = plt.subplots(1, 1, figsize=(8,3))
    llhs = []

    for i in range(1):
        _, _, _, llh = em_algorithm(X, 3, 100)
        llhs.append(llh)
        ax.plot(llhs, 'bx')
        fig.canvas.draw()
    #plt.show()



    expectation_maximization = EM(n_components=3, init_params='random', covariance_type='diag', verbose=2, verbose_interval =1).fit(X)
def run_EM(X,y,title):

    #kdist =  [2,3,4,5]
    #kdist = list(range(2,51))
    kdist = list(np.arange(2,20,2))
    sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = []
    
    for k in kdist:
        start_time = timeit.default_timer()
        em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X)
        end_time = timeit.default_timer()
        train_times.append(end_time - start_time)
        
        labels = em.predict(X)
        sil_scores.append(sil_score(X, labels))
        y_mode_vote = cluster_predictions(y,labels)
        f1_scores.append(f1_score(y, y_mode_vote))
        homo_scores.append(homogeneity_score(y, labels))
        aic_scores.append(em.aic(X))
        bic_scores.append(em.bic(X))
        
    # elbow curve for silhouette score
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, sil_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Avg Silhouette Score')
    plt.title('Elbow Plot for EM: '+ title)
    plt.show()
   
    # plot homogeneity scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, homo_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Homogeneity Score')
    plt.title('Homogeneity Scores EM: '+ title)
    plt.show()

    # plot f1 scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, f1_scores)
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('F1 Score')
    plt.title('F1 Scores EM: '+ title)
    plt.show()

    # plot model AIC and BIC
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores,label='BIC')
    plt.grid(True)
    plt.xlabel('No. Distributions')
    plt.ylabel('Model Complexity Score')
    plt.title('EM Model Complexity: '+ title)
    plt.legend(loc="best")
    plt.show()
    print("Log-likelihood Lower Bound: {:.2f}".format(em.lower_bound_))
    print("F1 Score:  "+"{:.2f}".format(f1))
    print("Accuracy:  "+"{:.2f}".format(accuracy)+"     AUC:       "+"{:.2f}".format(auc))
    print("Precision: "+"{:.2f}".format(precision)+"     Recall:    "+"{:.2f}".format(recall))
    print("*****************************************************")
    plt.figure()
    plot_confusion_matrix(cm, classes=["0","1"], title='Confusion Matrix')
    plt.show()


# In[67]:


ttX,ttY,bankX,bankY = import_data()
run_EM(ttX,ttY,'Titanic Data')
em = EM(n_components=24,covariance_type='diag',n_init=1,warm_start=True,random_state=100)
evaluate_EM(em,ttX,ttY)
df = pd.DataFrame(em.means_)
df.to_csv("Titanic EM Component Means.csv")


# In[37]:


ttX,ttY,bankX,bankY = import_data()
X_train, X_test, y_train, y_test = train_test_split(np.array(bankX),np.array(bankY), test_size=0.25)
run_EM(X_train,y_train,'Banking Data')
em = EM(n_components=41,covariance_type='diag',n_init=1,warm_start=True,random_state=100)
evaluate_EM(em,bankX,bankY)
df = pd.DataFrame(em.means_)
df.to_csv("Banking EM Component Means.csv")
예제 #15
0
def main():
    seed = 200
    df = pd.read_csv("../Dataset/winequality-white.csv", delimiter=";")
    np.random.seed(seed)

    #####load wine data

    lowquality = df.loc[df['quality'] <= 6].index
    highquality = df.loc[df['quality'] > 6].index
    df.iloc[lowquality, df.columns.get_loc('quality')] = 0
    df.iloc[highquality, df.columns.get_loc('quality')] = 1

    X = np.array(df.iloc[:, 0:-1])
    wine_Y = np.array(df.iloc[:, -1])

    standardScalerX = StandardScaler()
    wine_x = standardScalerX.fit_transform(X)

    #####run k means to find best

    kmeans_experiment(wine_x, wine_Y, 'Wine Data', folder="part1_wineplots")

    # Plot Kmeans Wine Cluster

    reduced_data = PCA(n_components=2).fit_transform(wine_x)
    kmeans = KMeans(init='k-means++', n_clusters=2, n_init=10)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z,
               interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto',
               origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Only for kmeans
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0],
                centroids[:, 1],
                marker='x',
                s=169,
                linewidths=3,
                color='w',
                zorder=10)
    plt.title('K-means clustering on the wine dataset')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.savefig('part1_wineplots/kmeans_cluster.png')
    plt.close()

    ###end plot

    ######run em to find best components
    em_experiment(wine_x, wine_Y, 'Wine Data', folder="part1_wineplots")

    # Plot EM wine Cluster

    reduced_data = PCA(n_components=2).fit_transform(wine_x)
    kmeans = EM(n_components=2, n_init=10)
    kmeans.fit(reduced_data)

    h = .02

    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z,
               interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto',
               origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    plt.title('EM clustering on the wine dataset')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.savefig('part1_wineplots/em_cluster.png')
    plt.close()

    ####Load digits

    df_digits = pd.read_csv("../Dataset/pendigits.csv", header=None)
    np.random.seed(seed)

    X = np.array(df_digits.iloc[:, 0:-1])
    Y = np.array(df_digits.iloc[:, -1])

    standardScalerX = StandardScaler()
    digits_x = standardScalerX.fit_transform(X)

    # #####run k means to find best
    kmeans_experiment(digits_x, Y, 'Digits Data', folder="part1_digitsplots")

    #Plot Kmeans Digit Cluster

    reduced_data = PCA(n_components=2).fit_transform(digits_x)
    kmeans = KMeans(init='k-means++', n_clusters=9, n_init=10)
    kmeans.fit(reduced_data)

    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z,
               interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto',
               origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0],
                centroids[:, 1],
                marker='x',
                s=169,
                linewidths=3,
                color='w',
                zorder=10)
    plt.title('K-means clustering on the digits dataset')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.savefig('part1_digitsplots/kmeans_cluster.png')
    plt.close()

    ######run em to find best components
    em_experiment(digits_x, Y, 'Digits Data', folder="part1_digitsplots")

    # Plot EM Digit Cluster

    reduced_data = PCA(n_components=2).fit_transform(digits_x)
    kmeans = EM(n_components=8, n_init=10)
    kmeans.fit(reduced_data)
    h = .02
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(Z,
               interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto',
               origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    plt.title('EM clustering on the digits dataset')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.savefig('part1_digitsplots/em_cluster.png')
    plt.close()
예제 #16
0
          "{:.2f}".format(auc))
    print("Precision: " + "{:.2f}".format(precision) + "     Recall:    " +
          "{:.2f}".format(recall))
    print("*****************************************************")
    plt.figure()
    plot_confusion_matrix(cm, classes=["0", "1"], title='Confusion Matrix')
    plt.show()


# In[ ]:

phishX, phishY, bankX, bankY = import_data()
run_EM(phishX, phishY, 'Phishing Data')
em = EM(n_components=24,
        covariance_type='diag',
        n_init=1,
        warm_start=True,
        random_state=100)
evaluate_EM(em, phishX, phishY)
df = pd.DataFrame(em.means_)
df.to_csv("Phishing EM Component Means.csv")

# In[ ]:

phishX, phishY, bankX, bankY = import_data()
X_train, X_test, y_train, y_test = train_test_split(np.array(bankX),
                                                    np.array(bankY),
                                                    test_size=0.25)
run_EM(X_train, y_train, 'Banking Data')
em = EM(n_components=41,
        covariance_type='diag',
예제 #17
0
from tensorflow.examples.tutorials.mnist import input_data
from scipy.stats import multivariate_normal
import numpy as np

mnist = input_data.read_data_sets("data/")

X = mnist.train.images
y = mnist.train.labels

# One cluster for each digit
k = 10

# Run EM algorithm on 1000 images from the MNIST dataset.
expectation_maximization = EM(n_components=k,
                              max_iter=10,
                              init_params='kmeans',
                              covariance_type='diag',
                              verbose=1,
                              verbose_interval=1).fit(X)

means = expectation_maximization.means_
covs = expectation_maximization.covariances_

fig, ax = plt.subplots(1, k, figsize=(8, 1))

for i in range(k):
    ax[i].imshow(means[i].reshape(28, 28), cmap='gray')

plt.show()


def sample(means, covs, num):
예제 #18
0
def em(x, n_classes, min_iterations=25):
    em = EM(n_components=n_classes, n_init=25).fit(x)
    return em.predict(x)
예제 #19
0
def em(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""):
    clf = EM(n_components=times)
    clf.fit(reduced_data)
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # centroids = clf.cluster_centers_
    # plt.scatter(centroids[:, 0], centroids[:, 1],
    #             marker='x', s=169, linewidths=3,
    #             color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

    clf = EM(n_components=times)
    clf.fit(tx)  #fit it to our data
    test = clf.predict(tx)
    result = clf.predict(rx)
    checker = EM(n_components=times)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    td = np.reshape(test, (test.size, 1))
    rd = np.reshape(result, (result.size, 1))
    # newtx = np.append(td)
    # newrx = np.append(rd)
    myNN(test, ty, result, ry, alg="EM_"+alg)
    errs = []
    scores = []
    # this is what we will compare to
    checker = EM(n_components=2)
    ry = ry.reshape(-1,1)
    checker.fit(ry)
    truth = checker.predict(ry)
    adj_rand = []
    v_meas = []
    mutual_info = []
    adj_mutual_info = []
    # so we do this a bunch of times
    for i in range(2,times):
        clusters = {x:[] for x in range(i)}
        # create a clusterer
        clf = EM(n_components=i)
        clf.fit(tx)  #fit it to our data
        test = clf.predict(tx)
        result = clf.predict(rx)  # and test it on the testing set
        for index, val in enumerate(result):
            clusters[val].append(index)
        mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)}
        processed = [mapper[val] for val in result]
        errs.append(sum((processed-truth)**2) / float(len(ry)))
        scores.append(clf.score(tx, ty))
        adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result))
        v_meas.append(metrics.v_measure_score(ry.ravel(), result))
        mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result))
        adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result))
    # plot([0, times, min(scores)-.1, max(scores)+.1],[range(2, times), scores, "-"], "Number of Clusters", "Log Likelihood", dataset+": EM Log Likelihood - " + alg, dataset+"_EM_"+alg)

    # other metrics
    # names = ["Adjusted Random", "V Measure", "Mutual Info", "Adjusted Mutual Info"]
    plt.figure()
    plt.title(dataset+": EM Clustering measures - "+alg)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score value')
    plt.plot(range(2,times),adj_rand, label="Adjusted Random")
    plt.plot(range(2,times),v_meas, label="V Measure")
    plt.plot(range(2,times),mutual_info, label = "Fowlkes Mallows Score")
    plt.plot(range(2,times),adj_mutual_info, label="Homogeneity Score")
    plt.legend()
    plt.savefig("EMMetrics"+dataset+"_"+alg+".png")

    kmeans = KM(n_clusters=2)
    kmeans.fit(reduced_data)

    Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

    Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.clf()
    plt.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=plt.cm.Paired,
               aspect='auto', origin='lower')

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    plt.title(dataset + ': EM clustering (' + alg + '-reduced data)\n'
              'Centroids are marked with a white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
예제 #20
0
def dimensionality_reduction_analysis():
    X_p, Y_p, df_phish = get_phishing_data()
    run_PCA(X_p, Y_p, "Phishing Data")
    run_ICA(X_p, Y_p, "Phishing Data")
    run_RCA(X_p, Y_p, "Phishing Data")
    imp_phish, topcols_phish = run_RFC(X_p, Y_p, df_original=df_phish)
    pca_phish = PCA(n_components=32, random_state=5).fit_transform(X_p)
    ica_phish = ICA(n_components=32, random_state=5).fit_transform(X_p)
    rca_phish = RCA(n_components=32, random_state=5).fit_transform(X_p)
    rfc_phish = df_phish[topcols_phish]
    rfc_phish = np.array(rfc_phish.values, dtype='int64')[:, :32]
    #
    run_kmeans(pca_phish, Y_p, 'PCA Phishing Data')
    run_kmeans(ica_phish, Y_p, 'ICA Phishing Data')
    run_kmeans(rca_phish, Y_p, 'RCA Phishing Data')
    run_kmeans(rfc_phish, Y_p, 'RFC Phishing Data')

    evaluate_kmeans(KMeans(n_clusters=14,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    pca_phish,
                    Y_p,
                    title="PCA")
    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    ica_phish,
                    Y_p,
                    title="ICA")
    evaluate_kmeans(KMeans(n_clusters=10,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rca_phish,
                    Y_p,
                    title="RCA")
    evaluate_kmeans(KMeans(n_clusters=2,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rfc_phish,
                    Y_p,
                    title="RFC")

    run_EM(pca_phish, Y_p, 'PCA Phishing Data')
    run_EM(ica_phish, Y_p, 'ICA Phishing Data')
    run_EM(rca_phish, Y_p, 'RCA Phishing Data')
    run_EM(rfc_phish, Y_p, 'RFC Phishing Data')

    evaluate_EM(EM(n_components=67,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                pca_phish,
                Y_p,
                title="PCA")
    evaluate_EM(EM(n_components=64,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                ica_phish,
                Y_p,
                title="ICA")
    evaluate_EM(EM(n_components=64,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rca_phish,
                Y_p,
                title="RCA")
    evaluate_EM(EM(n_components=32,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rfc_phish,
                Y_p,
                title="RFC")

    X_v, Y_v, df_vocal = get_vocal_data()
    run_PCA(X_v, Y_v, "Phone Me Data")
    run_ICA(X_v, Y_v, "Phone Me Data")
    run_RCA(X_v, Y_v, "Phone Me Data")
    imp_vocal, topcols_vocal = run_RFC(X_v, Y_v, df_original=df_vocal)
    pca_vocal = PCA(n_components=4, random_state=5).fit_transform(X_v)
    ica_vocal = ICA(n_components=4, random_state=5).fit_transform(X_v)
    rca_vocal = RCA(n_components=4, random_state=5).fit_transform(X_v)
    rfc_vocal = df_vocal[topcols_vocal]
    rfc_vocal = np.array(rfc_vocal.values, dtype='int64')[:, :4]

    run_kmeans(pca_vocal, Y_v, 'PCA Phone Me Data')
    run_kmeans(ica_vocal, Y_v, 'ICA Phone Me Data')
    run_kmeans(rca_vocal, Y_v, 'RCA Phone Me Data')
    run_kmeans(rfc_vocal, Y_v, 'RFC Phone Me Data')

    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    pca_vocal,
                    Y_v,
                    title="PCA")
    evaluate_kmeans(KMeans(n_clusters=10,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    ica_vocal,
                    Y_v,
                    title="ICA")
    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rca_vocal,
                    Y_v,
                    title="RCA")
    evaluate_kmeans(KMeans(n_clusters=12,
                           n_init=10,
                           random_state=100,
                           n_jobs=-1),
                    rfc_vocal,
                    Y_v,
                    title="RFC")

    run_EM(pca_vocal, Y_v, 'PCA Phone Me Data')
    run_EM(ica_vocal, Y_v, 'ICA Phone Me Data')
    run_EM(rca_vocal, Y_v, 'RCA Phone Me Data')
    run_EM(rfc_vocal, Y_v, 'RFC Phone Me Data')

    evaluate_EM(EM(n_components=58,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                pca_vocal,
                Y_v,
                title="PCA")
    evaluate_EM(EM(n_components=52,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                ica_vocal,
                Y_v,
                title="ICA")
    evaluate_EM(EM(n_components=56,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rca_vocal,
                Y_v,
                title="RCA")
    evaluate_EM(EM(n_components=48,
                   covariance_type='diag',
                   n_init=1,
                   warm_start=True,
                   random_state=100),
                rfc_vocal,
                Y_v,
                title="RFC")

    # Comparing With NN
    # Original
    print("Original")
    X_train, X_test, y_train, y_test = train_test_split(np.array(X_p),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    full_est = MLPClassifier(hidden_layer_sizes=(50, ),
                             solver='adam',
                             activation='logistic',
                             learning_rate_init=0.01,
                             random_state=100)
    train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve(
        full_est, X_train, y_train, title="Neural Net Phishing: Full")
    final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test)
    # PCA
    print("PCA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(pca_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    pca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve(
        pca_est, X_train, y_train, title="Neural Net Phishing: PCA")
    final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test)
    # ICA
    print("ICA")
    X_train, X_test, y_train, y_test = train_test_split(np.array(ica_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    ica_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve(
        ica_est, X_train, y_train, title="Neural Net Phishing: ICA")
    final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test)
    # Randomised Projection
    print("RCA")
    X_train, X_test, y_train, y_test = train_test_split(np.array(rca_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve(
        rca_est, X_train, y_train, title="Neural Net Phishing: RCA")
    final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test)
    # RFC
    print("RFC")
    X_train, X_test, y_train, y_test = train_test_split(np.array(rfc_phish),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rfc_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve(
        rfc_est, X_train, y_train, title="Neural Net Phishing: RFC")
    final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test)

    compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca,
                     NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc,
                     'Phishing Dataset')
    compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca,
                      NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc,
                      'Phishing Dataset')
    compare_learn_time(train_samp_full, NN_train_score_full,
                       NN_train_score_pca, NN_train_score_ica,
                       NN_train_score_rca, NN_train_score_rfc,
                       'Phishing Dataset')

    print("Training Clustered Label")
    # Training NN on Projected data with cluster labels
    km = KMeans(n_clusters=2, n_init=10, random_state=100, n_jobs=-1).fit(X_p)
    km_labels = km.labels_
    em = EM(n_components=30,
            covariance_type='diag',
            n_init=1,
            warm_start=True,
            random_state=100).fit(X_p)
    em_labels = em.predict(X_p)

    clust_full = addclusters(X_p, km_labels, em_labels)
    clust_pca = addclusters(pca_phish, km_labels, em_labels)
    clust_ica = addclusters(ica_phish, km_labels, em_labels)
    clust_rca = addclusters(rca_phish, km_labels, em_labels)
    clust_rfc = addclusters(rfc_phish, km_labels, em_labels)
    print("Training Clustered - Original")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_full),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    full_est = MLPClassifier(hidden_layer_sizes=(50, ),
                             solver='adam',
                             activation='logistic',
                             learning_rate_init=0.01,
                             random_state=100)
    train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve(
        full_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: Full")
    final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - PCA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_pca),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    pca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve(
        pca_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: PCA")
    final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - ICA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_ica),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    ica_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve(
        ica_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: ICA")
    final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - RCA")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rca),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rca_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve(
        rca_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: RCA")
    final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test)
    print("Training Clustered - RFC")

    X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rfc),
                                                        np.array(Y_p),
                                                        test_size=0.20)
    rfc_est = MLPClassifier(hidden_layer_sizes=(50, ),
                            solver='adam',
                            activation='logistic',
                            learning_rate_init=0.01,
                            random_state=100)
    train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve(
        rfc_est,
        X_train,
        y_train,
        title="Neural Net Phishing with Clusters: RFC")
    final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test)

    compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca,
                     NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc,
                     'Phishing Dataset')
    compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca,
                      NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc,
                      'Phishing Dataset')
    compare_learn_time(train_samp_full, NN_train_score_full,
                       NN_train_score_pca, NN_train_score_ica,
                       NN_train_score_rca, NN_train_score_rfc,
                       'Phishing Dataset')
예제 #21
0
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(kdist, aic_scores, label='AIC')
    ax.plot(kdist, bic_scores,label='BIC')
    plt.grid(True)
    plt.xlabel('# Clusters')
    plt.ylabel('Model Complexity Score')
    plt.title(title + 'Exp Max Model Complexity')
    plt.legend(loc="best")
    plt.show()


run_EM(diabetes_X,diabetes_Y,'Diabetes Data')
em = EM(n_components=24, covariance_type='diag', warm_start=True, random_state=100)


X_train, X_test, y_train, y_test = train_test_split(np.array(creditX),np.array(creditY), test_size=0.25)
run_EM(X_train,y_train,'Credit Data')
em = EM(n_components=41, covariance_type='diag', warm_start=True, random_state=100)



def run_PCA(X,y,title):
    
    pca = PCA(random_state=5).fit(X) #for all components
    cum_var = np.cumsum(pca.explained_variance_ratio_)

    fig, ax1 = plt.subplots()
    ax1.plot(list(range(len(pca.singular_values_))), pca.singular_values_, 'm-')