예제 #1
0
def clustering_score(ls_actual, ls_pred):

	dc = {0:0,1:1,2:2,6:3,9:4,11:5}
	for j in range(len(ls_actual)):
		ls_actual[j] = dc[ls_actual[j]]

	vm_score = vms(ls_actual, ls_pred)
	c_score = cs(ls_actual, ls_pred)
	h_score = hs(ls_actual, ls_pred)
	fm_score = fms(ls_actual, ls_pred)
	accu_score = accuracy_score(ls_actual, ls_pred)

	return vm_score, c_score, h_score, fm_score, accu_score
예제 #2
0
def clustering_score(labels_Models):
    for i in range(len(labels_Models)):
        j = 0
    ls_pred = []
    for i in range(len(labels_Models)):
        for j in range(len(labels_Models[i])):
            ls_pred.append(i)
    ls_actual = []
    for item in labels_Models:
        ls_actual = ls_actual + item
    print(np.unique(np.array(ls_actual)))
    #Map uneven labels from 0 to 5
    dc = {0: 0, 1: 1, 2: 2, 6: 3, 9: 4, 11: 5}
    for j in range(len(ls_actual)):
        ls_actual[j] = dc[ls_actual[j]]
    # print ls_pred
    vm_score = vms(ls_actual, ls_pred)
    c_score = cs(ls_actual, ls_pred)
    h_score = hs(ls_actual, ls_pred)
    fm_score = fms(ls_actual, ls_pred)
    accu_score = accuracy_score(ls_actual, ls_pred)

    return vm_score, c_score, h_score, fm_score, accu_score
예제 #3
0
def run_clustering(out, cancer_x, cancer_y, housing_x, housing_y):
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    silhouette = defaultdict(lambda: defaultdict(dict))
    completeness = defaultdict(lambda: defaultdict(dict))
    homogeniety = defaultdict(lambda: defaultdict(dict))

    st = clock()
    for k in range(2, 20, 1):
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(cancer_x)
        gmm.fit(cancer_x)

        SSE[k]['cancer'] = km.score(cancer_x)
        ll[k]['cancer'] = gmm.score(cancer_x)

        acc[k]['cancer']['Kmeans'] = cluster_acc(cancer_y,
                                                 km.predict(cancer_x))
        acc[k]['cancer']['GMM'] = cluster_acc(cancer_y, gmm.predict(cancer_x))

        adjMI[k]['cancer']['Kmeans'] = ami(cancer_y, km.predict(cancer_x))
        adjMI[k]['cancer']['GMM'] = ami(cancer_y, gmm.predict(cancer_x))

        silhouette[k]['cancer']['Kmeans Silhouette'] = ss(
            cancer_x, km.predict(cancer_x))
        silhouette[k]['cancer']['GMM Silhouette'] = ss(cancer_x,
                                                       gmm.predict(cancer_x))

        completeness[k]['cancer']['Kmeans Completeness'] = cs(
            cancer_y, km.predict(cancer_x))
        completeness[k]['cancer']['GMM Completeness'] = cs(
            cancer_y, gmm.predict(cancer_x))

        homogeniety[k]['cancer']['Kmeans Homogeniety'] = hs(
            cancer_y, km.predict(cancer_x))
        homogeniety[k]['cancer']['GMM Homogeniety'] = hs(
            cancer_y, gmm.predict(cancer_x))

        km.fit(housing_x)
        gmm.fit(housing_x)
        SSE[k]['housing'] = km.score(housing_x)
        ll[k]['housing'] = gmm.score(housing_x)

        acc[k]['housing']['Kmeans'] = cluster_acc(housing_y,
                                                  km.predict(housing_x))
        acc[k]['housing']['GMM'] = cluster_acc(housing_y,
                                               gmm.predict(housing_x))

        adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x))
        adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x))

        silhouette[k]['housing']['Kmeans Silhouette'] = ss(
            housing_x, km.predict(housing_x))
        silhouette[k]['housing']['GMM Silhouette'] = ss(
            housing_x, gmm.predict(housing_x))

        completeness[k]['housing']['Kmeans Completeness'] = cs(
            housing_y, km.predict(housing_x))
        completeness[k]['housing']['GMM Completeness'] = cs(
            housing_y, gmm.predict(housing_x))

        homogeniety[k]['housing']['Kmeans Homogeniety'] = hs(
            housing_y, km.predict(housing_x))
        homogeniety[k]['housing']['GMM Homogeniety'] = hs(
            housing_y, gmm.predict(housing_x))

        print(k, clock() - st)
    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)

    adjMI = pd.Panel(adjMI)

    silhouette = pd.Panel(silhouette)
    completeness = pd.Panel(completeness)
    homogeniety = pd.Panel(homogeniety)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv')
    acc.ix[:, :, 'cancer'].to_csv(out + 'Perm acc.csv')

    adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv')
    adjMI.ix[:, :, 'cancer'].to_csv(out + 'Perm adjMI.csv')

    silhouette.ix[:, :, 'cancer'].to_csv(out + 'Perm silhouette.csv')
    completeness.ix[:, :, 'cancer'].to_csv(out + 'Perm completeness.csv')
    homogeniety.ix[:, :, 'cancer'].to_csv(out + 'Perm homogeniety.csv')

    silhouette.ix[:, :, 'housing'].to_csv(out + 'housing silhouette.csv')
    completeness.ix[:, :, 'housing'].to_csv(out + 'housing completeness.csv')
    homogeniety.ix[:, :, 'housing'].to_csv(out + 'housing homogeniety.csv')
print(model.inertia_)
print("Predicted: ", y)
plt.title("KMeans")
plt.scatter(df[y == 0]['x'], df[y == 0]['y'])
plt.scatter(df[y == 1]['x'], df[y == 1]['y'])
plt.scatter(df[y == 2]['x'], df[y == 2]['y'])
plt.scatter(df[y == 3]['x'], df[y == 3]['y'])
plt.show()
y_true = [0 for i in range(500)]
for i in range(500):
    y_true.append(1)
for i in range(500):
    y_true.append(2)
for i in range(500):
    y_true.append(3)
print("Purity score for KMeans: ", purity_score(y_true, y))
model1 = gmm(n_components=4)
y1 = model1.fit_predict(df)
print("predicted: ", y1)
plt.title("GMM")
plt.scatter(df[y1 == 0]['x'], df[y1 == 0]['y'])
plt.scatter(df[y1 == 1]['x'], df[y1 == 1]['y'])
plt.scatter(df[y1 == 2]['x'], df[y1 == 2]['y'])
plt.scatter(df[y1 == 3]['x'], df[y1 == 3]['y'])
plt.show()

print("Purity score for gmm: ", purity_score(y_true, y1))
print("hs for KMeans ", hs(y_true, y))
print("hs for GMM ", hs(y_true, y1))
예제 #5
0
def main_logic():
    out = './BASE/'
    # change the below value based on the readme.txt file instructions
    base = './BASE/'
    np.random.seed(0)

    madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values

    character = pd.read_hdf(base + 'datasets.hdf', 'character')
    character_X = character.drop('Class', 1).copy().values
    character_Y = character['Class'].copy().values

    np.random.seed(0)
    # clusters = [2]
    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    madelon_X = StandardScaler().fit_transform(madelon_X)
    character_X = StandardScaler().fit_transform(character_X)

    # Data for 1-3
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    Silhouette_dict = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    for j in clusters:
        st = clock()
        km.set_params(n_clusters=j)
        gmm.set_params(n_components=j)
        km.fit(madelon_X)
        gmm.fit(madelon_X)

        SSE[j]['Madelon'] = km.score(madelon_X)
        ll[j]['Madelon'] = gmm.score(madelon_X)
        test = km.predict(madelon_X)

        acc[j]['Madelon']['Kmeans'] = cluster_acc(madelon_Y,
                                                  km.predict(madelon_X), j)
        acc[j]['Madelon']['GMM'] = cluster_acc(madelon_Y,
                                               gmm.predict(madelon_X))

        adjMI[j]['Madelon']['Kmeans'] = ami(madelon_Y, km.predict(madelon_X))
        adjMI[j]['Madelon']['GMM'] = ami(madelon_Y, gmm.predict(madelon_X))
        print("Homogenity Score ,{}, Kmeans,".format(j),
              hs(madelon_Y, km.labels_))
        print("Completeness Score ,{} ,Kmeans,".format(j),
              cs(madelon_Y, km.labels_))

        label = km.labels_
        gmmm = gmm.predict_proba(madelon_X)
        sil_coeff = silhouette_score(madelon_X, label, metric='euclidean')
        Silhouette_dict[j]['Madelon'] = sil_coeff
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            j, sil_coeff))

        km.fit(character_X)
        gmm.fit(character_X)
        SSE[j]['character'] = km.score(character_X)
        ll[j]['character'] = gmm.score(character_X)
        best = km.predict(character_X)
        acc[j]['character']['Kmeans'] = cluster_acc(character_Y,
                                                    km.predict(character_X), j)
        acc[j]['character']['GMM'] = cluster_acc(character_Y,
                                                 gmm.predict(character_X))
        adjMI[j]['character']['Kmeans'] = ami(character_Y,
                                              km.predict(character_X))
        adjMI[j]['character']['GMM'] = ami(character_Y,
                                           gmm.predict(character_X))
        label = km.labels_
        sil_coeff = silhouette_score(character_X, label, metric='euclidean')
        Silhouette_dict[j]['character'] = sil_coeff
        print(j, clock() - st)
        print("Homogenity Score ,{}, Kmeans,".format(j),
              hs(character_Y, km.labels_))
        print("Completeness Score ,{} ,Kmeans,".format(j),
              cs(character_Y, km.labels_))
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            j, sil_coeff))

    Silhouette_dict = pd.DataFrame(Silhouette_dict).to_csv(out +
                                                           'Silhouette.csv')
    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'character'].to_csv(out + 'character_acc.csv')
    acc.ix[:, :, 'Madelon'].to_csv(out + 'Madelon acc.csv')
    adjMI.ix[:, :, 'character'].to_csv(out + 'character_adjMI.csv')
    adjMI.ix[:, :, 'Madelon'].to_csv(out + 'Madelon adjMI.csv')

    # %% NN fit data (2,3)
    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values
    X_train, X_test, y_train, y_test = train_test_split(madelon_X,
                                                        madelon_Y,
                                                        test_size=0.3,
                                                        random_state=42)

    np.random.seed(0)

    for k in clusters:
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=5,
                            alpha=10**-5,
                            hidden_layer_sizes=(62, 62),
                            verbose=0)
        km = kmeans(random_state=5, n_clusters=k)
        pipe = Pipeline([('km', km), ('NN', mlp)])
        # gs = GridSearchCV(pipe, grid, verbose=10)
        tick = time.clock()
        pipe.fit(X_train, y_train)
        tock = time.clock() - tick

        print("Traning time , {}, k means dataset".format(k), ',', tock)
        tick = time.clock()
        y_pred = pipe.predict(X_test)
        tock = time.clock() - tick
        print("Testing time , {}, k means component".format(k), ',', tock)
        print("Accuracy Score ,  {}, kmeans Madelon".format(k), ',',
              accuracy_score(y_test, y_pred))

        grid = {'gmm__n_components': clusters}
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=5,
                            verbose=0,
                            alpha=10**-5,
                            hidden_layer_sizes=(62, 62))
        gmm = myGMM(random_state=43, n_components=k)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
        # gs = GridSearchCV(pipe, grid, verbose=10, cv=5)
        tick = time.clock()
        pipe.fit(X_train, y_train)
        tock = time.clock() - tick
        print("Traning time , {}, gmm dataset".format(k), ',', tock)
        tick = time.clock()
        y_pred = pipe.predict(X_test)
        tock = time.clock() - tick
        print("Testing time , {}, gmm means component".format(k), ',', tock)
        print("Accuracy Score , {}, gmm means Madelon".format(k), ',',
              accuracy_score(y_test, y_pred))

    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    km = kmeans(random_state=5)
    pipe = Pipeline([('km', km), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon cluster Kmeans.csv')

    grid = {
        'gmm__n_components': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    gmm = myGMM(random_state=5)
    pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon cluster GMM.csv')

    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    km = kmeans(random_state=5)
    pipe = Pipeline([('km', km), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_cluster_Kmeans.csv')

    grid = {
        'gmm__n_components': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    gmm = myGMM(random_state=5)
    pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_cluster_GMM.csv')

    # %% For chart 4/5
    madelonX2D = TSNE(verbose=10, random_state=5).fit_transform(madelon_X)
    character_X2D = TSNE(verbose=10, random_state=5).fit_transform(character_X)

    madelon2D = pd.DataFrame(np.hstack(
        (madelonX2D, np.atleast_2d(madelon_Y).T)),
                             columns=['x', 'y', 'target'])
    character2D = pd.DataFrame(np.hstack(
        (character_X2D, np.atleast_2d(character_Y).T)),
                               columns=['x', 'y', 'target'])

    madelon2D.to_csv(out + 'madelon2D.csv')
    character2D.to_csv(out + 'character2D.csv')