def clustering_score(ls_actual, ls_pred): dc = {0:0,1:1,2:2,6:3,9:4,11:5} for j in range(len(ls_actual)): ls_actual[j] = dc[ls_actual[j]] vm_score = vms(ls_actual, ls_pred) c_score = cs(ls_actual, ls_pred) h_score = hs(ls_actual, ls_pred) fm_score = fms(ls_actual, ls_pred) accu_score = accuracy_score(ls_actual, ls_pred) return vm_score, c_score, h_score, fm_score, accu_score
def clustering_score(labels_Models): for i in range(len(labels_Models)): j = 0 ls_pred = [] for i in range(len(labels_Models)): for j in range(len(labels_Models[i])): ls_pred.append(i) ls_actual = [] for item in labels_Models: ls_actual = ls_actual + item print(np.unique(np.array(ls_actual))) #Map uneven labels from 0 to 5 dc = {0: 0, 1: 1, 2: 2, 6: 3, 9: 4, 11: 5} for j in range(len(ls_actual)): ls_actual[j] = dc[ls_actual[j]] # print ls_pred vm_score = vms(ls_actual, ls_pred) c_score = cs(ls_actual, ls_pred) h_score = hs(ls_actual, ls_pred) fm_score = fms(ls_actual, ls_pred) accu_score = accuracy_score(ls_actual, ls_pred) return vm_score, c_score, h_score, fm_score, accu_score
def run_clustering(out, cancer_x, cancer_y, housing_x, housing_y): SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) silhouette = defaultdict(lambda: defaultdict(dict)) completeness = defaultdict(lambda: defaultdict(dict)) homogeniety = defaultdict(lambda: defaultdict(dict)) st = clock() for k in range(2, 20, 1): km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(cancer_x) gmm.fit(cancer_x) SSE[k]['cancer'] = km.score(cancer_x) ll[k]['cancer'] = gmm.score(cancer_x) acc[k]['cancer']['Kmeans'] = cluster_acc(cancer_y, km.predict(cancer_x)) acc[k]['cancer']['GMM'] = cluster_acc(cancer_y, gmm.predict(cancer_x)) adjMI[k]['cancer']['Kmeans'] = ami(cancer_y, km.predict(cancer_x)) adjMI[k]['cancer']['GMM'] = ami(cancer_y, gmm.predict(cancer_x)) silhouette[k]['cancer']['Kmeans Silhouette'] = ss( cancer_x, km.predict(cancer_x)) silhouette[k]['cancer']['GMM Silhouette'] = ss(cancer_x, gmm.predict(cancer_x)) completeness[k]['cancer']['Kmeans Completeness'] = cs( cancer_y, km.predict(cancer_x)) completeness[k]['cancer']['GMM Completeness'] = cs( cancer_y, gmm.predict(cancer_x)) homogeniety[k]['cancer']['Kmeans Homogeniety'] = hs( cancer_y, km.predict(cancer_x)) homogeniety[k]['cancer']['GMM Homogeniety'] = hs( cancer_y, gmm.predict(cancer_x)) km.fit(housing_x) gmm.fit(housing_x) SSE[k]['housing'] = km.score(housing_x) ll[k]['housing'] = gmm.score(housing_x) acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x)) acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x)) adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x)) adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x)) silhouette[k]['housing']['Kmeans Silhouette'] = ss( housing_x, km.predict(housing_x)) silhouette[k]['housing']['GMM Silhouette'] = ss( housing_x, gmm.predict(housing_x)) completeness[k]['housing']['Kmeans Completeness'] = cs( housing_y, km.predict(housing_x)) completeness[k]['housing']['GMM Completeness'] = cs( housing_y, gmm.predict(housing_x)) homogeniety[k]['housing']['Kmeans Homogeniety'] = hs( housing_y, km.predict(housing_x)) homogeniety[k]['housing']['GMM Homogeniety'] = hs( housing_y, gmm.predict(housing_x)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) silhouette = pd.Panel(silhouette) completeness = pd.Panel(completeness) homogeniety = pd.Panel(homogeniety) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv') acc.ix[:, :, 'cancer'].to_csv(out + 'Perm acc.csv') adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv') adjMI.ix[:, :, 'cancer'].to_csv(out + 'Perm adjMI.csv') silhouette.ix[:, :, 'cancer'].to_csv(out + 'Perm silhouette.csv') completeness.ix[:, :, 'cancer'].to_csv(out + 'Perm completeness.csv') homogeniety.ix[:, :, 'cancer'].to_csv(out + 'Perm homogeniety.csv') silhouette.ix[:, :, 'housing'].to_csv(out + 'housing silhouette.csv') completeness.ix[:, :, 'housing'].to_csv(out + 'housing completeness.csv') homogeniety.ix[:, :, 'housing'].to_csv(out + 'housing homogeniety.csv')
print(model.inertia_) print("Predicted: ", y) plt.title("KMeans") plt.scatter(df[y == 0]['x'], df[y == 0]['y']) plt.scatter(df[y == 1]['x'], df[y == 1]['y']) plt.scatter(df[y == 2]['x'], df[y == 2]['y']) plt.scatter(df[y == 3]['x'], df[y == 3]['y']) plt.show() y_true = [0 for i in range(500)] for i in range(500): y_true.append(1) for i in range(500): y_true.append(2) for i in range(500): y_true.append(3) print("Purity score for KMeans: ", purity_score(y_true, y)) model1 = gmm(n_components=4) y1 = model1.fit_predict(df) print("predicted: ", y1) plt.title("GMM") plt.scatter(df[y1 == 0]['x'], df[y1 == 0]['y']) plt.scatter(df[y1 == 1]['x'], df[y1 == 1]['y']) plt.scatter(df[y1 == 2]['x'], df[y1 == 2]['y']) plt.scatter(df[y1 == 3]['x'], df[y1 == 3]['y']) plt.show() print("Purity score for gmm: ", purity_score(y_true, y1)) print("hs for KMeans ", hs(y_true, y)) print("hs for GMM ", hs(y_true, y1))
def main_logic(): out = './BASE/' # change the below value based on the readme.txt file instructions base = './BASE/' np.random.seed(0) madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon') madelon_X = madelon.drop('Class', 1).copy().values madelon_Y = madelon['Class'].copy().values character = pd.read_hdf(base + 'datasets.hdf', 'character') character_X = character.drop('Class', 1).copy().values character_Y = character['Class'].copy().values np.random.seed(0) # clusters = [2] clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] madelon_X = StandardScaler().fit_transform(madelon_X) character_X = StandardScaler().fit_transform(character_X) # Data for 1-3 SSE = defaultdict(dict) ll = defaultdict(dict) Silhouette_dict = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) for j in clusters: st = clock() km.set_params(n_clusters=j) gmm.set_params(n_components=j) km.fit(madelon_X) gmm.fit(madelon_X) SSE[j]['Madelon'] = km.score(madelon_X) ll[j]['Madelon'] = gmm.score(madelon_X) test = km.predict(madelon_X) acc[j]['Madelon']['Kmeans'] = cluster_acc(madelon_Y, km.predict(madelon_X), j) acc[j]['Madelon']['GMM'] = cluster_acc(madelon_Y, gmm.predict(madelon_X)) adjMI[j]['Madelon']['Kmeans'] = ami(madelon_Y, km.predict(madelon_X)) adjMI[j]['Madelon']['GMM'] = ami(madelon_Y, gmm.predict(madelon_X)) print("Homogenity Score ,{}, Kmeans,".format(j), hs(madelon_Y, km.labels_)) print("Completeness Score ,{} ,Kmeans,".format(j), cs(madelon_Y, km.labels_)) label = km.labels_ gmmm = gmm.predict_proba(madelon_X) sil_coeff = silhouette_score(madelon_X, label, metric='euclidean') Silhouette_dict[j]['Madelon'] = sil_coeff print("For n_clusters={}, The Silhouette Coefficient is {}".format( j, sil_coeff)) km.fit(character_X) gmm.fit(character_X) SSE[j]['character'] = km.score(character_X) ll[j]['character'] = gmm.score(character_X) best = km.predict(character_X) acc[j]['character']['Kmeans'] = cluster_acc(character_Y, km.predict(character_X), j) acc[j]['character']['GMM'] = cluster_acc(character_Y, gmm.predict(character_X)) adjMI[j]['character']['Kmeans'] = ami(character_Y, km.predict(character_X)) adjMI[j]['character']['GMM'] = ami(character_Y, gmm.predict(character_X)) label = km.labels_ sil_coeff = silhouette_score(character_X, label, metric='euclidean') Silhouette_dict[j]['character'] = sil_coeff print(j, clock() - st) print("Homogenity Score ,{}, Kmeans,".format(j), hs(character_Y, km.labels_)) print("Completeness Score ,{} ,Kmeans,".format(j), cs(character_Y, km.labels_)) print("For n_clusters={}, The Silhouette Coefficient is {}".format( j, sil_coeff)) Silhouette_dict = pd.DataFrame(Silhouette_dict).to_csv(out + 'Silhouette.csv') SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'character'].to_csv(out + 'character_acc.csv') acc.ix[:, :, 'Madelon'].to_csv(out + 'Madelon acc.csv') adjMI.ix[:, :, 'character'].to_csv(out + 'character_adjMI.csv') adjMI.ix[:, :, 'Madelon'].to_csv(out + 'Madelon adjMI.csv') # %% NN fit data (2,3) grid = { 'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon') madelon_X = madelon.drop('Class', 1).copy().values madelon_Y = madelon['Class'].copy().values X_train, X_test, y_train, y_test = train_test_split(madelon_X, madelon_Y, test_size=0.3, random_state=42) np.random.seed(0) for k in clusters: mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5, alpha=10**-5, hidden_layer_sizes=(62, 62), verbose=0) km = kmeans(random_state=5, n_clusters=k) pipe = Pipeline([('km', km), ('NN', mlp)]) # gs = GridSearchCV(pipe, grid, verbose=10) tick = time.clock() pipe.fit(X_train, y_train) tock = time.clock() - tick print("Traning time , {}, k means dataset".format(k), ',', tock) tick = time.clock() y_pred = pipe.predict(X_test) tock = time.clock() - tick print("Testing time , {}, k means component".format(k), ',', tock) print("Accuracy Score , {}, kmeans Madelon".format(k), ',', accuracy_score(y_test, y_pred)) grid = {'gmm__n_components': clusters} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5, verbose=0, alpha=10**-5, hidden_layer_sizes=(62, 62)) gmm = myGMM(random_state=43, n_components=k) pipe = Pipeline([('gmm', gmm), ('NN', mlp)]) # gs = GridSearchCV(pipe, grid, verbose=10, cv=5) tick = time.clock() pipe.fit(X_train, y_train) tock = time.clock() - tick print("Traning time , {}, gmm dataset".format(k), ',', tock) tick = time.clock() y_pred = pipe.predict(X_test) tock = time.clock() - tick print("Testing time , {}, gmm means component".format(k), ',', tock) print("Accuracy Score , {}, gmm means Madelon".format(k), ',', accuracy_score(y_test, y_pred)) grid = { 'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) km = kmeans(random_state=5) pipe = Pipeline([('km', km), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10) gs.fit(madelon_X, madelon_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon cluster Kmeans.csv') grid = { 'gmm__n_components': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) gmm = myGMM(random_state=5) pipe = Pipeline([('gmm', gmm), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelon_X, madelon_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon cluster GMM.csv') grid = { 'km__n_clusters': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) km = kmeans(random_state=5) pipe = Pipeline([('km', km), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(character_X, character_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'character_cluster_Kmeans.csv') grid = { 'gmm__n_components': clusters, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) gmm = myGMM(random_state=5) pipe = Pipeline([('gmm', gmm), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(character_X, character_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'character_cluster_GMM.csv') # %% For chart 4/5 madelonX2D = TSNE(verbose=10, random_state=5).fit_transform(madelon_X) character_X2D = TSNE(verbose=10, random_state=5).fit_transform(character_X) madelon2D = pd.DataFrame(np.hstack( (madelonX2D, np.atleast_2d(madelon_Y).T)), columns=['x', 'y', 'target']) character2D = pd.DataFrame(np.hstack( (character_X2D, np.atleast_2d(character_Y).T)), columns=['x', 'y', 'target']) madelon2D.to_csv(out + 'madelon2D.csv') character2D.to_csv(out + 'character2D.csv')