def set_labels(self, data, min_values, max_values, learning_data, fuzzy): data['labels'] = pd.Series(fuzzy.labels_) for i in range(0, len(data.values)): for j in range(len(max_values)): x, y, z = data.values[i] max1, max2 = max_values[j] min1, min2 = min_values[j] if x > min1 and x < max1 and y > min2 and y < max2: data.at[i, 'labels'] = j if x > max1 and y > max2: data.at[i, 'labels'] = np.max(data['labels']) score = ss(data[['param_1', 'param_2']], labels=data['labels']) return score
def run_trial(X, labels, k): errors = '"' # Run our dbscan start = time() """ if metric == 'seuclidean': db = KMeans(eps,minPts,metric=metric,metric_params={'V':V}) else: db = kmean(,minPts,metric=metric) """ db = KMeans(k, n_jobs=12) pred_labels = db.fit_predict(X) elapsed = time() - start try: ari_score = ari(pred_labels, labels) except Exception as e: errors += str(e) + '; ' ari_score = np.nan try: nmi_score = nmi(pred_labels, labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nmi_score = np.nan try: ss_score = ss(X, pred_labels) except Exception as e: errors += str(e) + '; ' ss_score = np.nan try: vrc_score = vrc(X, pred_labels) except Exception as e: errors += str(e) + '; ' vrc_score = np.nan try: dbs_score = dbs(X, pred_labels) except Exception as e: errors += str(e) + '; ' dbs_score = np.nan errors += '"' return [ k, elapsed, ari_score, nmi_score, ss_score, vrc_score, dbs_score, errors ]
with open(sys.argv[1], 'rb') as fh: x = pickle.load(fh) to_remove = [] for i in range(len(x.cell_type)): if x.cell_type[i] in EXCLUDED_TYPES: to_remove.append(i) x.drop(index=to_remove, inplace=True) X = x.drop('cell_type', axis=1).values if scipy.sparse.issparse(X[0][0]): X = np.array(np.concatenate([i[0].todense() for i in X])) labels = LabelEncoder().fit_transform(x.cell_type) try: ss_euc = str(ss(X, labels, metric='euclidean')) except Exception as e: print(e) ss_euc = str(np.nan) try: ss_seu = str( ss(X, labels, metric='seuclidean', V=np.var(X, axis=0, ddof=1, dtype=np.double))) except Exception as e: print(e) ss_seu = str(np.nan) try:
data_pr = data[["param_1", "param_2"]] plt.scatter(data_pr.param_1, data_pr.param_2) kmeans = km(init='k-means++', n_clusters=3, random_state=0).fit(data_pr.as_matrix()) # data_pr['labels'] =pd.Series(kmeans.labels_) # data_pr.plot.scatter(x='b',y='c',c='labels', colormap='viridis') data_pr1 = data_pr.copy() scores = [] for k in range(2, 10): kmeans = km(init='k-means++', n_clusters=k, random_state=0).fit(data_pr.as_matrix()) data_pr1['labels'] = pd.Series(kmeans.labels_) print(len(kmeans.labels_)) data_pr1.plot.scatter(x='b', y='c', c='labels', colormap='viridis') scores.append(ss(data_pr1[['b', 'c']], labels=data_pr1['labels'])) print(data_pr1) n = [i for i in range(2, 10)] plt.figure() plt.plot(n, scores) plt.show() # zmiana txt do csv # param1 = list() # param2 = list() # depth = list() # all_data = list() # # filepath = "files\\sdmt3.txt"
km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(faultsX) gmm.fit(faultsX) #Faults dataset #Visual Measurements #Sum of Squared Errors for K-means SSE[k]['Faults'] = km.score(faultsX) #Log-Likelihood for GMM ll[k]['Faults'] = gmm.score(faultsX) #Silhouette Score #The best value is 1 and the worst value is -1. Silhouette analysis can be used to study the separation distance between the resulting clusters. SS[k]['Faults']['Kmeans'] = ss(faultsX, km.predict(faultsX)) SS[k]['Faults']['GMM'] = ss(faultsX, gmm.predict(faultsX)) #Cluster Accuracy acc[k]['Faults']['Kmeans'] = cluster_acc(faultsY, km.predict(faultsX)) acc[k]['Faults']['GMM'] = cluster_acc(faultsY, gmm.predict(faultsX)) #Adjusted Mutual Information adjMI[k]['Faults']['Kmeans'] = ami(faultsY, km.predict(faultsX)) adjMI[k]['Faults']['GMM'] = ami(faultsY, gmm.predict(faultsX)) #Breast Cancer dataset km.fit(bcX) gmm.fit(bcX) SSE[k]['BreastC'] = km.score(bcX) ll[k]['BreastC'] = gmm.score(bcX) SS[k]['BreastC']['Kmeans'] = ss(bcX, km.predict(bcX))
def run_clustering(out, cancer_x, cancer_y, housing_x, housing_y): SSE = defaultdict(dict) ll = defaultdict(dict) acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = kmeans(random_state=5) gmm = GMM(random_state=5) silhouette = defaultdict(lambda: defaultdict(dict)) completeness = defaultdict(lambda: defaultdict(dict)) homogeniety = defaultdict(lambda: defaultdict(dict)) st = clock() for k in range(2, 20, 1): km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(cancer_x) gmm.fit(cancer_x) SSE[k]['cancer'] = km.score(cancer_x) ll[k]['cancer'] = gmm.score(cancer_x) acc[k]['cancer']['Kmeans'] = cluster_acc(cancer_y, km.predict(cancer_x)) acc[k]['cancer']['GMM'] = cluster_acc(cancer_y, gmm.predict(cancer_x)) adjMI[k]['cancer']['Kmeans'] = ami(cancer_y, km.predict(cancer_x)) adjMI[k]['cancer']['GMM'] = ami(cancer_y, gmm.predict(cancer_x)) silhouette[k]['cancer']['Kmeans Silhouette'] = ss( cancer_x, km.predict(cancer_x)) silhouette[k]['cancer']['GMM Silhouette'] = ss(cancer_x, gmm.predict(cancer_x)) completeness[k]['cancer']['Kmeans Completeness'] = cs( cancer_y, km.predict(cancer_x)) completeness[k]['cancer']['GMM Completeness'] = cs( cancer_y, gmm.predict(cancer_x)) homogeniety[k]['cancer']['Kmeans Homogeniety'] = hs( cancer_y, km.predict(cancer_x)) homogeniety[k]['cancer']['GMM Homogeniety'] = hs( cancer_y, gmm.predict(cancer_x)) km.fit(housing_x) gmm.fit(housing_x) SSE[k]['housing'] = km.score(housing_x) ll[k]['housing'] = gmm.score(housing_x) acc[k]['housing']['Kmeans'] = cluster_acc(housing_y, km.predict(housing_x)) acc[k]['housing']['GMM'] = cluster_acc(housing_y, gmm.predict(housing_x)) adjMI[k]['housing']['Kmeans'] = ami(housing_y, km.predict(housing_x)) adjMI[k]['housing']['GMM'] = ami(housing_y, gmm.predict(housing_x)) silhouette[k]['housing']['Kmeans Silhouette'] = ss( housing_x, km.predict(housing_x)) silhouette[k]['housing']['GMM Silhouette'] = ss( housing_x, gmm.predict(housing_x)) completeness[k]['housing']['Kmeans Completeness'] = cs( housing_y, km.predict(housing_x)) completeness[k]['housing']['GMM Completeness'] = cs( housing_y, gmm.predict(housing_x)) homogeniety[k]['housing']['Kmeans Homogeniety'] = hs( housing_y, km.predict(housing_x)) homogeniety[k]['housing']['GMM Homogeniety'] = hs( housing_y, gmm.predict(housing_x)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) silhouette = pd.Panel(silhouette) completeness = pd.Panel(completeness) homogeniety = pd.Panel(homogeniety) SSE.to_csv(out + 'SSE.csv') ll.to_csv(out + 'logliklihood.csv') acc.ix[:, :, 'housing'].to_csv(out + 'Housing acc.csv') acc.ix[:, :, 'cancer'].to_csv(out + 'Perm acc.csv') adjMI.ix[:, :, 'housing'].to_csv(out + 'Housing adjMI.csv') adjMI.ix[:, :, 'cancer'].to_csv(out + 'Perm adjMI.csv') silhouette.ix[:, :, 'cancer'].to_csv(out + 'Perm silhouette.csv') completeness.ix[:, :, 'cancer'].to_csv(out + 'Perm completeness.csv') homogeniety.ix[:, :, 'cancer'].to_csv(out + 'Perm homogeniety.csv') silhouette.ix[:, :, 'housing'].to_csv(out + 'housing silhouette.csv') completeness.ix[:, :, 'housing'].to_csv(out + 'housing completeness.csv') homogeniety.ix[:, :, 'housing'].to_csv(out + 'housing homogeniety.csv')
def run_trial(X, labels, eps, minPts, metric, V): errors = '"' # Run our dbscan start = time() if metric == 'seuclidean': db = DBSCAN(eps, minPts, metric=metric, metric_params={'V': V}, n_jobs=6) else: db = DBSCAN(eps, minPts, metric=metric, n_jobs=6) pred_labels = db.fit_predict(X) elapsed = time() - start perc_noise = np.sum(pred_labels == -1) / len(pred_labels) n_clust = pred_labels.max() # Remove noisy points clean_idx = np.where(pred_labels != -1) nn_preds = pred_labels[clean_idx] nn_labels = labels[clean_idx] nn_X = X[clean_idx] try: ari_score = ari(pred_labels, labels) except Exception as e: errors += str(e) + '; ' ari_score = np.nan try: nmi_score = nmi(pred_labels, labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nmi_score = np.nan try: if metric == 'seuclidean': ss_score = ss(X, pred_labels, metric=metric, V=V) else: ss_score = ss(X, pred_labels, metric=metric) except Exception as e: errors += str(e) + '; ' ss_score = np.nan try: vrc_score = vrc(X, pred_labels) except Exception as e: errors += str(e) + '; ' vrc_score = np.nan try: dbs_score = dbs(X, pred_labels) except Exception as e: errors += str(e) + '; ' dbs_score = np.nan try: nn_ari_score = ari(nn_preds, nn_labels) except Exception as e: errors += str(e) + '; ' nn_ari_score = np.nan try: nn_nmi_score = nmi(nn_preds, nn_labels, average_method='arithmetic') except Exception as e: errors += str(e) + '; ' nn_nmi_score = np.nan try: if metric == 'seuclidean': nn_ss_score = ss(nn_X, nn_preds, metric=metric, V=V) else: nn_ss_score = ss(nn_X, nn_preds, metric=metric) except Exception as e: errors += str(e) + '; ' nn_ss_score = np.nan try: nn_vrc_score = vrc(nn_X, nn_preds) except Exception as e: errors += str(e) + '; ' nn_vrc_score = np.nan try: nn_dbs_score = dbs(nn_X, nn_preds) except Exception as e: errors += str(e) + '; ' nn_dbs_score = np.nan errors += '"' return [ metric, eps, minPts, n_clust, perc_noise, elapsed, ari_score, nn_ari_score, nmi_score, nn_nmi_score, ss_score, nn_ss_score, vrc_score, nn_vrc_score, dbs_score, nn_dbs_score, errors ]
def cluster_quality(self, data, labels): data['labels'] = pd.Series(labels) score = ss(data[['param_1', 'param_2']], labels=data['labels']) return score
def sklearn(self): all_data = list() for x in range(0, len(data_handler_1.param1)): all_data.append( list([data_handler_1.param1[x], data_handler_1.param2[x]])) data = np.array(all_data) datapd = pd.DataFrame(data) scores = [] # for k in range(2, 10): k = 4 fuzzy_kmeans = FuzzyKMeans(k=k, m=4, max_iter=300) fuzzy_kmeans.fit(datapd) datapd['labels'] = pd.Series(fuzzy_kmeans.labels_) score = ss(datapd[[0, 1]], labels=datapd['labels']) scores.append(score) # # datapd.plot.scatter(x=0, y=1, c='labels', colormap='viridis') # plt.xlabel("Param 1") # plt.ylabel("Param2") # plt.title(f'K = {k}, Silhouette score = {score}') for center in fuzzy_kmeans.cluster_centers_: plt.plot(center[0], center[1], 'ro') # for datapd['labels'] group_data = datapd.groupby(pd.Series(fuzzy_kmeans.labels_), group_keys=datapd['labels'].unique()) # print(datapd) # print(group_data[1].get_group(1)) # n = [i for i in range(2, 10)] # plt.figure() # plt.plot(n, scores) # plt.xlabel("K") # plt.ylabel("Silhouette score") # plt.show() second_data = DataHandler() second_data.open_file("files\\sdmt3.txt") second = list() for x in range(0, len(data_handler_1.param1)): second.append( list([data_handler_1.param1[x], data_handler_1.param2[x]])) sec_data = np.array(second) sec_datapd = pd.DataFrame(sec_data) scores = [] # print(second_data.param1) max_values = [] min_values = [] for group_label in datapd['labels'].unique(): max_param1 = np.max(group_data[0].get_group(group_label)) max_param2 = np.max(group_data[1].get_group(group_label)) min_param1 = np.min(group_data[0].get_group(group_label)) min_param2 = np.min(group_data[1].get_group(group_label)) tuple_max = (max_param1, max_param2) tuple_min = (min_param1, min_param2) max_values.append(tuple_max) min_values.append(tuple_min) # print(max_values) # print(min_values) # print(sec_datapd) sec_datapd['labels'] = pd.Series(fuzzy_kmeans.labels_) sec_datapd.set_value(2, 'labels', 50) print(sec_datapd) for i in range(0, 40): for j in range(len(max_values)): x, y, z = sec_datapd.values[i] max1, max2 = max_values[j] min1, min2 = min_values[j] # print(x,y) if x > min1 and x < max1 and y > min2 and y < max2: sec_datapd.set_value(i, 'labels', j) sec_score = ss(sec_datapd[[0, 1]], labels=sec_datapd['labels']) print(sec_datapd) sec_datapd.plot.scatter(x=0, y=1, c='labels', colormap='viridis') plt.xlabel("Param 1") plt.ylabel("Param2") plt.title(f'K = {k}, Silhouette score = {sec_score}') plt.show()
### please run entire batch of below code at once ## Plotting Elbow and Silhoutte plot fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(16, 6) clusters = np.arange(2, 11, 1) dists = [] means = [] stdevs = [] for l in clusters: km = KMeans(n_clusters=l) labels = km.fit_predict(r) ss_vals = ss(r, labels, metric="euclidean") ss_avg = np.mean(ss_vals) ss_std = np.std(ss_vals) dists.append(km.inertia_) means.append(ss_avg) stdevs.append(ss_std) ax1.plot(clusters, dists, c="black", linestyle=":", marker="+", markersize=10) ax1.set_xlabel("k") ax1.set_ylabel("Distortion (Within-Cluster SSE)") ax1.tick_params(axis='y', which='both', left=False, labelleft=False) ax1.set_title("Elbow Method") ax2.scatter(clusters, means, c="black")