def clustering_rfp(cluster_range, RFE_component_diabetes, dataset, dir): df = dataset.data x = (df.iloc[:, 0:-1]) y = (df.iloc[:, -1]) y = y.astype('int') x = StandardScaler().fit_transform(x) global diabetes_rp, x_rp, diabetes_dataset_rp, diabetes_dataset_rp NN_RFE_accuracy = defaultdict(dict) estimator = SVR(kernel="linear") kmeans_accuracy_RFE = defaultdict(dict) kmeans_time_RFE = defaultdict(dict) em_accuracy_RFE = defaultdict(dict) em_time_RFE = defaultdict(dict) for RFE_comp in RFE_component_diabetes: diabetes_data_RFE = RFE(estimator, n_features_to_select=RFE_comp) diabetes_data_RFE_data = diabetes_data_RFE.fit_transform(x, y) diabetes_data_RFE_df = pd.DataFrame(data=diabetes_data_RFE_data) diabetes_rp = RFE(estimator, n_features_to_select=RFE_comp) x_rp = diabetes_rp.fit_transform(x, y) diabetes_dataset_rp = dataset diabetes_dataset_rp.x = x_rp diabetes_dataset_rp.y = y for cluster in cluster_range: # Kmean start = datetime.now() myk_mean_RFE_prediction = KMeans( n_clusters=cluster, random_state=0).fit_predict(diabetes_data_RFE_df) kmeans_accuracy_for_k = common_utils.get_cluster_accuracy( y, myk_mean_RFE_prediction) end = datetime.now() kmeans_accuracy_RFE[RFE_comp][cluster] = kmeans_accuracy_for_k kmeans_time_RFE[RFE_comp][cluster] = (end - start).total_seconds() # EM start = datetime.now() em_pca_prediction_y = GaussianMixture(n_components=cluster).fit( diabetes_data_RFE_df).predict(diabetes_data_RFE_df) em_pca_accuracy_for_k = common_utils.get_cluster_accuracy( y, em_pca_prediction_y) end = datetime.now() em_accuracy_RFE[RFE_comp][cluster] = em_pca_accuracy_for_k em_time_RFE[RFE_comp][cluster] = (end - start).total_seconds() NN_RFE_accuracy[RFE_comp] = nn_experiment(diabetes_dataset_rp) common_utils.plot_feature_transformation_time( kmeans_time_RFE, "k-means RFE clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( kmeans_accuracy_RFE, "k-means RFE clusters vs accuracy", dir) common_utils.plot_feature_transformation_time(em_time_RFE, "EM RFE clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( em_accuracy_RFE, "EM RFE clusters vs accuracy", dir)
def clustering_pca(cluster_range, _pca_components_range, dataset, dir): global _pca, df, x, y, x_pca, _dataset_pca, _dataset_pca NN_PCA_accuracy = defaultdict(dict) # for PCA, we can only have 20 Principal components since the number of features for is 20 kmeans_accuracy_pca, kmeans_time_pca, _accuracy_em_PCA, _time_em_PCA = defaultdict( dict), defaultdict( dict), defaultdict(dict), defaultdict(dict) _pca = PCA(random_state=0) eigen = _pca.fit(dataset.x) tmp = pd.Series(data=_pca.explained_variance_, index=range(1, min(_pca.explained_variance_.shape[0], 500) + 1)) tmp.to_csv(dir+'{}_pca_scree.csv'.format(dataset.dataset_name)) common_utils.plot_dim_red_scores(dir + '{}_pca_scree.csv'.format(dataset.dataset_name), dir, dataset.dataset_name, "PCA", multiple_runs=False, xlabel='Number of Clusters', ylabel=None) df = dataset.data x = (df.iloc[:, 0:-1]) y = (df.iloc[:, -1]) y = y.astype('int') x = StandardScaler().fit_transform(x) for component in _pca_components_range: _pca = PCA(n_components=component, random_state=0) x_pca = _pca.fit_transform(x) _dataset_pca = dataset _dataset_pca.x = x_pca _dataset_pca.y = y for cluster in cluster_range: # Kmeans start = datetime.now() kmeans_pca_prediction_y = KMeans(n_clusters=cluster, random_state=0).fit_predict(_dataset_pca.x) kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(_dataset_pca.y, kmeans_pca_prediction_y) end = datetime.now() kmeans_accuracy_pca[component][cluster] = kmeans_accuracy_for_k kmeans_time_pca[component][cluster] = (end - start).total_seconds() # EM start = datetime.now() em_pca_prediction_y = GaussianMixture(n_components=cluster).fit_predict(_dataset_pca.x) em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(_dataset_pca.y, em_pca_prediction_y) end = datetime.now() _accuracy_em_PCA[component][cluster] = em_pca_accuracy_for_k _time_em_PCA[component][cluster] = (end - start).total_seconds() NN_PCA_accuracy[component] = nn_experiment(_dataset_pca) common_utils.plot_feature_transformation_time(kmeans_time_pca, "k-means PCA clusters vs time", dir) common_utils.plot_feature_transformation_accuracy(kmeans_accuracy_pca, "k-means PCA clusters vs accuracy", dir) common_utils.plot_feature_transformation_time(_time_em_PCA, "EM PCA clusters vs time", dir) common_utils.plot_feature_transformation_accuracy(_accuracy_em_PCA, "EM PCA clusters vs accuracy", dir)
def clustering_ica(cluster_range, ICA_component_, dataset, dir): df = dataset.data x = (df.iloc[:, 0:-1]) y = (df.iloc[:, -1]) y = y.astype('int') x = StandardScaler().fit_transform(x) global _ica, x_ica, _dataset_ica, _dataset_ica NN_ICA_accuracy = defaultdict(dict) kmeans_accuracy_ICA = defaultdict(dict) kmeans_time_ICA = defaultdict(dict) em_accuracy_ICA = defaultdict(dict) em_time_ICA = defaultdict(dict) _data_ICA = FastICA(random_state=0) kurt = {} for dim in ICA_component_: _data_ICA.set_params(n_components=dim) tmp = _data_ICA.fit_transform(dataset.x) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv(dir + '{}_ica_scree.csv'.format(dataset.dataset_name)) common_utils.plot_dim_red_scores( dir + '{}_ica_scree.csv'.format(dataset.dataset_name), dir, dataset.dataset_name, "ICA", multiple_runs=False, xlabel='Number of Clusters', ylabel=None) _data_ICA_data = _data_ICA.fit_transform(x) _data_ICA_df = pd.DataFrame(data=_data_ICA_data) _data_ICA_kurtosis = _data_ICA_df.kurt() print(_data_ICA_kurtosis) for ICA_comp in ICA_component_: _data_ICA = FastICA(n_components=ICA_comp, random_state=0) _data_ICA_data = _data_ICA.fit_transform(x) _data_ICA_df = pd.DataFrame(data=_data_ICA_data) _ica = FastICA(n_components=ICA_comp, random_state=0) x_ica = _ica.fit_transform(x) _dataset_ica = dataset _dataset_ica.x = x_ica _dataset_ica.y = y for cluster in cluster_range: # Kmeans start = datetime.now() myk_mean_ICA_prediction = KMeans( n_clusters=cluster, random_state=0).fit_predict(_data_ICA_df) kmeans_accuracy_for_k = common_utils.get_cluster_accuracy( y, myk_mean_ICA_prediction) end = datetime.now() kmeans_accuracy_ICA[ICA_comp][cluster] = kmeans_accuracy_for_k kmeans_time_ICA[ICA_comp][cluster] = (end - start).total_seconds() # EM start = datetime.now() em_pca_prediction_y = GaussianMixture( n_components=cluster).fit(_data_ICA_df).predict(_data_ICA_df) em_pca_accuracy_for_k = common_utils.get_cluster_accuracy( y, em_pca_prediction_y) end = datetime.now() em_accuracy_ICA[ICA_comp][cluster] = em_pca_accuracy_for_k em_time_ICA[ICA_comp][cluster] = (end - start).total_seconds() NN_ICA_accuracy[ICA_comp] = nn_experiment(_dataset_ica) common_utils.plot_feature_transformation_time( kmeans_time_ICA, "k-means ICA clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( kmeans_accuracy_ICA, "k-means ICA clusters vs accuracy", dir) common_utils.plot_feature_transformation_time(em_time_ICA, "EM ICA clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( em_accuracy_ICA, "EM ICA clusters vs accuracy", dir)
def clustering_rp(cluster_range, RP_component, dataset, dir): df = dataset.data x = (df.iloc[:, 0:-1]) y = (df.iloc[:, -1]) y = y.astype('int') x = StandardScaler().fit_transform(x) global diabetes_rp, x_rp, diabetes_dataset_rp, diabetes_dataset_rp NN_RP_ = defaultdict(dict) kmeans_accuracy_RP = defaultdict(dict) kmeans_time_RP = defaultdict(dict) em_accuracy_RP = defaultdict(dict) em_time_RP = defaultdict(dict) tmp = defaultdict(dict) for i, dim in product(range(10), RP_component): rp = GaussianRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwise_dist_corr(rp.fit_transform(dataset.x), dataset.x) tmp = pd.DataFrame(tmp).T tmp.to_csv(dir + '{}_rp_scree1.csv'.format(dataset.dataset_name)) common_utils.plot_dim_red_scores( dir + '{}_rp_scree1.csv'.format(dataset.dataset_name), dir, dataset.dataset_name, "RP", multiple_runs=False, xlabel='Number of Clusters', ylabel=None) tmp = defaultdict(dict) for i, dim in product(range(10), RP_component): rp = GaussianRandomProjection(random_state=i, n_components=dim) rp.fit(dataset.x) tmp[dim][i] = reconstruction_error(rp, dataset.x) tmp = pd.DataFrame(tmp).T tmp.to_csv(dir + '{}_rp_scree2.csv'.format(dataset.dataset_name)) common_utils.plot_dim_red_scores( dir + '{}_rp_scree2.csv'.format(dataset.dataset_name), dir, dataset.dataset_name, "RP", multiple_runs=False, xlabel='Number of Clusters', ylabel=None) for RP_comp in RP_component: diabetes_data_RP = GaussianRandomProjection(n_components=RP_comp, random_state=0) diabetes_data_RP_data = diabetes_data_RP.fit_transform(x) diabetes_data_RP_df = pd.DataFrame(data=diabetes_data_RP_data) diabetes_rp = GaussianRandomProjection(n_components=RP_comp, random_state=0) x_rp = diabetes_rp.fit_transform(x) diabetes_dataset_rp = dataset diabetes_dataset_rp.x = x_rp diabetes_dataset_rp.y = y for cluster in cluster_range: # Kmeans start = datetime.now() myk_mean_RP_prediction = KMeans( n_clusters=cluster, random_state=0).fit_predict(diabetes_data_RP_df) kmeans_accuracy_for_k = common_utils.get_cluster_accuracy( y, myk_mean_RP_prediction) end = datetime.now() kmeans_accuracy_RP[RP_comp][cluster] = kmeans_accuracy_for_k kmeans_time_RP[RP_comp][cluster] = (end - start).total_seconds() # EM start = datetime.now() em_pca_prediction_y = GaussianMixture(n_components=cluster).fit( diabetes_data_RP_df).predict(diabetes_data_RP_df) em_pca_accuracy_for_k = common_utils.get_cluster_accuracy( y, em_pca_prediction_y) end = datetime.now() em_accuracy_RP[RP_comp][cluster] = em_pca_accuracy_for_k em_time_RP[RP_comp][cluster] = (end - start).total_seconds() NN_RP_[RP_comp] = nn_experiment(diabetes_dataset_rp) common_utils.plot_feature_transformation_time( kmeans_time_RP, "k-means RP clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( kmeans_accuracy_RP, "k-means RP clusters vs score", dir) common_utils.plot_feature_transformation_time(em_time_RP, "EM RP clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( em_accuracy_RP, "EM RP clusters vs score", dir)
def cluster(cluster_range, dataset, dir): global start, kmeans_accuracy_for_k, end, em_prediction_y, em_pca_accuracy_for_k kmeans_accuracy, em_accuracy, kmeans_timetaken, em_timetaken = {}, {}, {}, {} sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) for k in cluster_range: # Kmeans Clustering km = KMeans(n_clusters=k, random_state=0) gmm = GaussianMixture(n_components=k, random_state=0) start = datetime.now() kmeans_predicted_y = km.fit_predict(dataset.x) end = datetime.now() # EM Clustering start = datetime.now() em_prediction_y = gmm.fit_predict(dataset.x) end = datetime.now() ## Accuracy kmeans_accuracy_for_k = common_utils.get_cluster_accuracy( dataset.y, kmeans_predicted_y) kmeans_accuracy[k] = kmeans_accuracy_for_k kmeans_timetaken[k] = (end - start).total_seconds() em_pca_accuracy_for_k = common_utils.get_cluster_accuracy( dataset.y, em_prediction_y) em_accuracy[k] = em_pca_accuracy_for_k em_timetaken[k] = (end - start).total_seconds() ## PLotting sil[k]['Kmeans'] = sil_score(dataset.x, kmeans_predicted_y) sil[k]['GMM'] = sil_score(dataset.x, em_prediction_y) sse[k] = [km.score(dataset.x)] ll[k] = [gmm.score(dataset.x)] bic[k] = [gmm.bic(dataset.x)] adj_mi[k]['Kmeans'] = ami(dataset.y, kmeans_predicted_y) adj_mi[k]['GMM'] = ami(dataset.y, em_prediction_y) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(dataset.dataset_name)] ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = ['{} log-likelihood'.format(dataset.dataset_name)] bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(dataset.dataset_name)] sil = pd.DataFrame(sil).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' adj_mi.index.name = 'k' sse.to_csv(dir + '{}_sse.csv'.format(dataset.dataset_name)) ll.to_csv(dir + '{}_logliklihood.csv'.format(dataset.dataset_name)) bic.to_csv(dir + '{}_bic.csv'.format(dataset.dataset_name)) sil.to_csv(dir + '{}_sil_score.csv'.format(dataset.dataset_name)) adj_mi.to_csv(dir + '{}_adj_mi.csv'.format(dataset.dataset_name)) neural_net_score = nn_experiment(dataset) common_utils.plot_clustering_accuracy(kmeans_accuracy, "k-means - clusters vs Accuracy", dir) common_utils.plot_clustering_time(kmeans_timetaken, "k-means - clusters vs Time", dir) common_utils.plot_clustering_accuracy(em_accuracy, "EM clusters - vs Accuracy", dir) common_utils.plot_clustering_time(em_timetaken, "EM clusters - vs Time", dir) common_utils.read_and_plot_sse( 'Clustering', dir + '{}_sse.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_loglikelihood( 'Clustering', dir + '{}_logliklihood.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_bic( 'Clustering', dir + '{}_bic.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_sil_score( 'Clustering', dir + '{}_sil_score.csv'.format(dataset.dataset_name), dir) common_utils.read_and_plot_adj_mi( 'Clustering', dir + '{}_adj_mi.csv'.format(dataset.dataset_name), dir)