示例#1
0
def clustering_rfp(cluster_range, RFE_component_diabetes, dataset, dir):
    df = dataset.data
    x = (df.iloc[:, 0:-1])
    y = (df.iloc[:, -1])
    y = y.astype('int')
    x = StandardScaler().fit_transform(x)
    global diabetes_rp, x_rp, diabetes_dataset_rp, diabetes_dataset_rp
    NN_RFE_accuracy = defaultdict(dict)
    estimator = SVR(kernel="linear")
    kmeans_accuracy_RFE = defaultdict(dict)
    kmeans_time_RFE = defaultdict(dict)
    em_accuracy_RFE = defaultdict(dict)
    em_time_RFE = defaultdict(dict)
    for RFE_comp in RFE_component_diabetes:

        diabetes_data_RFE = RFE(estimator, n_features_to_select=RFE_comp)
        diabetes_data_RFE_data = diabetes_data_RFE.fit_transform(x, y)
        diabetes_data_RFE_df = pd.DataFrame(data=diabetes_data_RFE_data)

        diabetes_rp = RFE(estimator, n_features_to_select=RFE_comp)
        x_rp = diabetes_rp.fit_transform(x, y)

        diabetes_dataset_rp = dataset
        diabetes_dataset_rp.x = x_rp
        diabetes_dataset_rp.y = y

        for cluster in cluster_range:
            # Kmean
            start = datetime.now()
            myk_mean_RFE_prediction = KMeans(
                n_clusters=cluster,
                random_state=0).fit_predict(diabetes_data_RFE_df)
            kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, myk_mean_RFE_prediction)
            end = datetime.now()

            kmeans_accuracy_RFE[RFE_comp][cluster] = kmeans_accuracy_for_k
            kmeans_time_RFE[RFE_comp][cluster] = (end - start).total_seconds()

            # EM
            start = datetime.now()
            em_pca_prediction_y = GaussianMixture(n_components=cluster).fit(
                diabetes_data_RFE_df).predict(diabetes_data_RFE_df)
            em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, em_pca_prediction_y)
            end = datetime.now()

            em_accuracy_RFE[RFE_comp][cluster] = em_pca_accuracy_for_k
            em_time_RFE[RFE_comp][cluster] = (end - start).total_seconds()

        NN_RFE_accuracy[RFE_comp] = nn_experiment(diabetes_dataset_rp)
    common_utils.plot_feature_transformation_time(
        kmeans_time_RFE, "k-means RFE clusters vs time", dir)
    common_utils.plot_feature_transformation_accuracy(
        kmeans_accuracy_RFE, "k-means RFE clusters vs accuracy", dir)
    common_utils.plot_feature_transformation_time(em_time_RFE,
                                                  "EM RFE clusters vs time",
                                                  dir)
    common_utils.plot_feature_transformation_accuracy(
        em_accuracy_RFE, "EM RFE clusters vs accuracy", dir)
示例#2
0
def clustering_pca(cluster_range, _pca_components_range, dataset, dir):
    global _pca, df, x, y, x_pca, _dataset_pca, _dataset_pca
    NN_PCA_accuracy = defaultdict(dict)
    # for  PCA, we can only have 20 Principal components since the number of features for  is 20
    kmeans_accuracy_pca, kmeans_time_pca, _accuracy_em_PCA, _time_em_PCA = defaultdict(
        dict), defaultdict(
        dict), defaultdict(dict), defaultdict(dict)
    _pca = PCA(random_state=0)
    eigen = _pca.fit(dataset.x)
    tmp = pd.Series(data=_pca.explained_variance_, index=range(1, min(_pca.explained_variance_.shape[0], 500) + 1))
    tmp.to_csv(dir+'{}_pca_scree.csv'.format(dataset.dataset_name))
    common_utils.plot_dim_red_scores(dir + '{}_pca_scree.csv'.format(dataset.dataset_name), dir, dataset.dataset_name, "PCA", multiple_runs=False, xlabel='Number of Clusters', ylabel=None)
    df = dataset.data
    x = (df.iloc[:, 0:-1])
    y = (df.iloc[:, -1])
    y = y.astype('int')
    x = StandardScaler().fit_transform(x)
    for component in _pca_components_range:

        _pca = PCA(n_components=component, random_state=0)
        x_pca = _pca.fit_transform(x)

        _dataset_pca = dataset
        _dataset_pca.x = x_pca
        _dataset_pca.y = y

        for cluster in cluster_range:
            # Kmeans
            start = datetime.now()
            kmeans_pca_prediction_y = KMeans(n_clusters=cluster, random_state=0).fit_predict(_dataset_pca.x)
            kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(_dataset_pca.y, kmeans_pca_prediction_y)
            end = datetime.now()

            kmeans_accuracy_pca[component][cluster] = kmeans_accuracy_for_k
            kmeans_time_pca[component][cluster] = (end - start).total_seconds()

            # EM
            start = datetime.now()
            em_pca_prediction_y = GaussianMixture(n_components=cluster).fit_predict(_dataset_pca.x)
            em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(_dataset_pca.y, em_pca_prediction_y)
            end = datetime.now()

            _accuracy_em_PCA[component][cluster] = em_pca_accuracy_for_k
            _time_em_PCA[component][cluster] = (end - start).total_seconds()

        NN_PCA_accuracy[component] = nn_experiment(_dataset_pca)
    common_utils.plot_feature_transformation_time(kmeans_time_pca, "k-means PCA clusters vs time", dir)
    common_utils.plot_feature_transformation_accuracy(kmeans_accuracy_pca, "k-means PCA clusters vs accuracy", dir)
    common_utils.plot_feature_transformation_time(_time_em_PCA, "EM PCA clusters vs time", dir)
    common_utils.plot_feature_transformation_accuracy(_accuracy_em_PCA, "EM PCA clusters vs accuracy", dir)
示例#3
0
def clustering_ica(cluster_range, ICA_component_, dataset, dir):
    df = dataset.data
    x = (df.iloc[:, 0:-1])
    y = (df.iloc[:, -1])
    y = y.astype('int')
    x = StandardScaler().fit_transform(x)
    global _ica, x_ica, _dataset_ica, _dataset_ica
    NN_ICA_accuracy = defaultdict(dict)
    kmeans_accuracy_ICA = defaultdict(dict)
    kmeans_time_ICA = defaultdict(dict)
    em_accuracy_ICA = defaultdict(dict)
    em_time_ICA = defaultdict(dict)
    _data_ICA = FastICA(random_state=0)
    kurt = {}
    for dim in ICA_component_:
        _data_ICA.set_params(n_components=dim)
        tmp = _data_ICA.fit_transform(dataset.x)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(dir + '{}_ica_scree.csv'.format(dataset.dataset_name))
    common_utils.plot_dim_red_scores(
        dir + '{}_ica_scree.csv'.format(dataset.dataset_name),
        dir,
        dataset.dataset_name,
        "ICA",
        multiple_runs=False,
        xlabel='Number of Clusters',
        ylabel=None)

    _data_ICA_data = _data_ICA.fit_transform(x)
    _data_ICA_df = pd.DataFrame(data=_data_ICA_data)
    _data_ICA_kurtosis = _data_ICA_df.kurt()
    print(_data_ICA_kurtosis)
    for ICA_comp in ICA_component_:

        _data_ICA = FastICA(n_components=ICA_comp, random_state=0)
        _data_ICA_data = _data_ICA.fit_transform(x)
        _data_ICA_df = pd.DataFrame(data=_data_ICA_data)

        _ica = FastICA(n_components=ICA_comp, random_state=0)
        x_ica = _ica.fit_transform(x)

        _dataset_ica = dataset
        _dataset_ica.x = x_ica
        _dataset_ica.y = y

        for cluster in cluster_range:
            # Kmeans
            start = datetime.now()
            myk_mean_ICA_prediction = KMeans(
                n_clusters=cluster, random_state=0).fit_predict(_data_ICA_df)
            kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, myk_mean_ICA_prediction)
            end = datetime.now()

            kmeans_accuracy_ICA[ICA_comp][cluster] = kmeans_accuracy_for_k
            kmeans_time_ICA[ICA_comp][cluster] = (end - start).total_seconds()

            # EM
            start = datetime.now()
            em_pca_prediction_y = GaussianMixture(
                n_components=cluster).fit(_data_ICA_df).predict(_data_ICA_df)
            em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, em_pca_prediction_y)
            end = datetime.now()

            em_accuracy_ICA[ICA_comp][cluster] = em_pca_accuracy_for_k
            em_time_ICA[ICA_comp][cluster] = (end - start).total_seconds()

        NN_ICA_accuracy[ICA_comp] = nn_experiment(_dataset_ica)
    common_utils.plot_feature_transformation_time(
        kmeans_time_ICA, "k-means ICA clusters vs time", dir)
    common_utils.plot_feature_transformation_accuracy(
        kmeans_accuracy_ICA, "k-means ICA clusters vs accuracy", dir)
    common_utils.plot_feature_transformation_time(em_time_ICA,
                                                  "EM ICA clusters vs time",
                                                  dir)
    common_utils.plot_feature_transformation_accuracy(
        em_accuracy_ICA, "EM ICA clusters vs accuracy", dir)
示例#4
0
def clustering_rp(cluster_range, RP_component, dataset, dir):
    df = dataset.data
    x = (df.iloc[:, 0:-1])
    y = (df.iloc[:, -1])
    y = y.astype('int')
    x = StandardScaler().fit_transform(x)
    global diabetes_rp, x_rp, diabetes_dataset_rp, diabetes_dataset_rp
    NN_RP_ = defaultdict(dict)
    kmeans_accuracy_RP = defaultdict(dict)
    kmeans_time_RP = defaultdict(dict)
    em_accuracy_RP = defaultdict(dict)
    em_time_RP = defaultdict(dict)

    tmp = defaultdict(dict)
    for i, dim in product(range(10), RP_component):
        rp = GaussianRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwise_dist_corr(rp.fit_transform(dataset.x),
                                         dataset.x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(dir + '{}_rp_scree1.csv'.format(dataset.dataset_name))
    common_utils.plot_dim_red_scores(
        dir + '{}_rp_scree1.csv'.format(dataset.dataset_name),
        dir,
        dataset.dataset_name,
        "RP",
        multiple_runs=False,
        xlabel='Number of Clusters',
        ylabel=None)

    tmp = defaultdict(dict)
    for i, dim in product(range(10), RP_component):
        rp = GaussianRandomProjection(random_state=i, n_components=dim)
        rp.fit(dataset.x)
        tmp[dim][i] = reconstruction_error(rp, dataset.x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(dir + '{}_rp_scree2.csv'.format(dataset.dataset_name))
    common_utils.plot_dim_red_scores(
        dir + '{}_rp_scree2.csv'.format(dataset.dataset_name),
        dir,
        dataset.dataset_name,
        "RP",
        multiple_runs=False,
        xlabel='Number of Clusters',
        ylabel=None)

    for RP_comp in RP_component:

        diabetes_data_RP = GaussianRandomProjection(n_components=RP_comp,
                                                    random_state=0)
        diabetes_data_RP_data = diabetes_data_RP.fit_transform(x)
        diabetes_data_RP_df = pd.DataFrame(data=diabetes_data_RP_data)

        diabetes_rp = GaussianRandomProjection(n_components=RP_comp,
                                               random_state=0)
        x_rp = diabetes_rp.fit_transform(x)

        diabetes_dataset_rp = dataset
        diabetes_dataset_rp.x = x_rp
        diabetes_dataset_rp.y = y

        for cluster in cluster_range:
            # Kmeans
            start = datetime.now()
            myk_mean_RP_prediction = KMeans(
                n_clusters=cluster,
                random_state=0).fit_predict(diabetes_data_RP_df)
            kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, myk_mean_RP_prediction)
            end = datetime.now()

            kmeans_accuracy_RP[RP_comp][cluster] = kmeans_accuracy_for_k
            kmeans_time_RP[RP_comp][cluster] = (end - start).total_seconds()

            # EM
            start = datetime.now()
            em_pca_prediction_y = GaussianMixture(n_components=cluster).fit(
                diabetes_data_RP_df).predict(diabetes_data_RP_df)
            em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, em_pca_prediction_y)
            end = datetime.now()

            em_accuracy_RP[RP_comp][cluster] = em_pca_accuracy_for_k
            em_time_RP[RP_comp][cluster] = (end - start).total_seconds()

        NN_RP_[RP_comp] = nn_experiment(diabetes_dataset_rp)
    common_utils.plot_feature_transformation_time(
        kmeans_time_RP, "k-means RP clusters vs time", dir)
    common_utils.plot_feature_transformation_accuracy(
        kmeans_accuracy_RP, "k-means RP clusters vs score", dir)
    common_utils.plot_feature_transformation_time(em_time_RP,
                                                  "EM RP clusters vs time",
                                                  dir)
    common_utils.plot_feature_transformation_accuracy(
        em_accuracy_RP, "EM RP clusters vs score", dir)
示例#5
0
def cluster(cluster_range, dataset, dir):
    global start, kmeans_accuracy_for_k, end, em_prediction_y, em_pca_accuracy_for_k
    kmeans_accuracy, em_accuracy, kmeans_timetaken, em_timetaken = {}, {}, {}, {}
    sse = defaultdict(list)
    ll = defaultdict(list)
    bic = defaultdict(list)
    sil = defaultdict(lambda: defaultdict(list))
    acc = defaultdict(lambda: defaultdict(float))
    adj_mi = defaultdict(lambda: defaultdict(float))
    for k in cluster_range:
        # Kmeans Clustering
        km = KMeans(n_clusters=k, random_state=0)
        gmm = GaussianMixture(n_components=k, random_state=0)
        start = datetime.now()
        kmeans_predicted_y = km.fit_predict(dataset.x)
        end = datetime.now()

        # EM Clustering
        start = datetime.now()
        em_prediction_y = gmm.fit_predict(dataset.x)
        end = datetime.now()

        ## Accuracy

        kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(
            dataset.y, kmeans_predicted_y)
        kmeans_accuracy[k] = kmeans_accuracy_for_k
        kmeans_timetaken[k] = (end - start).total_seconds()

        em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(
            dataset.y, em_prediction_y)
        em_accuracy[k] = em_pca_accuracy_for_k
        em_timetaken[k] = (end - start).total_seconds()

        ## PLotting
        sil[k]['Kmeans'] = sil_score(dataset.x, kmeans_predicted_y)
        sil[k]['GMM'] = sil_score(dataset.x, em_prediction_y)

        sse[k] = [km.score(dataset.x)]
        ll[k] = [gmm.score(dataset.x)]
        bic[k] = [gmm.bic(dataset.x)]

        adj_mi[k]['Kmeans'] = ami(dataset.y, kmeans_predicted_y)
        adj_mi[k]['GMM'] = ami(dataset.y, em_prediction_y)

    sse = (-pd.DataFrame(sse)).T
    sse.index.name = 'k'
    sse.columns = ['{} sse (left)'.format(dataset.dataset_name)]

    ll = pd.DataFrame(ll).T
    ll.index.name = 'k'
    ll.columns = ['{} log-likelihood'.format(dataset.dataset_name)]

    bic = pd.DataFrame(bic).T
    bic.index.name = 'k'
    bic.columns = ['{} BIC'.format(dataset.dataset_name)]

    sil = pd.DataFrame(sil).T
    adj_mi = pd.DataFrame(adj_mi).T

    sil.index.name = 'k'
    adj_mi.index.name = 'k'

    sse.to_csv(dir + '{}_sse.csv'.format(dataset.dataset_name))
    ll.to_csv(dir + '{}_logliklihood.csv'.format(dataset.dataset_name))
    bic.to_csv(dir + '{}_bic.csv'.format(dataset.dataset_name))
    sil.to_csv(dir + '{}_sil_score.csv'.format(dataset.dataset_name))
    adj_mi.to_csv(dir + '{}_adj_mi.csv'.format(dataset.dataset_name))

    neural_net_score = nn_experiment(dataset)
    common_utils.plot_clustering_accuracy(kmeans_accuracy,
                                          "k-means - clusters vs Accuracy",
                                          dir)
    common_utils.plot_clustering_time(kmeans_timetaken,
                                      "k-means - clusters vs Time", dir)
    common_utils.plot_clustering_accuracy(em_accuracy,
                                          "EM clusters - vs Accuracy", dir)
    common_utils.plot_clustering_time(em_timetaken, "EM clusters - vs Time",
                                      dir)

    common_utils.read_and_plot_sse(
        'Clustering', dir + '{}_sse.csv'.format(dataset.dataset_name), dir)
    common_utils.read_and_plot_loglikelihood(
        'Clustering', dir + '{}_logliklihood.csv'.format(dataset.dataset_name),
        dir)
    common_utils.read_and_plot_bic(
        'Clustering', dir + '{}_bic.csv'.format(dataset.dataset_name), dir)
    common_utils.read_and_plot_sil_score(
        'Clustering', dir + '{}_sil_score.csv'.format(dataset.dataset_name),
        dir)
    common_utils.read_and_plot_adj_mi(
        'Clustering', dir + '{}_adj_mi.csv'.format(dataset.dataset_name), dir)