Python CoclustMod примеры, coclust.coclustering.CoclustMod Python примеры использования

Пример #1

0

Показать файл

Файл: test_coclust.py Проект: franrole/cclust_package

 def setUpClass(cls):
     file_name = "datasets/cstr.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['fea']  # numpy.ndarray
     model = CoclustMod(n_clusters=4)
     model.fit(X)
     cls.model = model

Пример #2

0

Показать файл

Файл: test_coclust.py Проект: franrole/cclust_package

 def setUpClass(cls):
     file_name = "datasets/classic3.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['A']  # scipy.sparse.csc.csc_matrix
     model = CoclustMod(n_clusters=3)
     model.fit(X)
     cls.model = model

Пример #3

0

Показать файл

Файл: test_coclust.py Проект: wangzehua94/cclust_package

 def setUpClass(cls):
     file_name = "datasets/classic3.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['A']  # scipy.sparse.csc.csc_matrix
     model = CoclustMod(n_clusters=3)
     model.fit(X)
     cls.model = model

Пример #4

0

Показать файл

Файл: test_coclust.py Проект: wangzehua94/cclust_package

 def setUpClass(cls):
     file_name = "datasets/cstr.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['fea']  # numpy.ndarray
     model = CoclustMod(n_clusters=4)
     model.fit(X)
     cls.model = model

Пример #5

0

Показать файл

Файл: articles_coclust.py Проект: frole/review

    def run_coclust(self):
        # co-clustering
        model = CoclustMod(n_clusters=4)
        model.fit(
            self.doc_term_mat
        )  # No errors? Is this right? Gensim types have plug-and-play support?

        top_term_plt = plot_cluster_top_terms(in_data=self.doc_term_mat,
                                              all_terms=self.vocab,
                                              nb_top_terms=5,
                                              model=model,
                                              do_plot=False)

        # print(get_term_graph(X=doc_term_mat,
        #                      model=model,
        #                      terms=vocab,
        #                      n_cluster=2,
        #                      n_top_terms=10,
        #                      n_neighbors=2,
        #                      stopwords=[]))

        clus_sz_plt = plot_cluster_sizes(model=model, do_plot=False)
        mat_plot = plot_reorganized_matrix(X=self.doc_term_mat,
                                           model=model,
                                           do_plot=False)

        return (top_term_plt, clus_sz_plt, mat_plot)

Пример #6

0

Показать файл

    def cluster(self, data):
        global weighted_edge_list, matrix, model, row_order, column_order, rowMap, colMap, subModels, row_sums_map, column_sums_map
        subModels = {}
        # num_clusters = 9
        weighted_edge_list = data[[
            "RECHTSTRAEGER", "MEDIUM_MEDIENINHABER", "EURO"
        ]]
        weighted_edge_list = weighted_edge_list.groupby(
            by=["RECHTSTRAEGER", "MEDIUM_MEDIENINHABER"]).sum().reset_index()

        G = nx.from_pandas_dataframe(weighted_edge_list,
                                     "RECHTSTRAEGER",
                                     "MEDIUM_MEDIENINHABER",
                                     "EURO",
                                     create_using=nx.DiGraph())
        row_order = np.sort(np.unique(weighted_edge_list["RECHTSTRAEGER"]))
        column_order = np.sort(
            np.unique(weighted_edge_list["MEDIUM_MEDIENINHABER"]))
        matrix_real = biadjacency_matrix(G,
                                         row_order,
                                         column_order=column_order,
                                         weight="EURO")
        matrix = matrix_real.toarray()
        row_sums = matrix.sum(axis=1).round(2)
        row_sums_map = dict(zip(row_order, row_sums))
        row_sums_map = {k: float(v) for k, v in row_sums_map.items()}
        column_sums = matrix.sum(axis=0).round(2)
        column_sums_map = dict(zip(column_order, column_sums))
        column_sums_map = {k: float(v) for k, v in column_sums_map.items()}

        model = CoclustMod(min(min(matrix.shape), num_clusters),
                           random_state=0)  #n_init=500
        model.fit(matrix)

        #test andere liste senden
        rowMap = dict(zip(row_order, list(map(str, model.row_labels_))))
        colMap = dict(zip(column_order, list(map(str, model.column_labels_))))
        ret = []

        wel = weighted_edge_list.copy()
        wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap))
        wel["MEDIUM_MEDIENINHABER"].update(
            wel["MEDIUM_MEDIENINHABER"].map(colMap))
        ret = wel.as_matrix().tolist()

        clusters = self.getElementsbyCluster()

        return {"data": ret, "clusters": clusters}

Пример #7

0

Показать файл

    def cluster(self, data, tempMem, num_clusters):
        #global weighted_edge_list, tempMem["firstGroupIndex"], tempMem["secondGroupIndex"], tempMem["valueIndex"], tempMem["matrix"], tempMem["model"], tempMem["row_order"], tempMem["column_order"], tempMem["rowMap"], tempMem["colMap"], tempMem["subModels"], tempMem["subModels"], tempMem["column_sums_map"]
        tempMem["subModels"] = {}

        dataKeys = data.keys();
        tempMem["firstGroupIndex"] = dataKeys[0]
        tempMem["secondGroupIndex"] = dataKeys[len(dataKeys) - 2]
        tempMem["valueIndex"] = dataKeys[len(dataKeys) - 1]

        # num_clusters = 9
        tempMem["weighted_edge_list"] = data[[tempMem["firstGroupIndex"],tempMem["secondGroupIndex"],tempMem["valueIndex"]]]
        tempMem["weighted_edge_list"] = tempMem["weighted_edge_list"].groupby(by = [tempMem["firstGroupIndex"], tempMem["secondGroupIndex"]]).sum().reset_index()

        G = nx.from_pandas_dataframe(tempMem["weighted_edge_list"],tempMem["firstGroupIndex"],tempMem["secondGroupIndex"],tempMem["valueIndex"], create_using=nx.DiGraph())
        tempMem["row_order"] = np.sort(np.unique(tempMem["weighted_edge_list"][tempMem["firstGroupIndex"]]))
        tempMem["column_order"] = np.sort(np.unique(tempMem["weighted_edge_list"][tempMem["secondGroupIndex"]]))
        matrix_real = biadjacency_matrix(G, tempMem["row_order"], column_order=tempMem["column_order"], weight=tempMem["valueIndex"])
        tempMem["matrix"] = matrix_real.toarray()
        row_sums = tempMem["matrix"].sum(axis=1).round(2)
        tempMem["row_sums_map"] = dict(zip(tempMem["row_order"], row_sums))
        tempMem["row_sums_map"] = {k:float(v) for k,v in tempMem["row_sums_map"].items()}
        column_sums = tempMem["matrix"].sum(axis=0).round(2)
        tempMem["column_sums_map"] = dict(zip(tempMem["column_order"], column_sums))
        tempMem["column_sums_map"] = {k:float(v) for k,v in tempMem["column_sums_map"].items()}

        tempMem["model"] = CoclustMod(min(min(tempMem["matrix"].shape), num_clusters),random_state=0) #n_init=500
        tempMem["model"].fit(tempMem["matrix"])

        #test andere liste senden
        tempMem["rowMap"] = dict(zip(tempMem["row_order"], list(map(str, tempMem["model"].row_labels_))))
        tempMem["colMap"] = dict(zip(tempMem["column_order"], list(map(str,tempMem["model"].column_labels_))))
        ret = []

        wel = tempMem["weighted_edge_list"].copy()
        wel[tempMem["firstGroupIndex"]].update(wel[tempMem["firstGroupIndex"]].map(tempMem["rowMap"]))
        wel[tempMem["secondGroupIndex"]].update(wel[tempMem["secondGroupIndex"]].map(tempMem["colMap"]))
        #ret = wel.as_matrix().tolist()
        ret = wel.values.tolist()

        clusters = self.getElementsbyCluster(tempMem)

        return {"data": ret, "clusters": clusters}

Пример #8

0

Показать файл

    def subcluster3(self, clusterID):
        global subModels

        clusterID_array = [int(x) for x in clusterID.split('.')]
        # print(clusterID_array)
        # print("subModels",subModels)
        subMatrix = model.get_submatrix(matrix, clusterID_array[0])
        sub_row_order = row_order[model.get_indices(clusterID_array[0])[0]]
        sub_column_order = column_order[model.get_indices(
            clusterID_array[0])[1]]

        for i, cID in enumerate(clusterID_array[1:]):
            smID = '.'.join(str(x) for x in clusterID_array[:(i + 1)])
            print("smID", smID)
            sm = subModels[smID]
            subMatrix = sm.get_submatrix(subMatrix, cID)
            sub_row_order = sub_row_order[sm.get_indices(cID)[0]]
            sub_column_order = sub_column_order[sm.get_indices(cID)[1]]

        zeros_cols = np.where(~subMatrix.any(axis=0))[0]
        zeros_rows = np.where(~subMatrix.any(axis=1))[0]
        subMatrix = np.delete(subMatrix, zeros_cols, 1)
        subMatrix = np.delete(subMatrix, zeros_rows, 0)
        sub_row_order = np.delete(sub_row_order, zeros_rows)
        sub_column_order = np.delete(sub_column_order, zeros_cols)

        num_clusters2 = min(min(subMatrix.shape), num_clusters)

        subModel = CoclustMod(num_clusters2, random_state=0)

        subModels[clusterID] = subModel
        # print("subModels",subModels)
        subModel.fit(subMatrix)

        for i, label in enumerate(subModel.row_labels_):
            rowMap[sub_row_order[i]] = str(clusterID) + "." + str(label)

        for i, label in enumerate(subModel.column_labels_):
            colMap[sub_column_order[i]] = str(clusterID) + "." + str(label)

        # ret = []
        # wel = weighted_edge_list.copy()
        # wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap))
        # wel["MEDIUM_MEDIENINHABER"].update(wel["MEDIUM_MEDIENINHABER"].map(colMap))

        rowLabelSet = set(
            [str(clusterID) + "." + str(x) for x in subModel.row_labels_])
        colLabelSet = set(
            [str(clusterID) + "." + str(x) for x in subModel.column_labels_])
        #---

        rowMap2 = {
            k: (v if v in rowLabelSet else "Sonstige")
            for k, v in rowMap.items()
        }
        colMap2 = {
            k: (v if v in colLabelSet else "Sonstige")
            for k, v in colMap.items()
        }

        wel = weighted_edge_list.copy()
        # print(rowLabelSet)

        wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap2))
        wel["MEDIUM_MEDIENINHABER"].update(
            wel["MEDIUM_MEDIENINHABER"].map(colMap2))

        idc = wel[(
            wel["RECHTSTRAEGER"].astype(str).str[:len(clusterID)] != clusterID)
                  & (wel["MEDIUM_MEDIENINHABER"].astype(
                      str).str[:len(clusterID)] != clusterID)].index
        wel = wel.drop(idc)

        wel2 = weighted_edge_list.copy()
        wel2 = wel2.drop(idc)
        row_sums_map2 = wel2.groupby(
            by=["RECHTSTRAEGER"]).sum().to_dict()["EURO"]
        row_sums_map2 = {k: float(v) for k, v in row_sums_map2.items()}
        column_sums_map2 = wel2.groupby(
            by=["MEDIUM_MEDIENINHABER"]).sum().to_dict()["EURO"]
        column_sums_map2 = {k: float(v) for k, v in column_sums_map2.items()}

        ret = []
        ret = wel.as_matrix().tolist()

        # clusters = self.getElementsbyCluster()
        inv_rowMap2 = {}
        for k, v in rowMap2.items():
            inv_rowMap2.setdefault(v, []).append(k)

        inv_colMap2 = {}
        for k, v in colMap2.items():
            inv_colMap2.setdefault(v, []).append(k)

        clusters = {}
        for label in inv_rowMap2:
            clusters[label] = {
                "rows": {
                    k: row_sums_map2[k]
                    for k in inv_rowMap2[label] if k in row_sums_map2
                },
                "columns": {
                    k: column_sums_map2[k]
                    for k in inv_colMap2[label] if k in column_sums_map2
                }
            }

        return {"data": ret, "clusters": clusters}

Пример #9

0

Показать файл

Файл: coclustering.py Проект: MatPont/Text_Mining_Project

    df = pd.read_csv(label_file)
    y = np.unique(df['Label'], return_inverse=True)[1]  # as factor

    mat = io.loadmat(mat_file)['X']
    print(mat.shape)

    no_cluster = len(np.unique(y))
    print(no_cluster)

    algo_pipeline = []
    algo_pipeline.append((CoclustInfo(n_row_clusters=no_cluster,
                                      n_col_clusters=no_cluster,
                                      n_init=10,
                                      max_iter=200), "CoclustInfo"))
    algo_pipeline.append((CoclustMod(n_clusters=no_cluster,
                                     n_init=10,
                                     max_iter=200), "CoclustMod"))
    algo_pipeline.append((CoclustSpecMod(n_clusters=no_cluster,
                                         n_init=10,
                                         max_iter=200), "CoclustSpecMod"))

    for model, model_name in algo_pipeline:
        res_nmi, res_ari, res_acc = execute_algo(model, model_name, mat, y)

        # Save results
        out_dir = result_path + "/" + data_version + "/"
        makedir(out_dir)
        out_file = out_dir + dataset + "_" + mat_version + "_" + model_name + ".txt"
        content = str(res_nmi) + ", " + str(res_ari) + ", " + str(
            res_acc) + "\n"
        myfile = open(out_file, "a")

Пример #10

0

Показать файл

from scipy.io import loadmat
from coclust.coclustering import CoclustMod

file_name = "/home/sayon/Dropbox/MyModules/Canvass/cclust_package/datasets/som.mat"
matlab_dict = loadmat(file_name)
X = matlab_dict['fea']

model = CoclustMod(n_clusters=4)
model.fit(X)

print(model.modularity)
predicted_row_labels = model.row_labels_
predicted_column_labels = model.column_labels_

Пример #11

0

Показать файл

plt.matshow(euclidean_distances(Feafile.values, Feafile.values))
plt.colorbar()
plt.title('Show the Euclidean distance matrix')
plt.show()

# %%Combined usage
# The following example shows how easy coclust is to run several algorithms on the same dataset
import matplotlib.pyplot
import numpy as np, scipy.sparse as sp, scipy.io as io
from sklearn.metrics import confusion_matrix
from coclust.coclustering import (CoclustMod, CoclustSpecMod, CoclustInfo)
from coclust.visualization import plot_reorganized_matrix

X = Feafile.values
model_1 = CoclustMod(n_clusters=4, n_init=4)
model_1.fit(X)
model_2 = CoclustSpecMod(n_clusters=4, n_init=4)
model_2.fit(X)
model_3 = CoclustInfo(n_row_clusters=3, n_col_clusters=4, n_init=4)
model_3.fit(X)
plt.figure()

plt.title(' plot three reorganized matrices for the dataset')
plt.subplot(131)
plot_reorganized_matrix(X, model_1)
plt.subplot(132)
plot_reorganized_matrix(X, model_2)
plt.subplot(133)
plot_reorganized_matrix(X, model_3)
plt.show()

Пример #12

0

Показать файл

Файл: test_coclust.py Проект: franrole/cclust_package

 def setUpClass(cls):
     model = CoclustMod(n_clusters=3)
     X = np.diag(range(1, 200))
     model.fit(X)
     cls.model = model

Пример #13

0

Показать файл

Файл: test_coclust.py Проект: wangzehua94/cclust_package

 def setUpClass(cls):
     model = CoclustMod(n_clusters=3)
     X = np.diag(range(1, 200))
     model.fit(X)
     cls.model = model

Пример #14

0

Показать файл

Файл: api.py Проект: JosephGesnouin/coclust

def genes_articles():
    print(request.json)
    tfidf = request.json["tfidf"]
    distance = request.json["distance"]
    coclust = int(request.json["coclust"])
    selected_genes = [v["label"] for v in request.json["genes"]]
    nb_cluster = int(request.json["nb"])

    genes_articles_str = [
        ' '.join(str(x) for x in genesjson[g]) for g in selected_genes
    ]

    if tfidf:
        vec = TfidfVectorizer()
    else:
        vec = CountVectorizer()

    X = vec.fit_transform(genes_articles_str)
    nb = nb_cluster
    if nb_cluster == 0 and coclust != 3:
        xn = X.shape[0]
        step = round(xn / 10) if round(xn / 10) > 0 else 1
        rng = range(1, xn, step)
        # _, modularities = best_modularity_partition(X, rng, n_rand_init=1)
        # nb = rng[np.argmax(modularities)]
        modularities = []
        for x in rng:
            print(x)
            m = CoclustMod(n_clusters=x, n_init=1).fit(X)
            modularities.append(m.modularity)
        nb = rng[np.argmax(modularities)]

    if coclust == 1:
        model = CoclustMod(n_clusters=nb, random_state=0)
    if coclust == 2:
        model = CoclustSpecMod(n_clusters=nb, random_state=0)
    if coclust == 3:
        model = CoclustInfo()

    dt = X.toarray()
    model.fit(dt)
    fit_data = dt[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.figure(figsize=(22, 5))
    if nb_cluster == 0 and coclust != 3:
        plt.subplot(131)
        plt.plot(rng, modularities, 'ro-')
        plt.xlabel("Number of cluster")
        plt.ylabel("Modularity")
        plt.title("Max modularity for " + str(nb) + " clusters (" +
                  str(round(np.max(modularities), 3)) + ")")
        plt.axvline(x=nb, color='r', linestyle='-')
    plt.subplot(132)
    sns.heatmap(dt,
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("Heatmap on Original Data")
    plt.subplot(133)
    sns.heatmap(fit_data,
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("CoclustMod %i clusters" % nb)
    plt.savefig("img-ga1.jpg", bbox_inches='tight', pad_inches=0)

    # hierarchical clustering
    Z = linkage(dt, 'single', 'euclidean')

    plt.figure(figsize=(15, 7))
    plt.xlabel('')
    plt.ylabel('distance')
    dendrogram(Z, labels=selected_genes)
    plt.savefig("img-ga2.jpg", bbox_inches='tight', pad_inches=0)
    plt.close('all')

    return jsonify({"tab": 1})

Пример #15

0

Показать файл

Файл: api.py Проект: JosephGesnouin/coclust

def genes_termes():
    tfidf = request.json["tfidf"]
    coclust = int(request.json["coclust"])
    selected_genes = [v["label"] for v in request.json["genes"]]
    nb_cluster = int(request.json["nb"])

    # get id articles from all genes
    l = [genesjson[g] for g in selected_genes]
    genes_articles = list(set([item for sublist in l for item in sublist]))
    # get text from article
    articles_text = [
        ' '.join(asthmajson[str(i)]["text"]) for i in genes_articles
    ]

    if tfidf:
        vec = TfidfVectorizer(max_df=0.7, min_df=0.01)
    else:
        vec = CountVectorizer(max_df=0.7, min_df=0.01)

    dt = vec.fit_transform(articles_text)

    matrix_article_terms = dt.toarray()
    matrix_genes_terms = defaultdict(
        lambda: np.zeros(matrix_article_terms.shape[1]).astype(np.float64))
    for idx, row in enumerate(matrix_article_terms):
        article = asthmajson[str(genes_articles[idx])]
        for ge in article["genes"]:
            if ge in selected_genes:
                matrix_genes_terms[ge] += row
    list_matrix_genes_terms = [v for k, v in matrix_genes_terms.items()]
    list_genes = [k for k, v in matrix_genes_terms.items()]
    #df3 = pd.DataFrame(list_matrix_genes_terms, columns=vec.get_feature_names())

    if coclust == 1:
        model = CoclustMod(n_clusters=nb_cluster, random_state=0)
    if coclust == 2:
        model = CoclustSpecMod(n_clusters=nb_cluster, random_state=0)
    if coclust == 3:
        model = CoclustInfo()

    dt = np.array(list_matrix_genes_terms)
    m1 = model.fit(dt)
    fit_data = dt[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.figure(figsize=(20, 8))
    plt.subplot(121)
    sns.heatmap(np.log(dt + 1),
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("Heatmap on Original Data", fontdict={'fontsize': 20})

    plt.subplot(122)
    sns.heatmap(np.log(fit_data + 1),
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("CoclustMod %i clusters" % nb_cluster, fontdict={'fontsize': 20})
    plt.savefig("img-gt1.jpg", bbox_inches='tight', pad_inches=0)

    # Top terms by cluster
    nb_clust = np.unique(model.column_labels_)
    nm = np.array(vec.get_feature_names())
    dt_sum = np.sum(dt, axis=0)
    col_label = np.array(model.column_labels_)
    cluster = []
    for c in nb_clust:
        idx = np.argsort(-dt_sum[col_label == c])
        col = nm[np.array(model.column_labels_) == c]
        value = dt_sum[col_label == c][idx]
        name = col[idx]
        cluster.append({"name": list(name[0:8]), "value": list(value[0:8])})

    # hierarchical clustering
    Z = linkage(dt, 'single', 'euclidean')
    plt.figure(figsize=(15, 7))
    # plt.title('Hierarchical Clustering - Hamming')
    plt.xlabel('')
    plt.ylabel('distance')
    dendrogram(Z, labels=list_genes)
    plt.savefig("img-gt2.jpg", bbox_inches='tight', pad_inches=0)
    plt.close('all')

    return jsonify({"tab": 2, "cluster": cluster})

Python CoclustMod примеры использования