Пример #1
0
 def setUpClass(cls):
     file_name = "datasets/cstr.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['fea']  # numpy.ndarray
     model = CoclustMod(n_clusters=4)
     model.fit(X)
     cls.model = model
Пример #2
0
 def setUpClass(cls):
     file_name = "datasets/classic3.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['A']  # scipy.sparse.csc.csc_matrix
     model = CoclustMod(n_clusters=3)
     model.fit(X)
     cls.model = model
Пример #3
0
 def setUpClass(cls):
     file_name = "datasets/classic3.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['A']  # scipy.sparse.csc.csc_matrix
     model = CoclustMod(n_clusters=3)
     model.fit(X)
     cls.model = model
Пример #4
0
 def setUpClass(cls):
     file_name = "datasets/cstr.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['fea']  # numpy.ndarray
     model = CoclustMod(n_clusters=4)
     model.fit(X)
     cls.model = model
Пример #5
0
    def run_coclust(self):
        # co-clustering
        model = CoclustMod(n_clusters=4)
        model.fit(
            self.doc_term_mat
        )  # No errors? Is this right? Gensim types have plug-and-play support?

        top_term_plt = plot_cluster_top_terms(in_data=self.doc_term_mat,
                                              all_terms=self.vocab,
                                              nb_top_terms=5,
                                              model=model,
                                              do_plot=False)

        # print(get_term_graph(X=doc_term_mat,
        #                      model=model,
        #                      terms=vocab,
        #                      n_cluster=2,
        #                      n_top_terms=10,
        #                      n_neighbors=2,
        #                      stopwords=[]))

        clus_sz_plt = plot_cluster_sizes(model=model, do_plot=False)
        mat_plot = plot_reorganized_matrix(X=self.doc_term_mat,
                                           model=model,
                                           do_plot=False)

        return (top_term_plt, clus_sz_plt, mat_plot)
Пример #6
0
    def cluster(self, data):
        global weighted_edge_list, matrix, model, row_order, column_order, rowMap, colMap, subModels, row_sums_map, column_sums_map
        subModels = {}
        # num_clusters = 9
        weighted_edge_list = data[[
            "RECHTSTRAEGER", "MEDIUM_MEDIENINHABER", "EURO"
        ]]
        weighted_edge_list = weighted_edge_list.groupby(
            by=["RECHTSTRAEGER", "MEDIUM_MEDIENINHABER"]).sum().reset_index()

        G = nx.from_pandas_dataframe(weighted_edge_list,
                                     "RECHTSTRAEGER",
                                     "MEDIUM_MEDIENINHABER",
                                     "EURO",
                                     create_using=nx.DiGraph())
        row_order = np.sort(np.unique(weighted_edge_list["RECHTSTRAEGER"]))
        column_order = np.sort(
            np.unique(weighted_edge_list["MEDIUM_MEDIENINHABER"]))
        matrix_real = biadjacency_matrix(G,
                                         row_order,
                                         column_order=column_order,
                                         weight="EURO")
        matrix = matrix_real.toarray()
        row_sums = matrix.sum(axis=1).round(2)
        row_sums_map = dict(zip(row_order, row_sums))
        row_sums_map = {k: float(v) for k, v in row_sums_map.items()}
        column_sums = matrix.sum(axis=0).round(2)
        column_sums_map = dict(zip(column_order, column_sums))
        column_sums_map = {k: float(v) for k, v in column_sums_map.items()}

        model = CoclustMod(min(min(matrix.shape), num_clusters),
                           random_state=0)  #n_init=500
        model.fit(matrix)

        #test andere liste senden
        rowMap = dict(zip(row_order, list(map(str, model.row_labels_))))
        colMap = dict(zip(column_order, list(map(str, model.column_labels_))))
        ret = []

        wel = weighted_edge_list.copy()
        wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap))
        wel["MEDIUM_MEDIENINHABER"].update(
            wel["MEDIUM_MEDIENINHABER"].map(colMap))
        ret = wel.as_matrix().tolist()

        clusters = self.getElementsbyCluster()

        return {"data": ret, "clusters": clusters}
Пример #7
0
    def cluster(self, data, tempMem, num_clusters):
        #global weighted_edge_list, tempMem["firstGroupIndex"], tempMem["secondGroupIndex"], tempMem["valueIndex"], tempMem["matrix"], tempMem["model"], tempMem["row_order"], tempMem["column_order"], tempMem["rowMap"], tempMem["colMap"], tempMem["subModels"], tempMem["subModels"], tempMem["column_sums_map"]
        tempMem["subModels"] = {}

        dataKeys = data.keys();
        tempMem["firstGroupIndex"] = dataKeys[0]
        tempMem["secondGroupIndex"] = dataKeys[len(dataKeys) - 2]
        tempMem["valueIndex"] = dataKeys[len(dataKeys) - 1]

        # num_clusters = 9
        tempMem["weighted_edge_list"] = data[[tempMem["firstGroupIndex"],tempMem["secondGroupIndex"],tempMem["valueIndex"]]]
        tempMem["weighted_edge_list"] = tempMem["weighted_edge_list"].groupby(by = [tempMem["firstGroupIndex"], tempMem["secondGroupIndex"]]).sum().reset_index()

        G = nx.from_pandas_dataframe(tempMem["weighted_edge_list"],tempMem["firstGroupIndex"],tempMem["secondGroupIndex"],tempMem["valueIndex"], create_using=nx.DiGraph())
        tempMem["row_order"] = np.sort(np.unique(tempMem["weighted_edge_list"][tempMem["firstGroupIndex"]]))
        tempMem["column_order"] = np.sort(np.unique(tempMem["weighted_edge_list"][tempMem["secondGroupIndex"]]))
        matrix_real = biadjacency_matrix(G, tempMem["row_order"], column_order=tempMem["column_order"], weight=tempMem["valueIndex"])
        tempMem["matrix"] = matrix_real.toarray()
        row_sums = tempMem["matrix"].sum(axis=1).round(2)
        tempMem["row_sums_map"] = dict(zip(tempMem["row_order"], row_sums))
        tempMem["row_sums_map"] = {k:float(v) for k,v in tempMem["row_sums_map"].items()}
        column_sums = tempMem["matrix"].sum(axis=0).round(2)
        tempMem["column_sums_map"] = dict(zip(tempMem["column_order"], column_sums))
        tempMem["column_sums_map"] = {k:float(v) for k,v in tempMem["column_sums_map"].items()}

        tempMem["model"] = CoclustMod(min(min(tempMem["matrix"].shape), num_clusters),random_state=0) #n_init=500
        tempMem["model"].fit(tempMem["matrix"])

        #test andere liste senden
        tempMem["rowMap"] = dict(zip(tempMem["row_order"], list(map(str, tempMem["model"].row_labels_))))
        tempMem["colMap"] = dict(zip(tempMem["column_order"], list(map(str,tempMem["model"].column_labels_))))
        ret = []

        wel = tempMem["weighted_edge_list"].copy()
        wel[tempMem["firstGroupIndex"]].update(wel[tempMem["firstGroupIndex"]].map(tempMem["rowMap"]))
        wel[tempMem["secondGroupIndex"]].update(wel[tempMem["secondGroupIndex"]].map(tempMem["colMap"]))
        #ret = wel.as_matrix().tolist()
        ret = wel.values.tolist()

        clusters = self.getElementsbyCluster(tempMem)

        return {"data": ret, "clusters": clusters}
Пример #8
0
    def subcluster3(self, clusterID):
        global subModels

        clusterID_array = [int(x) for x in clusterID.split('.')]
        # print(clusterID_array)
        # print("subModels",subModels)
        subMatrix = model.get_submatrix(matrix, clusterID_array[0])
        sub_row_order = row_order[model.get_indices(clusterID_array[0])[0]]
        sub_column_order = column_order[model.get_indices(
            clusterID_array[0])[1]]

        for i, cID in enumerate(clusterID_array[1:]):
            smID = '.'.join(str(x) for x in clusterID_array[:(i + 1)])
            print("smID", smID)
            sm = subModels[smID]
            subMatrix = sm.get_submatrix(subMatrix, cID)
            sub_row_order = sub_row_order[sm.get_indices(cID)[0]]
            sub_column_order = sub_column_order[sm.get_indices(cID)[1]]

        zeros_cols = np.where(~subMatrix.any(axis=0))[0]
        zeros_rows = np.where(~subMatrix.any(axis=1))[0]
        subMatrix = np.delete(subMatrix, zeros_cols, 1)
        subMatrix = np.delete(subMatrix, zeros_rows, 0)
        sub_row_order = np.delete(sub_row_order, zeros_rows)
        sub_column_order = np.delete(sub_column_order, zeros_cols)

        num_clusters2 = min(min(subMatrix.shape), num_clusters)

        subModel = CoclustMod(num_clusters2, random_state=0)

        subModels[clusterID] = subModel
        # print("subModels",subModels)
        subModel.fit(subMatrix)

        for i, label in enumerate(subModel.row_labels_):
            rowMap[sub_row_order[i]] = str(clusterID) + "." + str(label)

        for i, label in enumerate(subModel.column_labels_):
            colMap[sub_column_order[i]] = str(clusterID) + "." + str(label)

        # ret = []
        # wel = weighted_edge_list.copy()
        # wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap))
        # wel["MEDIUM_MEDIENINHABER"].update(wel["MEDIUM_MEDIENINHABER"].map(colMap))

        rowLabelSet = set(
            [str(clusterID) + "." + str(x) for x in subModel.row_labels_])
        colLabelSet = set(
            [str(clusterID) + "." + str(x) for x in subModel.column_labels_])
        #---

        rowMap2 = {
            k: (v if v in rowLabelSet else "Sonstige")
            for k, v in rowMap.items()
        }
        colMap2 = {
            k: (v if v in colLabelSet else "Sonstige")
            for k, v in colMap.items()
        }

        wel = weighted_edge_list.copy()
        # print(rowLabelSet)

        wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap2))
        wel["MEDIUM_MEDIENINHABER"].update(
            wel["MEDIUM_MEDIENINHABER"].map(colMap2))

        idc = wel[(
            wel["RECHTSTRAEGER"].astype(str).str[:len(clusterID)] != clusterID)
                  & (wel["MEDIUM_MEDIENINHABER"].astype(
                      str).str[:len(clusterID)] != clusterID)].index
        wel = wel.drop(idc)

        wel2 = weighted_edge_list.copy()
        wel2 = wel2.drop(idc)
        row_sums_map2 = wel2.groupby(
            by=["RECHTSTRAEGER"]).sum().to_dict()["EURO"]
        row_sums_map2 = {k: float(v) for k, v in row_sums_map2.items()}
        column_sums_map2 = wel2.groupby(
            by=["MEDIUM_MEDIENINHABER"]).sum().to_dict()["EURO"]
        column_sums_map2 = {k: float(v) for k, v in column_sums_map2.items()}

        ret = []
        ret = wel.as_matrix().tolist()

        # clusters = self.getElementsbyCluster()
        inv_rowMap2 = {}
        for k, v in rowMap2.items():
            inv_rowMap2.setdefault(v, []).append(k)

        inv_colMap2 = {}
        for k, v in colMap2.items():
            inv_colMap2.setdefault(v, []).append(k)

        clusters = {}
        for label in inv_rowMap2:
            clusters[label] = {
                "rows": {
                    k: row_sums_map2[k]
                    for k in inv_rowMap2[label] if k in row_sums_map2
                },
                "columns": {
                    k: column_sums_map2[k]
                    for k in inv_colMap2[label] if k in column_sums_map2
                }
            }

        return {"data": ret, "clusters": clusters}
Пример #9
0
    df = pd.read_csv(label_file)
    y = np.unique(df['Label'], return_inverse=True)[1]  # as factor

    mat = io.loadmat(mat_file)['X']
    print(mat.shape)

    no_cluster = len(np.unique(y))
    print(no_cluster)

    algo_pipeline = []
    algo_pipeline.append((CoclustInfo(n_row_clusters=no_cluster,
                                      n_col_clusters=no_cluster,
                                      n_init=10,
                                      max_iter=200), "CoclustInfo"))
    algo_pipeline.append((CoclustMod(n_clusters=no_cluster,
                                     n_init=10,
                                     max_iter=200), "CoclustMod"))
    algo_pipeline.append((CoclustSpecMod(n_clusters=no_cluster,
                                         n_init=10,
                                         max_iter=200), "CoclustSpecMod"))

    for model, model_name in algo_pipeline:
        res_nmi, res_ari, res_acc = execute_algo(model, model_name, mat, y)

        # Save results
        out_dir = result_path + "/" + data_version + "/"
        makedir(out_dir)
        out_file = out_dir + dataset + "_" + mat_version + "_" + model_name + ".txt"
        content = str(res_nmi) + ", " + str(res_ari) + ", " + str(
            res_acc) + "\n"
        myfile = open(out_file, "a")
Пример #10
0
from scipy.io import loadmat
from coclust.coclustering import CoclustMod

file_name = "/home/sayon/Dropbox/MyModules/Canvass/cclust_package/datasets/som.mat"
matlab_dict = loadmat(file_name)
X = matlab_dict['fea']

model = CoclustMod(n_clusters=4)
model.fit(X)

print(model.modularity)
predicted_row_labels = model.row_labels_
predicted_column_labels = model.column_labels_
Пример #11
0
plt.matshow(euclidean_distances(Feafile.values, Feafile.values))
plt.colorbar()
plt.title('Show the Euclidean distance matrix')
plt.show()

# %%Combined usage
# The following example shows how easy coclust is to run several algorithms on the same dataset
import matplotlib.pyplot
import numpy as np, scipy.sparse as sp, scipy.io as io
from sklearn.metrics import confusion_matrix
from coclust.coclustering import (CoclustMod, CoclustSpecMod, CoclustInfo)
from coclust.visualization import plot_reorganized_matrix

X = Feafile.values
model_1 = CoclustMod(n_clusters=4, n_init=4)
model_1.fit(X)
model_2 = CoclustSpecMod(n_clusters=4, n_init=4)
model_2.fit(X)
model_3 = CoclustInfo(n_row_clusters=3, n_col_clusters=4, n_init=4)
model_3.fit(X)
plt.figure()

plt.title(' plot three reorganized matrices for the dataset')
plt.subplot(131)
plot_reorganized_matrix(X, model_1)
plt.subplot(132)
plot_reorganized_matrix(X, model_2)
plt.subplot(133)
plot_reorganized_matrix(X, model_3)
plt.show()
Пример #12
0
 def setUpClass(cls):
     model = CoclustMod(n_clusters=3)
     X = np.diag(range(1, 200))
     model.fit(X)
     cls.model = model
Пример #13
0
 def setUpClass(cls):
     model = CoclustMod(n_clusters=3)
     X = np.diag(range(1, 200))
     model.fit(X)
     cls.model = model
Пример #14
0
def genes_articles():
    print(request.json)
    tfidf = request.json["tfidf"]
    distance = request.json["distance"]
    coclust = int(request.json["coclust"])
    selected_genes = [v["label"] for v in request.json["genes"]]
    nb_cluster = int(request.json["nb"])

    genes_articles_str = [
        ' '.join(str(x) for x in genesjson[g]) for g in selected_genes
    ]

    if tfidf:
        vec = TfidfVectorizer()
    else:
        vec = CountVectorizer()

    X = vec.fit_transform(genes_articles_str)
    nb = nb_cluster
    if nb_cluster == 0 and coclust != 3:
        xn = X.shape[0]
        step = round(xn / 10) if round(xn / 10) > 0 else 1
        rng = range(1, xn, step)
        # _, modularities = best_modularity_partition(X, rng, n_rand_init=1)
        # nb = rng[np.argmax(modularities)]
        modularities = []
        for x in rng:
            print(x)
            m = CoclustMod(n_clusters=x, n_init=1).fit(X)
            modularities.append(m.modularity)
        nb = rng[np.argmax(modularities)]

    if coclust == 1:
        model = CoclustMod(n_clusters=nb, random_state=0)
    if coclust == 2:
        model = CoclustSpecMod(n_clusters=nb, random_state=0)
    if coclust == 3:
        model = CoclustInfo()

    dt = X.toarray()
    model.fit(dt)
    fit_data = dt[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.figure(figsize=(22, 5))
    if nb_cluster == 0 and coclust != 3:
        plt.subplot(131)
        plt.plot(rng, modularities, 'ro-')
        plt.xlabel("Number of cluster")
        plt.ylabel("Modularity")
        plt.title("Max modularity for " + str(nb) + " clusters (" +
                  str(round(np.max(modularities), 3)) + ")")
        plt.axvline(x=nb, color='r', linestyle='-')
    plt.subplot(132)
    sns.heatmap(dt,
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("Heatmap on Original Data")
    plt.subplot(133)
    sns.heatmap(fit_data,
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("CoclustMod %i clusters" % nb)
    plt.savefig("img-ga1.jpg", bbox_inches='tight', pad_inches=0)

    # hierarchical clustering
    Z = linkage(dt, 'single', 'euclidean')

    plt.figure(figsize=(15, 7))
    plt.xlabel('')
    plt.ylabel('distance')
    dendrogram(Z, labels=selected_genes)
    plt.savefig("img-ga2.jpg", bbox_inches='tight', pad_inches=0)
    plt.close('all')

    return jsonify({"tab": 1})
Пример #15
0
def genes_termes():
    tfidf = request.json["tfidf"]
    coclust = int(request.json["coclust"])
    selected_genes = [v["label"] for v in request.json["genes"]]
    nb_cluster = int(request.json["nb"])

    # get id articles from all genes
    l = [genesjson[g] for g in selected_genes]
    genes_articles = list(set([item for sublist in l for item in sublist]))
    # get text from article
    articles_text = [
        ' '.join(asthmajson[str(i)]["text"]) for i in genes_articles
    ]

    if tfidf:
        vec = TfidfVectorizer(max_df=0.7, min_df=0.01)
    else:
        vec = CountVectorizer(max_df=0.7, min_df=0.01)

    dt = vec.fit_transform(articles_text)

    matrix_article_terms = dt.toarray()
    matrix_genes_terms = defaultdict(
        lambda: np.zeros(matrix_article_terms.shape[1]).astype(np.float64))
    for idx, row in enumerate(matrix_article_terms):
        article = asthmajson[str(genes_articles[idx])]
        for ge in article["genes"]:
            if ge in selected_genes:
                matrix_genes_terms[ge] += row
    list_matrix_genes_terms = [v for k, v in matrix_genes_terms.items()]
    list_genes = [k for k, v in matrix_genes_terms.items()]
    #df3 = pd.DataFrame(list_matrix_genes_terms, columns=vec.get_feature_names())

    if coclust == 1:
        model = CoclustMod(n_clusters=nb_cluster, random_state=0)
    if coclust == 2:
        model = CoclustSpecMod(n_clusters=nb_cluster, random_state=0)
    if coclust == 3:
        model = CoclustInfo()

    dt = np.array(list_matrix_genes_terms)
    m1 = model.fit(dt)
    fit_data = dt[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.figure(figsize=(20, 8))
    plt.subplot(121)
    sns.heatmap(np.log(dt + 1),
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("Heatmap on Original Data", fontdict={'fontsize': 20})

    plt.subplot(122)
    sns.heatmap(np.log(fit_data + 1),
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("CoclustMod %i clusters" % nb_cluster, fontdict={'fontsize': 20})
    plt.savefig("img-gt1.jpg", bbox_inches='tight', pad_inches=0)

    # Top terms by cluster
    nb_clust = np.unique(model.column_labels_)
    nm = np.array(vec.get_feature_names())
    dt_sum = np.sum(dt, axis=0)
    col_label = np.array(model.column_labels_)
    cluster = []
    for c in nb_clust:
        idx = np.argsort(-dt_sum[col_label == c])
        col = nm[np.array(model.column_labels_) == c]
        value = dt_sum[col_label == c][idx]
        name = col[idx]
        cluster.append({"name": list(name[0:8]), "value": list(value[0:8])})

    # hierarchical clustering
    Z = linkage(dt, 'single', 'euclidean')
    plt.figure(figsize=(15, 7))
    # plt.title('Hierarchical Clustering - Hamming')
    plt.xlabel('')
    plt.ylabel('distance')
    dendrogram(Z, labels=list_genes)
    plt.savefig("img-gt2.jpg", bbox_inches='tight', pad_inches=0)
    plt.close('all')

    return jsonify({"tab": 2, "cluster": cluster})