示例#1
0
def create_model(data, n_clusters, method, random):
    model = SpectralBiclustering(n_clusters=n_clusters,
                                 method=method,
                                 random_state=random)
    model.fit(data)

    return model
示例#2
0
def test_fit_best_piecewise():
    model = SpectralBiclustering(random_state=0)
    vectors = np.array([[0, 0, 0, 1, 1, 1],
                        [2, 2, 2, 3, 3, 3],
                        [0, 1, 2, 3, 4, 5]])
    best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
    assert_array_equal(best, vectors[:2])
def test_project_and_cluster():
    model = SpectralBiclustering(random_state=0)
    data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
    vectors = np.array([[1, 0], [0, 1], [0, 0]])
    for mat in (data, csr_matrix(data)):
        labels = model._project_and_cluster(data, vectors, n_clusters=2)
        assert_array_equal(labels, [0, 0, 1, 1])
示例#4
0
def test_spectral_biclustering():
    """Test Kluger methods on a checkerboard dataset."""
    param_grid = {'method': ['scale', 'bistochastic', 'log'],
                  'svd_method': ['randomized', 'arpack'],
                  'n_svd_vecs': [None, 20],
                  'mini_batch': [False, True],
                  'init': ['k-means++'],
                  'n_init': [10],
                  'n_jobs': [1]}
    random_state = 0
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=random_state)
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralBiclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)

            if issparse(mat) and kwargs['method'] == 'log':
                # cannot take log of sparse matrix
                assert_raises(ValueError, model.fit, mat)
                continue
            else:
                model.fit(mat)

            assert_equal(model.rows_.shape, (9, 30))
            assert_equal(model.columns_.shape, (9, 30))
            assert_array_equal(model.rows_.sum(axis=0),
                               np.repeat(3, 30))
            assert_array_equal(model.columns_.sum(axis=0),
                               np.repeat(3, 30))
            assert_equal(consensus_score(model.biclusters_,
                                         (rows, cols)), 1)
 def run(self, data):
     bc = SpectralBiclustering(n_clusters=(self.n_gene_classes,
                                           self.n_classes))
     bc.fit(data)
     gene_clusters = bc.row_labels_
     cell_clusters = bc.column_labels_
     return cell_clusters
示例#6
0
    def biclustering(matrix, distance, callback=None):
        if min(matrix.shape) <= 2:
            return np.arange(matrix.shape[0]), np.arange(matrix.shape[1])

        best_score = np.iinfo(np.dtype('uint16')).max
        best_model = None

        # find the best biclusters (needs revision)
        limit = int(min(matrix.shape) / 2) - 1
        limit = 3 if limit < 3 else limit
        for i in range(2, limit):
            if callback is not None:
                callback(0.2 + (i - 2) / (limit - 2) * 0.8)
            # perform biclustering
            model = SpectralBiclustering(
                n_clusters=i, method='log', random_state=0)
            model.fit(matrix)
            fit_data = matrix[np.argsort(model.row_labels_)]
            fit_data = fit_data[:, np.argsort(model.column_labels_)]

            # calculate score and save the lowest one
            score = distance(fit_data)
            if score < best_score:
                best_score = score
                best_model = model

        return np.argsort(best_model.row_labels_), np.argsort(best_model.column_labels_)
示例#7
0
文件: main.py 项目: kg7155/DataMining
    def fit_predict(self, D):
        """Run ConsensusClustering algorithm on data D.
        Return partition of input data and consensus matrix for best k.
        """
        # number of samples
        n = D.shape[0]

        # AUC score for each k
        AUC_scores = np.zeros(len(self.num_clusters))
        i = 0

        for k in self.num_clusters:
            M = self.calc_consensus(n, D, k)
            AUC_scores[i] = self.calc_auc(M)
            i = i + 1

        # find best number of clusters (k_best)
        idx_k_best = np.argmax(AUC_scores)
        k_best = K[idx_k_best]

        # uncomment to see the best k for given input data
        #print("Best number of clusters (k): ", k_best)

        M_k_best = self.calc_consensus(n, D, k_best)

        # partition D into K-best clusters based on M_k_best using
        # SpectralBiclustering
        model = SpectralBiclustering(n_clusters=k_best, method='bistochastic')
        model.fit(M_k_best)
        P = model.row_labels_

        return P, M_k_best
示例#8
0
def test_fit_best_piecewise():
    model = SpectralBiclustering(random_state=0)
    vectors = np.array([[0, 0, 0, 1, 1, 1],
                        [2, 2, 2, 3, 3, 3],
                        [0, 1, 2, 3, 4, 5]])
    best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
    assert_array_equal(best, vectors[:2])
示例#9
0
def test_spectral_biclustering():
    """Test Kluger methods on a checkerboard dataset."""
    param_grid = {'method': ['scale', 'bistochastic', 'log'],
                  'svd_method': ['randomized', 'arpack'],
                  'n_svd_vecs': [None, 20],
                  'mini_batch': [False, True],
                  'init': ['k-means++'],
                  'n_init': [3],
                  'n_jobs': [1]}
    random_state = 0
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=random_state)
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralBiclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)

            if issparse(mat) and kwargs['method'] == 'log':
                # cannot take log of sparse matrix
                assert_raises(ValueError, model.fit, mat)
                continue
            else:
                model.fit(mat)

            assert_equal(model.rows_.shape, (9, 30))
            assert_equal(model.columns_.shape, (9, 30))
            assert_array_equal(model.rows_.sum(axis=0),
                               np.repeat(3, 30))
            assert_array_equal(model.columns_.sum(axis=0),
                               np.repeat(3, 30))
            assert_equal(consensus_score(model.biclusters_,
                                         (rows, cols)), 1)
示例#10
0
def fi_selection_algo(metadata, settings, X, target_atts_list=None):
    fi_scores = get_fi_scores(X, target_atts_list, metadata)
    n_clusters = (int(settings["selection"]["param"]), 2)
    model = SpectralBiclustering(n_clusters=n_clusters, method="log")
    model.fit(fi_scores)
    cluster_labels = model.row_labels_
    codes = labels_to_codes(cluster_labels, target_atts_list)
    return codes
示例#11
0
def plotBicluster(df, n_clusters):
    model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
    model.fit(df)
    
    fitDf = df.iloc[np.argsort(model.row_labels_), :]
    fitDf = fitDf.iloc[:, np.argsort(model.column_labels_)]
    plotCorrHeatmap(dmat=fitDf)
    return fitDf
示例#12
0
def plotBicluster(df, n_clusters, col_labels=None):
    model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
    model.fit(df)
    
    fitDf = df.iloc[np.argsort(model.row_labels_),:]
    fitDf = fitDf.iloc[:, np.argsort(model.column_labels_)]
    plotCorrHeatmap(dmat=fitDf, col_labels=col_labels)
    return fitDf
示例#13
0
	def fit_data_to_model(self,shapey):
		model = SpectralBiclustering(n_clusters=shapey, method='log',random_state=0)
		model.fit(self.data)
		self.fit_data = self.data[np.argsort(model.row_labels_)]
		self.fit_data = self.fit_data[:, np.argsort(model.column_labels_)]
		self.rowl = model.row_labels_
		self.coll = model.column_labels_
		self.shapex = shapey
示例#14
0
    def get_bicluster(self, data):
        # Biclustering
        model = SpectralBiclustering(n_clusters=data.shape[1], random_state=0)
        print(data.sum(axis=0))
        print(data.sum(axis=1))
        model.fit(data.fillna(0))
        fit_data = data.iloc[np.argsort(model.row_labels_)]
        fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)]

        return fit_data
示例#15
0
def SpectralBiCluster(data, n_clusters=(4, 4)):
    from sklearn.datasets import make_checkerboard
    from matplotlib import pyplot as plt
    from sklearn.cluster.bicluster import SpectralBiclustering
    model = SpectralBiclustering(method='log', random_state=0)
    data = np.array(data)
    model.fit(data)
    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
示例#16
0
 def fit_data_to_model(self, shapey):
     model = SpectralBiclustering(n_clusters=shapey,
                                  method='log',
                                  random_state=0)
     model.fit(self.data)
     self.fit_data = self.data[np.argsort(model.row_labels_)]
     self.fit_data = self.fit_data[:, np.argsort(model.column_labels_)]
     self.rowl = model.row_labels_
     self.coll = model.column_labels_
     self.shapex = shapey
def test_errors():
    data = np.arange(25).reshape((5, 5))

    model = SpectralBiclustering(n_clusters=(3, 3, 3))
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering(n_clusters='abc')
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering(n_clusters=(3, 'abc'))
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering(method='unknown')
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering(svd_method='unknown')
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering(n_components=0)
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering(n_best=0)
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering(n_components=3, n_best=4)
    assert_raises(ValueError, model.fit, data)

    model = SpectralBiclustering()
    data = np.arange(27).reshape((3, 3, 3))
    assert_raises(ValueError, model.fit, data)
示例#18
0
def plot_biclustering_with_pearson(time_ms, title):
    sliced_matrix = slice_matrix(matrix, time_ms)
    channels_data = calculate_n_columns(sliced_matrix)
    z_score = stats.zscore(channels_data)
    plt.title('Z Score Biclustering Over %i ms' % time_ms)
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_%s_biclustering_all_ts_%i.svg' % (time_ms, title))
示例#19
0
def test_project_and_cluster():
    model = SpectralBiclustering(random_state=0)
    data = np.array([[1, 1, 1],
                     [1, 1, 1],
                     [3, 6, 3],
                     [3, 6, 3]])
    vectors = np.array([[1, 0],
                        [0, 1],
                        [0, 0]])
    for mat in (data, csr_matrix(data)):
        labels = model._project_and_cluster(data, vectors,
                                            n_clusters=2)
        assert_array_equal(labels, [0, 0, 1, 1])
示例#20
0
def biclustering(filtered, checked) :

	### over 2 
	if len(filtered['data']) >= 2 :
		n_clusters = (2, 2)
	else :
		n_clusters = (1, 1)

	model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
	data = np.asarray(filtered['data'])
	model.fit(data)

	#biclustering
	y_fit_data = data[np.argsort(model.row_labels_)]
	fit_data = y_fit_data[:, np.argsort(model.column_labels_)]

	#set y label
	y = np.argsort(model.row_labels_)
	y_label = [0 for i in range(len(y))]
	for n in range(len(y)) :
		y_label[y[n]] = n

	#set x label
	x = np.argsort(model.column_labels_)
	x_label = [0 for i in range(len(x))]
	for n in range(len(x)) :
		x_label[x[n]] = n

	d1 = bd.draw_graph(group1, group2, checked,
		x = x, x_label = x_label,
		y_label = y_label,
		fit_data = fit_data,
		genus_data = filtered['genus'],
		pvalue_label = filtered['pvalue'],
		title = "After biclustering")
		
	d1.draw()

	# biclustering of fixed x-axis domain 
	d2 = bd.draw_graph(group1, group2, checked,
		x_label = [i for i in range(len(group1+group2))],
		y_label = y_label,
		x = [i for i in range(len(group1+group2))],
		fit_data = y_fit_data,
		genus_data = filtered['genus'],
		pvalue_label = filtered['pvalue'],
		title = "After biclustering; fixed x domins")

	d2.draw()
def spectral_biclust(E, ngenes=3, nconditions=1,  spectral_method="bistochastic", n=6, n_best_ratio=0.5, **kwargs):
    n_best = max([int(n*n_best_ratio), 1])

    spectral = SpectralBiclustering(n_clusters=(nconditions,ngenes), method=spectral_method, n_components=n, n_best=n_best)

    spectral.fit(standardize(E))

    bics = []
    for columns, rows in zip(spectral.columns_, spectral.rows_):
        genes = E.columns[columns]
        conditions = E.index[rows]

        bics.append(Bicluster(genes, conditions))

    return bics
示例#22
0
def spectral_bi_cluster(data, n_clusters, para_jobs=1, random_state=None):
    from sklearn.cluster.bicluster import SpectralBiclustering
    assert len(
        n_clusters
    ) == 2, "n_cluster should be a tuple or list that contains 2 integer!"
    model = SpectralBiclustering(n_clusters,
                                 random_state=random_state,
                                 n_jobs=para_jobs,
                                 method='bistochastic',
                                 n_best=20,
                                 n_components=40)
    model.fit(data)
    row_labels = model.row_labels_
    col_labels = model.column_labels_
    return row_labels, col_labels
 def _spectral_bicluster(self, n_clusters, interaction_matrix):
     clustering = SpectralBiclustering(n_clusters=n_clusters,
                                       random_state=0).fit(
                                           self.interaction_matrix)
     pdz_clusters = clustering.row_labels_
     peptide_clusters = clustering.column_labels_
     return pdz_clusters, peptide_clusters
示例#24
0
def plot_biclustering_raw_data(time_ms, t=False):
    # take the transpose of sliced matrix
    if t:
        channels_data = slice_matrix(matrix, time_ms).T
    else:
        channels_data = slice_matrix(matrix, time_ms)
    print len(channels_data), len(channels_data[1])
    z_score = stats.zscore(channels_data)
    plt.title('Z Score Biclustering Over %i ms' % time_ms)
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    # plt.savefig('z_score_raw_biclustering_all_ts_%i_T_%s.svg' % (time_ms, str(t)))
    plt.show()
示例#25
0
def spectral(dataset_name, full, preprocessing, mindf, k1, k2, ngram_min,
             ngram_max, start, end, n):
    if not spectral_directory_exists(dataset_name):
        create_spectral_directory(dataset_name)
    h, c = obtain_file_name_from_dataset(dataset_name, preprocessing)
    corpus = obtain_full_corpus(h, c)
    if full:
        texts = corpus.text.values
        docnames = corpus.text.index.values
        if not tfidf_exists(dataset_name, preprocessing):
            X, v = create_tfidf(texts, mindf, ngram_min, ngram_max)
            words = v.get_feature_names()
            store_data(dataset_name, preprocessing, X, docnames, words)
        tfidf, documents, terms = load_data(dataset_name, preprocessing)
        if not spectral_exists(get_directory_dataset(dataset_name),
                               dataset_name, preprocessing, mindf, k1, k2,
                               ngram_min, ngram_max):
            start = time.time()
            model = SpectralBiclustering(n_clusters=(k1, k2), random_state=0)
            model.fit(tfidf)
            end = time.time()
            print("Biclustering process takes", int(round(end - start)),
                  "seconds")
            save_clasification(get_directory_dataset(dataset_name),
                               dataset_name, preprocessing, mindf, k1, k2,
                               ngram_min, ngram_max, model)
    else:
        time_corpus = split_data_in_time_slices(corpus, start, end, n)
        if not tfidf_periods_exists(dataset_name, preprocessing, start, end,
                                    n):
            os.makedirs(
                get_directory_dataset_periods(dataset_name, preprocessing,
                                              start, end, n))
            for (s, e), corp in time_corpus.items():
                texts = corp.text.values
                docnames = corp.text.index.values
                X, v = create_tfidf(texts, mindf, ngram_min, ngram_max)
                words = v.get_feature_names()
                store_data_periods(dataset_name, preprocessing, start, end, n,
                                   s, e, X, docnames, words)
        for s, e in time_corpus:
            tfidf, documents, terms = load_data_periods(
                dataset_name, preprocessing, start, end, n, s, e)
            if not spectral_periods_exists(dataset_name, preprocessing, mindf,
                                           k1, k2, ngram_min, ngram_max, start,
                                           end, n, s, e):
                st = time.time()
                model = SpectralBiclustering(n_clusters=(k1, k2),
                                             random_state=0)
                model.fit(tfidf)
                ed = time.time()
                print("Biclustering process takes", int(round(ed - st)),
                      "seconds")
                save_clasification_periods(dataset_name, preprocessing, mindf,
                                           k1, k2, ngram_min, ngram_max, model,
                                           start, end, n, s, e)
示例#26
0
def plot_biclusters_n_intervals(n_intervals=30000):
    channels_data = [[] for i in range(64)]
    for row in range(64):
        start, end = 0, n_intervals
        row_data = matrix[row]
        while end < len(row_data):
            channels_data[row].append(float(sum(row_data[start:end])) / len(row_data[start:end]))
            start = end
            end += n_intervals
    z_score = stats.zscore(np.array(channels_data))
    plt.title('Z Score Biclustering')
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_raw_biclustering_all_%is.svg' % (n_intervals / 1000))
示例#27
0
    def spectral_biclustering(cls, *args):
        """
        Wrapper method for the spectral_biclustering algorithm

        :param args: the arguments to be sent to the sci-kit implementation
        :return: returns the Biclustering object
        """

        model = SpectralBiclustering(*args)
        return cls(model)
示例#28
0
def spectral_cluster(dataframe, n_clusters=(30, 30), show_plots=False):
    model = SpectralBiclustering(n_clusters=n_clusters,
                                 method='log',
                                 random_state=0)
    data = dataframe.fillna(0.0).values
    model.fit(data)

    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    if show_plots:
        plt.matshow(fit_data, cmap=plt.cm.Blues)
        plt.title("After biclustering; rearranged to show biclusters")
        plt.matshow(np.outer(
            np.sort(model.row_labels_) + 1,
            np.sort(model.column_labels_) + 1),
                    cmap=plt.cm.Blues)
        plt.title("Checkerboard structure of rearranged data")

    return model
示例#29
0
def test_perfect_checkerboard():
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
示例#30
0
def Spectral_BiClustering(M, args):
    '''Function to perform bipartite clustering'''
    # Create model
    try:
        if args.arpack:
            model = SpectralBiclustering(
                n_clusters=args.nClusters, svd_method='arpack')
        else:
            model = SpectralBiclustering(
                n_clusters=args.nClusters)
    except:
        print '-r 1 may cause problems when svd_method has been set to arpack'

    print('Running biclustering')
    model.fit(M.tocsc())
    print('Biclustering done')

    # Fit to data
    # fit_data = M[np.argsort(model.row_labels_)]
    # fit_data = fit_data[:, np.argsort(model.column_labels_)]
    fit_data = M.tocoo()
    fit_data.row = invert_permutation(np.argsort(model.row_labels_))[fit_data.row]
    fit_data.col = invert_permutation(np.argsort(model.column_labels_))[fit_data.col]

    save_clusters(model, fit_data, args, '_BiClustering', 1)

    return model, fit_data
def test_perfect_checkerboard():
    raise SkipTest("This test is failing on the buildbot, but cannot"
                   " reproduce. Temporarily disabling it until it can be"
                   " reproduced and  fixed.")
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
示例#32
0
def _get_clusters_using_spectrals(corrarr, n_clusters=5, mode='co'):
    if mode=='co':
        model = SpectralCoclustering(n_clusters, random_state=0)
        model.fit(corrarr)
        indices = np.arange(corrarr.columns.size)
        clusters = [indices[x].tolist() for x in model.columns_]
        return clusters
    elif mode=='bi':
        model = SpectralBiclustering(n_clusters, random_state=0)
        model.fit(corrarr)
        indices = np.arange(corrarr.columns.size)
        clusters = [indices[x].tolist() for x in model.columns_]
        repetition_start = clusters[1:].index(clusters[0]) + 1
        return clusters[:repetition_start]
    else:
        raise("Mode wrong?")
def test_spectral_biclustering():
    # Test Kluger methods on a checkerboard dataset.
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=0)

    non_default_params = {'method': ['scale', 'log'],
                          'svd_method': ['arpack'],
                          'n_svd_vecs': [20],
                          'mini_batch': [True]}

    for mat in (S, csr_matrix(S)):
        for param_name, param_values in non_default_params.items():
            for param_value in param_values:

                model = SpectralBiclustering(
                    n_clusters=3,
                    n_init=3,
                    init='k-means++',
                    random_state=0,
                )
                model.set_params(**dict([(param_name, param_value)]))

                if issparse(mat) and model.get_params().get('method') == 'log':
                    # cannot take log of sparse matrix
                    with pytest.raises(ValueError):
                        model.fit(mat)
                    continue
                else:
                    model.fit(mat)

                assert model.rows_.shape == (9, 30)
                assert model.columns_.shape == (9, 30)
                assert_array_equal(model.rows_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_array_equal(model.columns_.sum(axis=0),
                                   np.repeat(3, 30))
                assert consensus_score(model.biclusters_,
                                       (rows, cols)) == 1

                _test_shape_indices(model)
示例#34
0
 def cocluster(self, mx, blockdiag=False):
     logging.info('Co-clustering Tade..')
     if blockdiag:
         logging.info('blockdiag')
         clusser = SpectralCoclustering(n_jobs=-1)
     else:  # checkerboard
         logging.info('checkerboard')
         clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4, 3))
         #n_clusters=3, svd_method='randomized',
     clusser.fit(mx)
     logging.info('Argsorting mx rows..')
     mx = mx[np.argsort(clusser.row_labels_)]
     self.prev = self.prev[np.argsort(clusser.row_labels_)]
     logging.info('Argsorting mx cases..')
     mx = mx[:, np.argsort(clusser.column_labels_)]
     self.case = self.case[np.argsort(clusser.column_labels_)]
     return mx
示例#35
0
def test_perfect_checkerboard():
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)
示例#36
0
def test_perfect_checkerboard():
    raise SkipTest("This test is failing on the buildbot, but cannot"
                   " reproduce. Temporarily disabling it until it can be"
                   " reproduced and  fixed.")
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0,
                                      random_state=0)
    model.fit(S)
    assert_equal(consensus_score(model.biclusters_,
                                 (rows, cols)), 1)
示例#37
0
def test_spectral_biclustering():
    # Test Kluger methods on a checkerboard dataset.
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=0)

    non_default_params = {'method': ['scale', 'log'],
                          'svd_method': ['arpack'],
                          'n_svd_vecs': [20],
                          'mini_batch': [True]}

    for mat in (S, csr_matrix(S)):
        for param_name, param_values in non_default_params.items():
            for param_value in param_values:

                model = SpectralBiclustering(
                    n_clusters=3,
                    n_init=3,
                    init='k-means++',
                    random_state=0,
                )
                model.set_params(**dict([(param_name, param_value)]))

                if issparse(mat) and model.get_params().get('method') == 'log':
                    # cannot take log of sparse matrix
                    assert_raises(ValueError, model.fit, mat)
                    continue
                else:
                    model.fit(mat)

                assert_equal(model.rows_.shape, (9, 30))
                assert_equal(model.columns_.shape, (9, 30))
                assert_array_equal(model.rows_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_array_equal(model.columns_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_equal(consensus_score(model.biclusters_,
                                             (rows, cols)), 1)

                _test_shape_indices(model)
##0.780023781213

###############################################################################
## Draw  dendrogram
Z = linkage(data, 'ward')
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    labels=np.array(authors)
)
plt.show()
 

###############################################################################
## Biclustering
data = data.astype('float')
bc = SpectralBiclustering(n_clusters=(n_authors,5))
bc.fit(data)
## TODO : sort the rows and columns 
bc_data = data[np.argsort(bc.row_labels_)]
bc_data = bc_data[:, np.argsort(bc.column_labels_)]
## How to annotate the words?
plt.matshow(data, cmap = plt.cm.Blues)
plt.title("Original dataset")
plt.matshow(bc_data, cmap = plt.cm.Blues)
plt.title("After biclustering; rearrange to show biclusters")
示例#39
0
        l = int(sys.argv[4])
        output_mat_name = sys.argv[5]
        tfidf = load_sparse_mat(mat_name,mat_filename).astype(float32)
        data = tfidf.A

        im = plt.matshow(data, aspect='auto', cmap='jet')
        vmax = amax(data)
        vmin = amin(data)
        plt.clim(vmin,vmax)
        plt.colorbar(im)
        m,n = tfidf.shape
        print("Matrix dimensions: ",m,"x",n)
        print("Row clusters:",k)
        print("Column clusters:",l)
        start = time.time()
        model = SpectralBiclustering(n_clusters=(k,l),random_state=0)
        model.fit(tfidf)
        end = time.time()
        print("Biclustering process takes",int(round(end-start)),"seconds")
        fit_data = data[np.argsort(model.row_labels_)]
        fit_data = fit_data[:, np.argsort(model.column_labels_)]
        im = plt.matshow(fit_data, aspect='auto', cmap='jet')
        plt.clim(vmin,vmax)
        plt.colorbar(im)
        im = plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
                     np.sort(model.column_labels_) + 1),
            cmap='jet',aspect='auto')
        plt.clim(vmin,vmax)
        plt.colorbar(im)
        plt.title("Checkerboard structure of rearranged data")
        plt.show()
示例#40
0
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_raw_biclustering_all_%is.svg' % (n_intervals / 1000))


def dump_raw_z_scores():
    np.array(stats.zscore(np.array(matrix))).dump('raw_z_npdump.dump')


if __name__ == '__main__':
<<<<<<< HEAD
    z_scores = np.load('raw_z_npdump.dump')
    plt.title('Z Score Biclustering')
    spectral_model = SpectralBiclustering()
    spectral_model.fit(z_scores)
    fit_data = z_scores[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_bicluster.svg')
=======
    # plot_biclustering_with_pearson(30000000000)
    # plot_biclustering_raw_data(60000)
    # plot_biclustering_raw_data(60000, t=True)
    # plot_coclusters_raw_data(60000)
    # plot_coclusters_raw_data(60000, t=True)
    # plot_biclusters_n_intervals(15000)
    dump_raw_z_scores()
    print(z_score)
>>>>>>> 51502dc598c9e79326407b5d15302c706bb6cdf2
示例#41
0
#         train_features[user]['average_set_score'] = sum_set_scores / float(num_sets)
#         # s average score
#         sum_s_scores = 0
#         for i in range(0, num_sets):
#             sum_s_scores += grades_rowdict[key]['s' + str(i)]
#         train_features[user]['average_s_score'] = sum_set_scores / float(num_sets)
#         # rest of the features
#         train_features[user]['course_score'] = grades_rowdict[key]['course']
#         train_features[user]['final_exam_score'] = grades_rowdict[key]['final']
#         train_features[user]['hw_score'] = grades_rowdict[key]['hw']
#         train_features[user]['letter'] = grades_rowdict[key]['letter']
#         train_features[user]['demerit'] = grades_rowdict[key]['demerit']
#     else:
#         pass

# MACHINE LEARNING CLUSTERING
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
kmeans.fit(train_features)

from sklearn.cluster.bicluster import SpectralBiclustering
model = SpectralBiclustering(n_clusters=5, method='log', random_state=0)
model.fit(train_features)

train_features.loc['bf7aa87b-444a-4eff-9f81-b4078e6dccd3']

model.row_labels_


示例#42
0
    except:
        print('FAYOL!')
    media_id_num = picsdict[media_id]
    m[media_id_num, usersdict[db[media_id][3]]] = True
    for user in temp:
        try:
            m[media_id_num, usersdict[user.username]] = True
        except:
#            print(':3 ', user.username)
            other_users.add(user.username)


import pickle
pickle.dump( m, open( "save.p", "wb" ), protocol = 2 )
m = pickle.load( open( "save.p", "rb" ) )

import numpy as np
from matplotlib import pyplot as plt
plt.matshow(m, cmap=plt.cm.Blues)

from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.metrics import consensus_score
model = SpectralBiclustering(method='bistochastic', n_jobs = -1)
model.fit(m)

fit_data = m[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")
示例#43
0
from sklearn.manifold import TSNE
# C1 = sklearn.cluster.AgglomerativeClustering(n_clusters=5, affinity='precomputed')
# 
# R1 = C1.fit_predict(Gram)
# 
n = len(Gram)
Di = np.reshape(np.diag(Gram),(n,1))
M = Di.dot(np.ones((1,n)))

D = M + M.T - 2*Gram

C2 = AffinityPropagation(affinity='precomputed')
C1 = KMeans(n_clusters = 5)
C3 = AgglomerativeClustering(n_clusters=5, affinity='precomputed',linkage='average')
C4 = SpectralClustering(n_clusters=5,affinity='precomputed')
C5 = SpectralBiclustering(n_clusters=(5,5))

R1 = C1.fit_predict(D)
R2 = C2.fit_predict(D)
R3 = C3.fit_predict(D)
R4 = C4.fit_predict(Gram +11)
R5 = C5.fit(D)

print(R4)

modèle = TSNE(n_components=2,metric='precomputed')
Trans = modèle.fit_transform(D)

G_ACP = ACP(Gram,precomputed=True)

trace_ACP(G_ACP,[10]*5)
示例#44
0
        mplpyplot.show()
# nodebox section end

n_clusters = (4, 3)
data, rows, columns = make_checkerboard(
    shape=(300, 300), n_clusters=n_clusters, noise=10,
    shuffle=False, random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralBiclustering(n_clusters=n_clusters, method='log',
                             random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.1f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
                     np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)