Exemplo n.º 1
0
def test_fit_best_piecewise():
    model = SpectralBiclustering(random_state=0)
    vectors = np.array([[0, 0, 0, 1, 1, 1],
                        [2, 2, 2, 3, 3, 3],
                        [0, 1, 2, 3, 4, 5]])
    best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
    assert_array_equal(best, vectors[:2])
Exemplo n.º 2
0
def test_project_and_cluster():
    model = SpectralBiclustering(random_state=0)
    data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
    vectors = np.array([[1, 0], [0, 1], [0, 0]])
    for mat in (data, csr_matrix(data)):
        labels = model._project_and_cluster(mat, vectors, n_clusters=2)
        assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
Exemplo n.º 3
0
def biclustering(dist, genes_1, genes_2, x_label, y_label, out_file, experiment, id_convertor, n_clusters=3, precent_visualize=0.1):
    model = SpectralBiclustering(n_clusters=n_clusters, n_components=12, n_best=6,
                                 init='random', random_state=1)

    m, n = dist.shape
    assert m == len(genes_1) and n == len(genes_2)
    model.fit(dist)
    rows = [(idx, clust_id) for idx, clust_id in enumerate(model.row_labels_)]
    selected_rows = random.choices(rows, k=int(precent_visualize * len(rows)))
    selected_rows_name = [genes_1[idx] for idx, _ in selected_rows]
    selected_rows_clust_ids = [clust_id for _, clust_id in selected_rows]
    selected_rows_indices = [idx for idx, _ in selected_rows]
    # Slect columns
    cols = [(idx, clust_id) for idx, clust_id in enumerate(model.column_labels_)]
    selected_cols = random.choices(cols, k=int(precent_visualize * len(cols)))
    selected_cols_names = [genes_2[idx] for idx, _ in selected_cols]
    selected_cols_clust_ids = [clust_id for _, clust_id in selected_cols]
    selected_cols_indices = [idx for idx, _ in selected_cols]
    # Selected dist
    selected_dist = dist[selected_rows_indices] [:, selected_cols_indices]
    # Sort rows
    sorted_rows_indices = np.argsort(selected_rows_clust_ids)
    selected_dist = selected_dist[sorted_rows_indices, :]
    selected_row_names = [selected_rows_name[i] for i in sorted_rows_indices]
    #selected_row_names = selected_rows_name[sorted_rows_indices]
    # sort columns
    sorted_cols_indices = np.argsort(selected_cols_clust_ids)
    selected_dist = selected_dist[:, sorted_cols_indices]
    selected_cols_names = [selected_cols_names[i] for i in sorted_cols_indices]

    result = pd.DataFrame(selected_dist, columns=selected_cols_names, index=selected_rows_name)

    ax = sns.heatmap(result, cmap="Greens_r", square=True)
    plt.title("Biclustering Results")
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.set_ylabel('{} genes'.format(x_label))
    ax.set_xlabel('{} genes'.format(y_label))
    figure = ax.get_figure()
    figure.savefig(out_file)
    plt.close()

    for bic in range(n_clusters*n_clusters):
        #print(bic)
        r = list(model.rows_[bic])
        rows = [i for (i, b) in zip(genes_1, r) if b]

        c = list(model.columns_[bic])
        columns = [i for (i, b) in zip(genes_2, c) if b]

        rows = id_convertor.ints2ids([int(k) for k in rows])
        columns = id_convertor.ints2ids([int(k) for k in columns])

        cluster_path = os.path.join(experiment, f'{bic}_{x_label}_{y_label}_biclustering.csv')
        with open(cluster_path, 'w') as fout:
            fout.write(','.join(rows))
            fout.write("\n")
            fout.write(','.join(columns))
Exemplo n.º 4
0
    def get_bicluster(self, data):
        # Biclustering
        model = SpectralBiclustering(n_clusters=data.shape[1], random_state=0)
        print(data.sum(axis=0))
        print(data.sum(axis=1))
        model.fit(data.fillna(0))
        fit_data = data.iloc[np.argsort(model.row_labels_)]
        fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)]

        return fit_data
Exemplo n.º 5
0
def cluster_validation_dav(df):
    """
    Conduct multiple Davies-Bouldin scores calculations with different algorithms. 
    Argument: a standardized dataframe
    Return: a dataframe with three columns of scores
    """

    dbs = []
    #calculate scores of three algorithms
    for i in range(3, 11):
        c1 = KMeans(n_clusters=i, random_state=1, algorithm='auto').fit(df)
        label1 = c1.labels_
        c2 = Birch(n_clusters=i).fit(df)
        label2 = c2.labels_
        c3 = SpectralBiclustering(n_clusters=i, random_state=1).fit(df)
        label3 = c3.row_labels_

        dbs.append([
            davies_bouldin_score(df, label1),
            davies_bouldin_score(df, label2),
            davies_bouldin_score(df, label3)
        ])
    #change column name
    result = pd.DataFrame(dbs).rename(columns={
        0: 'K-Means',
        1: 'Birch',
        2: 'Spectral Biclustering'
    })
    return result
Exemplo n.º 6
0
def SB(data_x , k):
#SpectralBi cluster
    data_class = SpectralBiclustering(n_clusters=k).fit(data_x).row_labels_
    
    #translate to one-hot
    data_class_np = np.zeros(shape=(len(data_class),k))
    for i in range(len(data_class)):
        data_class_np[i,data_class[i]] = 1    
    return data_class_np , data_class   
Exemplo n.º 7
0
def spectral_biclust(E, ngenes=3, nconditions=1,  spectral_method="bistochastic", n=6, n_best_ratio=0.5, **kwargs):
    """
    Note:
    - method was moved from sklearn.cluster.bicluster.SpectralBiclustering

    """
    n_best = max([int(n*n_best_ratio), 1])

    spectral = SpectralBiclustering(n_clusters=(nconditions,ngenes), method=spectral_method, n_components=n, n_best=n_best)

    spectral.fit(standardize(E))

    bics = []
    for columns, rows in zip(spectral.columns_, spectral.rows_):
        genes = E.columns[columns]
        conditions = E.index[rows]

        bics.append(Bicluster(genes, conditions))

    return bics
    
Exemplo n.º 8
0
def test_perfect_checkerboard():
    # XXX Previously failed on build bot (not reproducible)
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1
Exemplo n.º 9
0
def test_spectral_biclustering():
    # Test Kluger methods on a checkerboard dataset.
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=0)

    non_default_params = {'method': ['scale', 'log'],
                          'svd_method': ['arpack'],
                          'n_svd_vecs': [20],
                          'mini_batch': [True]}

    for mat in (S, csr_matrix(S)):
        for param_name, param_values in non_default_params.items():
            for param_value in param_values:

                model = SpectralBiclustering(
                    n_clusters=3,
                    n_init=3,
                    init='k-means++',
                    random_state=0,
                )
                model.set_params(**dict([(param_name, param_value)]))

                if issparse(mat) and model.get_params().get('method') == 'log':
                    # cannot take log of sparse matrix
                    with pytest.raises(ValueError):
                        model.fit(mat)
                    continue
                else:
                    model.fit(mat)

                assert model.rows_.shape == (9, 30)
                assert model.columns_.shape == (9, 30)
                assert_array_equal(model.rows_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_array_equal(model.columns_.sum(axis=0),
                                   np.repeat(3, 30))
                assert consensus_score(model.biclusters_,
                                       (rows, cols)) == 1

                _test_shape_indices(model)
Exemplo n.º 10
0
def test_perfect_checkerboard():
    # XXX test always skipped
    raise SkipTest("This test is failing on the buildbot, but cannot"
                   " reproduce. Temporarily disabling it until it can be"
                   " reproduced and  fixed.")
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1
Exemplo n.º 11
0
def test_errors(args):
    data = np.arange(25).reshape((5, 5))

    model = SpectralBiclustering(**args)
    with pytest.raises(ValueError):
        model.fit(data)
Exemplo n.º 12
0
        {
            "n_components": 3,
            "n_best": 4
        },
    ],
)
def test_errors(args):
    data = np.arange(25).reshape((5, 5))

    model = SpectralBiclustering(**args)
    with pytest.raises(ValueError):
        model.fit(data)


def test_wrong_shape():
    model = SpectralBiclustering()
    data = np.arange(27).reshape((3, 3, 3))
    with pytest.raises(ValueError):
        model.fit(data)


@pytest.mark.parametrize("est",
                         (SpectralBiclustering(), SpectralCoclustering()))
def test_n_features_in_(est):

    X, _, _ = make_biclusters((3, 3), 3, random_state=0)

    assert not hasattr(est, "n_features_in_")
    est.fit(X)
    assert est.n_features_in_ == 3
Exemplo n.º 13
0
 def __init__(self, matrix, cluster_args, vectorizer, df, handles):
     self.model = SpectralBiclustering(**cluster_args)
     self.matrix = matrix
     self.vectorizer = vectorizer
     self.df = df
     self.handles = handles
Exemplo n.º 14
0
start, end = 0, length_of_intervals
interval = 0

while end <= len(band[0]):
    print interval,
    for i in channels:
        for j in range(63):
            if i <= j:
                pearson_data[interval].append(
                    pearsonr(band[i][start:end], band[j][start:end])[0])
    start = end
    end += length_of_intervals
    interval += 1

p = np.array(pearson_data)
spectral_model = SpectralBiclustering()
spectral_model.fit(p)

fit_data = p[np.argsort(spectral_model.row_labels_)]
fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
plt.matshow(p, cmap=plt.cm.Blues)
plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.matshow(np.outer(
    np.sort(spectral_model.row_labels_) + 1,
    np.sort(spectral_model.column_labels_) + 1),
            cmap=plt.cm.Blues)

with open('media/pearson_30sec_bandpassMedian_clipped_2016.json', 'w+') as f:
    pearson_data_r = np.array(pearson_data)
    p = [[float(column) for column in row] for row in pearson_data_r]
    f.write(simplejson.dumps({'name': 's5d2nap', 'data': p}))
                                      n_clusters=5,
                                      noise=5,
                                      shuffle=False,
                                      random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original datasets")

#shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(data.shape[0])
col_idx = rng.permutation(data.shape[1])
data = data[row_idx][:, col_idx]

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffle datasets")

model = SpectralBiclustering(n_clusters=5, random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))
print("consensus score :{: .3f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclustering")

plt.show()
Exemplo n.º 16
0
def test_wrong_shape():
    model = SpectralBiclustering()
    data = np.arange(27).reshape((3, 3, 3))
    with pytest.raises(ValueError):
        model.fit(data)
Exemplo n.º 17
0
def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
    """Check parameters validation in `SpectralBiClustering`"""
    data = np.arange(25).reshape((5, 5))
    model = SpectralBiclustering(**params)
    with pytest.raises(type_err, match=err_msg):
        model.fit(data)
Exemplo n.º 18
0
class SpectBiclustUser():
    """hold data, model, and visualization methods for SpectralBiclustering on twitter user-term matrix
    
    parameters:
    matrix: user-term matrix
    cluster_args: optional arguments for SpectralBiclustering model, as dict
    vectorizer: sklearn.feature_extraction.text vectorizer
    df: DataFrame w/ 'text', 'handle', and 'description' fields
    handles: list/array of document identifiers
    """
    def __init__(self, matrix, cluster_args, vectorizer, df, handles):
        self.model = SpectralBiclustering(**cluster_args)
        self.matrix = matrix
        self.vectorizer = vectorizer
        self.df = df
        self.handles = handles

    def fit(self):
        self.model.fit(self.matrix)

    # works for bi- or co-clustering!
    # but hardly shows up for sparse data
    def draw_matrix(self, filename, lognorm=False):
        """Draw heatmap over data matrix sorted by cluster.

        params:
        lognorm: log & center data first as in SpectralBiclustering(method='log')"""
        if lognorm:
            data = lognormalize(self.matrix)
        # sort
        data = data[np.argsort(self.model.row_labels_)]
        data = data[:, np.argsort(self.model.column_labels_)]
        try:
            plt.matshow(data, cmap=plt.cm.Blues)
        except ValueError:
            plt.matshow(data.todense(), cmap=plt.cm.Blues)
        plt.savefig(filename, dpi=600)

    # best to call w/ axis from plt.subplots(tight_layout=True)
    def draw_image_matrix(self, filename, lognorm=False, percentile=False):
        """Draw heatmap over a reduced matrix where rows are row clusters and columns column clusters. Cells are shaded based on average values within that cluster x cluster block.

        params:
        lognorm: log & center data first as in SpectralBiclustering(method='log')
        percentile: instead of average, color based on some percentile of block values
        """
        image, counts = self.get_image_matrix(lognorm, percentile)
        #image = image.transpose()
        _, ax = plt.subplots(tight_layout=True)
        ax.matshow(image, cmap=plt.cm.Blues)
        # set tick labels
        yticks = np.array(range(image.shape[0]))
        ylabels = list(
            map(lambda x: "\n".join(x),
                self.get_handles_by_cluster(3, descriptions=True)))
        try:
            # artist-style
            ax.set_yticks(yticks)
            ax.set_yticklabels(ylabels, size=3)
        except AttributeError:
            # scripting-style
            ax.yticks(yticks, labels=[])
        xticks = np.array(range(image.shape[1]))
        if cv:
            xlabels = list(
                map(lambda x: "\n".join(x), self.get_terms_by_cluster(3)))
        else:
            xlabels = None
        try:
            # scripting-style
            ax.xticks(ticks=xticks, labels=xlabels, size=4)
        except AttributeError:
            # artist-style
            ax.set_xticks(xticks)
            ax.set_xticklabels(xlabels, rotation=90, size=4)
        # annotate w/ counts
        for i in range(counts.shape[0]):
            for j in range(counts.shape[1]):
                # don't transpose 'counts' b/c plt.matshow orients axes funny
                ax.annotate(counts[i, j], (j, i), size=3, ha='center')
        plt.savefig(filename, dpi=500)

    def get_image_matrix(self, lognorm=False, percentile=False):
        if lognorm:
            data = lognormalize(self.matrix)
        else:
            data = self.matrix
        clusters = [
            pd.unique(self.model.row_labels_),
            pd.unique(self.model.column_labels_)
        ]
        dim = list(map(lambda x: len(x), clusters))
        image = np.zeros(shape=dim)
        counts = np.full(shape=dim, fill_value='', dtype=object)
        for i in clusters[0]:
            for j in clusters[1]:
                submat = self.get_bicluster_submatrix(i, j)
                if percentile is False:
                    image[i, j] = np.mean(submat)
                else:
                    image[i, j] = np.percentile(submat, percentile)
                counts[i, j] = f"{submat.shape[0]}x{submat.shape[1]}"
        return image, counts

    def get_bicluster_submatrix(self, i, j):
        rows = np.where(np.equal(self.model.row_labels_, i))[0]
        columns = np.where(np.equal(self.model.column_labels_, j))[0]
        return self.matrix[rows][:, columns]

    def print_by_cluster(self, n_terms, words=True):
        if words:
            top_terms = self.get_terms_by_cluster(n_terms)
        else:
            top_terms = self.get_handles_by_cluster(n_terms)
        for i in range(len(top_terms)):
            print(i)
            # loop thru handles
            for h in top_terms[i]:
                # print in full
                desc = self.get_handle_description(h, 1000)
                desc = "; ".join(desc.split("\n"))
                print(f"   {h} -- {desc}")

    def get_terms_by_cluster(self, n_terms=5):
        """Get list of top terms for each term/column cluster"""
        # get cluster indices
        col_clusters = pd.unique(self.model.column_labels_)
        col_clusters = np.sort(col_clusters)
        # get term frequencies
        freq = np.sum(self.matrix, axis=0)
        # grr matrices
        if len(freq.shape) > 1:
            freq = np.array(freq)
            freq = freq[0]
        # get int->string vocabulary
        words = self.vectorizer.get_feature_names()
        top_terms = []
        for c in col_clusters:
            # get term indices
            term_inds = np.where(np.equal(self.model.column_labels_, c))[0]
            # get frequencies
            term_freqs = freq[term_inds]
            # get top frequencies
            top_inds = term_inds[np.argsort(term_freqs)[-1 * n_terms:]]
            top_cluster_terms = [words[i] for i in top_inds]
            top_cluster_terms.reverse()
            top_terms.append(top_cluster_terms)
        return top_terms

    def get_handles_by_cluster(self, n, descriptions=False):
        """Get list of top terms for each user/row cluster. 'descriptions=True' prints descriptions in lieu of handles."""
        # get cluster indices
        row_clusters = pd.unique(self.model.row_labels_)
        row_clusters = np.sort(row_clusters)
        # get usage frequencies
        freq = np.sum(self.matrix, axis=1)
        # grr matrices
        if len(freq.shape) > 1:
            freq = np.array(freq)
            freq = freq[:, 0]
        # get int->string vocabulary (here it's rec_handles)
        top_handles = []
        for c in row_clusters:
            # get term indices
            handle_inds = np.where(np.equal(self.model.row_labels_, c))[0]
            # get frequencies
            handle_freqs = freq[handle_inds]
            # get top frequencies
            top_inds = handle_inds[np.argsort(handle_freqs)[-1 * n:]]
            if descriptions:
                top_cluster_handles = [
                    self.get_handle_description(rec_handles[i])
                    for i in top_inds
                ]
            else:
                top_cluster_handles = [self.handles[i] for i in top_inds]
            top_cluster_handles.reverse()
            top_handles.append(top_cluster_handles)
        return top_handles

    def get_handle_description(self, handle, nchar=24):
        description = self.df.query('handle == @handle').iloc[0, :].description
        if pd.isna(description):
            description = str(description)
        else:
            description = description[:nchar]
        return description

    # 'bags' should be list of lemmatized tweets, not aggregated by handle
    def print_tweets_by_block(self, row, col, n, word_bags):
        """Print a sample of tweets corresponding to an intersection of row and column clusters -- i.e. a block in the image matrix.

        params:

        row, col: cluster indices
        n: # tweets to print
        word_bags: list of lemmatized tweets (not aggregated by handle)
        """
        if word_bags is None:
            # TODO could get lemmas from dataframe...
            pass
        # get row/col indices from clusterer
        rows = np.where(np.equal(self.model.row_labels_, row))[0]
        cols = np.where(np.equal(self.model.column_labels_, col))[0]
        # resolve row indices to users via rec_handles
        users = [rec_handles[i] for i in rows]
        # get df row indices from users
        df_user_inds = np.where(np.isin(self.df.handle.values, users))[0]
        # resolve col indices to lemmas from vectorizer
        vocabulary = self.vectorizer.get_feature_names()
        lemmas = np.array([vocabulary[i] for i in cols], dtype=object)
        # sanity check
        print(f"example terms: {list(lemmas[:10])}\n")
        # get df row indices by finding lemmas in list of word bags
        df_inds = []
        for i in df_user_inds:
            b = np.array(word_bags[i], dtype=object)
            if len(np.intersect1d(b, lemmas)) > 0:
                df_inds.append(i)
        # print sample of 'n' tweets
        df_inds = np.random.choice(df_inds,
                                   size=min(n, len(df_inds)),
                                   replace=False)
        for i in df_inds:
            handle = self.df.iloc[i, :].handle
            descr = self.df.iloc[i, :].description
            print(f"@{handle} - {descr}")
            print("   " + self.df.iloc[i, :].text + "\n")
Exemplo n.º 19
0
# Calculate Bi-clusters via spectral clustering
# NOTE THAT YOUR DATA NEEDS TO BE NORMALIZED
from sklearn.cluster import SpectralCoclustering, SpectralBiclustering

nClusters = 3

#clus = SpectralCoclustering(n_clusters=nClusters)
clus = SpectralBiclustering(n_clusters=nClusters)
clus.fit(data)
clabel = clus.column_labels_.astype(float)
rlabel = clus.row_labels_.astype(float)
Exemplo n.º 20
0
def spectral_biclustering(tfidf_matrix, n_clusters=100):
    return SpectralBiclustering(n_clusters=n_clusters).fit(tfidf_matrix)
Exemplo n.º 21
0
    shape=(300, 300), n_clusters=n_clusters, noise=10,
    shuffle=False, random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

# shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(data.shape[0])
col_idx = rng.permutation(data.shape[1])
data = data[row_idx][:, col_idx]

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralBiclustering(n_clusters=n_clusters, method='log',
                             random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.1f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
                     np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)
Exemplo n.º 22
0
 def spectral_biclustering(self):
     self.model = SpectralBiclustering(n_clusters=self.n_clusters)