Пример #1
0
def biclustering(dist, genes_1, genes_2, x_label, y_label, out_file, experiment, id_convertor, n_clusters=3, precent_visualize=0.1):
    model = SpectralBiclustering(n_clusters=n_clusters, n_components=12, n_best=6,
                                 init='random', random_state=1)

    m, n = dist.shape
    assert m == len(genes_1) and n == len(genes_2)
    model.fit(dist)
    rows = [(idx, clust_id) for idx, clust_id in enumerate(model.row_labels_)]
    selected_rows = random.choices(rows, k=int(precent_visualize * len(rows)))
    selected_rows_name = [genes_1[idx] for idx, _ in selected_rows]
    selected_rows_clust_ids = [clust_id for _, clust_id in selected_rows]
    selected_rows_indices = [idx for idx, _ in selected_rows]
    # Slect columns
    cols = [(idx, clust_id) for idx, clust_id in enumerate(model.column_labels_)]
    selected_cols = random.choices(cols, k=int(precent_visualize * len(cols)))
    selected_cols_names = [genes_2[idx] for idx, _ in selected_cols]
    selected_cols_clust_ids = [clust_id for _, clust_id in selected_cols]
    selected_cols_indices = [idx for idx, _ in selected_cols]
    # Selected dist
    selected_dist = dist[selected_rows_indices] [:, selected_cols_indices]
    # Sort rows
    sorted_rows_indices = np.argsort(selected_rows_clust_ids)
    selected_dist = selected_dist[sorted_rows_indices, :]
    selected_row_names = [selected_rows_name[i] for i in sorted_rows_indices]
    #selected_row_names = selected_rows_name[sorted_rows_indices]
    # sort columns
    sorted_cols_indices = np.argsort(selected_cols_clust_ids)
    selected_dist = selected_dist[:, sorted_cols_indices]
    selected_cols_names = [selected_cols_names[i] for i in sorted_cols_indices]

    result = pd.DataFrame(selected_dist, columns=selected_cols_names, index=selected_rows_name)

    ax = sns.heatmap(result, cmap="Greens_r", square=True)
    plt.title("Biclustering Results")
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.tick_params(left=False, bottom=False)
    ax.set_ylabel('{} genes'.format(x_label))
    ax.set_xlabel('{} genes'.format(y_label))
    figure = ax.get_figure()
    figure.savefig(out_file)
    plt.close()

    for bic in range(n_clusters*n_clusters):
        #print(bic)
        r = list(model.rows_[bic])
        rows = [i for (i, b) in zip(genes_1, r) if b]

        c = list(model.columns_[bic])
        columns = [i for (i, b) in zip(genes_2, c) if b]

        rows = id_convertor.ints2ids([int(k) for k in rows])
        columns = id_convertor.ints2ids([int(k) for k in columns])

        cluster_path = os.path.join(experiment, f'{bic}_{x_label}_{y_label}_biclustering.csv')
        with open(cluster_path, 'w') as fout:
            fout.write(','.join(rows))
            fout.write("\n")
            fout.write(','.join(columns))
Пример #2
0
    def get_bicluster(self, data):
        # Biclustering
        model = SpectralBiclustering(n_clusters=data.shape[1], random_state=0)
        print(data.sum(axis=0))
        print(data.sum(axis=1))
        model.fit(data.fillna(0))
        fit_data = data.iloc[np.argsort(model.row_labels_)]
        fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)]

        return fit_data
Пример #3
0
def spectral_biclust(E, ngenes=3, nconditions=1,  spectral_method="bistochastic", n=6, n_best_ratio=0.5, **kwargs):
    """
    Note:
    - method was moved from sklearn.cluster.bicluster.SpectralBiclustering

    """
    n_best = max([int(n*n_best_ratio), 1])

    spectral = SpectralBiclustering(n_clusters=(nconditions,ngenes), method=spectral_method, n_components=n, n_best=n_best)

    spectral.fit(standardize(E))

    bics = []
    for columns, rows in zip(spectral.columns_, spectral.rows_):
        genes = E.columns[columns]
        conditions = E.index[rows]

        bics.append(Bicluster(genes, conditions))

    return bics
    
Пример #4
0
def test_spectral_biclustering():
    # Test Kluger methods on a checkerboard dataset.
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=0)

    non_default_params = {'method': ['scale', 'log'],
                          'svd_method': ['arpack'],
                          'n_svd_vecs': [20],
                          'mini_batch': [True]}

    for mat in (S, csr_matrix(S)):
        for param_name, param_values in non_default_params.items():
            for param_value in param_values:

                model = SpectralBiclustering(
                    n_clusters=3,
                    n_init=3,
                    init='k-means++',
                    random_state=0,
                )
                model.set_params(**dict([(param_name, param_value)]))

                if issparse(mat) and model.get_params().get('method') == 'log':
                    # cannot take log of sparse matrix
                    with pytest.raises(ValueError):
                        model.fit(mat)
                    continue
                else:
                    model.fit(mat)

                assert model.rows_.shape == (9, 30)
                assert model.columns_.shape == (9, 30)
                assert_array_equal(model.rows_.sum(axis=0),
                                   np.repeat(3, 30))
                assert_array_equal(model.columns_.sum(axis=0),
                                   np.repeat(3, 30))
                assert consensus_score(model.biclusters_,
                                       (rows, cols)) == 1

                _test_shape_indices(model)
Пример #5
0
def test_perfect_checkerboard():
    # XXX Previously failed on build bot (not reproducible)
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1
Пример #6
0
def test_perfect_checkerboard():
    # XXX test always skipped
    raise SkipTest("This test is failing on the buildbot, but cannot"
                   " reproduce. Temporarily disabling it until it can be"
                   " reproduced and  fixed.")
    model = SpectralBiclustering(3, svd_method="arpack", random_state=0)

    S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1

    S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0)
    model.fit(S)
    assert consensus_score(model.biclusters_, (rows, cols)) == 1
Пример #7
0
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

# shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(data.shape[0])
col_idx = rng.permutation(data.shape[1])
data = data[row_idx][:, col_idx]

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralBiclustering(n_clusters=n_clusters, method='log',
                             random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.1f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(np.sort(model.row_labels_) + 1,
                     np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)
plt.title("Checkerboard structure of rearranged data")
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original datasets")

#Shuffle clusters
rng = np.random.RandomState(0)
row_idx = rng.permutation(data.shape[0])
col_idx = rng.permutation(data.shape[1])
data = data[row_idx][:, col_idx]

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffle datasets")

models = SpectralBiclustering(n_clusters=n_clusters,
                              method='log',
                              random_state=0)
models.fit(data)
score = consensus_score(models.biclusters_,
                        (rows[:, row_idx], colums[:, col_idx]))

print("consensus score: {: .1f}".format(score))

fit_data = data[np.argsort(models.row_labels_)]
fit_data = fit_data[:, np.argsort(models.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(
    np.sort(model.row_labels_) + 1,
    np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)
Пример #9
0
def test_wrong_shape():
    model = SpectralBiclustering()
    data = np.arange(27).reshape((3, 3, 3))
    with pytest.raises(ValueError):
        model.fit(data)
Пример #10
0
def test_errors(args):
    data = np.arange(25).reshape((5, 5))

    model = SpectralBiclustering(**args)
    with pytest.raises(ValueError):
        model.fit(data)
Пример #11
0
# Calculate Bi-clusters via spectral clustering
# NOTE THAT YOUR DATA NEEDS TO BE NORMALIZED
from sklearn.cluster import SpectralCoclustering, SpectralBiclustering

nClusters = 3

#clus = SpectralCoclustering(n_clusters=nClusters)
clus = SpectralBiclustering(n_clusters=nClusters)
clus.fit(data)
clabel = clus.column_labels_.astype(float)
rlabel = clus.row_labels_.astype(float)
Пример #12
0
def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
    """Check parameters validation in `SpectralBiClustering`"""
    data = np.arange(25).reshape((5, 5))
    model = SpectralBiclustering(**params)
    with pytest.raises(type_err, match=err_msg):
        model.fit(data)
Пример #13
0
class SpectBiclustUser():
    """hold data, model, and visualization methods for SpectralBiclustering on twitter user-term matrix
    
    parameters:
    matrix: user-term matrix
    cluster_args: optional arguments for SpectralBiclustering model, as dict
    vectorizer: sklearn.feature_extraction.text vectorizer
    df: DataFrame w/ 'text', 'handle', and 'description' fields
    handles: list/array of document identifiers
    """
    def __init__(self, matrix, cluster_args, vectorizer, df, handles):
        self.model = SpectralBiclustering(**cluster_args)
        self.matrix = matrix
        self.vectorizer = vectorizer
        self.df = df
        self.handles = handles

    def fit(self):
        self.model.fit(self.matrix)

    # works for bi- or co-clustering!
    # but hardly shows up for sparse data
    def draw_matrix(self, filename, lognorm=False):
        """Draw heatmap over data matrix sorted by cluster.

        params:
        lognorm: log & center data first as in SpectralBiclustering(method='log')"""
        if lognorm:
            data = lognormalize(self.matrix)
        # sort
        data = data[np.argsort(self.model.row_labels_)]
        data = data[:, np.argsort(self.model.column_labels_)]
        try:
            plt.matshow(data, cmap=plt.cm.Blues)
        except ValueError:
            plt.matshow(data.todense(), cmap=plt.cm.Blues)
        plt.savefig(filename, dpi=600)

    # best to call w/ axis from plt.subplots(tight_layout=True)
    def draw_image_matrix(self, filename, lognorm=False, percentile=False):
        """Draw heatmap over a reduced matrix where rows are row clusters and columns column clusters. Cells are shaded based on average values within that cluster x cluster block.

        params:
        lognorm: log & center data first as in SpectralBiclustering(method='log')
        percentile: instead of average, color based on some percentile of block values
        """
        image, counts = self.get_image_matrix(lognorm, percentile)
        #image = image.transpose()
        _, ax = plt.subplots(tight_layout=True)
        ax.matshow(image, cmap=plt.cm.Blues)
        # set tick labels
        yticks = np.array(range(image.shape[0]))
        ylabels = list(
            map(lambda x: "\n".join(x),
                self.get_handles_by_cluster(3, descriptions=True)))
        try:
            # artist-style
            ax.set_yticks(yticks)
            ax.set_yticklabels(ylabels, size=3)
        except AttributeError:
            # scripting-style
            ax.yticks(yticks, labels=[])
        xticks = np.array(range(image.shape[1]))
        if cv:
            xlabels = list(
                map(lambda x: "\n".join(x), self.get_terms_by_cluster(3)))
        else:
            xlabels = None
        try:
            # scripting-style
            ax.xticks(ticks=xticks, labels=xlabels, size=4)
        except AttributeError:
            # artist-style
            ax.set_xticks(xticks)
            ax.set_xticklabels(xlabels, rotation=90, size=4)
        # annotate w/ counts
        for i in range(counts.shape[0]):
            for j in range(counts.shape[1]):
                # don't transpose 'counts' b/c plt.matshow orients axes funny
                ax.annotate(counts[i, j], (j, i), size=3, ha='center')
        plt.savefig(filename, dpi=500)

    def get_image_matrix(self, lognorm=False, percentile=False):
        if lognorm:
            data = lognormalize(self.matrix)
        else:
            data = self.matrix
        clusters = [
            pd.unique(self.model.row_labels_),
            pd.unique(self.model.column_labels_)
        ]
        dim = list(map(lambda x: len(x), clusters))
        image = np.zeros(shape=dim)
        counts = np.full(shape=dim, fill_value='', dtype=object)
        for i in clusters[0]:
            for j in clusters[1]:
                submat = self.get_bicluster_submatrix(i, j)
                if percentile is False:
                    image[i, j] = np.mean(submat)
                else:
                    image[i, j] = np.percentile(submat, percentile)
                counts[i, j] = f"{submat.shape[0]}x{submat.shape[1]}"
        return image, counts

    def get_bicluster_submatrix(self, i, j):
        rows = np.where(np.equal(self.model.row_labels_, i))[0]
        columns = np.where(np.equal(self.model.column_labels_, j))[0]
        return self.matrix[rows][:, columns]

    def print_by_cluster(self, n_terms, words=True):
        if words:
            top_terms = self.get_terms_by_cluster(n_terms)
        else:
            top_terms = self.get_handles_by_cluster(n_terms)
        for i in range(len(top_terms)):
            print(i)
            # loop thru handles
            for h in top_terms[i]:
                # print in full
                desc = self.get_handle_description(h, 1000)
                desc = "; ".join(desc.split("\n"))
                print(f"   {h} -- {desc}")

    def get_terms_by_cluster(self, n_terms=5):
        """Get list of top terms for each term/column cluster"""
        # get cluster indices
        col_clusters = pd.unique(self.model.column_labels_)
        col_clusters = np.sort(col_clusters)
        # get term frequencies
        freq = np.sum(self.matrix, axis=0)
        # grr matrices
        if len(freq.shape) > 1:
            freq = np.array(freq)
            freq = freq[0]
        # get int->string vocabulary
        words = self.vectorizer.get_feature_names()
        top_terms = []
        for c in col_clusters:
            # get term indices
            term_inds = np.where(np.equal(self.model.column_labels_, c))[0]
            # get frequencies
            term_freqs = freq[term_inds]
            # get top frequencies
            top_inds = term_inds[np.argsort(term_freqs)[-1 * n_terms:]]
            top_cluster_terms = [words[i] for i in top_inds]
            top_cluster_terms.reverse()
            top_terms.append(top_cluster_terms)
        return top_terms

    def get_handles_by_cluster(self, n, descriptions=False):
        """Get list of top terms for each user/row cluster. 'descriptions=True' prints descriptions in lieu of handles."""
        # get cluster indices
        row_clusters = pd.unique(self.model.row_labels_)
        row_clusters = np.sort(row_clusters)
        # get usage frequencies
        freq = np.sum(self.matrix, axis=1)
        # grr matrices
        if len(freq.shape) > 1:
            freq = np.array(freq)
            freq = freq[:, 0]
        # get int->string vocabulary (here it's rec_handles)
        top_handles = []
        for c in row_clusters:
            # get term indices
            handle_inds = np.where(np.equal(self.model.row_labels_, c))[0]
            # get frequencies
            handle_freqs = freq[handle_inds]
            # get top frequencies
            top_inds = handle_inds[np.argsort(handle_freqs)[-1 * n:]]
            if descriptions:
                top_cluster_handles = [
                    self.get_handle_description(rec_handles[i])
                    for i in top_inds
                ]
            else:
                top_cluster_handles = [self.handles[i] for i in top_inds]
            top_cluster_handles.reverse()
            top_handles.append(top_cluster_handles)
        return top_handles

    def get_handle_description(self, handle, nchar=24):
        description = self.df.query('handle == @handle').iloc[0, :].description
        if pd.isna(description):
            description = str(description)
        else:
            description = description[:nchar]
        return description

    # 'bags' should be list of lemmatized tweets, not aggregated by handle
    def print_tweets_by_block(self, row, col, n, word_bags):
        """Print a sample of tweets corresponding to an intersection of row and column clusters -- i.e. a block in the image matrix.

        params:

        row, col: cluster indices
        n: # tweets to print
        word_bags: list of lemmatized tweets (not aggregated by handle)
        """
        if word_bags is None:
            # TODO could get lemmas from dataframe...
            pass
        # get row/col indices from clusterer
        rows = np.where(np.equal(self.model.row_labels_, row))[0]
        cols = np.where(np.equal(self.model.column_labels_, col))[0]
        # resolve row indices to users via rec_handles
        users = [rec_handles[i] for i in rows]
        # get df row indices from users
        df_user_inds = np.where(np.isin(self.df.handle.values, users))[0]
        # resolve col indices to lemmas from vectorizer
        vocabulary = self.vectorizer.get_feature_names()
        lemmas = np.array([vocabulary[i] for i in cols], dtype=object)
        # sanity check
        print(f"example terms: {list(lemmas[:10])}\n")
        # get df row indices by finding lemmas in list of word bags
        df_inds = []
        for i in df_user_inds:
            b = np.array(word_bags[i], dtype=object)
            if len(np.intersect1d(b, lemmas)) > 0:
                df_inds.append(i)
        # print sample of 'n' tweets
        df_inds = np.random.choice(df_inds,
                                   size=min(n, len(df_inds)),
                                   replace=False)
        for i in df_inds:
            handle = self.df.iloc[i, :].handle
            descr = self.df.iloc[i, :].description
            print(f"@{handle} - {descr}")
            print("   " + self.df.iloc[i, :].text + "\n")
Пример #14
0
interval = 0

while end <= len(band[0]):
    print interval,
    for i in channels:
        for j in range(63):
            if i <= j:
                pearson_data[interval].append(
                    pearsonr(band[i][start:end], band[j][start:end])[0])
    start = end
    end += length_of_intervals
    interval += 1

p = np.array(pearson_data)
spectral_model = SpectralBiclustering()
spectral_model.fit(p)

fit_data = p[np.argsort(spectral_model.row_labels_)]
fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
plt.matshow(p, cmap=plt.cm.Blues)
plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.matshow(np.outer(
    np.sort(spectral_model.row_labels_) + 1,
    np.sort(spectral_model.column_labels_) + 1),
            cmap=plt.cm.Blues)

with open('media/pearson_30sec_bandpassMedian_clipped_2016.json', 'w+') as f:
    pearson_data_r = np.array(pearson_data)
    p = [[float(column) for column in row] for row in pearson_data_r]
    f.write(simplejson.dumps({'name': 's5d2nap', 'data': p}))
plt.show()