示例#1
0
    def compare_models(self, n_clusters, score_func):
        """Compare the model performance with respect to a score metric.

        Args:
            random_state (int):
            n_clusters (int):

        Returns:
            (tuple): The name and score of the selected model, in addition to
                a model instance with the optimal hyperparameter settings.

        """

        # Shuffle data matrix, row and col indicators for random state.
        _train, self.row_idx, self.col_idx = sgen._shuffle(
            self._data, random_state=self.random_state)
        # NB: Subtract mean and divide by std.
        _train_std = self.scaler.fit_transform(_train)

        best_score = -np.float('inf')
        winning_model, best_params = None, None
        for model, params in self.models_and_params:

            if isinstance(n_clusters, (tuple, list)):
                if not isinstance(model, SpectralBiclustering):
                    n_clusters = min(n_clusters)

            # Determine the best hyperparameter combo for that model
            _grid = GridSearchCV(model(random_state=self.random_state,
                                       n_clusters=n_clusters),
                                 param_grid=params,
                                 scoring=score_func,
                                 n_jobs=16,
                                 cv=self.dummy_cv,
                                 return_train_score=True,
                                 refit=False)
            _grid.fit(_train_std, y=None)

            if self.verbose > 1:
                print('Model performance:\nName: {}\nScore: {}\n'
                      ''.format(model.__name__, _grid.best_score_))

            if _grid.best_score_ > best_score:
                winner_model = model.__name__
                best_score = _grid.best_score_
                winner_params = _grid.best_params_

        return (winner_model, winner_params, best_score)
示例#2
0
from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score

data, rows, columns = make_biclusters(shape=(300, 300),
                                      n_clusters=5,
                                      noise=5,
                                      shuffle=False,
                                      random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.3f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")
示例#3
0
        os.remove(temppath)
        size(W, HEIGHT+dy+40)
else:
    def pltshow(mplpyplot):
        mplpyplot.show()
# nodebox section end

n_clusters = (4, 3)
data, rows, columns = make_checkerboard(
    shape=(300, 300), n_clusters=n_clusters, noise=10,
    shuffle=False, random_state=0)

plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Original dataset")

data, row_idx, col_idx = sg._shuffle(data, random_state=0)
plt.matshow(data, cmap=plt.cm.Blues)
plt.title("Shuffled dataset")

model = SpectralBiclustering(n_clusters=n_clusters, method='log',
                             random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.1f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
示例#4
0
def test_co_clustering():

    import numpy as np
    import nibabel as nb
    from matplotlib import pyplot as plt
    import sklearn as sk
    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    # REAL DATA
    subject_file = '/Users/aki.nikolaidis/Desktop/NKI_SampleData/A00060280/3mm_bandpassed_demeaned_filtered_antswarp.nii.gz'
    roi_mask_file = home + '/git_repo/basc/masks/BG_3mm.nii.gz'
    roi2_mask_file = home + '/git_repo/basc/masks/yeo2_3mm.nii.gz'

    data = nb.load(subject_file).get_data().astype('float32')
    print('Data Loaded')

    print('Setting up NIS')
    roi_mask_file_nb = nb.load(roi_mask_file)
    roi2_mask_file_nb = nb.load(roi2_mask_file)

    roi_mask_nparray = nb.load(roi_mask_file).get_data().astype(
        'float32').astype('bool')
    roi2_mask_nparray = nb.load(roi2_mask_file).get_data().astype(
        'float32').astype('bool')

    roi1data = data[roi_mask_nparray]
    roi2data = data[roi2_mask_nparray]

    #add code that uploads the roi1data and roi2data, divides by the mean and standard deviation of the timeseries
    roi1data = sk.preprocessing.normalize(roi1data, norm='l2')
    roi2data = sk.preprocessing.normalize(roi2data, norm='l2')

    dist_btwn_data_1_2 = np.array(
        sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation'))
    sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2
    sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)] = 0
    sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0

    sim_btwn_data_1_2 = sim_btwn_data_1_2 + (np.random.rand(
        len(sim_btwn_data_1_2), len(sim_btwn_data_1_2[1, :]))) / 100
    sim_btwn_data_1_2[sim_btwn_data_1_2 > 1] = 1

    sum(sum(sim_btwn_data_1_2 == np.inf))
    sum(sum(sim_btwn_data_1_2 == np.nan))

    model = SpectralCoclustering(n_clusters=5, random_state=0, n_init=100)
    model.fit(sim_btwn_data_1_2)

    fit_data = sim_btwn_data_1_2[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()

    #SIMULATION DATA
    import numpy as np
    from matplotlib import pyplot as plt

    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    #Creating Simulated Data
    data, rows, columns = make_biclusters(shape=(300, 100),
                                          n_clusters=5,
                                          noise=5,
                                          shuffle=False,
                                          random_state=0)

    plt.matshow(data, cmap=plt.cm.Blues)
    plt.title("Original dataset")

    data, row_idx, col_idx = sg._shuffle(data, random_state=0)
    plt.matshow(data, cmap=plt.cm.Blues)
    plt.title("Shuffled dataset")

    #Creating Model
    model = SpectralCoclustering(n_clusters=5, random_state=0)
    model.fit(data)
    score = consensus_score(model.biclusters_,
                            (rows[:, row_idx], columns[:, col_idx]))

    print("consensus score: {:.3f}".format(score))

    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()

    ####################################################################
    ####################################################################
    from sklearn import cluster
    import scipy as sp
    import time
    from sklearn import cluster, datasets
    import numpy as np
    from matplotlib import pyplot as plt

    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    data1 = generate_simple_blobs(27)
    data2 = generate_simple_blobs(27)
    data2 = data2[0:150, :]

    print("Calculating Cross-clustering")
    print("Calculating pairwise distances between areas")

    dist_btwn_data_1_2 = np.array(
        sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation'))
    sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2
    sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0
    co_cluster = cluster.SpectralCoclustering()
    co_cluster.fit(sim_btwn_data_1_2)
    score = consensus_score(co_cluster.biclusters_,
                            (rows[:, row_idx], columns[:, col_idx]))

    print("consensus score: {:.3f}".format(score))

    fit_data = data[np.argsort(co_cluster.row_labels_)]
    fit_data = fit_data[:, np.argsort(co_cluster.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()