예제 #1
0
def Spectral_CoClustering(args):
    '''Function to perform bipartite clustering'''
    # Create model
    try:
        if args.arpack:
            model = SpectralCoclustering(
                n_clusters=args.nClusters, svd_method='arpack')
        else:
            model = SpectralCoclustering(
                n_clusters=args.nClusters)
    except:
        print '-r 1 may cause problems when svd_method has been set to arpack'
    print('Running coclustering')
    model.fit(args.M.tocsc())
    print('Coclustering done')

    # Fit to data
    # fit_data = args.M[np.argsort(model.row_labels_)]
    # fit_data = fit_data[:, np.argsort(model.column_labels_)]
    fit_data = args.M.tocoo()
    fit_data.row = invert_permutation(np.argsort(model.row_labels_))[fit_data.row]
    fit_data.col = invert_permutation(np.argsort(model.column_labels_))[fit_data.col]

    save_clusters(model, fit_data, args, '_CoClustering')

    return model, fit_data
def test_spectral_coclustering():
    # Test Dhillon's Spectral CoClustering on a simple problem.
    param_grid = {
        'svd_method': ['randomized', 'arpack'],
        'n_svd_vecs': [None, 20],
        'mini_batch': [False, True],
        'init': ['k-means++'],
        'n_init': [10],
        'n_jobs': [1]
    }
    random_state = 0
    S, rows, cols = make_biclusters((30, 30),
                                    3,
                                    noise=0.5,
                                    random_state=random_state)
    S -= S.min()  # needs to be nonnegative before making it sparse
    S = np.where(S < 1, 0, S)  # threshold some values
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralCoclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)
            model.fit(mat)

            assert_equal(model.rows_.shape, (3, 30))
            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
            assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)

            _test_shape_indices(model)
def bi_clustering(data, args):
    print 'clustering...'

    # max_val = np.max(np.max(data))

    # data = -np.exp(data / data.std())

    max_val = np.max(np.max(data))

    data[data == 0] = max_val

    data = data / max_val

    model = SpectralCoclustering(n_clusters=args.k, svd_method='arpack')
    model.fit(data)

    np.savetxt(args.o, model.row_labels_, fmt="%d", newline="\n")

    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    if not args.plot:
        return

    plt.matshow(shuffle(data), cmap=plt.cm.Blues)
    plt.title("Org dataset")

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()
예제 #4
0
 def run(self, data):
     bc = SpectralCoclustering(n_clusters=(self.n_gene_classes,
                                           self.n_classes))
     bc.fit(data)
     gene_clusters = bc.row_labels_
     cell_clusters = bc.column_labels_
     return cell_clusters
예제 #5
0
def main():
    origin = open('10k.txt', 'r')

    lines = origin.readlines()

    x = []
    label = []

    for l in lines:
        l = l.split(',')
        ip1 = l[2].split('.')
        ip2 = l[3].split('.')
        d = [datetime.fromtimestamp(int(l[1][0:11])).hour, int("%02x%02x%02x%02x"%(int(ip1[0]),int(ip1[1]),int(ip1[2]),int(ip1[3])),16), int("%02x%02x%02x%02x" % (int(ip2[0]),int(ip2[1]),int(ip2[2]),int(ip2[3])),16)] + l[4:6] + l[7:10]
        x.append(d)

    data = np.array(x, dtype='float32')

    model = SpectralCoclustering(n_clusters=5)
    model.fit(data)

    print model.rows_

    for i in range(5):
        print "Cluster" + str(i) + ':'
        for j in range(10000):
            if model.rows_[i][j]:
                print j,
        print ' '
예제 #6
0
def update_bicluster(batch_df, task_df, compound_df, mode='RobustMT', K=5):
    if mode == 'RobustMT':
        n_tasks = task_df.shape[1] - 1
    elif mode == 'ST':
        n_tasks = 1
    elif mode == 'MT':
        n_tasks = task_df.shape[1]

    if not mode == 'ST':
        # cocluster of the minibatch predictive matrix
        X = preprocessing.scale(np.matrix(batch_df)[:, 0:n_tasks])
        cocluster = SpectralCoclustering(n_clusters=K, random_state=0)
        cocluster.fit(X)
        batch_df['batch_label'] = cocluster.row_labels_
    else:
        rank_x = batch_df[batch_df.columns[0]].rank().tolist()
        groups = pd.qcut(rank_x, K, duplicates='drop')
        batch_df['batch_label'] = groups.codes

    # generate color hex for batch_label
    lut = dict(zip(batch_df['batch_label'].unique(), Category20_20))
    batch_df['batch_label_color'] = batch_df['batch_label'].map(lut)

    # generate color hex for compound_df
    lut2 = dict(zip(batch_df['Label_id'], batch_df['batch_label_color']))
    compound_df['batch_label_color'] = compound_df['label'].map(lut2)
    lut22 = dict(zip(batch_df['Label_id'], batch_df['batch_label']))
    compound_df['batch_label'] = compound_df['label'].map(lut22)
    groups = pd.qcut(compound_df['label'].tolist(),
                     len(Category20b_20),
                     duplicates='drop')
    c = [Category20b_20[xx] for xx in groups.codes]
    compound_df['label_color'] = c

    return batch_df, task_df, compound_df
예제 #7
0
def plot_biclusters():

    co_grid = ParameterGrid(
        {'n_clusters': np.arange(2, 10, 1), 'n_init': [20]}
    )
    _y = pd.read_csv('./../../data_source/to_analysis/original_images/dfs_original_images.csv', index_col=0)
    y_orig = np.squeeze(_y.values)

    X_orig = pd.read_csv('./../../data_source/to_analysis/original_images/all_features_original_images.csv', index_col=0)

    scaler = StandardScaler()
    X_orig_std = scaler.fit_transform(X_orig.values)

    #_run_experiment(co_grid, X_orig_std)

    df_avg_co_scores = pd.read_csv('bic_scores.csv', index_col=0)
    best_co_config = co_grid[
        np.argmin(df_avg_co_scores.loc[:, 'tvr'].values) - 1
    ]
    print(best_co_config, min(df_avg_co_scores.loc[:, 'tvr'].values))

    orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack')
    orig_co_model.set_params(**best_co_config)
    orig_co_model.fit(X_orig_std)

    #plt.figure()
    #_plot_tve(df_avg_co_scores, co_grid)

    plt.figure()
    _plot_bicmaps(X_orig_std, best_co_config)
예제 #8
0
def correlation_matrix(df):
    sns.set(style='white', font_scale=.9)
    clusters = 4
    pearson = df.drop(['asset', 'unixtime'], axis=1).corr(method='pearson')
    clust = SpectralCoclustering(n_clusters=clusters, random_state=0)
    clust.fit(pearson)
    pearson = pearson.iloc[np.argsort(clust.row_labels_)[::-1],
                           np.argsort(clust.column_labels_)]

    grid = dict(width_ratios=[1.5, pearson.shape[1]])
    fig, axs = plt.subplots(1, 2, figsize=(10, 8), gridspec_kw=grid)

    sns.heatmap(data=np.sort(clust.row_labels_)[::-1].reshape(-1, 1),
                ax=axs[0],
                cbar=False,
                linewidths=.005,
                cmap=sns.color_palette('Spectral'))
    axs[0].set(xticks=(), yticks=())

    sns.heatmap(data=pearson,
                cmap=sns.diverging_palette(220, 10, n=11),
                linewidths=.005,
                cbar_kws={'shrink': .75},
                vmax=1,
                vmin=-1,
                ax=axs[1])
    axs[1].set_xticklabels(pearson.columns, rotation='vertical')
    axs[1].set_yticklabels(pearson.index, rotation='horizontal')

    fig.suptitle(f'Variable Correlation Matrix in {clusters} Clusters',
                 fontsize=20)
    fig.tight_layout(w_pad=.5, rect=(.03, 0, 1, .95))
    fig.savefig('reports/correlation_matrix.png')
예제 #9
0
def bicluster(*cotables):
    table = cotables[0]
    model = SpectralCoclustering(n_clusters=table.shape[1], random_state=0)
    model.fit(table.as_matrix())
    return [
        cotable.iloc[np.argsort(model.row_labels_),
                     np.argsort(model.column_labels_)] for cotable in cotables
    ]
예제 #10
0
def spectral_co_cluster(data, n_clusters, para_jobs=1, random_state=None):
    from sklearn.cluster.bicluster import SpectralCoclustering
    model = SpectralCoclustering(n_clusters,
                                 random_state=random_state,
                                 n_jobs=para_jobs)
    model.fit(data)
    row_labels = model.row_labels_
    col_labels = model.column_labels_
    return row_labels, col_labels
예제 #11
0
    def spectral_coclustering(cls, *args):
        """
        Wrapper method for the spectral_coclustering algorithm

        :param args: the arguments to be sent to the sci-kit implementation
        :return: returns the Biclustering object
        """

        model = SpectralCoclustering(*args)
        return cls(model)
예제 #12
0
def get_clusters(data):
    coclusters = SpectralCoclustering(n_clusters=5, random_state=0)
    coclusters.fit(data)
    word_clusters = []
    hidden_clusters = []
    for i in range(5):
        wc = coclusters.get_indices(i)[0]
        hc = coclusters.get_indices(i)[1]
        word_clusters.append(wc.tolist())
        hidden_clusters.append(hc.tolist())
    return word_clusters, hidden_clusters
예제 #13
0
def _plot_bicmaps(X_orig_std, best_co_config):

    # Train model with best config.
    orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack')
    orig_co_model.set_params(**best_co_config)
    orig_co_model.fit(X_orig_std)
    orig_co_row_sorted = X_orig_std[np.argsort(orig_co_model.row_labels_), :]
    orig_co_fit_data = orig_co_row_sorted[:, np.argsort(orig_co_model.column_labels_)]

    hmap = sns.heatmap(
        orig_co_fit_data,
        robust=True,
        cmap=plt.cm.viridis,
        fmt='f',
        vmin=np.min(orig_co_fit_data),
        vmax=np.max(orig_co_fit_data),
        cbar=False
    )
    coords = bic_coords(orig_co_model, best_co_config['n_clusters'])
    for num in coords.index:
        plt.plot(
            (coords.loc[num, ['x1', 'x2', 'x2', 'x1', 'x1']]),
            (coords.loc[num, ['y1', 'y1', 'y2', 'y2', 'y1']]),
            c='darkred'
    )
    plt.ylabel('Patients')
    plt.xlabel('Features')

    plt.yticks([], [])
    plt.xticks([], [])

    ax_divider = make_axes_locatable(hmap)
    cax = ax_divider.append_axes('right', size='3%', pad='2%')
    colorbar.colorbar(
        hmap.get_children()[0],
        cax=cax,
        orientation='vertical'
    )
    #cax.xaxis.set_label_text('AUC', fontname='Sans')
    #cax.xaxis.set_label_position('top')
    cbar_ticks = np.linspace(
        np.nanmin(orig_co_fit_data),
        np.nanmax(orig_co_fit_data),
        6
    )
    cax.yaxis.set_ticks(cbar_ticks)
    cax.yaxis.set_ticklabels([f'{num:.01f}' for num in cbar_ticks])

    plt.savefig(
        '../biclustering/bic_map_original_images.pdf',
        bbox_inches='tight',
        transparent=True,
        dpi=CONFIG.DPI,
    )
예제 #14
0
def plot_coclusters_raw_data(time_ms, t=False):
    # take the transpose of sliced matrix
    if t:
        channels_data = slice_matrix(matrix, time_ms)
    else:
        channels_data = slice_matrix(matrix, time_ms)
    print len(channels_data), len(channels_data[1])
    z_score = stats.zscore(channels_data)
    plt.title('Z Score Biclustering Over %i ms' % time_ms)
    spectral_model = SpectralCoclustering()
    spectral_model.fit(z_score)
    fit_data = z_score[np.argsort(spectral_model.row_labels_)]
    fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)]
    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.savefig('z_score_raw_coclustering_all_ts_%i_T_%s.svg' % (time_ms, str(t)))
예제 #15
0
def _get_clusters_using_spectrals(corrarr, n_clusters=5, mode='co'):
    if mode=='co':
        model = SpectralCoclustering(n_clusters, random_state=0)
        model.fit(corrarr)
        indices = np.arange(corrarr.columns.size)
        clusters = [indices[x].tolist() for x in model.columns_]
        return clusters
    elif mode=='bi':
        model = SpectralBiclustering(n_clusters, random_state=0)
        model.fit(corrarr)
        indices = np.arange(corrarr.columns.size)
        clusters = [indices[x].tolist() for x in model.columns_]
        repetition_start = clusters[1:].index(clusters[0]) + 1
        return clusters[:repetition_start]
    else:
        raise("Mode wrong?")
예제 #16
0
파일: coclust_tade.py 프로젝트: makrai/misc
 def cocluster(self, mx, blockdiag=False):
     logging.info('Co-clustering Tade..')
     if blockdiag:
         logging.info('blockdiag')
         clusser = SpectralCoclustering(n_jobs=-1)
     else:  # checkerboard
         logging.info('checkerboard')
         clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4, 3))
         #n_clusters=3, svd_method='randomized',
     clusser.fit(mx)
     logging.info('Argsorting mx rows..')
     mx = mx[np.argsort(clusser.row_labels_)]
     self.prev = self.prev[np.argsort(clusser.row_labels_)]
     logging.info('Argsorting mx cases..')
     mx = mx[:, np.argsort(clusser.column_labels_)]
     self.case = self.case[np.argsort(clusser.column_labels_)]
     return mx
예제 #17
0
def main(model):
    store = pd.HDFStore(model)
    
    from_ = store['from_'][0][0]
    to = store['to'][0][0]
    assert from_ == 0
    
    trace_fpath = store['trace_fpath'][0][0]
    Theta_zh = store['Theta_zh'].values
    Psi_oz = store['Psi_sz'].values
    count_z = store['count_z'].values[:, 0]

    Psi_oz = Psi_oz / Psi_oz.sum(axis=0)
    Psi_zo = (Psi_oz * count_z).T
    Psi_zo = Psi_zo / Psi_zo.sum(axis=0)
    obj2id = dict(store['source2id'].values)
    hyper2id = dict(store['hyper2id'].values)
    id2obj = dict((v, k) for k, v in obj2id.items())

    ZtZ = Psi_zo.dot(Psi_oz)
    ZtZ = ZtZ / ZtZ.sum(axis=0)
    L = ZtZ
    #ZtZ[ZtZ < (ZtZ.mean())] = 0
    L[ZtZ >= 1.0 / (len(ZtZ))] = 1
    L[L != 1] = 0

    colormap = toyplot.color.brewer.map("Purples", domain_min=0, domain_max=1, reverse=True)
    print(colormap)
    canvas = toyplot.matrix((L.T, colormap), label="P[z' | z]", \
            colorshow=False, tlabel="To z'", llabel="From")[0]
    #canvas.axes(ylabel='From z', xlabel='To z\'')
    toyplot.pdf.render(canvas, 'tmat.pdf')

    model = SpectralCoclustering(n_clusters=3)
    model.fit(L)
    fit_data = L[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]
    canvas = toyplot.matrix((fit_data, colormap), label="P[z' | z']", \
            colorshow=False)[0]
    toyplot.pdf.render(canvas, 'tmat-cluster.pdf')
    
    #AtA = Psi_oz.dot(Psi_zo)
    #np.fill_diagonal(AtA, 0)
    #AtA = AtA / AtA.sum(axis=0)

    store.close()
예제 #18
0
def biclustering(db):
    #mydata = genfromtxt('/home/fan/intern/process_db/analysis/viewtime_matrix_524.csv',dtype=None,delimiter=',',names=True,skip_header=1)
    df = pd.read_csv(
        '/home/fan/intern/process_db/analysis/viewtime_matrix_501_0.1.csv')
    dma = 501
    #print df.head()
    print df.shape
    dev_list = df.ix[:, 0].values
    prog_list = df.columns.values
    #print type(dev_list)
    #print type(prog_list)
    df.drop(df.columns[0], axis=1, inplace=True)
    #df[df==0] = 1
    df = df.apply(fraction, axis=1)
    #print df.head()
    #print df.values
    #print type(df.values)
    #mydata = df.values
    #mydata=np.delete(mydata, 0, axis=0)
    #mydata=np.delete(mydata, 0, axis=1)
    #mydata[mydata==0] = 0.01
    #print 'data format is:',mydata,type(mydata)
    # model=SpectralCoclustering(n_clusters=5, random_state=0)
    #n_clusters=(1000,20) # 4*3 = 12 clusters

    #model = SpectralBiclustering(random_state=None)
    model = SpectralCoclustering(n_clusters=10)
    model.fit(df)
    #fit_data=mydata[np.argsort(model.row_labels_)]
    #fit_data=fit_data[:,np.argsort(model.column_labels_)]
    #plt.matshow(fit_data[0:40],cmap=plt.cm.Blues)
    # plt.show()
    print model.get_params()
    for i in range(0, 5):
        print 'Size of one cluster:', model.get_shape(i)
        indices = model.get_indices(i)
        #print indices[1]
        print prog_list[indices[1]]
        print model.get_submatrix(i, df.values)
        dev_in_cluster = dev_list[indices[0]]
        #print type(dev_in_cluster)
        print 'number of devices within this cluster:', len(dev_in_cluster)
        get_income(db, dma, dev_in_cluster.tolist())
def cluster_ex_by_feature_matrix(sub_ex_by_feat_mat, plot_file):
    if sub_ex_by_feat_mat.shape[0] > 50000:
        print "Matrix too large to be efficient, pleased reduce number of examples"

    # Subset down to motifs that are used
    plot_df = sub_ex_by_feat_mat[:,
                                 np.apply_along_axis(
                                     np.max, 0, sub_ex_by_feat_mat.toarray()
                                 ) != 0]
    # for numpy array
    plot_df = sub_ex_by_feat_mat_1[np.apply_along_axis(
        lambda row:
        (row != 0).sum(), 1, sub_ex_by_feat_mat_1.toarray()) > 10, :]
    plot_df = plot_df[:,
                      np.apply_along_axis(lambda column:
                                          (column != 0).sum(), 0,
                                          sub_ex_by_feat_mat_1.toarray()) > 50]
    # for pandas
    plot_df = sub_ex_by_feat_df2.ix[
        sub_ex_by_feat_df2.apply(lambda row: (row != 0).sum(), 1) > 10, :]
    plot_df = plot_df.ix[:,
                         plot_df.apply(lambda row: (row != 0).sum(), 0) > 50]
    plot_df = sub_ex_by_feat_df2

    np.apply_along_axis(lambda column: (column != 0).sum(), 0,
                        sub_ex_by_feat_mat_1.toarray())

    model = SpectralCoclustering(n_clusters=50)
    model.fit(plot_df)  # fits for 50K
    fit_data = plot_df.ix[np.argsort(model.row_labels_)]
    fit_data = fit_data.ix[:, np.argsort(model.column_labels_)]

    plt.figure()
    plt.matshow(fit_data.ix[0:500, ], cmap=plt.cm.YlGnBu, aspect='auto')
    plt.savefig(plot_file)

    print "DONE: biclustering plot here: {0}".format(plot_file)

    return "pretty picture"
예제 #20
0
def _run_experiment(co_grid, X_orig_std):

    np.random.seed(seed=0)
    random_states = np.random.choice(40, size=40)

    avg_co_scores = {}
    for num, co_param_config in enumerate(co_grid):
        orig_co_scores = []
        for random_state in random_states:
            orig_co_model = SpectralCoclustering(random_state=random_state, svd_method='arpack')
            # NOTE: Outputs a TVE score.
            orig_co_clusters = biclusters(orig_co_model, X_orig_std, co_param_config)
            orig_co_scores.append(orig_co_clusters.external_metrics.values)
        avg_co_scores[num] = np.nanmean(orig_co_scores, axis=0)

    avg_orig_co_scores = []
    for num, scores in enumerate(avg_co_scores.values()):
        avg_orig_co_scores.append(np.mean(scores, axis=0))

    df_avg_co_scores = pd.DataFrame(avg_orig_co_scores, columns=['tvr'])
    df_avg_co_scores.index.name = 'ConfigID'

    df_avg_co_scores.to_csv('bic_scores.csv')
def Block_diagonal(input_path,top_sd,n_clusters,output_path):
    
    ###input data
    input_dat=pd.read_csv(input_path,index_col=0,sep='\t',comment='#')
    
    ### get index and sample name
    # get_index = input_dat.index.astype(str)+'_'+input_dat.ix[:,0].astype(str)+'_'+\
    # input_dat.ix[:,1].astype(str)+'_'+input_dat.ix[:,2].astype(str)
    # get_samp_name = input_dat.columns[3:]
    
    pro_dat = input_dat.fillna(0)
    # pro_dat = pro_dat.ix[:,3:]
    # pro_dat.index = get_index
    # pro_dat.columns = get_samp_name
#    pro_dat = 2**pro_dat-1
    
    df_sd = pro_dat.apply(np.std,axis=1)
    df_sd_sort = df_sd.sort_values(ascending = False)
    df_sd_sort_top = df_sd_sort.ix[:int(len(df_sd_sort)*top_sd)]
    pro_dat = pro_dat.ix[df_sd_sort_top.index,:]
    
    sd_index = pro_dat.index
    sd_sample_names = pro_dat.columns
    
    #plt.matshow(pro_dat, cmap=plt.cm.Blues)
    #plt.title("Original dataset")
    
    ### model
    model = SpectralCoclustering(n_clusters=n_clusters, random_state=0)
    model.fit(pro_dat)
    
    pro_dat = np.array(pro_dat)
    fit_data = pro_dat[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]
    
    
    ### output the model fitting data
    fit_data = pd.DataFrame(fit_data)
    fit_data.index = sd_index[np.argsort(model.row_labels_)]
    fit_data.columns = sd_sample_names[np.argsort(model.column_labels_)]
    
    out_fit_data_path = os.path.join(output_path,'fit_data.csv')
    fit_data.to_csv(out_fit_data_path)
    ### output image
    fig = plt.figure(figsize=(20,40))
    ax = fig.add_subplot(111)
    ax.matshow(fit_data, cmap=plt.cm.Blues)
    #cax = ax.matshow(pro_dat, interpolation='nearest')
    #fig.colorbar(cax)
    ax.set_title("After biclustering; rearranged to show biclusters")
#    ax.set_xticklabels(fit_data.columns )
#    ax.set_yticklabels(fit_data.index)

    out_img_path = os.path.join(output_path,'bicluster.png')
    fig.savefig(out_img_path)
    
    ### output module
    a11 = pd.Series(model.row_labels_)
    b11 = pd.Series(model.column_labels_)
    c11 = a11.groupby(a11).size()
    c22 = b11.groupby(b11).size()
    d11 = pd.DataFrame(a11.sort_values().values,fit_data.index.values)
    d22 = pd.DataFrame(b11.sort_values().values,fit_data.columns.values)
    d11.columns = ['cpg_module']
    d22.columns = ['sample_module']
    
    out_module_path = os.path.join(output_path,'output.xlsx')
    writer = pd.ExcelWriter(out_module_path)
    d11.to_excel(writer,'Sheet1')
    d22.to_excel(writer,'Sheet2')
    writer.save()
   # 
    print("\n")
    print("cpg module:")
    print(c11)
    print("\n")
    print("sample module:")
    print(c22)
# exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
                             tokenizer=number_aware_tokenizer)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time,
    v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
예제 #23
0
os.mkdir('solution')

#n_clusters = (3, 2)
n_clusters = 20

arq = open('dados_v2.txt')
dados = np.array([map(float, a.split('\t')[:-1]) for a in arq.readlines()])

plt.matshow(zip(*dados), cmap=cm.PiYG)
plt.title("Original dataset")
pl.savefig('solution/original.png', bbox_inches=0)

#model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0)
model = SpectralCoclustering(n_clusters=n_clusters,
                             svd_method='arpack',
                             random_state=0)
model.fit(dados)
fit_data = dados[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(zip(*fit_data), cmap=cm.PiYG)
pl.savefig('solution/biclustered.png', bbox_inches=0)
plt.title("After biclustering; rearranged to show biclusters")
plt.matshow(zip(*np.outer(
    np.sort(model.row_labels_) + 1,
    np.sort(model.column_labels_) + 1)),
            cmap=plt.cm.PiYG)
plt.title("Checkerboard structure of rearranged data")
pl.savefig('solution/biclustered and rearranged.png', bbox_inches=0)
예제 #24
0
print(coOccurencesMatrix)
print(coOccurencesMatrix.shape)

hashtags = vectorizer.get_feature_names()
hashtags = np.array(hashtags)

coOccurencesMatrix = np.where(coOccurencesMatrix == 0, 0, coOccurencesMatrix)
#coOccurencesMatrix = StandardScaler().fit_transform(coOccurencesMatrix)

print(coOccurencesMatrix)
import copy
coOccurencesMatrix2 = copy.deepcopy(coOccurencesMatrix)
coOccurencesMatrix2 = np.corrcoef(coOccurencesMatrix2)
coOccurencesMatrix = np.corrcoef(coOccurencesMatrix)
nbClusters = 40
model = SpectralCoclustering(n_clusters=nbClusters, random_state=1)
model.fit(coOccurencesMatrix)
print("fit")
print(coOccurencesMatrix)
fit_data = coOccurencesMatrix[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
hashtagsrow = hashtags[np.argsort(model.row_labels_)]
hashtagscolumns = hashtags[np.argsort(model.column_labels_)]
print("rowlavels")
print(model.row_labels_)
print("columnlzbels")
print(model.column_labels_)
print("hashtags")
print(hashtags)
print(fit_data.shape)
print(fit_data)
예제 #25
0
plt.figure(figsize=(15, 25))
sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis],
            yticklabels=cluster_cell_types,
            vmin=0,
            vmax=1,
            linewidths=0.5)
plt.xlabel('UNCURL clusters')
plt.ylabel('Seurat clusters')
plt.title('SCH Cerebellum Clusters')
plt.savefig('uncurl_vs_seurat_clusters.png', dpi=200)

# do a biclustering

from sklearn.cluster.bicluster import SpectralCoclustering

spec = SpectralCoclustering(18)
cluster_counts_subset = np.vstack(
    [cluster_counts[:31, :], cluster_counts[32:, :]])
spec.fit(cluster_counts + 0.0001)
row_labels = spec.row_labels_
column_labels = spec.column_labels_

row_order = np.argsort(row_labels)
col_order = np.argsort(column_labels)

#row_labels = row_labels[row_order]
#col_labels = column_labels[col_order]

cluster_counts_reordered = cluster_counts[row_order, :]
cluster_counts_reordered = cluster_counts_reordered[:, col_order]
cluster_cell_types_2 = np.array(
예제 #26
0
def test_co_clustering():

    import numpy as np
    import nibabel as nb
    from matplotlib import pyplot as plt
    import sklearn as sk
    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    # REAL DATA
    subject_file = '/Users/aki.nikolaidis/Desktop/NKI_SampleData/A00060280/3mm_bandpassed_demeaned_filtered_antswarp.nii.gz'
    roi_mask_file = home + '/git_repo/basc/masks/BG_3mm.nii.gz'
    roi2_mask_file = home + '/git_repo/basc/masks/yeo2_3mm.nii.gz'

    data = nb.load(subject_file).get_data().astype('float32')
    print('Data Loaded')

    print('Setting up NIS')
    roi_mask_file_nb = nb.load(roi_mask_file)
    roi2_mask_file_nb = nb.load(roi2_mask_file)

    roi_mask_nparray = nb.load(roi_mask_file).get_data().astype(
        'float32').astype('bool')
    roi2_mask_nparray = nb.load(roi2_mask_file).get_data().astype(
        'float32').astype('bool')

    roi1data = data[roi_mask_nparray]
    roi2data = data[roi2_mask_nparray]

    #add code that uploads the roi1data and roi2data, divides by the mean and standard deviation of the timeseries
    roi1data = sk.preprocessing.normalize(roi1data, norm='l2')
    roi2data = sk.preprocessing.normalize(roi2data, norm='l2')

    dist_btwn_data_1_2 = np.array(
        sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation'))
    sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2
    sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)] = 0
    sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0

    sim_btwn_data_1_2 = sim_btwn_data_1_2 + (np.random.rand(
        len(sim_btwn_data_1_2), len(sim_btwn_data_1_2[1, :]))) / 100
    sim_btwn_data_1_2[sim_btwn_data_1_2 > 1] = 1

    sum(sum(sim_btwn_data_1_2 == np.inf))
    sum(sum(sim_btwn_data_1_2 == np.nan))

    model = SpectralCoclustering(n_clusters=5, random_state=0, n_init=100)
    model.fit(sim_btwn_data_1_2)

    fit_data = sim_btwn_data_1_2[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()

    #SIMULATION DATA
    import numpy as np
    from matplotlib import pyplot as plt

    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    #Creating Simulated Data
    data, rows, columns = make_biclusters(shape=(300, 100),
                                          n_clusters=5,
                                          noise=5,
                                          shuffle=False,
                                          random_state=0)

    plt.matshow(data, cmap=plt.cm.Blues)
    plt.title("Original dataset")

    data, row_idx, col_idx = sg._shuffle(data, random_state=0)
    plt.matshow(data, cmap=plt.cm.Blues)
    plt.title("Shuffled dataset")

    #Creating Model
    model = SpectralCoclustering(n_clusters=5, random_state=0)
    model.fit(data)
    score = consensus_score(model.biclusters_,
                            (rows[:, row_idx], columns[:, col_idx]))

    print("consensus score: {:.3f}".format(score))

    fit_data = data[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()

    ####################################################################
    ####################################################################
    from sklearn import cluster
    import scipy as sp
    import time
    from sklearn import cluster, datasets
    import numpy as np
    from matplotlib import pyplot as plt

    from sklearn.datasets import make_biclusters
    from sklearn.datasets import samples_generator as sg
    from sklearn.cluster.bicluster import SpectralCoclustering
    from sklearn.metrics import consensus_score

    data1 = generate_simple_blobs(27)
    data2 = generate_simple_blobs(27)
    data2 = data2[0:150, :]

    print("Calculating Cross-clustering")
    print("Calculating pairwise distances between areas")

    dist_btwn_data_1_2 = np.array(
        sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation'))
    sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2
    sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0
    co_cluster = cluster.SpectralCoclustering()
    co_cluster.fit(sim_btwn_data_1_2)
    score = consensus_score(co_cluster.biclusters_,
                            (rows[:, row_idx], columns[:, col_idx]))

    print("consensus score: {:.3f}".format(score))

    fit_data = data[np.argsort(co_cluster.row_labels_)]
    fit_data = fit_data[:, np.argsort(co_cluster.column_labels_)]

    plt.matshow(fit_data, cmap=plt.cm.Blues)
    plt.title("After biclustering; rearranged to show biclusters")

    plt.show()
    km = KMeans(n_clusters=k)
    km = km.fit(df.iloc[:,2:14])
    SS.append(km.inertia_)
plt.plot(NC,SS)
plt.xlabel('k')
plt.ylabel('SS')
plt.show()
from sklearn.cluster.bicluster import SpectralCoclustering
flavour=df.iloc[:,2:14] 
corr_whisky=pd.DataFrame.corr(flavour.transpose())
print(corr_whisky)
plt.figure(figsize=(8,8))
plt.pcolor(corr_whisky)
import pandas as pd
plt.colorbar()
model=SpectralCoclustering(n_clusters=5,random_state=45)
x=df["Distillery"]
df["disteliries_group"]=pd.Series(x,index=df.index)

cluster=list(zip(df.iloc[:,1],df.iloc[:,13]))

cluster=sorted(cluster, key=lambda x: x[1])

print("the resultant grouped classified whiskey based on their flavour")
print("\n")

c=pd.DataFrame(cluster)
print(c)
model1=pickle.dump(cluster,open('model1.pkl','wb'))

       
예제 #28
0
def createSpectralCoclustering(params):
    # params['n_clusters'] = N
    cls = SpectralCoclustering()
    return cls
예제 #29
0
cluster_colors = ["red", "orange", "green", "blue", "purple", "gray"]
regions = [
    "Speyside", "Highlands", "Lowlands", "Islands", "Campbelltown", "Islay"
]
import numpy as np
region_colors = dict(zip(regions, cluster_colors))  ## ENTER CODE HERE! ##
print(region_colors)

#from lectures
import pandas as pd
import pylab as plt
whisky = pd.read_csv('whiskies.txt')
whisky['Region'] = pd.read_csv('regions.txt')
from sklearn.cluster.bicluster import SpectralCoclustering
model = SpectralCoclustering(n_clusters=6, random_state=0)
flavors = whisky.iloc[:, 2:14]
corr_flavors = pd.DataFrame.corr(flavors)
corr_whisky = pd.DataFrame.corr(flavors.transpose())
model.fit(corr_whisky)
whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index)
whisky = whisky.ix[np.argsort(model.row_labels_)]
whisky = whisky.reset_index(drop=True)
correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose())
correlations = np.array(correlations)

distilleries = list(whisky.Distillery)
correlation_colors = []
for i in range(len(distilleries)):
    for j in range(len(distilleries)):
        if correlations[i][
예제 #30
0
#build data
data_init, rows, columns = make_biclusters(shape=(SIZE, SIZE),
                                           n_clusters=NB_CLUSTERS,
                                           noise=NOIZE,
                                           shuffle=False,
                                           random_state=0)

# we dont want negative data
data_init = np.absolute(data_init)

#shuffle rows and columns!
data, row_idx, col_idx = sg._shuffle(data_init, random_state=0)

######### sklearn algorithm #########
model = SpectralCoclustering(n_clusters=NB_CLUSTERS, random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))
fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]
#################################

######### our algorithm #########
custom_spectralcoclustering_result, r, c = bs.custom_spectral_biclustering(
    data, NB_CLUSTERS, K_MEANS_ITERATIONS)
custom_score = consensus_score((r, c), (rows[:, row_idx], columns[:, col_idx]))
#################################

######### plot results part #########
print("consensus score sklearn: {:.3f}".format(score))