示例#1
0
def cluster_array_data_by_proteins(arr_df, sample_inds, num_clusters, ind_dict, method='complete', metric='spearman'):
    """
    Cluster array data for each protein antigen separately using linkage clustering

    Parameters:
    ----------
    arr_df: pandas.DataFrame
        dataframe of array data
    sample_inds: list
        list of samples that we want to cluster - allows clustering a subset of the data in arr_df.
    num_clusters: int
        number of clusters - arbitrary parmeter set by the user.
    ind_dict: dictionary
        dictionary mapping prot_names to columns in the arr_df
    method: string
        linkage clustering method. Can be 'average', 'single' and 'complete' (default).
    metric: string
        distance metric used for comparing response vectors. Default set to 'spearman' (rank order correlation).


    Returns:
    -------
    dist_mat: dictionary
        dictionary of distance matrices whose keys are the keys in ind_dict (proteins)
    Z_struct: dictionary
        clustering structure (matlab style) returned by linkage indexed by ind_dict.keys()
    dend: dictionary
        clustering dendrograms for each index set in ind_dict
    clusters: dictionary
        cluster assignment of each datapoint indexed by ind_dict.keys()
    """
    dist_mat = {}  # distance matrices
    Z_struct = {}  # clustering struct
    dend = {}  # dendrogram struct
    clusters = {}  # cluster labels
    
    # clustering occurs for each protein separately: i.e. H7, N9, H1, etc.
    for k in ind_dict.keys():
        # use Andrew's package which allows clustering using Spearman distances 
        # (sch.linkage, and pdist do not support this for some reason, unlike Matlab)
        (dist_mat[k], Z_struct[k], dend[k]) = \
            hcp.computeHCluster(arr_df[sample_inds][ind_dict[k]], method='complete', metric='spearman')
        clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion='maxclust')

    return dist_mat, Z_struct, dend, clusters

#  cluster using Andrew's package:
num_clusters = 5
dMat = {}  # distance matrices
Z_struct = {}  # clustering struct
dend = {}  # dendrogram struct
clusters = {}  # cluster labels
cluster_treatment_stats = {}
pred_treatment_labels = {}  # predicted treatment labels based on clustering

post_inds = time_dict['Post']
p_labels = np.unique(arr_df[post_inds].group_label.values)
for k in ind_dict.keys():
    # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab)
    (dMat[k], Z_struct[k], dend[k]) = hcp.computeHCluster(arr_df[post_inds][ind_dict[k]], method='complete', metric='spearman')
    clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion='maxclust')

    # compute cluster homogeneity and completness (purity and accuracy) for treatment label and for infection status:
    pred_treatment_labels[k] = np.zeros(shape=(arr_df[post_inds].shape[0]))
    for i in np.arange(1, num_clusters+1):
        c_inds = np.where(clusters[k] == i)
        val, ind = scipy.stats.mode(arr_df[post_inds]['group_label'].values[c_inds])
        pred_treatment_labels[k][c_inds] = val[0]

    cluster_treatment_stats[k] = metrics.homogeneity_completeness_v_measure(arr_df[post_inds]['group_label'].values, pred_treatment_labels[k])

# compute pairwise statistics of clusters using alternate assays as values:
prot_stats = {}
for p in ['SHA_ha', 'SHA_na']:
    p_values = {assay: np.zeros(shape=(num_clusters, num_clusters)) for assay in assays}
    for g in ["Obese", "Normal"]:
        f, axarr = plt.subplots(group_inds[g].shape[0], 1)
        f.set_tight_layout(True)
        for i, curr_g in enumerate(group_inds[g]):
            axarr[i].plot(arr_df.loc[curr_g][ind_dict[p]])
            # axarr[i].set_ylim(0, 20000)
            # axarr[i].set_yticks([])
            axarr[i].set_title(p + " " + curr_g)
        filename = "".join([FIG_PATH, p, "_", g, "_responses_by_ptids.png"])
        f.savefig(filename, dpi=200)


# cluster data:
for k in ind_dict.keys():
    # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab)
    (dMat[k], Z_struct[k], dend[k]) = hcp.computeHCluster(arr_df[ind_dict[k]], method="complete", metric="spearman")
    clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion="maxclust")


# Plot figures for a given clustering solution - currently only performed for the Shanghai strain:
for p in ["SHA_ha", "SHA_na"]:
    f, axarr = plt.subplots(num_clusters, 1)
    f.set_tight_layout(True)
    f.set_size_inches(18, 11)
    # plot clusters
    for i in np.arange(num_clusters):

        axarr[i].plot(np.arange(len(ind_dict[p])), arr_df[ind_dict[p]].loc[clusters[p] == i + 1].T)
        axarr[i].set_title(
            p + " cluster " + str(i + 1) + " (n = " + str(len(np.where([clusters[p] == i + 1])[0])) + ")"
        )
num_clusters = 4
dMat = {}  # distance matrices
Z_struct = {}  # clustering struct
dend = {}  # dendrogram struct
clusters = {}  # cluster labels

# cluster obese and WT by adjuvant:
for a in adjuvants:
    curr_inds = group_inds['Ob_post_' + a].append(group_inds['WT_post_' + a])
    dMat[a] = {}
    Z_struct[a] = {}
    dend[a] = {}
    clusters[a] = {}
    for p in ['SHA_ha', 'SHA_na']:
        # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab)
        (dMat[a][p], Z_struct[a][p], dend[a][p]) = hcp.computeHCluster(arr_df.loc[curr_inds][ind_dict[p]], method='complete', metric='spearman')
        clusters[a][p] = sch.fcluster(Z_struct[a][p], t=num_clusters, criterion='maxclust')


# compute ranksum p-values for comparisons of HAI, microneut assays for the Shanghai and Cal strains comparing different treatment groups across the same adjuvant
p_values = {assay: np.zeros(shape=(len(adjuvants), len(exp_group_prefixes))) for assay in assays}
q_values = {assay: np.zeros(shape=(len(adjuvants), len(exp_group_prefixes))) for assay in assays}
stats_df = pd.DataFrame()

for assay in assays + arr_summary_stats:
    res = []
    ind_labels = []
    for t in ['pre', 'post']:
        for ad in adjuvants:
            res.append(scipy.stats.ranksums(arr_df.loc[group_inds['Ob_' + t + '_' + ad]][assay], arr_df.loc[group_inds['WT_' + t + '_' + ad]][assay]))
            ind_labels.append(t + '_' + ad)