def cluster_array_data_by_proteins(arr_df, sample_inds, num_clusters, ind_dict, method='complete', metric='spearman'): """ Cluster array data for each protein antigen separately using linkage clustering Parameters: ---------- arr_df: pandas.DataFrame dataframe of array data sample_inds: list list of samples that we want to cluster - allows clustering a subset of the data in arr_df. num_clusters: int number of clusters - arbitrary parmeter set by the user. ind_dict: dictionary dictionary mapping prot_names to columns in the arr_df method: string linkage clustering method. Can be 'average', 'single' and 'complete' (default). metric: string distance metric used for comparing response vectors. Default set to 'spearman' (rank order correlation). Returns: ------- dist_mat: dictionary dictionary of distance matrices whose keys are the keys in ind_dict (proteins) Z_struct: dictionary clustering structure (matlab style) returned by linkage indexed by ind_dict.keys() dend: dictionary clustering dendrograms for each index set in ind_dict clusters: dictionary cluster assignment of each datapoint indexed by ind_dict.keys() """ dist_mat = {} # distance matrices Z_struct = {} # clustering struct dend = {} # dendrogram struct clusters = {} # cluster labels # clustering occurs for each protein separately: i.e. H7, N9, H1, etc. for k in ind_dict.keys(): # use Andrew's package which allows clustering using Spearman distances # (sch.linkage, and pdist do not support this for some reason, unlike Matlab) (dist_mat[k], Z_struct[k], dend[k]) = \ hcp.computeHCluster(arr_df[sample_inds][ind_dict[k]], method='complete', metric='spearman') clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion='maxclust') return dist_mat, Z_struct, dend, clusters
# cluster using Andrew's package: num_clusters = 5 dMat = {} # distance matrices Z_struct = {} # clustering struct dend = {} # dendrogram struct clusters = {} # cluster labels cluster_treatment_stats = {} pred_treatment_labels = {} # predicted treatment labels based on clustering post_inds = time_dict['Post'] p_labels = np.unique(arr_df[post_inds].group_label.values) for k in ind_dict.keys(): # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab) (dMat[k], Z_struct[k], dend[k]) = hcp.computeHCluster(arr_df[post_inds][ind_dict[k]], method='complete', metric='spearman') clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion='maxclust') # compute cluster homogeneity and completness (purity and accuracy) for treatment label and for infection status: pred_treatment_labels[k] = np.zeros(shape=(arr_df[post_inds].shape[0])) for i in np.arange(1, num_clusters+1): c_inds = np.where(clusters[k] == i) val, ind = scipy.stats.mode(arr_df[post_inds]['group_label'].values[c_inds]) pred_treatment_labels[k][c_inds] = val[0] cluster_treatment_stats[k] = metrics.homogeneity_completeness_v_measure(arr_df[post_inds]['group_label'].values, pred_treatment_labels[k]) # compute pairwise statistics of clusters using alternate assays as values: prot_stats = {} for p in ['SHA_ha', 'SHA_na']: p_values = {assay: np.zeros(shape=(num_clusters, num_clusters)) for assay in assays}
for g in ["Obese", "Normal"]: f, axarr = plt.subplots(group_inds[g].shape[0], 1) f.set_tight_layout(True) for i, curr_g in enumerate(group_inds[g]): axarr[i].plot(arr_df.loc[curr_g][ind_dict[p]]) # axarr[i].set_ylim(0, 20000) # axarr[i].set_yticks([]) axarr[i].set_title(p + " " + curr_g) filename = "".join([FIG_PATH, p, "_", g, "_responses_by_ptids.png"]) f.savefig(filename, dpi=200) # cluster data: for k in ind_dict.keys(): # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab) (dMat[k], Z_struct[k], dend[k]) = hcp.computeHCluster(arr_df[ind_dict[k]], method="complete", metric="spearman") clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion="maxclust") # Plot figures for a given clustering solution - currently only performed for the Shanghai strain: for p in ["SHA_ha", "SHA_na"]: f, axarr = plt.subplots(num_clusters, 1) f.set_tight_layout(True) f.set_size_inches(18, 11) # plot clusters for i in np.arange(num_clusters): axarr[i].plot(np.arange(len(ind_dict[p])), arr_df[ind_dict[p]].loc[clusters[p] == i + 1].T) axarr[i].set_title( p + " cluster " + str(i + 1) + " (n = " + str(len(np.where([clusters[p] == i + 1])[0])) + ")" )
num_clusters = 4 dMat = {} # distance matrices Z_struct = {} # clustering struct dend = {} # dendrogram struct clusters = {} # cluster labels # cluster obese and WT by adjuvant: for a in adjuvants: curr_inds = group_inds['Ob_post_' + a].append(group_inds['WT_post_' + a]) dMat[a] = {} Z_struct[a] = {} dend[a] = {} clusters[a] = {} for p in ['SHA_ha', 'SHA_na']: # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab) (dMat[a][p], Z_struct[a][p], dend[a][p]) = hcp.computeHCluster(arr_df.loc[curr_inds][ind_dict[p]], method='complete', metric='spearman') clusters[a][p] = sch.fcluster(Z_struct[a][p], t=num_clusters, criterion='maxclust') # compute ranksum p-values for comparisons of HAI, microneut assays for the Shanghai and Cal strains comparing different treatment groups across the same adjuvant p_values = {assay: np.zeros(shape=(len(adjuvants), len(exp_group_prefixes))) for assay in assays} q_values = {assay: np.zeros(shape=(len(adjuvants), len(exp_group_prefixes))) for assay in assays} stats_df = pd.DataFrame() for assay in assays + arr_summary_stats: res = [] ind_labels = [] for t in ['pre', 'post']: for ad in adjuvants: res.append(scipy.stats.ranksums(arr_df.loc[group_inds['Ob_' + t + '_' + ad]][assay], arr_df.loc[group_inds['WT_' + t + '_' + ad]][assay])) ind_labels.append(t + '_' + ad)