def compute_individual_heatmaps(out_drugs, individual_distances, phenotypes, filename="Ustats", metric='euclidean'): ind=np.where([el!='NH' and el!='CTRL' for el in out_drugs])[0] out_drugs = np.array(out_drugs) for k in range(len(individual_distances)): data = individual_distances[k][ind] hierarchical_clustering.heatmap(data, out_drugs[ind], range(data.shape[1]), row_method='ward', column_method='ward', row_metric=metric, column_metric=metric, color_gradient='YlOrRd', filename=filename+phenotypes[k]) return
def condition_clustering(distance_name, folder='/media/lalil0u/New/projects/drug_screen/results/', color_gradient='YlOrRd', hit_only=False, compare_to='MITO', level_row=0.4, level_column=0.5,show=False, filename='Clusters_{}_{}.pkl', #to avoid reloading distance files each time distances=None, all_exposures=None, row_method='ward' ): ''' DOING CONDITION CLUSTERING (MEDIAN OF EXPERIMENTS FOR THIS CONDITION) - compare_to: We can do drug clustering based on their distances to eachother (value 'DS') but we can also do drug clustering based on their distances to Mitocheck (value 'MITO') - 'hit_only': to do clustering considering hit distances only or no ''' f=open(os.path.join(folder, 'DS_hits_1.5IQR.pkl')) _, _, exposure_hits=pickle.load(f) f.close() d=Counter(exposure_hits) d={el:d[el]/float(PASSED_QC_COND[el]) for el in d} distinct_exposure=filter(lambda x:d[x]>0.5, d) if distances is None: distances, _, exposure_, _=_return_right_distance(distance_name, folder, filter_replicates=True, hit_only=hit_only, compare_to=compare_to) if not hit_only: all_exposures=sorted(Counter(exposure_).keys()) distances=np.vstack((np.median(distances[np.where(exposure_==condition)],0) for condition in all_exposures)) else: all_exposures=sorted(distinct_exposure) distances=np.vstack((np.median(distances[np.where(exposure_==condition)],0) for condition in all_exposures)) if compare_to=='MITO': column_header=[k for k in range(distances.shape[1])] else: column_header=all_exposures print distances.shape clusters=hierarchical_clustering.heatmap(distances, row_header=all_exposures, column_header=column_header, row_method=row_method, column_method='ward', row_metric='euclidean', column_metric='euclidean', color_gradient=color_gradient, filename="{}{}".format(int(hit_only),distance_name), folder='{}/inference_Freplicates'.format(folder), level_row=level_row, level_column=level_column, title=drug_screen_utils.DISTANCES[distance_name], colorbar_ticks=[-2, 0, 2], colorbar_ticklabels=[0, '', 1], show=show, colorbar_title='Distance (arbitrary units)', range_normalization=(scoreatpercentile(distances.flatten(),10), scoreatpercentile(distances.flatten(), per=90))) global_=np.bincount(clusters) if not hit_only: hit_clusters = clusters[np.where(np.array([el in distinct_exposure for el in all_exposures]))] all_exposures=distinct_exposure else: hit_clusters=clusters print len(hit_clusters), len(distinct_exposure) print global_, np.bincount(hit_clusters) who_cluster_hits={k: Counter(np.array(all_exposures)[np.where(hit_clusters==k)]) for k in range(1,np.max(clusters)+1)} if hit_only: f=open(os.path.join(folder, 'inference_Freplicates', filename.format(distance_name, level_row)), 'w') pickle.dump(who_cluster_hits, f); f.close() return distances, all_exposures, who_cluster_hits
def experiment_clustering(distance_name, folder='/media/lalil0u/New/projects/drug_screen/results/', color_gradient='YlOrRd', hit_only=False, compare_to='MITO', level=0.4): ''' DOING EXPERIMENT CLUSTERING AS OPPOSED TO CONDITION CLUSTERING - compare_to: We can do drug clustering based on their distances to eachother (value 'DS') but we can also do drug clustering based on their distances to Mitocheck (value 'MITO') - 'hit_only': to do clustering considering hit distances only or no ''' distances, who_, exposure_, mito_who=_return_right_distance(distance_name, folder, filter_replicates=True, hit_only=hit_only, compare_to=compare_to) plates=np.array([int(el.split('--')[0].split('_')[1]) for el in who_]) exposure_wPL=np.array(['{}{:>10}'.format(exposure_[i], plates[i]) for i in range(len(exposure_))]) f=open(os.path.join(folder, 'DS_hits_1.5IQR.pkl')) _, who_hits, exposure_hits=pickle.load(f) f.close() d=Counter(exposure_hits) d={el:d[el]/float(PASSED_QC_COND[el]) for el in d} distinct_exposure=filter(lambda x:d[x]>0.5, d) if hit_only: wh_=np.hstack((np.where(who_==who_hits[i])[0] for i in range(len(who_hits)) if exposure_hits[i] in distinct_exposure)) distances=distances[wh_] if compare_to=='DS': distances=distances[:,wh_] print wh_ who_=who_[wh_] exposure_wPL=exposure_wPL[wh_] plates=plates[wh_] if compare_to=='MITO': column_header=mito_who else: column_header=who_ clusters=hierarchical_clustering.heatmap(distances, row_header=exposure_wPL, column_header=column_header, row_method='ward', column_method='ward', row_metric='euclidean', column_metric='euclidean', color_gradient=color_gradient, filename="E{}".format(distance_name), folder='{}/inference_Freplicates'.format(folder), level=level,title=drug_screen_utils.DISTANCES[distance_name], colorbar_ticks=[-2, 0, 2], colorbar_ticklabels=[0, '', 1], colorbar_title='Distance (arbitrary units)', range_normalization=(scoreatpercentile(distances.flatten(),10), scoreatpercentile(distances.flatten(), per=90))) global_=np.bincount(clusters) if not hit_only: hit_clusters = clusters[np.where(np.array([el in who_hits for el in who_]))] hit_clusters = hit_clusters[np.where(np.array([exposure_hits[i] in distinct_exposure for i in range(len(who_hits))]))] plates=plates[np.where(np.array([el in who_hits for el in who_]))] plates=plates[np.where(np.array([exposure_hits[i] in distinct_exposure for i in range(len(who_hits))]))] exposure_hits=exposure_hits[np.where(np.array([exposure_hits[i] in distinct_exposure for i in range(len(who_hits))]))] else: hit_clusters=clusters print len(hit_clusters), len(exposure_hits), len(plates) print global_, np.bincount(hit_clusters) who_cluster_hits={k: Counter(exposure_hits[np.where(hit_clusters==k)]) for k in range(1,np.max(clusters)+1)} plate_clusters={k: Counter(plates[np.where(hit_clusters==k)]) for k in range(1,np.max(clusters)+1)} return hit_clusters, who_cluster_hits, plate_clusters