def _plot_and_save_local_ancestry(df, kmer, image_filename, num_chromosomes, id_vars, x_axis, y_scale): print('saving plot as: {}'.format(image_filename)) var_name='chromosome' local_ancestry_df_long = pd.melt(df, id_vars=id_vars, var_name=var_name, value_name='estimated_ancestry') new_names = {} for i in range(1, num_chromosomes + 1): new_names['test_{}'.format(i)] = 2*i - 2 * y_scale new_names['true_{}'.format(i)] = 2*i - 1 * y_scale for key, value in new_names.items(): local_ancestry_df_long.replace(key, value, inplace=True) plot = ggplot.ggplot(ggplot.aes(x=x_axis, y=var_name, color='estimated_ancestry'), data=local_ancestry_df_long) \ + ggplot.geom_point() \ + ggplot.scale_y_continuous(labels=list(new_names.keys()), breaks=list(new_names.values())) \ + ggplot.scale_color_manual(values=['#FF0000', '#0000FF', '#73008C']) \ + ggplot.theme(plot_margin={'top':0.7, 'bottom':0.3}) ### TODO: this should depend on scale plot.save(image_filename)
#total-based dftmp = df[['n_sub']+brks[:5]].melt(id_vars=['n_sub'],value_vars=brks[:5], var_name = 'stat',value_name = 'value') dftmp['method']=['(Total-Expected Total)/Expected Total']*dftmp['n_sub'].size df_stacked = dftmp #enhancement-based dftmp = df[['n_sub']+brks[5:10]].melt(id_vars=['n_sub'],value_vars=brks[5:10], var_name = 'stat',value_name = 'value') dftmp['method']=['(Enhanc-Expected Enhanc)/Expected Enhanc']*dftmp['n_sub'].size df_stacked = df_stacked.append(dftmp) #enhancements + full sample background dftmp = df[['n_sub']+brks[10:]].melt(id_vars=['n_sub'],value_vars=brks[10:], var_name = 'stat',value_name = 'value') dftmp['method']=['(Enhanc+Expected Backgr-Expected Total)/Expected Total']*dftmp['n_sub'].size df_stacked = df_stacked.append(dftmp) df_stacked['percentile']=['{0}th%'.format(a[1:3]) for a in df_stacked['stat']] #plots #compare all 3 plt1 = gg.ggplot(df_stacked, gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.facet_wrap('method')+gg.ggtitle('Bias comparison {0}'.format(title)) plt1.save(filename = r'..\charts\drivebias_laqn_{0}.png'.format(species), width=None, height=None, dpi=300) #plot total alone for presenation plt2 = gg.ggplot(df_stacked[df_stacked['method']=='(Total-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title)) t = gg.theme_bw() t._rcParams['font.size']=16 plt2 = plt2+t plt2.save(filename = r'..\charts\drivebias_laqn_{0}_total.png'.format(species), width=None, height=None, dpi=300) #plot enhancement alone for presenation plt3 = gg.ggplot(df_stacked[df_stacked['method']=='(Enhanc+Expected Backgr-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title)) t = gg.theme_bw() t._rcParams['font.size']=16 plt3 = plt3+t plt3.save(filename = r'..\charts\drivebias_laqn_{0}_enhanc.png'.format(species), width=None, height=None, dpi=300)
def plot_embedding(show_plot=False, label='', n_clusters=5): plot_df = df_tsne.copy(deep=True) if label == '': plt.figure(figsize=(12, 8)) plt.scatter(plot_df.iloc[:, 0], plot_df.iloc[:, 1]) plt.title('ICD Code Embeddings') plt.savefig(base + '/Plots/icd_embedding.png') if show_plot: plt.show() elif label == 'Category': ### Annotate by desc plot_df = df_tsne.copy(deep=True) plot_df = pd.merge(plot_df, icd9_desc, left_on='label', right_on='icd9', how='left') plot_df.drop(['label', 'icd9'], axis=1, inplace=True) plot_df.rename(columns={'range': 'label'}, inplace=True) chart = ggplot(plot_df, aes(x='x1', y='x2',color = 'label') ) \ + geom_point(size=70 ,alpha=0.6 ) \ + ggtitle("Word2Vec Embeddings") # + scale_color_manual(values = ["red", "blue","green", "purple","orange" # , "#6DABE4","#65CBC9" chart.save(base + '/Plots/icd_embedding_categories.png') if show_plot: print(chart) elif label == 'Cluster': #### Load embedding dict with open(base + '/Processed_Data/icd_embedding_dict.pickle', 'rb') as handle: embedding_dict = pickle.load(handle) X = embedding_dict.get('embedding_matrix') #### Cluster on Embedding Space icd_clusters = KMeans(n_clusters=n_clusters, random_state=1).fit(X) # Reorder Clusters idx = pd.DataFrame(icd_clusters.labels_)[0].value_counts().index.values lut = np.zeros_like(idx) lut[idx] = np.arange(n_clusters) plot_df['label'] = lut[icd_clusters.labels_].astype(str) chart = ggplot(plot_df, aes(x='x1', y='x2',color = 'label') ) \ + geom_point(size=70 ,alpha=0.6 ) \ + ggtitle("Word2Vec Embeddings")\ + scale_color_manual(values = ["red", "blue","green", "purple","orange" # , "#6DABE4","#65CBC9" ]) chart.save(base + '/Plots/icd_clustered_embedding.png') if show_plot: print(chart) elif label == 'Code_type': ### Annotate by desc plot_df = df_tsne.copy(deep=True) plot_df = pd.merge(plot_df, icd9_desc, left_on='label', right_on='icd9', how='left') plot_df.drop(['label', 'icd9'], axis=1, inplace=True) plot_df.rename(columns={'type': 'label'}, inplace=True) chart = ggplot(plot_df, aes(x='x1', y='x2',color = 'label') ) \ + geom_point(size=70 ,alpha=0.6 ) \ + ggtitle("Word2Vec Embeddings") # + scale_color_manual(values = ["red", "blue","green", "purple","orange" # , "#6DABE4","#65CBC9" chart.save(base + '/Plots/icd_embedding_code_type.png') if show_plot: print(chart)
def new_plot_ancestry_with_correct_results(test, true, y_scale=0.5, image_filename=None): columns_to_ignore = ['POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] ### we want only 'POS' and ancestry columns ancestry_cols = list(filter(lambda x: x not in columns_to_ignore, test.columns)) merged = pd.DataFrame(test['POS']) for col_name in ancestry_cols: if col_name not in true: raise KeyError('true ancestry dataframe is missing ancestry for id: {}'.format(col_name)) merged[col_name+'_test'] = test[col_name] merged[col_name+'_true'] = true[col_name] melted = pd.melt(merged, id_vars=['POS'], var_name='chromosome', value_name='ancestry') # the above takes merged from something like this: ### ### columns: POS sample1_test sample1_true sample2_test sample2_true ### 111 pop1 pop1 pop2 pop1 ### 124 pop1 pop1 pop2 pop1 ### # to this: (spaces between rows added for clarity) ### ### columns: POS chromosome ancestry # 111 sample1_test pop1 # 124 sample1_test pop1 # # 111 sample1_true pop1 # 124 sample1_true pop1 # # 111 sample2_test pop2 # 124 sample2_test pop2 # # 111 sample2_true pop1 # 124 sample2_true pop1 spacing = {} for i, col_name in enumerate(ancestry_cols): spacing[col_name+'_test'] = 2*i - 2 * y_scale spacing[col_name+'_true'] = 2*i - 1 * y_scale # taks above example to something like: ### ### columns: POS chromosome ancestry # 111 0 pop1 # 124 0 pop1 # # 111 1 pop1 # 124 1 pop1 # # 111 2 pop2 # 124 2 pop2 # # 111 3 pop1 # 124 3 pop1 for col_name, spacing_val in spacing.items(): melted.replace(col_name, spacing_val, inplace=True) plot = ggplot.ggplot(ggplot.aes(x='POS', y='chromosome', color='ancestry'), data=melted) \ + ggplot.geom_point() \ + ggplot.scale_y_continuous(labels=list(spacing.keys()), breaks=list(spacing.values())) \ + ggplot.scale_color_manual(values=['#FF0000', '#0000FF', '#73008C']) \ + ggplot.theme(plot_margin={'top':0.7, 'bottom':0.3}) ### TODO: this should depend on scale if image_filename is not None: plot.save(image_filename) else: plot.show()
#split percentiles into different charts, all sites #plt1 = gg.ggplot(df_along, gg.aes(x='n_passes',y='value',color='site_str'))+gg.geom_point()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.xlim(0,100)+gg.facet_wrap('yparam',scales='free_y') #plt1.save(filename = r'..\charts\bias_{0}.png'.format(c['name']), width=None, height=None, dpi=200) #n_segments plt2 = gg.ggplot( df_a, gg.aes(x='n_passes', y='n_segments', color='site_str') ) + gg.geom_line() + gg.xlab('n, number drive periods') + gg.ylab( 'Sample size (number of drive patterns)') + gg.theme_bw() + gg.xlim( 0, 35) + gg.ylim(0, 2000) plt2.save(filename=r'..\charts\n_segments_{0}_{1}.png'.format( c['name'], dtstamp), width=None, height=None, dpi=200) #combine percentiles, split sites plt3 = gg.ggplot( df_along, gg.aes(x='n_passes', y='value', color='yparam') ) + gg.geom_line() + gg.xlab('n, number of drive periods') + gg.ylab( 'Sample error (%)') + gg.theme_bw() + gg.xlim(0, 35) + gg.ylim( -100, 100) + gg.geom_hline( y=25, linetype="dashed", color="gray") + gg.geom_hline( y=-25, linetype="dashed", color="gray") + gg.geom_vline( x=[10, 15], linetype="dashed", color="gray") + gg.scale_color_manual( values=colors) + gg.facet_wrap('site_str') plt3.save(filename=r'..\charts\percentiles_{0}_{1}.png'.format( c['name'], dtstamp), width=None, height=None, dpi=200)