示例#1
0
def _plot_and_save_local_ancestry(df, kmer, image_filename, num_chromosomes, id_vars, x_axis, y_scale):
	print('saving plot as: {}'.format(image_filename))
	var_name='chromosome'

	local_ancestry_df_long = pd.melt(df, id_vars=id_vars, var_name=var_name, value_name='estimated_ancestry')

	new_names = {}
	for i in range(1, num_chromosomes + 1):
		new_names['test_{}'.format(i)] = 2*i - 2 * y_scale
		new_names['true_{}'.format(i)] = 2*i - 1 * y_scale

	for key, value in new_names.items():
		local_ancestry_df_long.replace(key, value, inplace=True)

	plot = ggplot.ggplot(ggplot.aes(x=x_axis, y=var_name, color='estimated_ancestry'), data=local_ancestry_df_long) \
		+ ggplot.geom_point() \
		+ ggplot.scale_y_continuous(labels=list(new_names.keys()), breaks=list(new_names.values())) \
		+ ggplot.scale_color_manual(values=['#FF0000', '#0000FF', '#73008C']) \
		+ ggplot.theme(plot_margin={'top':0.7, 'bottom':0.3}) ### TODO: this should depend on scale

	plot.save(image_filename)
#total-based
dftmp = df[['n_sub']+brks[:5]].melt(id_vars=['n_sub'],value_vars=brks[:5], var_name = 'stat',value_name = 'value')
dftmp['method']=['(Total-Expected Total)/Expected Total']*dftmp['n_sub'].size
df_stacked = dftmp
#enhancement-based
dftmp = df[['n_sub']+brks[5:10]].melt(id_vars=['n_sub'],value_vars=brks[5:10], var_name = 'stat',value_name = 'value')
dftmp['method']=['(Enhanc-Expected Enhanc)/Expected Enhanc']*dftmp['n_sub'].size
df_stacked = df_stacked.append(dftmp)
#enhancements + full sample background
dftmp = df[['n_sub']+brks[10:]].melt(id_vars=['n_sub'],value_vars=brks[10:], var_name = 'stat',value_name = 'value')
dftmp['method']=['(Enhanc+Expected Backgr-Expected Total)/Expected Total']*dftmp['n_sub'].size
df_stacked = df_stacked.append(dftmp)
df_stacked['percentile']=['{0}th%'.format(a[1:3]) for a in df_stacked['stat']]
#plots
#compare all 3
plt1 = gg.ggplot(df_stacked, gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.facet_wrap('method')+gg.ggtitle('Bias comparison {0}'.format(title))
plt1.save(filename = r'..\charts\drivebias_laqn_{0}.png'.format(species), width=None, height=None, dpi=300)

#plot total alone for presenation
plt2 = gg.ggplot(df_stacked[df_stacked['method']=='(Total-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title))
t = gg.theme_bw()
t._rcParams['font.size']=16
plt2 = plt2+t
plt2.save(filename = r'..\charts\drivebias_laqn_{0}_total.png'.format(species), width=None, height=None, dpi=300)

#plot enhancement alone for presenation
plt3 = gg.ggplot(df_stacked[df_stacked['method']=='(Enhanc+Expected Backgr-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title))
t = gg.theme_bw()
t._rcParams['font.size']=16
plt3 = plt3+t
plt3.save(filename = r'..\charts\drivebias_laqn_{0}_enhanc.png'.format(species), width=None, height=None, dpi=300)
示例#3
0
def plot_embedding(show_plot=False, label='', n_clusters=5):

    plot_df = df_tsne.copy(deep=True)
    if label == '':
        plt.figure(figsize=(12, 8))
        plt.scatter(plot_df.iloc[:, 0], plot_df.iloc[:, 1])
        plt.title('ICD Code Embeddings')
        plt.savefig(base + '/Plots/icd_embedding.png')
        if show_plot:
            plt.show()

    elif label == 'Category':
        ### Annotate by desc
        plot_df = df_tsne.copy(deep=True)
        plot_df = pd.merge(plot_df,
                           icd9_desc,
                           left_on='label',
                           right_on='icd9',
                           how='left')
        plot_df.drop(['label', 'icd9'], axis=1, inplace=True)
        plot_df.rename(columns={'range': 'label'}, inplace=True)

        chart = ggplot(plot_df, aes(x='x1', y='x2',color = 'label') ) \
                + geom_point(size=70
                             ,alpha=0.6
                             ) \
                + ggtitle("Word2Vec Embeddings")
        #                + scale_color_manual(values = ["red", "blue","green", "purple","orange"
        #                                       , "#6DABE4","#65CBC9"

        chart.save(base + '/Plots/icd_embedding_categories.png')
        if show_plot:
            print(chart)

    elif label == 'Cluster':

        #### Load embedding dict
        with open(base + '/Processed_Data/icd_embedding_dict.pickle',
                  'rb') as handle:
            embedding_dict = pickle.load(handle)

        X = embedding_dict.get('embedding_matrix')

        #### Cluster on Embedding Space
        icd_clusters = KMeans(n_clusters=n_clusters, random_state=1).fit(X)
        # Reorder Clusters
        idx = pd.DataFrame(icd_clusters.labels_)[0].value_counts().index.values
        lut = np.zeros_like(idx)
        lut[idx] = np.arange(n_clusters)
        plot_df['label'] = lut[icd_clusters.labels_].astype(str)

        chart = ggplot(plot_df, aes(x='x1', y='x2',color = 'label') ) \
            + geom_point(size=70
                         ,alpha=0.6
                         ) \
            + ggtitle("Word2Vec Embeddings")\
            + scale_color_manual(values = ["red", "blue","green", "purple","orange"
        #                                       , "#6DABE4","#65CBC9"
                                           ])
        chart.save(base + '/Plots/icd_clustered_embedding.png')
        if show_plot:
            print(chart)

    elif label == 'Code_type':
        ### Annotate by desc
        plot_df = df_tsne.copy(deep=True)
        plot_df = pd.merge(plot_df,
                           icd9_desc,
                           left_on='label',
                           right_on='icd9',
                           how='left')
        plot_df.drop(['label', 'icd9'], axis=1, inplace=True)
        plot_df.rename(columns={'type': 'label'}, inplace=True)

        chart = ggplot(plot_df, aes(x='x1', y='x2',color = 'label') ) \
                + geom_point(size=70
                             ,alpha=0.6
                             ) \
                + ggtitle("Word2Vec Embeddings")
        #                + scale_color_manual(values = ["red", "blue","green", "purple","orange"
        #                                       , "#6DABE4","#65CBC9"

        chart.save(base + '/Plots/icd_embedding_code_type.png')
        if show_plot:
            print(chart)
示例#4
0
def new_plot_ancestry_with_correct_results(test, true, y_scale=0.5, image_filename=None):
   columns_to_ignore = ['POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] ### we want only 'POS' and ancestry columns
   ancestry_cols = list(filter(lambda x: x not in columns_to_ignore, test.columns))

   merged = pd.DataFrame(test['POS'])
   for col_name in ancestry_cols:
       if col_name not in true:
           raise KeyError('true ancestry dataframe is missing ancestry for id: {}'.format(col_name))
       merged[col_name+'_test'] = test[col_name]
       merged[col_name+'_true'] = true[col_name]

   melted = pd.melt(merged, id_vars=['POS'], var_name='chromosome', value_name='ancestry')
   # the above takes merged from something like this:
   ###
   ### columns: POS sample1_test sample1_true sample2_test sample2_true
   ###                      111      pop1         pop1         pop2         pop1
   ###          124      pop1         pop1         pop2         pop1
   ###
   # to this: (spaces between rows added for clarity)
   ###
   ### columns: POS   chromosome    ancestry
   #            111   sample1_test    pop1
   #            124   sample1_test    pop1
   #
   #            111   sample1_true    pop1
   #            124   sample1_true    pop1
   #
   #            111   sample2_test    pop2
   #            124   sample2_test    pop2
   #
   #            111   sample2_true    pop1
   #            124   sample2_true    pop1

   spacing = {}
   for i, col_name in enumerate(ancestry_cols):
       spacing[col_name+'_test'] = 2*i - 2 * y_scale
       spacing[col_name+'_true'] = 2*i - 1 * y_scale

   # taks above example to something like:
   ###
   ### columns: POS  chromosome  ancestry
   #            111       0        pop1
   #            124       0        pop1
   #
   #            111       1        pop1
   #            124       1        pop1
   #
   #            111       2        pop2
   #            124       2        pop2
   #
   #            111       3        pop1
   #            124       3        pop1

   for col_name, spacing_val in spacing.items():
       melted.replace(col_name, spacing_val, inplace=True)

   plot = ggplot.ggplot(ggplot.aes(x='POS', y='chromosome', color='ancestry'), data=melted) \
       + ggplot.geom_point() \
       + ggplot.scale_y_continuous(labels=list(spacing.keys()), breaks=list(spacing.values())) \
       + ggplot.scale_color_manual(values=['#FF0000', '#0000FF', '#73008C']) \
       + ggplot.theme(plot_margin={'top':0.7, 'bottom':0.3}) ### TODO: this should depend on scale

   if image_filename is not None:
       plot.save(image_filename)
   else:
       plot.show()
示例#5
0
    #split percentiles into different charts, all sites
    #plt1 = gg.ggplot(df_along, gg.aes(x='n_passes',y='value',color='site_str'))+gg.geom_point()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.xlim(0,100)+gg.facet_wrap('yparam',scales='free_y')
    #plt1.save(filename = r'..\charts\bias_{0}.png'.format(c['name']), width=None, height=None, dpi=200)
    #n_segments
    plt2 = gg.ggplot(
        df_a, gg.aes(x='n_passes', y='n_segments', color='site_str')
    ) + gg.geom_line() + gg.xlab('n, number drive periods') + gg.ylab(
        'Sample size (number of drive patterns)') + gg.theme_bw() + gg.xlim(
            0, 35) + gg.ylim(0, 2000)
    plt2.save(filename=r'..\charts\n_segments_{0}_{1}.png'.format(
        c['name'], dtstamp),
              width=None,
              height=None,
              dpi=200)
    #combine percentiles, split sites
    plt3 = gg.ggplot(
        df_along, gg.aes(x='n_passes', y='value', color='yparam')
    ) + gg.geom_line() + gg.xlab('n, number of drive periods') + gg.ylab(
        'Sample error (%)') + gg.theme_bw() + gg.xlim(0, 35) + gg.ylim(
            -100, 100) + gg.geom_hline(
                y=25, linetype="dashed", color="gray") + gg.geom_hline(
                    y=-25, linetype="dashed", color="gray") + gg.geom_vline(
                        x=[10, 15], linetype="dashed",
                        color="gray") + gg.scale_color_manual(
                            values=colors) + gg.facet_wrap('site_str')
    plt3.save(filename=r'..\charts\percentiles_{0}_{1}.png'.format(
        c['name'], dtstamp),
              width=None,
              height=None,
              dpi=200)