def main():
    figs = dict()
    figs['stats'] = plt.figure(figsize=(15, 8))
    figs['stats_rel'] = plt.figure(figsize=(15, 8))
    figs['stats_rel_sampsize'] = plt.figure(figsize=(15, 8))
    gss = dict()
    gss['stats'] = gridspec.GridSpec(2, 3)  #, wspace=0.0, hspace=0.0)
    gss['stats_rel'] = gridspec.GridSpec(2, 3)  #, wspace=0.0, hspace=0.0)
    gss['stats_rel_sampsize'] = gridspec.GridSpec(
        2, 3)  #, wspace=0.0, hspace=0.0)

    for e, event_type in enumerate(event_types):

        print('Handling %s' % event_type, file=sys.stderr)

        ### is exonization
        is_exonization = pickle.load(
            open(
                os.path.join(
                    BASEDIR_ICGC,
                    'merge_graphs_%s_C%i.exonize_candidates_step2.pickle' %
                    (event_type, CONF)), 'r'))

        ### load confident events
        IN = h5py.File(
            os.path.join(
                BASEDIR_ICGC,
                'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CONF)), 'r')
        idx_conf_icgc = IN['conf_idx'][:]
        [tumor_dict, histo_dict] = tm.translate([('analysis_id', 'is_tumour'),
                                                 ('analysis_id', 'histotype')])
        htypes_all = sp.array([
            histo_dict[x.split('.')[0]]
            if x.split('.')[0] in histo_dict else 'NA'
            for x in IN['strains'][:]
        ])
        htypes_all_u, htypes_all_cnt = sp.unique(htypes_all,
                                                 return_counts=True)
        htypes_all_med = sp.median(htypes_all_cnt).astype('float')
        htypes_sf = htypes_all_med / htypes_all_cnt
        IN.close()
        htypes_sf = dict([(htypes_all_u[_], htypes_sf[_])
                          for _ in range(htypes_all_u.shape[0])])

        ### load psi filtered events
        idx_psi_icgc = pickle.load(
            open(
                os.path.join(
                    BASEDIR_ICGC,
                    'merge_graphs_%s_C%i.counts.hdf5.psi_filt_per_ht_normalized.pickle'
                    % (event_type, CONF)), 'r'))[1]

        ### get all histotypes
        htypes = sp.unique([x[0] for x in list(idx_psi_icgc.keys())])
        #colors = dict(zip(htypes, ic.get_color_scheme('tumor_subtype', labels=htypes)))
        colors = ic.get_color_scheme('tumor_subtype', labels=htypes)

        ### get counts
        counts = []
        dp = 0.3
        counts_anno = sp.array([
            sp.sum(is_exonization[idx_psi_icgc[(ht, dp)]]) if
            (ht, dp) in idx_psi_icgc else 0 for ht in htypes
        ])
        counts_all = sp.array([
            idx_psi_icgc[(ht, dp)].shape[0] if (ht, dp) in idx_psi_icgc else 0
            for ht in htypes
        ])
        counts_sf = sp.array([htypes_sf[ht] for ht in htypes], dtype='float')

        ### plot stats for events by histotype
        ax = figs['stats'].add_subplot(gss['stats'][e / 3, e % 3])
        ax.bar(sp.arange(htypes.shape[0]) + 0.2,
               counts_anno,
               0.6,
               color=colors,
               linewidth=0.5)
        #li, = ax.plot(sp.arange(data1_icgc.shape[0]), data1_icgc, '-r', label='ICGC')
        axs.set_ticks_outer(ax)
        axs.clean_axis(ax)
        #if e == len(event_types) - 1:
        #    ax.legend(handles=[lg, lga, li, lia], loc='upper right', frameon=False, fontsize=10)
        ax.set_xticks(sp.arange(htypes.shape[0]) + 0.5)
        ax.set_xlim([-0.2, htypes.shape[0]])
        if e < len(event_types) - 3:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(htypes, rotation=90, fontsize=10)
        ax.set_title(event_dict[event_type])
        ax.yaxis.grid(True)

        ### plot stats for events by histotype - relative to events detected
        ax = figs['stats_rel'].add_subplot(gss['stats_rel'][e / 3, e % 3])
        ax.bar(sp.arange(htypes.shape[0]) + 0.2,
               counts_anno / counts_all.astype('float'),
               0.6,
               color=colors,
               linewidth=0.5)
        axs.set_ticks_outer(ax)
        axs.clean_axis(ax)
        ax.set_xticks(sp.arange(htypes.shape[0]) + 0.5)
        ax.set_xlim([-0.2, htypes.shape[0]])
        if e < len(event_types) - 3:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(htypes, rotation=90, fontsize=10)
        ax.set_title(event_dict[event_type])
        ax.yaxis.grid(True)

        ### plot stats for events by histotype - relative to sample size
        ax = figs['stats_rel_sampsize'].add_subplot(
            gss['stats_rel_sampsize'][e / 3, e % 3])
        ax.bar(sp.arange(htypes.shape[0]) + 0.2,
               counts_anno * counts_sf,
               0.6,
               color=colors,
               linewidth=0.5)
        axs.set_ticks_outer(ax)
        axs.clean_axis(ax)
        ax.set_xticks(sp.arange(htypes.shape[0]) + 0.5)
        ax.set_xlim([-0.2, htypes.shape[0]])
        if e < len(event_types) - 3:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(htypes, rotation=90, fontsize=10)
        ax.set_title(event_dict[event_type])
        ax.yaxis.grid(True)

    for p in figs:
        figs[p].tight_layout()
        figs[p].savefig(os.path.join(
            PLOTDIR,
            'event_overview_per_ht_exonize_level2_C%i_%s.pdf' % (CONF, p)),
                        format='pdf',
                        bbox_inches='tight')
        figs[p].savefig(os.path.join(
            PLOTDIR,
            'event_overview_per_ht_exonize_level2_C%i_%s.png' % (CONF, p)),
                        format='png',
                        bbox_inches='tight')
        plt.close(figs[p])
event_type = 'exon_skip'
event_in = os.path.join(BASEDIR_AS, 'alternative_splicing', 'merge_graphs_%s_C2.pickle' % event_type)
candidate_out = os.path.join(BASEDIR_AS, 'alternative_splicing', 'merge_graphs_%s_C2.exonize_candidates.pickle' % event_type)

VARIANTS = os.path.join(BASEDIR, 'qtl_analysis/variants/mccalls/October_2016_whitelist_2583.snv_mnv_indel.sorted.sparse.hdf5')
coding_genes = sp.loadtxt(os.path.join(BASEDIR, 'annotation/gencode.v19.annotation.hs37d5_chr.gtf.coding_genes.txt'), delimiter='\t', dtype='str')     
read_thresh = 3

### prepare bam file dict
bam_dict = dict()
for line in open(os.path.join(BASEDIR_AS, 'alternative_splicing', 'sample_list_merged.txt'), 'r'):
    bam_id = re.sub(r'.bam$', '', line.strip().split('/')[-1])
    bam_dict[bam_id] = line.strip()

### get ID mappings
[strain_dict, tumor_dict, primary_dict, project_dict] = tm.translate([('analysis_id', 'icgc_donor_id'), ('analysis_id', 'is_tumour'), ('analysis_id', 'specimen_type'), ('icgc_donor_id', 'project_code')])

### get conf_idx
EV = h5py.File(os.path.join(BASEDIR_AS, 'alternative_splicing', 'merge_graphs_%s_C2.counts.hdf5' % event_type), 'r')

conf_idx = EV['conf_idx'][:]
strains_ev = sp.array([x.split('.')[0] for x in EV['strains'][:]])
files_ev = sp.array([bam_dict[x] for x in EV['strains'][:]]) 

### only keep tumor samples
st_idx = sp.where([tumor_dict[x] == 'yes' if x in tumor_dict else False for x in strains_ev])[0]  
### only keep primary samples
k_idx = sp.where([primary_dict[x].startswith('Primary') if x in primary_dict else 'False' for x in strains_ev[st_idx]])[0]
st_idx = st_idx[k_idx]
_, u_idx = sp.unique(strains_ev[st_idx], return_index=True)
st_idx = st_idx[u_idx]
Exemplo n.º 3
0
    os.makedirs(PLOTDIR)

results = sp.loadtxt(fname, dtype='str', delimiter='\t')
if len(results.shape) < 2:
    results = results[sp.newaxis, :]

lookup = un.get_lookup_complete()
logtag = ''
if ('--log' in sys.argv):
    logtag = '.log'

gs = gridspec.GridSpec(2, 2, width_ratios=[5, 1], height_ratios=[2, 1])

[strain_dict, tumor_dict, primary_dict,
 donor_dict] = tm.translate([('analysis_id', 'icgc_donor_id'),
                             ('analysis_id', 'is_tumour'),
                             ('analysis_id', 'specimen_type'),
                             ('icgc_donor_id', 'analysis_id')])

for i in range(min(TOP, results.shape[0])):

    print('Processing %i: %s' % (i, results[i, 0]), file=sys.stderr)

    alt_donors = sp.array(results[i, 20].split(','))
    ref_donors = sp.array(results[i, 21].split(','))
    #projects = results[i, 14].split(',')
    files_alt = results[i, 18].split(',')
    files_ref = results[i, 19].split(',')

    #var_pos = [int(x.split(':')[1]) for x in results[i, 0].split(',')]

    ### generate intron filter